def __call__(self, x, transformed_classifier): """ Perform the defensive distillation defence mechanism and return a robuster classifier. :param x: Dataset for training the transformed classifier. :type x: `np.ndarray` :param transformed_classifier: A classifier to be transformed for increased robustness. Note that, the objective loss function used for fitting inside the input transformed_classifier must support soft labels, i.e. probability labels. :type transformed_classifier: :class:`.Classifier` :return: The transformed classifier. :rtype: :class:`.Classifier` """ # Check if the trained classifier produces probability outputs preds = self.classifier.predict(x=x, batch_size=self.batch_size) are_probability = [is_probability(y) for y in preds] all_probability = np.sum(are_probability) == preds.shape[0] if not all_probability: raise ValueError("The input trained classifier do not produce probability outputs.") # Check if the transformed classifier produces probability outputs transformed_preds = transformed_classifier.predict(x=x, batch_size=self.batch_size) are_probability = [is_probability(y) for y in transformed_preds] all_probability = np.sum(are_probability) == transformed_preds.shape[0] if not all_probability: raise ValueError("The input transformed classifier do not produce probability outputs.") # Train the transformed classifier with soft labels transformed_classifier.fit(x=x, y=preds, batch_size=self.batch_size, nb_epochs=self.nb_epochs) return transformed_classifier
def test_is_probability(self): probabilities = np.array([0.1, 0.3, 0.6]) self.assertTrue(is_probability(probabilities)) not_probabilities = np.array([0.1, 0.3, 0.8]) self.assertFalse(is_probability(not_probabilities)) not_probabilities = np.array([0.1, 0.3, 1.8]) self.assertFalse(is_probability(not_probabilities)) not_probabilities = np.array([-1.1, 0.3, 1.8]) self.assertFalse(is_probability(not_probabilities)) not_probabilities = np.array([-1.1, 0.3, 0.7]) self.assertFalse(is_probability(not_probabilities))
def __call__(self, preds): """ Perform model postprocessing and return postprocessed output. :param preds: model output to be postprocessed. :type preds: `np.ndarray` :return: Postprocessed model output. :rtype: `np.ndarray` """ # Generate random noise noise = np.random.normal(loc=0.0, scale=self.scale, size=preds.shape) # Add noise to model output post_preds = preds.copy() post_preds += noise if preds.shape[1] > 1: # Check if model output is logits or probability are_probability = [is_probability(x) for x in preds] all_probability = np.sum(are_probability) == preds.shape[0] # Finally normalize probability output if all_probability: post_preds[post_preds < 0.0] = 0.0 sums = np.sum(post_preds, axis=1) post_preds /= sums else: post_preds[post_preds < 0.0] = 0.0 return post_preds
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: An array with the original labels to be predicted. :return: An array holding the adversarial examples. """ x_adv = x.astype(ART_NUMPY_DTYPE) preds = self.estimator.predict(x, batch_size=self.batch_size) if is_probability(preds[0]): logger.warning( "It seems that the attacked model is predicting probabilities. DeepFool expects logits as model output " "to achieve its full attack strength.") # Determine the class labels for which to compute the gradients use_grads_subset = self.nb_grads < self.estimator.nb_classes if use_grads_subset: # TODO compute set of unique labels per batch grad_labels = np.argsort(-preds, axis=1)[:, :self.nb_grads] labels_set = np.unique(grad_labels) else: labels_set = np.arange(self.estimator.nb_classes) sorter = np.arange(len(labels_set)) # Pick a small scalar to avoid division by 0 tol = 10e-8 # Compute perturbation with implicit batching for batch_id in trange(int( np.ceil(x_adv.shape[0] / float(self.batch_size))), desc="DeepFool", disable=not self.verbose): batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size batch = x_adv[batch_index_1:batch_index_2].copy() # Get predictions and gradients for batch f_batch = preds[batch_index_1:batch_index_2] fk_hat = np.argmax(f_batch, axis=1) if use_grads_subset: # Compute gradients only for top predicted classes grd = np.array([ self.estimator.class_gradient(batch, label=_) for _ in labels_set ]) grd = np.squeeze(np.swapaxes(grd, 0, 2), axis=0) else: # Compute gradients for all classes grd = self.estimator.class_gradient(batch) # Get current predictions active_indices = np.arange(len(batch)) current_step = 0 while active_indices.size > 0 and current_step < self.max_iter: # Compute difference in predictions and gradients only for selected top predictions labels_indices = sorter[np.searchsorted(labels_set, fk_hat, sorter=sorter)] grad_diff = grd - grd[np.arange(len(grd)), labels_indices][:, None] f_diff = f_batch[:, labels_set] - f_batch[np.arange(len(f_batch)), labels_indices][:, None] # Choose coordinate and compute perturbation norm = np.linalg.norm(grad_diff.reshape( len(grad_diff), len(labels_set), -1), axis=2) + tol value = np.abs(f_diff) / norm value[np.arange(len(value)), labels_indices] = np.inf l_var = np.argmin(value, axis=1) absolute1 = abs(f_diff[np.arange(len(f_diff)), l_var]) draddiff = grad_diff[np.arange(len(grad_diff)), l_var].reshape(len(grad_diff), -1) pow1 = (pow( np.linalg.norm(draddiff, axis=1), 2, ) + tol) r_var = absolute1 / pow1 r_var = r_var.reshape((-1, ) + (1, ) * (len(x.shape) - 1)) r_var = r_var * grad_diff[np.arange(len(grad_diff)), l_var] # Add perturbation and clip result if self.estimator.clip_values is not None: batch[active_indices] = np.clip( batch[active_indices] + r_var[active_indices] * (self.estimator.clip_values[1] - self.estimator.clip_values[0]), self.estimator.clip_values[0], self.estimator.clip_values[1], ) else: batch[active_indices] += r_var[active_indices] # Recompute prediction for new x f_batch = self.estimator.predict(batch) fk_i_hat = np.argmax(f_batch, axis=1) # Recompute gradients for new x if use_grads_subset: # Compute gradients only for (originally) top predicted classes grd = np.array([ self.estimator.class_gradient(batch, label=_) for _ in labels_set ]) grd = np.squeeze(np.swapaxes(grd, 0, 2), axis=0) else: # Compute gradients for all classes grd = self.estimator.class_gradient(batch) # Stop if misclassification has been achieved active_indices = np.where(fk_i_hat == fk_hat)[0] current_step += 1 # Apply overshoot parameter x_adv1 = x_adv[batch_index_1:batch_index_2] x_adv2 = (1 + self.epsilon) * (batch - x_adv[batch_index_1:batch_index_2]) x_adv[batch_index_1:batch_index_2] = x_adv1 + x_adv2 if self.estimator.clip_values is not None: np.clip( x_adv[batch_index_1:batch_index_2], self.estimator.clip_values[0], self.estimator.clip_values[1], out=x_adv[batch_index_1:batch_index_2], ) logger.info( "Success rate of DeepFool attack: %.2f%%", 100 * compute_success( self.estimator, x, y, x_adv, batch_size=self.batch_size), ) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> Tuple[np.ndarray, np.ndarray]: """ Generate an adversarial patch and return the patch and its mask in arrays. :param x: An array with the original input images of shape NHWC or input videos of shape NFHWC. :param y: An array with the original true labels. :param mask: A boolean array of shape equal to the shape of a single samples (1, H, W) or the shape of `x` (N, H, W) without their channel dimensions. Any features for which the mask is True can be the center location of the patch during sampling. :type mask: `np.ndarray` :param reset_patch: If `True` reset patch to initial values of mean of minimal and maximal clip value, else if `False` (default) restart from previous patch values created by previous call to `generate` or mean of minimal and maximal clip value if first call to `generate`. :type reset_patch: bool :return: An array with adversarial patch and an array of the patch mask. """ import tensorflow as tf # lgtm [py/repeated-import] shuffle = kwargs.get("shuffle", True) mask = kwargs.get("mask") if mask is not None: mask = mask.copy() mask = self._check_mask(mask=mask, x=x) if kwargs.get("reset_patch"): self.reset_patch(initial_patch_value=self._initial_value) y = check_and_transform_label_format( labels=y, nb_classes=self.estimator.nb_classes) # check if logits or probabilities y_pred = self.estimator.predict(x=x[[0]]) if is_probability(y_pred): self.use_logits = False else: self.use_logits = True if mask is None: if shuffle: dataset = (tf.data.Dataset.from_tensor_slices( (x, y)).shuffle(10000).batch(self.batch_size).repeat( math.ceil(x.shape[0] / self.batch_size))) else: dataset = (tf.data.Dataset.from_tensor_slices( (x, y)).batch(self.batch_size).repeat( math.ceil(x.shape[0] / self.batch_size))) else: if shuffle: dataset = (tf.data.Dataset.from_tensor_slices( (x, y, mask)).shuffle(10000).batch(self.batch_size).repeat( math.ceil(x.shape[0] / self.batch_size))) else: dataset = (tf.data.Dataset.from_tensor_slices( (x, y, mask)).batch(self.batch_size).repeat( math.ceil(x.shape[0] / self.batch_size))) for _ in trange(self.max_iter, desc="Adversarial Patch TensorFlow v2", disable=not self.verbose): if mask is None: for images, target in dataset: _ = self._train_step(images=images, target=target, mask=None) else: for images, target, mask_i in dataset: _ = self._train_step(images=images, target=target, mask=mask_i) return ( self._patch.numpy(), self._get_circular_patch_mask(nb_samples=1).numpy()[0], )
def PDTP( # pylint: disable=C0103 target_estimator: "Classifier", extra_estimator: "Classifier", x: np.ndarray, y: np.ndarray, indexes: Optional[np.ndarray] = None, num_iter: Optional[int] = 10, ) -> np.ndarray: """ Compute the pointwise differential training privacy metric for the given classifier and training set. | Paper link: https://arxiv.org/abs/1712.09136 :param target_estimator: The classifier to be analyzed. :param extra_estimator: Another classifier of the same type as the target classifier, but not yet fit. :param x: The training data of the classifier. :param y: Target values (class labels) of `x`, one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). :param indexes: the subset of indexes of `x` to compute the PDTP metric on. If not supplied, PDTP will be computed for all samples in `x`. :param num_iter: the number of iterations of PDTP computation to run for each sample. If not supplied, defaults to 10. The result is the average across iterations. :return: an array containing the average PDTP value for each sample in the training set. The higher the value, the higher the privacy leakage for that sample. """ from art.estimators.classification.pytorch import PyTorchClassifier from art.estimators.classification.tensorflow import TensorFlowV2Classifier from art.estimators.classification.scikitlearn import ScikitlearnClassifier supported_classifiers = (PyTorchClassifier, TensorFlowV2Classifier, ScikitlearnClassifier) if not isinstance(target_estimator, supported_classifiers) or not isinstance( extra_estimator, supported_classifiers): raise ValueError( "PDTP metric only supports classifiers of type PyTorch, TensorFlowV2 and ScikitLearn." ) if target_estimator.input_shape[0] != x.shape[1]: raise ValueError("Shape of x does not match input_shape of classifier") y = check_and_transform_label_format(y, target_estimator.nb_classes) if y.shape[0] != x.shape[0]: raise ValueError("Number of rows in x and y do not match") results = [] for _ in range(num_iter): iter_results = [] # get probabilities from original model pred = target_estimator.predict(x) if not is_probability(pred): try: pred = scipy.special.softmax(pred, axis=1) except Exception as exc: raise ValueError( "PDTP metric only supports classifiers that output logits or probabilities." ) from exc # divide into 100 bins and return center of bin bins = np.array(np.arange(0.0, 1.01, 0.01).round(decimals=2)) pred_bin_indexes = np.digitize(pred, bins) pred_bin = bins[pred_bin_indexes] - 0.005 if not indexes: indexes = range(x.shape[0]) for row in indexes: # create new model without sample in training data alt_x = np.delete(x, row, 0) alt_y = np.delete(y, row, 0) try: extra_estimator.reset() except NotImplementedError as exc: raise ValueError( "PDTP metric can only be applied to classifiers that implement the reset method." ) from exc extra_estimator.fit(alt_x, alt_y) # get probabilities from new model alt_pred = extra_estimator.predict(x) if not is_probability(alt_pred): alt_pred = scipy.special.softmax(alt_pred, axis=1) # divide into 100 bins and return center of bin alt_pred_bin_indexes = np.digitize(alt_pred, bins) alt_pred_bin = bins[alt_pred_bin_indexes] - 0.005 ratio_1 = pred_bin / alt_pred_bin ratio_2 = alt_pred_bin / pred_bin # get max value max_value = max(ratio_1.max(), ratio_2.max()) iter_results.append(max_value) results.append(iter_results) # get average of iterations for each sample # We now have a list of list, internal lists represent an iteration. We need to transpose and get averages. per_sample = list(map(list, zip(*results))) avg_per_sample = np.array([sum(val) / len(val) for val in per_sample]) # return leakage per sample return avg_per_sample
def __init__( self, estimator: "CLASSIFIER_LOSS_GRADIENTS_TYPE", norm: Union[int, float, str] = np.inf, eps: float = 0.3, eps_step: float = 0.1, max_iter: int = 100, targeted: bool = False, nb_random_init: int = 5, batch_size: int = 32, loss_type: Optional[str] = None, verbose: bool = True, ): """ Create a :class:`.AutoProjectedGradientDescent` instance. :param estimator: An trained estimator. :param norm: The norm of the adversarial perturbation. Possible values: "inf", np.inf, 1 or 2. :param eps: Maximum perturbation that the attacker can introduce. :param eps_step: Attack step size (input variation) at each iteration. :param max_iter: The maximum number of iterations. :param targeted: Indicates whether the attack is targeted (True) or untargeted (False). :param nb_random_init: Number of random initialisations within the epsilon ball. For num_random_init=0 starting at the original input. :param batch_size: Size of the batch on which adversarial samples are generated. :param loss_type: Defines the loss to attack. Available options: None (Use loss defined by estimator), "cross_entropy", or "difference_logits_ratio" :param verbose: Show progress bars. """ from art.estimators.classification import TensorFlowClassifier, TensorFlowV2Classifier, PyTorchClassifier if loss_type not in self._predefined_losses: raise ValueError( "The argument loss_type has an invalid value. The following options for `loss_type` are currently " "supported: {}".format(self._predefined_losses) ) if loss_type is None: if hasattr(estimator, "predict") and is_probability( estimator.predict(x=np.ones(shape=(1, *estimator.input_shape), dtype=np.float32)) ): raise ValueError( "AutoProjectedGradientDescent is expecting logits as estimator output, the provided " "estimator seems to predict probabilities." ) estimator_apgd = estimator else: if isinstance(estimator, TensorFlowClassifier): import tensorflow as tf if loss_type == "cross_entropy": if is_probability(estimator.predict(x=np.ones(shape=(1, *estimator.input_shape)))): raise NotImplementedError("Cross-entropy loss is not implemented for probability output.") self._loss_object = tf.reduce_mean( tf.keras.losses.categorical_crossentropy( y_pred=estimator._output, y_true=estimator._labels_ph, from_logits=True ) ) elif loss_type == "difference_logits_ratio": if is_probability(estimator.predict(x=np.ones(shape=(1, *estimator.input_shape)))): raise ValueError( "The provided estimator seems to predict probabilities. " "If loss_type='difference_logits_ratio' the estimator has to to predict logits." ) raise ValueError( "The loss `difference_logits_ratio` has not been validate completely. It seems that the " "commented implemented below is failing to selected the second largest logit for cases " "where the largest logit is the true logit. For future work `difference_logits_ratio` and " "loss_fn should return the same loss value." ) # def difference_logits_ratio(y_true, y_pred): # i_y_true = tf.cast(tf.math.argmax(tf.cast(y_true, tf.int32), axis=1), tf.int32) # i_y_pred_arg = tf.argsort(y_pred, axis=1) # # Not completely sure if the following line is correct. # # `i_y_pred_arg[:, -2], i_y_pred_arg[:, -1]` seems closer to the output of `loss_fn` than # # `i_y_pred_arg[:, -1], i_y_pred_arg[:, -2]` # i_z_i = tf.where(i_y_pred_arg[:, -1] != i_y_true[:], i_y_pred_arg[:, -2], # i_y_pred_arg[:, -1]) # # z_1 = tf.gather(y_pred, i_y_pred_arg[:, -1], axis=1, batch_dims=0) # z_3 = tf.gather(y_pred, i_y_pred_arg[:, -3], axis=1, batch_dims=0) # z_i = tf.gather(y_pred, i_z_i, axis=1, batch_dims=0) # z_y = tf.gather(y_pred, i_y_true, axis=1, batch_dims=0) # # z_1 = tf.linalg.diag_part(z_1) # z_3 = tf.linalg.diag_part(z_3) # z_i = tf.linalg.diag_part(z_i) # z_y = tf.linalg.diag_part(z_y) # # dlr = -(z_y - z_i) / (z_1 - z_3) # # return tf.reduce_mean(dlr) # # def loss_fn(y_true, y_pred): # i_y_true = np.argmax(y_true, axis=1) # i_y_pred_arg = np.argsort(y_pred, axis=1) # i_z_i = np.where(i_y_pred_arg[:, -1] != i_y_true[:], i_y_pred_arg[:, -1], # i_y_pred_arg[:, -2]) # # z_1 = y_pred[:, i_y_pred_arg[:, -1]] # z_3 = y_pred[:, i_y_pred_arg[:, -3]] # z_i = y_pred[:, i_z_i] # z_y = y_pred[:, i_y_true] # # z_1 = np.diag(z_1) # z_3 = np.diag(z_3) # z_i = np.diag(z_i) # z_y = np.diag(z_y) # # dlr = -(z_y - z_i) / (z_1 - z_3) # # return np.mean(dlr) # # self._loss_fn = loss_fn # self._loss_object = difference_logits_ratio(y_true=estimator._labels_ph, # y_pred=estimator._output) estimator_apgd = TensorFlowClassifier( input_ph=estimator._input_ph, output=estimator._output, labels_ph=estimator._labels_ph, train=estimator._train, loss=self._loss_object, learning=estimator._learning, sess=estimator._sess, channels_first=estimator.channels_first, clip_values=estimator.clip_values, preprocessing_defences=estimator.preprocessing_defences, postprocessing_defences=estimator.postprocessing_defences, preprocessing=estimator.preprocessing, feed_dict=estimator._feed_dict, ) elif isinstance(estimator, TensorFlowV2Classifier): import tensorflow as tf if loss_type == "cross_entropy": if is_probability(estimator.predict(x=np.ones(shape=(1, *estimator.input_shape)))): self._loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=False) else: self._loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True) elif loss_type == "difference_logits_ratio": if is_probability(estimator.predict(x=np.ones(shape=(1, *estimator.input_shape)))): raise ValueError( "The provided estimator seems to predict probabilities. " "If loss_type='difference_logits_ratio' the estimator has to to predict logits." ) class difference_logits_ratio: def __init__(self): self.reduction = "mean" def __call__(self, y_true, y_pred): i_y_true = tf.cast(tf.math.argmax(tf.cast(y_true, tf.int32), axis=1), tf.int32) i_y_pred_arg = tf.argsort(y_pred, axis=1) i_z_i_list = list() for i in range(y_true.shape[0]): if i_y_pred_arg[i, -1] != i_y_true[i]: i_z_i_list.append(i_y_pred_arg[i, -1]) else: i_z_i_list.append(i_y_pred_arg[i, -2]) i_z_i = tf.stack(i_z_i_list) z_1 = tf.gather(y_pred, i_y_pred_arg[:, -1], axis=1, batch_dims=0) z_3 = tf.gather(y_pred, i_y_pred_arg[:, -3], axis=1, batch_dims=0) z_i = tf.gather(y_pred, i_z_i, axis=1, batch_dims=0) z_y = tf.gather(y_pred, i_y_true, axis=1, batch_dims=0) z_1 = tf.linalg.diag_part(z_1) z_3 = tf.linalg.diag_part(z_3) z_i = tf.linalg.diag_part(z_i) z_y = tf.linalg.diag_part(z_y) dlr = -(z_y - z_i) / (z_1 - z_3) return tf.reduce_mean(dlr) self._loss_fn = difference_logits_ratio() self._loss_object = difference_logits_ratio() estimator_apgd = TensorFlowV2Classifier( model=estimator.model, nb_classes=estimator.nb_classes, input_shape=estimator.input_shape, loss_object=self._loss_object, train_step=estimator._train_step, channels_first=estimator.channels_first, clip_values=estimator.clip_values, preprocessing_defences=estimator.preprocessing_defences, postprocessing_defences=estimator.postprocessing_defences, preprocessing=estimator.preprocessing, ) elif isinstance(estimator, PyTorchClassifier): import torch if loss_type == "cross_entropy": if is_probability( estimator.predict(x=np.ones(shape=(1, *estimator.input_shape), dtype=np.float32)) ): raise ValueError( "The provided estimator seems to predict probabilities. If loss_type='cross_entropy' " "the estimator has to to predict logits." ) self._loss_object = torch.nn.CrossEntropyLoss(reduction="mean") elif loss_type == "difference_logits_ratio": if is_probability( estimator.predict(x=np.ones(shape=(1, *estimator.input_shape), dtype=ART_NUMPY_DTYPE)) ): raise ValueError( "The provided estimator seems to predict probabilities. " "If loss_type='difference_logits_ratio' the estimator has to to predict logits." ) class difference_logits_ratio: def __init__(self): self.reduction = "mean" def __call__(self, y_pred, y_true): # type: ignore if isinstance(y_true, np.ndarray): y_true = torch.from_numpy(y_true) if isinstance(y_pred, np.ndarray): y_pred = torch.from_numpy(y_pred) y_true = y_true.float() i_y_true = torch.argmax(y_true, axis=1) i_y_pred_arg = torch.argsort(y_pred, axis=1) i_z_i_list = list() for i in range(y_true.shape[0]): if i_y_pred_arg[i, -1] != i_y_true[i]: i_z_i_list.append(i_y_pred_arg[i, -1]) else: i_z_i_list.append(i_y_pred_arg[i, -2]) i_z_i = torch.stack(i_z_i_list) z_1 = y_pred[:, i_y_pred_arg[:, -1]] z_3 = y_pred[:, i_y_pred_arg[:, -3]] z_i = y_pred[:, i_z_i] z_y = y_pred[:, i_y_true] z_1 = torch.diagonal(z_1) z_3 = torch.diagonal(z_3) z_i = torch.diagonal(z_i) z_y = torch.diagonal(z_y) dlr = -(z_y - z_i) / (z_1 - z_3) return torch.mean(dlr.float()) self._loss_object = difference_logits_ratio() estimator_apgd = PyTorchClassifier( model=estimator.model, loss=self._loss_object, input_shape=estimator.input_shape, nb_classes=estimator.nb_classes, optimizer=None, channels_first=estimator.channels_first, clip_values=estimator.clip_values, preprocessing_defences=estimator.preprocessing_defences, postprocessing_defences=estimator.postprocessing_defences, preprocessing=estimator.preprocessing, device_type=estimator._device, ) else: raise ValueError("The loss type {} is not supported for the provided estimator.".format(loss_type)) super().__init__(estimator=estimator_apgd) self.norm = norm self.eps = eps self.eps_step = eps_step self.max_iter = max_iter self.targeted = targeted self.nb_random_init = nb_random_init self.batch_size = batch_size self.loss_type = loss_type self.verbose = verbose self._check_params()
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> Tuple[np.ndarray, np.ndarray]: """ Generate an adversarial patch and return the patch and its mask in arrays. :param x: An array with the original input images of shape NHWC or input videos of shape NFHWC. :param y: An array with the original true labels. :param mask: A boolean array of shape equal to the shape of a single samples (1, H, W) or the shape of `x` (N, H, W) without their channel dimensions. Any features for which the mask is True can be the center location of the patch during sampling. :type mask: `np.ndarray` :param reset_patch: If `True` reset patch to initial values of mean of minimal and maximal clip value, else if `False` (default) restart from previous patch values created by previous call to `generate` or mean of minimal and maximal clip value if first call to `generate`. :type reset_patch: bool :return: An array with adversarial patch and an array of the patch mask. """ import tensorflow as tf # lgtm [py/repeated-import] shuffle = kwargs.get("shuffle", True) mask = kwargs.get("mask") if mask is not None: mask = mask.copy() mask = self._check_mask(mask=mask, x=x) if y is None: # pragma: no cover logger.info( "Setting labels to estimator predictions and running untargeted attack because `y=None`." ) y = to_categorical(np.argmax(self.estimator.predict(x=x), axis=1), nb_classes=self.estimator.nb_classes) self.targeted = False else: self.targeted = True if kwargs.get("reset_patch"): self.reset_patch(initial_patch_value=self._initial_value) y = check_and_transform_label_format( labels=y, nb_classes=self.estimator.nb_classes) # check if logits or probabilities y_pred = self.estimator.predict(x=x[[0]]) if is_probability(y_pred): self.use_logits = False else: self.use_logits = True if mask is None: if shuffle: dataset = tf.data.Dataset.from_tensor_slices( (x, y)).shuffle(10000).batch(self.batch_size) else: dataset = tf.data.Dataset.from_tensor_slices( (x, y)).batch(self.batch_size) else: if shuffle: dataset = tf.data.Dataset.from_tensor_slices( (x, y, mask)).shuffle(10000).batch(self.batch_size) else: dataset = tf.data.Dataset.from_tensor_slices( (x, y, mask)).batch(self.batch_size) for i_iter in trange(self.max_iter, desc="Adversarial Patch TensorFlow v2", disable=not self.verbose): if mask is None: counter = 0 for images, target in dataset: counter += 1 _ = self._train_step(images=images, target=target, mask=None) else: for images, target, mask_i in dataset: _ = self._train_step(images=images, target=target, mask=mask_i) if self.summary_writer is not None: # pragma: no cover self.summary_writer.add_image( "patch", self._patch.numpy().transpose((2, 0, 1)), global_step=i_iter, ) if hasattr(self.estimator, "compute_losses"): x_patched = self._random_overlay(images=x, patch=self._patch, mask=mask) losses = self.estimator.compute_losses(x=x_patched, y=y) for key, value in losses.items(): self.summary_writer.add_scalar( "loss/{}".format(key), np.mean(value), global_step=i_iter, ) return ( self._patch.numpy(), self._get_circular_patch_mask(nb_samples=1).numpy()[0], )
def generate(self, inputs: np.ndarray, labels: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Function creates adversarial examples, then returns them in a numpy array :param inputs: Array with original inputs (to be attacked) :param labels: Array with original labels (to be predicted) :return: Array containing adversarial examples """ adv_inputs = inputs.astype(ART_NUMPY_DTYPE) predictions = self.estimator.predict(inputs, batch_size=self.batch_size) if (is_probability(predictions[0])): logger.warning( "Targeted model should output logits, not probabilities for predictions." ) # Determine class labels for gradients use_grad_subset, labels_set = self.define_class_labels(predictions) sorter = np.arange(len(labels_set)) # Calculate perturbation with batch for batch_nb in trange(int( np.ceil(adv_inputs.shape[0] / float(self.batch_size))), desc="DeepFool", disable=not self.show_prog): batch_idx_1, batch_idx_2 = batch_nb * self.batch_size, ( batch_nb + 1) * self.batch_size batch = adv_inputs[batch_idx_1:batch_idx_2].copy() # Predictions for batch f_batch, fk_hat = self.batch_predict(predictions, batch_idx_1, batch_idx_2) # Gradient for batch grads = self.batch_gradient(batch, use_grad_subset, labels_set) # Gets current predictions active_idxs = np.arange(len(batch)) step = 0 while (active_idxs.size > 0) and (step < self.total_iter): # Difference in gradients and predictions for selected predictions labels_idxs = sorter[np.searchsorted(labels_set, fk_hat, sorter=sorter)] grad_dif = grads - grads[np.arange(len(grads)), labels_idxs][:, None] f_dif = f_batch[:, labels_set] - f_batch[np.arange(len(f_batch)), labels_idxs][:, None] # Select coordinate and compute perturbation r_var = self.perturbation(adv_inputs, labels_set, labels_idxs, grad_dif, f_dif) # Add new perturbation to clip result if self.estimator.clip_values is not None: batch[active_idxs] = np.clip( batch[active_idxs] + r_var[active_idxs] * (self.estimator.clip_values[1] - self.estimator.clip_values[0]), self.estimator.clip_values[0], self.estimator.clip_values[1], ) else: batch[active_idxs] += r_var[active_idxs] # Recalculate prediction f_batch = self.estimator.predict(batch) fk_i_hat = np.argmax(f_batch, axis=1) # Recalculate gradient grads = self.batch_gradient(batch, use_grad_subset, labels_set) # Check if misclassification has occured active_idxs = np.where(fk_i_hat == fk_hat)[0] step += 1 # Apply overshoot parameters adv_inputs[batch_idx_1:batch_idx_2] = self.overshoot( adv_inputs, batch_idx_1, batch_idx_2, batch, batch_nb) if self.estimator.clip_values is not None: np.clip( adv_inputs[batch_idx_1:batch_idx_2], self.estimator.clip_values[0], self.estimator.clip_values[1], out=adv_inputs[batch_idx_1:batch_idx_2], ) logger.info( "DeepFool attack success rate: %.2f%%", 100 * compute_success(self.estimator, inputs, labels, adv_inputs, batch_size=self.batch_size), ) return adv_inputs
def generate( # type: ignore self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs ) -> Tuple[np.ndarray, np.ndarray]: """ Generate an adversarial patch and return the patch and its mask in arrays. :param x: An array with the original input images of shape NCHW or input videos of shape NFCHW. :param y: An array with the original true labels. :param mask: An boolean array of shape equal to the shape of a single samples (1, H, W) or the shape of `x` (N, H, W) without their channel dimensions. Any features for which the mask is True can be the center location of the patch during sampling. :type mask: `np.ndarray` :return: An array with adversarial patch and an array of the patch mask. """ import torch # lgtm [py/repeated-import] shuffle = kwargs.get("shuffle", True) mask = kwargs.get("mask") if mask is not None: mask = mask.copy() mask = self._check_mask(mask=mask, x=x) if self.patch_location is not None and mask is not None: raise ValueError("Masks can only be used if the `patch_location` is `None`.") if y is None: # pragma: no cover logger.info("Setting labels to estimator predictions and running untargeted attack because `y=None`.") y = to_categorical(np.argmax(self.estimator.predict(x=x), axis=1), nb_classes=self.estimator.nb_classes) if hasattr(self.estimator, "nb_classes"): y = check_and_transform_label_format(labels=y, nb_classes=self.estimator.nb_classes) # check if logits or probabilities y_pred = self.estimator.predict(x=x[[0]]) if is_probability(y_pred): self.use_logits = False else: self.use_logits = True if isinstance(y, np.ndarray): x_tensor = torch.Tensor(x) y_tensor = torch.Tensor(y) if mask is None: dataset = torch.utils.data.TensorDataset(x_tensor, y_tensor) data_loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=self.batch_size, shuffle=shuffle, drop_last=False, ) else: mask_tensor = torch.Tensor(mask) dataset = torch.utils.data.TensorDataset(x_tensor, y_tensor, mask_tensor) data_loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=self.batch_size, shuffle=shuffle, drop_last=False, ) else: class ObjectDetectionDataset(torch.utils.data.Dataset): """ Object detection dataset in PyTorch. """ def __init__(self, x, y): self.x = x self.y = y def __len__(self): return self.x.shape[0] def __getitem__(self, idx): img = torch.from_numpy(self.x[idx]) target = {} target["boxes"] = torch.from_numpy(self.y[idx]["boxes"]) target["labels"] = torch.from_numpy(self.y[idx]["labels"]) target["scores"] = torch.from_numpy(self.y[idx]["scores"]) return img, target class ObjectDetectionDatasetMask(torch.utils.data.Dataset): """ Object detection dataset in PyTorch. """ def __init__(self, x, y, mask): self.x = x self.y = y self.mask = mask def __len__(self): return self.x.shape[0] def __getitem__(self, idx): img = torch.from_numpy(self.x[idx]) target = {} target["boxes"] = torch.from_numpy(y[idx]["boxes"]) target["labels"] = torch.from_numpy(y[idx]["labels"]) target["scores"] = torch.from_numpy(y[idx]["scores"]) mask_i = torch.from_numpy(self.mask[idx]) return img, target, mask_i dataset_object_detection: Union[ObjectDetectionDataset, ObjectDetectionDatasetMask] if mask is None: dataset_object_detection = ObjectDetectionDataset(x, y) else: dataset_object_detection = ObjectDetectionDatasetMask(x, y, mask) data_loader = torch.utils.data.DataLoader( dataset=dataset_object_detection, batch_size=self.batch_size, shuffle=shuffle, drop_last=False, ) for i_iter in trange(self.max_iter, desc="Adversarial Patch PyTorch", disable=not self.verbose): if mask is None: for images, target in data_loader: images = images.to(self.estimator.device) if isinstance(target, torch.Tensor): target = target.to(self.estimator.device) else: target["boxes"] = target["boxes"].to(self.estimator.device) target["labels"] = target["labels"].to(self.estimator.device) target["scores"] = target["scores"].to(self.estimator.device) _ = self._train_step(images=images, target=target, mask=None) else: for images, target, mask_i in data_loader: images = images.to(self.estimator.device) if isinstance(target, torch.Tensor): target = target.to(self.estimator.device) else: target["boxes"] = target["boxes"].to(self.estimator.device) target["labels"] = target["labels"].to(self.estimator.device) target["scores"] = target["scores"].to(self.estimator.device) mask_i = mask_i.to(self.estimator.device) _ = self._train_step(images=images, target=target, mask=mask_i) # Write summary if self.summary_writer is not None: # pragma: no cover x_patched = ( self._random_overlay( images=torch.from_numpy(x).to(self.estimator.device), patch=self._patch, mask=mask ) .detach() .cpu() .numpy() ) self.summary_writer.update( batch_id=0, global_step=i_iter, grad=None, patch=self._patch, estimator=self.estimator, x=x_patched, y=y, targeted=self.targeted, ) if self.summary_writer is not None: self.summary_writer.reset() return ( self._patch.detach().cpu().numpy(), self._get_circular_patch_mask(nb_samples=1).cpu().numpy()[0], )
def __init__( self, estimator: "CLASSIFIER_LOSS_GRADIENTS_TYPE", norm: Union[int, float, str] = np.inf, eps: float = 0.3, eps_step: float = 0.1, max_iter: int = 100, targeted: bool = False, nb_random_init: int = 5, batch_size: int = 32, loss_type: Optional[str] = None, ): """ Create a :class:`.AutoProjectedGradientDescent` instance. :param estimator: An trained estimator. :param norm: The norm of the adversarial perturbation. Possible values: "inf", np.inf, 1 or 2. :param eps: Maximum perturbation that the attacker can introduce. :param eps_step: Attack step size (input variation) at each iteration. :param max_iter: The maximum number of iterations. :param targeted: Indicates whether the attack is targeted (True) or untargeted (False). :param nb_random_init: Number of random initialisations within the epsilon ball. For num_random_init=0 starting at the original input. :param batch_size: Size of the batch on which adversarial samples are generated. """ from art.estimators.classification import TensorFlowClassifier, TensorFlowV2Classifier, PyTorchClassifier if isinstance(estimator, TensorFlowClassifier): import tensorflow as tf if loss_type == "cross_entropy": if is_probability( estimator.predict(x=np.ones( shape=(1, *estimator.input_shape)))): raise NotImplementedError( "Cross-entropy loss is not implemented for probability output." ) else: self._loss_object = tf.reduce_mean( tf.keras.losses.categorical_crossentropy( y_pred=estimator._output, y_true=estimator._labels_ph, from_logits=True)) def loss_fn(y_true, y_pred): y_pred_norm = y_pred - np.amax( y_pred, axis=1, keepdims=True) loss_value = -(y_true * y_pred_norm - np.log( np.sum(np.exp(y_pred_norm), axis=1, keepdims=True))) return np.mean(loss_value) self._loss_fn = loss_fn elif loss_type == "difference_logits_ratio": if is_probability( estimator.predict(x=np.ones( shape=(1, *estimator.input_shape)))): raise ValueError( "The provided estimator seems to predict probabilities. If loss_type='difference_logits_ratio' " "the estimator has to to predict logits.") else: def difference_logits_ratio(y_true, y_pred): i_y_true = tf.cast( tf.math.argmax(tf.cast(y_true, tf.int32), axis=1), tf.int32) i_y_pred_arg = tf.argsort(y_pred, axis=1) i_z_i = tf.where(i_y_pred_arg[:, -1] != i_y_true[:], i_y_pred_arg[:, -2], i_y_pred_arg[:, -1]) z_1 = tf.gather(y_pred, i_y_pred_arg[:, -1], axis=1, batch_dims=0) z_3 = tf.gather(y_pred, i_y_pred_arg[:, -3], axis=1, batch_dims=0) z_i = tf.gather(y_pred, i_z_i, axis=1, batch_dims=0) z_y = tf.gather(y_pred, i_y_true, axis=1, batch_dims=0) z_1 = tf.linalg.diag_part(z_1) z_3 = tf.linalg.diag_part(z_3) z_i = tf.linalg.diag_part(z_i) z_y = tf.linalg.diag_part(z_y) dlr = -(z_y - z_i) / (z_1 - z_3) return tf.reduce_mean(dlr) def loss_fn(y_true, y_pred): i_y_true = np.argmax(y_true, axis=1) i_y_pred_arg = np.argsort(y_pred, axis=1) i_z_i = np.where(i_y_pred_arg[:, -1] != i_y_true[:], i_y_pred_arg[:, -1], i_y_pred_arg[:, -2]) z_1 = y_pred[:, i_y_pred_arg[:, -1]] z_3 = y_pred[:, i_y_pred_arg[:, -3]] z_i = y_pred[:, i_z_i] z_y = y_pred[:, i_y_true] z_1 = np.diag(z_1) z_3 = np.diag(z_3) z_i = np.diag(z_i) z_y = np.diag(z_y) dlr = -(z_y - z_i) / (z_1 - z_3) return np.mean(dlr) self._loss_fn = loss_fn self._loss_object = difference_logits_ratio( y_true=estimator._labels_ph, y_pred=estimator._output) elif loss_type is None: self._loss_object = estimator._loss_object else: raise ValueError( "The argument loss_type has an invalid value. The following options for loss_type are " "supported: {}".format( [None, "cross_entropy", "difference_logits_ratio"])) estimator_apgd = TensorFlowClassifier( input_ph=estimator._input_ph, output=estimator._output, labels_ph=estimator._labels_ph, train=estimator._train, loss=self._loss_object, learning=estimator._learning, sess=estimator._sess, channels_first=estimator.channels_first, clip_values=estimator.clip_values, preprocessing_defences=estimator.preprocessing_defences, postprocessing_defences=estimator.postprocessing_defences, preprocessing=estimator.preprocessing, feed_dict=estimator._feed_dict, ) elif isinstance(estimator, TensorFlowV2Classifier): import tensorflow as tf if loss_type == "cross_entropy": if is_probability( estimator.predict(x=np.ones( shape=(1, *estimator.input_shape)))): self._loss_object = tf.keras.losses.CategoricalCrossentropy( from_logits=False) self._loss_fn = self._loss_object else: self._loss_object = tf.keras.losses.CategoricalCrossentropy( from_logits=True) self._loss_fn = self._loss_object elif loss_type == "difference_logits_ratio": if is_probability( estimator.predict(x=np.ones( shape=(1, *estimator.input_shape)))): raise ValueError( "The provided estimator seems to predict probabilities. If loss_type='difference_logits_ratio' " "the estimator has to to predict logits.") else: def difference_logits_ratio(y_true, y_pred): i_y_true = tf.cast( tf.math.argmax(tf.cast(y_true, tf.int32), axis=1), tf.int32) i_y_pred_arg = tf.argsort(y_pred, axis=1) i_z_i_list = list() for i in range(y_true.shape[0]): if i_y_pred_arg[i, -1] != i_y_true[i]: i_z_i_list.append(i_y_pred_arg[i, -1]) else: i_z_i_list.append(i_y_pred_arg[i, -2]) i_z_i = tf.stack(i_z_i_list) z_1 = tf.gather(y_pred, i_y_pred_arg[:, -1], axis=1, batch_dims=0) z_3 = tf.gather(y_pred, i_y_pred_arg[:, -3], axis=1, batch_dims=0) z_i = tf.gather(y_pred, i_z_i, axis=1, batch_dims=0) z_y = tf.gather(y_pred, i_y_true, axis=1, batch_dims=0) z_1 = tf.linalg.diag_part(z_1) z_3 = tf.linalg.diag_part(z_3) z_i = tf.linalg.diag_part(z_i) z_y = tf.linalg.diag_part(z_y) dlr = -(z_y - z_i) / (z_1 - z_3) return tf.reduce_mean(dlr) self._loss_fn = difference_logits_ratio self._loss_object = difference_logits_ratio elif loss_type is None: self._loss_object = estimator._loss_object else: raise ValueError( "The argument loss_type has an invalid value. The following options for loss_type are " "supported: {}".format( [None, "cross_entropy", "difference_logits_ratio"])) estimator_apgd = TensorFlowV2Classifier( model=estimator.model, nb_classes=estimator.nb_classes, input_shape=estimator.input_shape, loss_object=self._loss_object, train_step=estimator._train_step, channels_first=estimator.channels_first, clip_values=estimator.clip_values, preprocessing_defences=estimator.preprocessing_defences, postprocessing_defences=estimator.postprocessing_defences, preprocessing=estimator.preprocessing, ) elif isinstance(estimator, PyTorchClassifier): import torch if loss_type == "cross_entropy": if is_probability( estimator.predict( x=np.ones(shape=(1, *estimator.input_shape), dtype=np.float32))): raise ValueError( "The provided estimator seems to predict probabilities. If loss_type='cross_entropy' " "the estimator has to to predict logits.") else: def loss_fn(y_true, y_pred): return torch.nn.CrossEntropyLoss()( torch.from_numpy(y_pred), torch.from_numpy(np.argmax(y_true, axis=1))) self._loss_fn = loss_fn self._loss_object = torch.nn.CrossEntropyLoss() elif loss_type == "difference_logits_ratio": if is_probability( estimator.predict( x=np.ones(shape=(1, *estimator.input_shape), dtype=ART_NUMPY_DTYPE))): raise ValueError( "The provided estimator seems to predict probabilities. If loss_type='difference_logits_ratio' " "the estimator has to to predict logits.") else: # def difference_logits_ratio(y_true, y_pred): def difference_logits_ratio(y_pred, y_true): # type: ignore if isinstance(y_true, np.ndarray): y_true = torch.from_numpy(y_true) if isinstance(y_pred, np.ndarray): y_pred = torch.from_numpy(y_pred) y_true = y_true.float() # dlr = torch.mean((y_pred - y_true) ** 2) # return loss i_y_true = torch.argmax(y_true, axis=1) i_y_pred_arg = torch.argsort(y_pred, axis=1) i_z_i_list = list() for i in range(y_true.shape[0]): if i_y_pred_arg[i, -1] != i_y_true[i]: i_z_i_list.append(i_y_pred_arg[i, -1]) else: i_z_i_list.append(i_y_pred_arg[i, -2]) i_z_i = torch.stack(i_z_i_list) z_1 = y_pred[:, i_y_pred_arg[:, -1]] z_3 = y_pred[:, i_y_pred_arg[:, -3]] z_i = y_pred[:, i_z_i] z_y = y_pred[:, i_y_true] z_1 = torch.diagonal(z_1) z_3 = torch.diagonal(z_3) z_i = torch.diagonal(z_i) z_y = torch.diagonal(z_y) dlr = -(z_y - z_i) / (z_1 - z_3) return torch.mean(dlr.float()) self._loss_fn = difference_logits_ratio self._loss_object = difference_logits_ratio elif loss_type is None: self._loss_object = estimator._loss_object else: raise ValueError( "The argument loss_type has an invalid value. The following options for loss_type are " "supported: {}".format( [None, "cross_entropy", "difference_logits_ratio"])) estimator_apgd = PyTorchClassifier( model=estimator.model, loss=self._loss_object, input_shape=estimator.input_shape, nb_classes=estimator.nb_classes, optimizer=None, channels_first=estimator.channels_first, clip_values=estimator.clip_values, preprocessing_defences=estimator.preprocessing_defences, postprocessing_defences=estimator.postprocessing_defences, preprocessing=estimator.preprocessing, device_type=estimator._device, ) else: estimator_apgd = None super().__init__(estimator=estimator_apgd) self.norm = norm self.eps = eps self.eps_step = eps_step self.max_iter = max_iter self.targeted = targeted self.nb_random_init = nb_random_init self.batch_size = batch_size self.loss_type = loss_type self._check_params()
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> Tuple[np.ndarray, np.ndarray]: """ Generate an adversarial patch and return the patch and its mask in arrays. :param x: An array with the original input images of shape NHWC or input videos of shape NFHWC. :param y: An array with the original true labels. :param mask: An boolean array of shape equal to the shape of a single samples (1, H, W) or the shape of `x` (N, H, W) without their channel dimensions. Any features for which the mask is True can be the center location of the patch during sampling. :type mask: `np.ndarray` :return: An array with adversarial patch and an array of the patch mask. """ import torch # lgtm [py/repeated-import] shuffle = kwargs.get("shuffle", True) mask = kwargs.get("mask") if mask is not None: mask = mask.copy() mask = self._check_mask(mask=mask, x=x) if y is None: logger.info("Setting labels to estimator predictions and running untargeted attack because `y=None`.") y = to_categorical(np.argmax(self.estimator.predict(x=x), axis=1), nb_classes=self.estimator.nb_classes) self.targeted = False else: self.targeted = True y = check_and_transform_label_format(labels=y, nb_classes=self.estimator.nb_classes) # check if logits or probabilities y_pred = self.estimator.predict(x=x[[0]]) if is_probability(y_pred): self.use_logits = False else: self.use_logits = True x_tensor = torch.Tensor(x) y_tensor = torch.Tensor(y) if mask is None: dataset = torch.utils.data.TensorDataset(x_tensor, y_tensor) data_loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=self.batch_size, shuffle=shuffle, drop_last=False, ) else: mask_tensor = torch.Tensor(mask) dataset = torch.utils.data.TensorDataset(x_tensor, y_tensor, mask_tensor) data_loader = torch.utils.data.DataLoader( dataset=dataset, batch_size=self.batch_size, shuffle=shuffle, drop_last=False, ) for i_iter in trange(self.max_iter, desc="Adversarial Patch PyTorch", disable=not self.verbose): if mask is None: for images, target in data_loader: _ = self._train_step(images=images, target=target, mask=None) else: for images, target, mask_i in data_loader: _ = self._train_step(images=images, target=target, mask=mask_i) if self.summary_writer is not None: self.summary_writer.add_image( "patch", self._patch, global_step=i_iter, ) if hasattr(self.estimator, "compute_losses"): x_patched = self._random_overlay( images=torch.from_numpy(x).to(self.estimator.device), patch=self._patch, mask=mask ) losses = self.estimator.compute_losses(x=x_patched, y=torch.from_numpy(y).to(self.estimator.device)) for key, value in losses.items(): self.summary_writer.add_scalar( "loss/{}".format(key), np.mean(value.detach().cpu().numpy()), global_step=i_iter, ) return ( self._patch.detach().cpu().numpy(), self._get_circular_patch_mask(nb_samples=1).numpy()[0], )
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: An array with the original labels to be predicted. :return: An array holding the adversarial examples. """ x_adv = x.astype(ART_NUMPY_DTYPE) preds = self.estimator.predict(x, batch_size=self.batch_size) if y is None: raise ValueError("Labels `y` cannot be None.") if self.estimator.nb_classes == 2 and preds.shape[1] == 1: raise ValueError( # pragma: no cover "This attack has not yet been tested for binary classification with a single output classifier." ) if is_probability(preds[0]): logger.warning( "It seems that the attacked model is predicting probabilities. DeepFool expects logits as model output " "to achieve its full attack strength." ) # Determine the class labels for which to compute the gradients labels_set = np.arange(self.estimator.nb_classes) sorter = np.arange(len(labels_set)) # Pick a small scalar to avoid division by 0 tol = 10e-8 # Compute perturbation with implicit batching for batch_id in trange( int(np.ceil(x_adv.shape[0] / float(self.batch_size))), desc="TargetedDeepFool_simple", disable=not self.verbose ): batch_index_1, batch_index_2 = batch_id * self.batch_size, (batch_id + 1) * self.batch_size batch = x_adv[batch_index_1:batch_index_2].copy() # Get predictions, labels, and gradients for batch f_batch = preds[batch_index_1:batch_index_2] f_target_y = y[batch_index_1:batch_index_2] fk_hat = np.argmax(f_batch, axis=1) # Compute gradients for all classes grd = self.estimator.class_gradient(batch) # Get current predictions active_indices = np.arange(len(batch)) # exclude the inputs that have already classified into thier target label. active_indices = np.where(fk_hat != np.argmax(f_target_y, axis=1))[0] target_labels_indices = sorter[np.searchsorted(labels_set, np.argmax(f_target_y, axis=1), sorter=sorter)] current_step = 0 while active_indices.size > 0 and current_step < self.max_iter: # Compute difference in predictions and gradients only for selected top predictions pred_labels_indices = sorter[np.searchsorted(labels_set, fk_hat, sorter=sorter)] grad_diff = grd - grd[np.arange(len(grd)), pred_labels_indices][:, None] f_diff = f_batch[:, labels_set] - f_batch[np.arange(len(f_batch)), pred_labels_indices][:, None] # Choose coordinate and compute perturbation norm = np.linalg.norm(grad_diff.reshape(len(grad_diff), len(labels_set), -1), axis=2) + tol value = np.abs(f_diff) / norm #value[np.arange(len(value)), pred_labels_indices] = np.inf #l_var = np.argmin(value, axis=1) l_var = target_labels_indices absolute1 = abs(f_diff[np.arange(len(f_diff)), l_var]) draddiff = grad_diff[np.arange(len(grad_diff)), l_var].reshape(len(grad_diff), -1) pow1 = ( pow( np.linalg.norm(draddiff, axis=1), 2, ) + tol ) r_var = absolute1 / pow1 r_var = r_var.reshape((-1,) + (1,) * (len(x.shape) - 1)) r_var = r_var * grad_diff[np.arange(len(grad_diff)), l_var] # Add perturbation and clip result if self.estimator.clip_values is not None: batch[active_indices] = np.clip( batch[active_indices] + r_var[active_indices] * (self.estimator.clip_values[1] - self.estimator.clip_values[0]), self.estimator.clip_values[0], self.estimator.clip_values[1], ) else: batch[active_indices] += r_var[active_indices] # Recompute prediction for new x f_batch = self.estimator.predict(batch) fk_i_hat = np.argmax(f_batch, axis=1) # Recompute gradients for new x grd = self.estimator.class_gradient(batch) # Stop if misclassification has been achieved active_indices = np.where(fk_i_hat != np.argmax(f_target_y, axis=1))[0] fk_hat = fk_i_hat current_step += 1 # Apply overshoot parameter x_adv1 = x_adv[batch_index_1:batch_index_2] x_adv2 = (1 + self.epsilon) * (batch - x_adv[batch_index_1:batch_index_2]) x_adv[batch_index_1:batch_index_2] = x_adv1 + x_adv2 if self.estimator.clip_values is not None: np.clip( x_adv[batch_index_1:batch_index_2], self.estimator.clip_values[0], self.estimator.clip_values[1], out=x_adv[batch_index_1:batch_index_2], ) logger.info( "Success rate of TargetedDeepFool(simple) attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, targeted=True, batch_size=self.batch_size), ) return x_adv
def check_softmax(model, images): modelout = model(images).detach().cpu().numpy() is_softmax = True for i in range(modelout.shape[0]): is_softmax = is_probability(modelout[i, :]) & is_softmax return is_softmax