def generate(self, x, y=None, **kwargs): """Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :type x: `np.ndarray` :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ ''' y = check_and_transform_label_format(y, self.classifier.nb_classes()) if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: raise ValueError('Target labels `y` need to be provided for a targeted attack.') # Use model predictions as correct outputs logger.info('Using model predictions as correct labels for FGM.') y = get_labels_np_array(self.classifier.predict(x, batch_size=self.batch_size)) y = y / np.sum(y, axis=1, keepdims=True) ''' # Return adversarial examples computed with minimal perturbation if option is active if self.minimal: logger.info('Performing minimal perturbation FGM.') adv_x_best = self._minimal_perturbation(x, y) rate_best = 100 * compute_success(self.classifier, x, y, adv_x_best, self.targeted, batch_size=self.batch_size) else: adv_x_best = None rate_best = None for _ in range(max(1, self.num_random_init)): adv_x = self._compute(x, x, y, self.eps, self.eps, self._project, self.num_random_init > 0) if self.num_random_init > 1: rate = 100 * compute_success(self.classifier, x, y, adv_x, self.targeted, batch_size=self.batch_size) if rate_best is None or rate > rate_best or adv_x_best is None: rate_best = rate adv_x_best = adv_x else: adv_x_best = adv_x return adv_x_best
def generate(self, x, y=None, **kwargs): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :type x: `np.ndarray` :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ y = check_and_transform_label_format(y, self.classifier.nb_classes()) if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: raise ValueError('Target labels `y` need to be provided for a targeted attack.') # Use model predictions as correct outputs targets = get_labels_np_array(self.classifier.predict(x, batch_size=self.batch_size)) else: targets = y adv_x_best = None rate_best = None for _ in range(max(1, self.num_random_init)): adv_x = x.astype(NUMPY_DTYPE) for i_max_iter in range(self.max_iter): adv_x = self._compute(adv_x, x, targets, self.eps, self.eps_step, self._project, self.num_random_init > 0 and i_max_iter == 0) if self.num_random_init > 1: rate = 100 * compute_success(self.classifier, x, targets, adv_x, self.targeted, batch_size=self.batch_size) if rate_best is None or rate > rate_best or adv_x_best is None: rate_best = rate adv_x_best = adv_x else: adv_x_best = adv_x logger.info('Success rate of attack: %.2f%%', rate_best if rate_best is not None else 100 * compute_success(self.classifier, x, y, adv_x, self.targeted, batch_size=self.batch_size)) return adv_x_best
def test_compute_success(self): class DummyClassifier: def predict(self, x, batch_size): return x classifier = DummyClassifier() x_clean = np.array([[0, 1], [1, 0]]) x_adv = np.array([[1, 0], [0, 1]]) labels = np.array([[1, 0], [0, 1]]) attack_success_targeted = compute_success(classifier, x_clean, labels, x_adv, targeted=True) attack_success_untargeted = compute_success(classifier, x_clean, labels, x_adv, targeted=False) self.assertEqual(attack_success_targeted, 1.0) self.assertEqual(attack_success_untargeted, 1.0)
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). If `self.targeted` is true, then `y` represents the target labels. Otherwise, the targets are the original class labels. :return: An array holding the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes) x_adv = x.astype(ART_NUMPY_DTYPE) # Assert that, if attack is targeted, y is provided: if self.targeted and y is None: raise ValueError( "Target labels `y` need to be provided for a targeted attack.") # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)) if self.estimator.nb_classes == 2 and y.shape[1] == 1: raise ValueError( "This attack has not yet been tested for binary classification with a single output classifier." ) # Compute adversarial examples with implicit batching nb_batches = int(np.ceil(x_adv.shape[0] / float(self.batch_size))) for batch_id in trange(nb_batches, desc="EAD", disable=not self.verbose): batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x_adv[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] x_adv[batch_index_1:batch_index_2] = self._generate_batch( x_batch, y_batch) # Apply clip if self.estimator.clip_values is not None: x_adv = np.clip(x_adv, self.estimator.clip_values[0], self.estimator.clip_values[1]) # Compute success rate of the EAD attack logger.info( "Success rate of EAD attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). :return: An array holding the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes) # Check that `y` is provided for targeted attacks if self.targeted and y is None: raise ValueError( "Target labels `y` need to be provided for a targeted attack.") # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)) if self.estimator.nb_classes == 2 and y.shape[1] == 1: raise ValueError( "This attack has not yet been tested for binary classification with a single output classifier." ) # Compute adversarial examples with implicit batching nb_batches = int(np.ceil(x.shape[0] / float(self.batch_size))) x_adv = [] for batch_id in trange(nb_batches, desc="ZOO", disable=not self.verbose): batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] res = self._generate_batch(x_batch, y_batch) x_adv.append(res) x_adv = np.vstack(x_adv) # Apply clip if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values np.clip(x_adv, clip_min, clip_max, out=x_adv) # Log success rate of the ZOO attack logger.info( "Success rate of ZOO attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def generate(self, x, y=None): """Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :type x: `np.ndarray` :param y: The labels for the data `x`. Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. Labels should be one-hot-encoded. :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: raise ValueError( 'Target labels `y` need to be provided for a targeted attack.' ) # Use model predictions as correct outputs logger.info('Using model predictions as correct labels for FGM.') y = get_labels_np_array(self.classifier.predict(x)) y = y / np.sum(y, axis=1, keepdims=True) # Return adversarial examples computed with minimal perturbation if option is active if self.minimal: logger.info('Performing minimal perturbation FGM.') adv_x_best = self._minimal_perturbation(x, y) rate_best = 100 * compute_success(self.classifier, x, y, adv_x_best, self.targeted) else: adv_x_best = None rate_best = 0.0 for i_random_init in range(max(1, self.num_random_init)): adv_x = self._compute(x, y, self.eps, self.eps, self.num_random_init > 0) rate = 100 * compute_success(self.classifier, x, y, adv_x, self.targeted) if rate > rate_best or adv_x_best is None: rate_best = rate adv_x_best = adv_x logger.info('Success rate of FGM attack: %.2f%%', rate_best) return adv_x_best
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial examples and return them as an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). :return: An array holding the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes, return_one_hot=False) x_adv = x.copy() for index in trange(x_adv.shape[0], desc="Decision tree attack", disable=not self.verbose): path = self.estimator.get_decision_path(x_adv[index]) legitimate_class = np.argmax(self.estimator.predict(x_adv[index].reshape(1, -1))) position = -2 adv_path = [-1] ancestor = path[position] while np.abs(position) < (len(path) - 1) or adv_path[0] == -1: ancestor = path[position] current_child = path[position + 1] # search in right subtree if current_child == self.estimator.get_left_child(ancestor): if y is None: adv_path = self._df_subtree(self.estimator.get_right_child(ancestor), legitimate_class) else: adv_path = self._df_subtree( self.estimator.get_right_child(ancestor), legitimate_class, y[index], ) else: # search in left subtree if y is None: adv_path = self._df_subtree(self.estimator.get_left_child(ancestor), legitimate_class) else: adv_path = self._df_subtree( self.estimator.get_left_child(ancestor), legitimate_class, y[index], ) position = position - 1 # we are going the decision path upwards adv_path.append(ancestor) # we figured out which is the way to the target, now perturb # first one is leaf-> no threshold, cannot be perturbed for i in range(1, 1 + len(adv_path[1:])): go_for = adv_path[i - 1] threshold = self.estimator.get_threshold_at_node(adv_path[i]) feature = self.estimator.get_feature_at_node(adv_path[i]) # only perturb if the feature is actually wrong if x_adv[index][feature] > threshold and go_for == self.estimator.get_left_child(adv_path[i]): x_adv[index][feature] = threshold - self.offset elif x_adv[index][feature] <= threshold and go_for == self.estimator.get_right_child(adv_path[i]): x_adv[index][feature] = threshold + self.offset logger.info( "Success rate of decision tree attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv), ) return x_adv
def generate(self, x, y=None, **kwargs): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :type x: `np.ndarray` :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). :type y: `np.ndarray` :param x_adv_init: Initial array to act as initial adversarial examples. Same shape as `x`. :type x_adv_init: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ y = check_and_transform_label_format(y, self.classifier.nb_classes()) # Get clip_min and clip_max from the classifier or infer them from data if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None: clip_min, clip_max = self.classifier.clip_values else: clip_min, clip_max = np.min(x), np.max(x) # Prediction from the original images preds = np.argmax(self.classifier.predict(x, batch_size=self.batch_size), axis=1) # Prediction from the initial adversarial examples if not None x_adv_init = kwargs.get('x_adv_init') if x_adv_init is not None: init_preds = np.argmax(self.classifier.predict(x_adv_init, batch_size=self.batch_size), axis=1) else: init_preds = [None] * len(x) x_adv_init = [None] * len(x) # Assert that, if attack is targeted, y is provided if self.targeted and y is None: raise ValueError('Target labels `y` need to be provided for a targeted attack.') # Some initial setups x_adv = x.astype(NUMPY_DTYPE) if y is not None: y = np.argmax(y, axis=1) # Generate the adversarial samples for ind, val in enumerate(x_adv): if self.targeted: x_adv[ind] = self._perturb(x=val, y=y[ind], y_p=preds[ind], init_pred=init_preds[ind], adv_init=x_adv_init[ind], clip_min=clip_min, clip_max=clip_max) else: x_adv[ind] = self._perturb(x=val, y=-1, y_p=preds[ind], init_pred=init_preds[ind], adv_init=x_adv_init[ind], clip_min=clip_min, clip_max=clip_max) if y is not None: y = to_categorical(y, self.classifier.nb_classes()) logger.info('Success rate of HopSkipJump attack: %.2f%%', 100 * compute_success(self.classifier, x, y, x_adv, self.targeted, batch_size=self.batch_size)) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in a Numpy array. :param x: An array with the original inputs to be attacked. :param y: An array with the original labels to be predicted. :return: An array holding the adversarial examples. """ x_adv = x.astype(ART_NUMPY_DTYPE) # Initialize variables y_pred = self.estimator.predict(x, batch_size=self.batch_size) pred_class = np.argmax(y_pred, axis=1) # Compute perturbation with implicit batching for batch_id in trange( int(np.ceil(x_adv.shape[0] / float(self.batch_size))), desc="NewtonFool", disable=not self.verbose ): batch_index_1, batch_index_2 = batch_id * self.batch_size, (batch_id + 1) * self.batch_size batch = x_adv[batch_index_1:batch_index_2] # Main algorithm for each batch norm_batch = np.linalg.norm(np.reshape(batch, (batch.shape[0], -1)), axis=1) l_batch = pred_class[batch_index_1:batch_index_2] l_b = to_categorical(l_batch, self.estimator.nb_classes).astype(bool) # Main loop of the algorithm for _ in range(self.max_iter): # Compute score score = self.estimator.predict(batch)[l_b] # Compute the gradients and norm grads = self.estimator.class_gradient(batch, label=l_batch) if grads.shape[1] == 1: grads = np.squeeze(grads, axis=1) norm_grad = np.linalg.norm(np.reshape(grads, (batch.shape[0], -1)), axis=1) # Theta theta = self._compute_theta(norm_batch, score, norm_grad) # Perturbation di_batch = self._compute_pert(theta, grads, norm_grad) # Update xi and perturbation batch += di_batch # Apply clip if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values x_adv[batch_index_1:batch_index_2] = np.clip(batch, clip_min, clip_max) else: x_adv[batch_index_1:batch_index_2] = batch logger.info( "Success rate of NewtonFool attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, batch_size=self.batch_size), ) return x_adv
def generate(self, x, y=None, **kwargs): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :type x: `np.ndarray` :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ y = check_and_transform_label_format(y, self.classifier.nb_classes()) # Check that `y` is provided for targeted attacks if self.targeted and y is None: raise ValueError( "Target labels `y` need to be provided for a targeted attack.") # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array( self.classifier.predict(x, batch_size=self.batch_size)) # Compute adversarial examples with implicit batching nb_batches = int(np.ceil(x.shape[0] / float(self.batch_size))) x_adv = [] for batch_id in range(nb_batches): logger.debug("Processing batch %i out of %i", batch_id, nb_batches) batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] res = self._generate_batch(x_batch, y_batch) x_adv.append(res) x_adv = np.vstack(x_adv) # Apply clip if hasattr(self.classifier, "clip_values") and self.classifier.clip_values is not None: clip_min, clip_max = self.classifier.clip_values np.clip(x_adv, clip_min, clip_max, out=x_adv) # Log success rate of the ZOO attack logger.info( "Success rate of ZOO attack: %.2f%%", 100 * compute_success(self.classifier, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def generate(self, x, y=None, **kwargs): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :type x: `np.ndarray` :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). If `self.targeted` is true, then `y` represents the target labels. Otherwise, the targets are the original class labels. :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ y = check_and_transform_label_format(y, self.classifier.nb_classes()) x_adv = x.astype(ART_NUMPY_DTYPE) # Assert that, if attack is targeted, y is provided: if self.targeted and y is None: raise ValueError( "Target labels `y` need to be provided for a targeted attack.") # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array( self.classifier.predict(x, batch_size=self.batch_size)) # Compute adversarial examples with implicit batching nb_batches = int(np.ceil(x_adv.shape[0] / float(self.batch_size))) for batch_id in range(nb_batches): logger.debug("Processing batch %i out of %i", batch_id, nb_batches) batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x_adv[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] x_adv[batch_index_1:batch_index_2] = self._generate_batch( x_batch, y_batch) # Apply clip if hasattr(self.classifier, "clip_values") and self.classifier.clip_values is not None: x_adv = np.clip(x_adv, self.classifier.clip_values[0], self.classifier.clip_values[1]) # Compute success rate of the EAD attack logger.info( "Success rate of EAD attack: %.2f%%", 100 * compute_success(self.classifier, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def generate(self, x, y=None): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :type x: `np.ndarray` :param y: If `self.targeted` is true, then `y` represents the target labels. Otherwise, the targets are the original class labels. :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ # ZOO can probably be extended to feature vectors if no zooming or resizing is applied if len(x.shape) == 2: raise ValueError( 'Feature vectors detected. The ZOO attack can only be applied to data with spatial' 'dimensions.') # Check that `y` is provided for targeted attacks if self.targeted and y is None: raise ValueError( 'Target labels `y` need to be provided for a targeted attack.') # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array(self.classifier.predict(x, logits=False)) # Compute adversarial examples with implicit batching nb_batches = int(np.ceil(x.shape[0] / float(self.batch_size))) x_adv = [] for batch_id in range(nb_batches): logger.debug('Processing batch %i out of %i', batch_id, nb_batches) batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] res = self._generate_batch(x_batch, y_batch) x_adv.append(res) x_adv = np.vstack(x_adv) # Apply clip if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None: clip_min, clip_max = self.classifier.clip_values np.clip(x_adv, clip_min, clip_max, out=x_adv) # Log success rate of the ZOO attack logger.info( 'Success rate of ZOO attack: %.2f%%', 100 * compute_success(self.classifier, x, y, x_adv, self.targeted)) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :param cost_matrix: A non-negative cost matrix. :type cost_matrix: `np.ndarray` :return: An array holding the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes) x_adv = x.copy().astype(ART_NUMPY_DTYPE) if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: raise ValueError("Target labels `y` need to be provided for a targeted attack.") # Use model predictions as correct outputs targets = get_labels_np_array(self.estimator.predict(x, batch_size=self.batch_size)) else: targets = y if self.estimator.nb_classes == 2 and targets.shape[1] == 1: raise ValueError( "This attack has not yet been tested for binary classification with a single output classifier." ) # Compute the cost matrix if needed cost_matrix = kwargs.get("cost_matrix") if cost_matrix is None: cost_matrix = self._compute_cost_matrix(self.p, self.kernel_size) # Compute perturbation with implicit batching nb_batches = int(np.ceil(x.shape[0] / float(self.batch_size))) for batch_id in trange(nb_batches, desc="Wasserstein", disable=not self.verbose): logger.debug("Processing batch %i out of %i", batch_id, nb_batches) batch_index_1, batch_index_2 = batch_id * self.batch_size, (batch_id + 1) * self.batch_size batch = x_adv[batch_index_1:batch_index_2] batch_labels = targets[batch_index_1:batch_index_2] x_adv[batch_index_1:batch_index_2] = self._generate_batch(batch, batch_labels, cost_matrix) logger.info( "Success rate of attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial examples and return them as an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). :return: An array holding the adversarial examples. """ x_adv = copy.copy(x) def minfun(x, args): # minimize L2 norm return np.sum(np.sqrt((x - args["orig"]) ** 2)) def constraint_conf(x, args): # constraint for confidence pred = args["classifier"].predict(x.reshape(1, -1))[0, 0] if args["class_zero"]: pred = 1.0 - pred return (pred - args["conf"]).reshape(-1) def constraint_unc(x, args): # constraint for uncertainty cur_unc = (args["classifier"].predict_uncertainty(x.reshape(1, -1))).reshape(-1) return (args["max_uncertainty"] - cur_unc)[0] bounds = [] # adding bounds, to not go away from original data for i in range(np.shape(x)[1]): bounds.append((self.min_val, self.max_val)) for i in trange(x.shape[0], desc="HCLU"): # go through data amd craft # get properties for attack max_uncertainty = self.unc_increase * self.estimator.predict_uncertainty(x_adv[i].reshape(1, -1)) class_zero = not self.estimator.predict(x_adv[i].reshape(1, -1))[0, 0] < 0.5 init_args = { "classifier": self.estimator, "class_zero": class_zero, "max_uncertainty": max_uncertainty, "conf": self.conf, } constr_conf = {"type": "ineq", "fun": constraint_conf, "args": (init_args,)} constr_unc = {"type": "ineq", "fun": constraint_unc, "args": (init_args,)} args = {"args": init_args, "orig": x[i].reshape(-1)} # finally, run optimization x_adv[i] = minimize(minfun, x_adv[i], args=args, bounds=bounds, constraints=[constr_conf, constr_unc],)["x"] logger.info( "Success rate of HCLU attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv), ) return x_adv
def poison(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> Tuple[np.ndarray, np.ndarray]: """ Iteratively finds optimal attack points starting at values at `x`. :param x: An array with the points that initialize attack points. :param y: The target labels for the attack. :return: A tuple holding the `(poisoning_examples, poisoning_labels)`. """ if y is None: raise ValueError( "Target labels `y` need to be provided for a targeted attack.") else: y_attack = np.copy(y) print("in poison") num_poison = len(x) if num_poison == 0: raise ValueError("Must input at least one poison point") num_features = len(x[0]) train_data = np.copy(self.x_train) train_labels = np.copy(self.y_train) all_poison = [] for attack_point, attack_label in tqdm(zip(x, y_attack), desc="SVM poisoning"): print("in for loop") poison = self.generate_attack_point(attack_point, attack_label) all_poison.append(poison) train_data = np.vstack([train_data, poison]) train_labels = np.vstack([train_labels, attack_label]) print("after generate attack") x_adv = np.array(all_poison).reshape((num_poison, num_features)) targeted = y is not None logger.info( "Success rate of poisoning attack SVM attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, targeted=targeted), ) return x_adv, y_attack
def generate(self, x, **kwargs): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :type x: `np.ndarray` :param y: If `self.targeted` is true, then `y` represents the target labels. Otherwise, the targets are the original class labels. :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ # Parse and save attack-specific parameters params_cpy = dict(kwargs) y = params_cpy.pop(str('y'), None) self.set_params(**params_cpy) # Check that `y` is provided for targeted attacks if self.targeted and y is None: raise ValueError('Target labels `y` need to be provided for a targeted attack.') # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array(self.classifier.predict(x, logits=False)) # Compute adversarial examples with implicit batching nb_batches = int(np.ceil(x.shape[0] / float(self.batch_size))) x_adv = [] for batch_id in range(nb_batches): logger.debug('Processing batch %i out of %i', batch_id, nb_batches) batch_index_1, batch_index_2 = batch_id * self.batch_size, (batch_id + 1) * self.batch_size x_batch = x[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] res = self._generate_batch(x_batch, y_batch) x_adv.append(res) # Apply clip x_adv = np.vstack(x_adv) x_adv = np.clip(x_adv, self.classifier.clip_values[0], self.classifier.clip_values[1]) # Log success rate of the ZOO attack logger.info('Success rate of ZOO attack: %.2f%%', 100 * compute_success(self.classifier, x, y, x_adv, self.targeted)) return x_adv
def poison(self, x, y=None, **kwargs): """ Iteratively finds optimal attack points starting at values at x :param x: An array with the points that initialize attack points. :type x: `np.ndarray` :param y: The target labels for :return: An tuple holding the (poisoning examples, poisoning labels). :rtype: `(np.ndarray, np.ndarray)` """ if y is None: raise ValueError( "Target labels `y` need to be provided for a targeted attack.") else: y_attack = np.copy(y) num_poison = len(x) if num_poison == 0: raise ValueError("Must input at least one poison point") num_features = len(x[0]) train_data = np.copy(self.x_train) train_labels = np.copy(self.y_train) all_poison = [] for attack_point, attack_label in zip(x, y_attack): poison = self.generate_attack_point(attack_point, attack_label) all_poison.append(poison) train_data = np.vstack([train_data, poison]) train_labels = np.vstack([train_labels, attack_label]) x_adv = np.array(all_poison).reshape((num_poison, num_features)) targeted = y is not None logger.info( "Success rate of poisoning attack SVM attack: %.2f%%", 100 * compute_success(self.classifier, x, y, x_adv, targeted=targeted), ) return x_adv, y_attack
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). If `self.targeted` is true, then `y_val` represents the target labels. Otherwise, the targets are the original class labels. :return: An array holding the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes) x_adv = x.astype(ART_NUMPY_DTYPE) if self.estimator.clip_values is not None: clip_min_per_pixel, clip_max_per_pixel = self.estimator.clip_values else: clip_min_per_pixel, clip_max_per_pixel = np.amin(x), np.amax(x) # Assert that, if attack is targeted, y_val is provided: if self.targeted and y is None: raise ValueError( "Target labels `y` need to be provided for a targeted attack.") # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)) # Compute perturbation with implicit batching nb_batches = int(np.ceil(x_adv.shape[0] / float(self.batch_size))) for batch_id in trange(nb_batches, desc="C&W L_inf", disable=not self.verbose): batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x_adv[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] # Determine values for later clipping clip_min = np.clip(x_batch - self.eps, clip_min_per_pixel, clip_max_per_pixel) clip_max = np.clip(x_batch + self.eps, clip_min_per_pixel, clip_max_per_pixel) # The optimization is performed in tanh space to keep the # adversarial images bounded from clip_min and clip_max. x_batch_tanh = original_to_tanh(x_batch, clip_min, clip_max, self._tanh_smoother) # Initialize perturbation in tanh space: x_adv_batch = x_batch.copy() x_adv_batch_tanh = x_batch_tanh.copy() # Initialize optimization: z_logits, loss = self._loss(x_adv_batch, y_batch) attack_success = loss <= 0 learning_rate = self.learning_rate * np.ones(x_batch.shape[0]) for i_iter in range(self.max_iter): logger.debug("Iteration step %i out of %i", i_iter, self.max_iter) logger.debug("Average Loss: %f", np.mean(loss)) logger.debug( "Successful attack samples: %i out of %i", int(np.sum(attack_success)), x_batch.shape[0], ) # only continue optimization for those samples where attack hasn't succeeded yet: active = ~attack_success if np.sum(active) == 0: break # compute gradient: logger.debug("Compute loss gradient") perturbation_tanh = -self._loss_gradient( z_logits[active], y_batch[active], x_adv_batch[active], x_adv_batch_tanh[active], clip_min[active], clip_max[active], ) # perform line search to optimize perturbation # first, halve the learning rate until perturbation actually decreases the loss: prev_loss = loss.copy() best_loss = loss.copy() best_lr = np.zeros(x_batch.shape[0]) halving = np.zeros(x_batch.shape[0]) for i_halve in range(self.max_halving): logger.debug( "Perform halving iteration %i out of %i", i_halve, self.max_halving, ) do_halving = loss[active] >= prev_loss[active] logger.debug("Halving to be performed on %i samples", int(np.sum(do_halving))) if np.sum(do_halving) == 0: break active_and_do_halving = active.copy() active_and_do_halving[active] = do_halving lr_mult = learning_rate[active_and_do_halving] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] adv_10 = x_adv_batch_tanh[active_and_do_halving] new_x_adv_batch_tanh = adv_10 + lr_mult * perturbation_tanh[ do_halving] new_x_adv_batch = tanh_to_original( new_x_adv_batch_tanh, clip_min[active_and_do_halving], clip_max[active_and_do_halving], ) _, loss[active_and_do_halving] = self._loss( new_x_adv_batch, y_batch[active_and_do_halving]) logger.debug("New Average Loss: %f", np.mean(loss)) logger.debug("Loss: %s", str(loss)) logger.debug("Prev_loss: %s", str(prev_loss)) logger.debug("Best_loss: %s", str(best_loss)) best_lr[loss < best_loss] = learning_rate[loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] learning_rate[active_and_do_halving] /= 2 halving[active_and_do_halving] += 1 learning_rate[active] *= 2 # if no halving was actually required, double the learning rate as long as this # decreases the loss: for i_double in range(self.max_doubling): logger.debug( "Perform doubling iteration %i out of %i", i_double, self.max_doubling, ) do_doubling = (halving[active] == 1) & (loss[active] <= best_loss[active]) logger.debug( "Doubling to be performed on %i samples", int(np.sum(do_doubling)), ) if np.sum(do_doubling) == 0: break active_and_do_doubling = active.copy() active_and_do_doubling[active] = do_doubling learning_rate[active_and_do_doubling] *= 2 lr_mult = learning_rate[active_and_do_doubling] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] x_adv15 = x_adv_batch_tanh[active_and_do_doubling] new_x_adv_batch_tanh = x_adv15 + lr_mult * perturbation_tanh[ do_doubling] new_x_adv_batch = tanh_to_original( new_x_adv_batch_tanh, clip_min[active_and_do_doubling], clip_max[active_and_do_doubling], ) _, loss[active_and_do_doubling] = self._loss( new_x_adv_batch, y_batch[active_and_do_doubling]) logger.debug("New Average Loss: %f", np.mean(loss)) best_lr[loss < best_loss] = learning_rate[loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] learning_rate[halving == 1] /= 2 update_adv = best_lr[active] > 0 logger.debug( "Number of adversarial samples to be finally updated: %i", int(np.sum(update_adv)), ) if np.sum(update_adv) > 0: active_and_update_adv = active.copy() active_and_update_adv[active] = update_adv best_lr_mult = best_lr[active_and_update_adv] for _ in range(len(x.shape) - 1): best_lr_mult = best_lr_mult[:, np.newaxis] best_13 = best_lr_mult * perturbation_tanh[update_adv] x_adv_batch_tanh[active_and_update_adv] = x_adv_batch_tanh[ active_and_update_adv] + best_13 x_adv_batch[active_and_update_adv] = tanh_to_original( x_adv_batch_tanh[active_and_update_adv], clip_min[active_and_update_adv], clip_max[active_and_update_adv], ) ( z_logits[active_and_update_adv], loss[active_and_update_adv], ) = self._loss( x_adv_batch[active_and_update_adv], y_batch[active_and_update_adv], ) attack_success = loss <= 0 # Update depending on attack success: x_adv_batch[~attack_success] = x_batch[~attack_success] x_adv[batch_index_1:batch_index_2] = x_adv_batch logger.info( "Success rate of C&W L_inf attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). If `self.targeted` is true, then `y` represents the target labels. :param x_adv_init: Initial array to act as initial adversarial examples. Same shape as `x`. :type x_adv_init: `np.ndarray` :return: An array holding the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes, return_one_hot=False) # Get clip_min and clip_max from the classifier or infer them from data if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values else: clip_min, clip_max = np.min(x), np.max(x) # Prediction from the original images preds = np.argmax(self.estimator.predict(x, batch_size=self.batch_size), axis=1) # Prediction from the initial adversarial examples if not None x_adv_init = kwargs.get("x_adv_init") if x_adv_init is not None: init_preds = np.argmax(self.estimator.predict( x_adv_init, batch_size=self.batch_size), axis=1) else: init_preds = [None] * len(x) x_adv_init = [None] * len(x) # Assert that, if attack is targeted, y is provided if self.targeted and y is None: raise ValueError( "Target labels `y` need to be provided for a targeted attack.") # Some initial setups x_adv = x.astype(ART_NUMPY_DTYPE) # Generate the adversarial samples for ind, val in enumerate( tqdm(x_adv, desc="Boundary attack", disable=not self.verbose)): if self.targeted: x_adv[ind] = self._perturb( x=val, y=y[ind], y_p=preds[ind], init_pred=init_preds[ind], adv_init=x_adv_init[ind], clip_min=clip_min, clip_max=clip_max, ) else: x_adv[ind] = self._perturb( x=val, y=-1, y_p=preds[ind], init_pred=init_preds[ind], adv_init=x_adv_init[ind], clip_min=clip_min, clip_max=clip_max, ) if y is not None: y = to_categorical(y, self.estimator.nb_classes) logger.info( "Success rate of Boundary attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :param mask: An array with a mask broadcastable to input `x` defining where to apply adversarial perturbations. Shape needs to be broadcastable to the shape of x and can also be of the same shape as `x`. Any features for which the mask is zero will not be adversarially perturbed. :type mask: `np.ndarray` :return: An array holding the adversarial examples. """ mask = self._get_mask(x, **kwargs) # Ensure eps is broadcastable self._check_compatibility_input_and_eps(x=x) # Check whether random eps is enabled self._random_eps() if isinstance(self.estimator, ClassifierMixin): # Set up targets targets = self._set_targets(x, y) # Start to compute adversarial examples adv_x = x.astype(ART_NUMPY_DTYPE) for batch_id in range(int(np.ceil(x.shape[0] / float(self.batch_size)))): self._batch_id = batch_id for rand_init_num in trange( max(1, self.num_random_init), desc="PGD - Random Initializations", disable=not self.verbose ): batch_index_1, batch_index_2 = batch_id * self.batch_size, (batch_id + 1) * self.batch_size batch_index_2 = min(batch_index_2, x.shape[0]) batch = x[batch_index_1:batch_index_2] batch_labels = targets[batch_index_1:batch_index_2] mask_batch = mask if mask is not None: if len(mask.shape) == len(x.shape): mask_batch = mask[batch_index_1:batch_index_2] for i_max_iter in trange( self.max_iter, desc="PGD - Iterations", leave=False, disable=not self.verbose ): self._i_max_iter = i_max_iter batch = self._compute( batch, x[batch_index_1:batch_index_2], batch_labels, mask_batch, self.eps, self.eps_step, self._project, self.num_random_init > 0 and i_max_iter == 0, self._batch_id, ) if rand_init_num == 0: # initial (and possibly only) random restart: we only have this set of # adversarial examples for now adv_x[batch_index_1:batch_index_2] = np.copy(batch) else: # replace adversarial examples if they are successful attack_success = compute_success_array( self.estimator, # type: ignore x[batch_index_1:batch_index_2], targets[batch_index_1:batch_index_2], batch, self.targeted, batch_size=self.batch_size, ) adv_x[batch_index_1:batch_index_2][attack_success] = batch[attack_success] logger.info( "Success rate of attack: %.2f%%", 100 * compute_success( self.estimator, # type: ignore x, targets, adv_x, self.targeted, batch_size=self.batch_size, # type: ignore ), ) else: if self.num_random_init > 0: # pragma: no cover raise ValueError("Random initialisation is only supported for classification.") # Set up targets targets = self._set_targets(x, y, classifier_mixin=False) # Start to compute adversarial examples if x.dtype == object: adv_x = x.copy() else: adv_x = x.astype(ART_NUMPY_DTYPE) for i_max_iter in trange(self.max_iter, desc="PGD - Iterations", disable=not self.verbose): self._i_max_iter = i_max_iter adv_x = self._compute( adv_x, x, targets, mask, self.eps, self.eps_step, self._project, self.num_random_init > 0 and i_max_iter == 0, ) if self.summary_writer is not None: self.summary_writer.reset() return adv_x
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :param mask: An array with a mask to be applied to the adversarial perturbations. Shape needs to be broadcastable to the shape of x. Any features for which the mask is zero will not be adversarially perturbed. :type mask: `np.ndarray` :return: An array holding the adversarial examples. """ import tensorflow as tf # lgtm [py/repeated-import] # Check whether random eps is enabled self._random_eps() # Set up targets targets = self._set_targets(x, y) # Get the mask mask = self._get_mask(x, **kwargs) # Create dataset if mask is not None: # Here we need to make a distinction: if the masks are different for each input, we need to index # those for the current batch. Otherwise (i.e. mask is meant to be broadcasted), keep it as it is. if len(mask.shape) == len(x.shape): dataset = tf.data.Dataset.from_tensor_slices(( x.astype(ART_NUMPY_DTYPE), targets.astype(ART_NUMPY_DTYPE), mask.astype(ART_NUMPY_DTYPE), )).batch(self.batch_size, drop_remainder=False) else: dataset = tf.data.Dataset.from_tensor_slices(( x.astype(ART_NUMPY_DTYPE), targets.astype(ART_NUMPY_DTYPE), np.array([mask.astype(ART_NUMPY_DTYPE)] * x.shape[0]), )).batch(self.batch_size, drop_remainder=False) else: dataset = tf.data.Dataset.from_tensor_slices(( x.astype(ART_NUMPY_DTYPE), targets.astype(ART_NUMPY_DTYPE), )).batch(self.batch_size, drop_remainder=False) # Start to compute adversarial examples adv_x_best = None rate_best = None for _ in range(max(1, self.num_random_init)): adv_x = x.astype(ART_NUMPY_DTYPE) data_loader = iter(dataset) # Compute perturbation with batching for (batch_id, batch_all) in enumerate(data_loader): if mask is not None: (batch, batch_labels, mask_batch) = batch_all[0], batch_all[1], batch_all[2] else: (batch, batch_labels, mask_batch) = batch_all[0], batch_all[1], None batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size adv_x[batch_index_1:batch_index_2] = self._generate_batch( batch, batch_labels, mask_batch) if self.num_random_init > 1: rate = 100 * compute_success(self.estimator, x, targets, adv_x, self.targeted, batch_size=self.batch_size) if rate_best is None or rate > rate_best or adv_x_best is None: rate_best = rate adv_x_best = adv_x else: adv_x_best = adv_x logger.info( "Success rate of attack: %.2f%%", rate_best if rate_best is not None else 100 * compute_success(self.estimator, x, y, adv_x_best, self.targeted, batch_size=self.batch_size), ) return adv_x_best
def generate(self, x, y=None, **kwargs): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :type x: `np.ndarray` :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape `(nb_samples,)`. :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ y = check_and_transform_label_format(y, self.classifier.nb_classes()) # Initialize variables dims = list(x.shape[1:]) self._nb_features = np.product(dims) x_adv = np.reshape(x.astype(ART_NUMPY_DTYPE), (-1, self._nb_features)) preds = np.argmax(self.classifier.predict(x, batch_size=self.batch_size), axis=1) # Determine target classes for attack if y is None: # Randomly choose target from the incorrect classes for each sample from art.utils import random_targets targets = np.argmax(random_targets(preds, self.classifier.nb_classes()), axis=1) else: targets = np.argmax(y, axis=1) # Compute perturbation with implicit batching for batch_id in range(int(np.ceil(x_adv.shape[0] / float(self.batch_size)))): batch_index_1, batch_index_2 = batch_id * self.batch_size, (batch_id + 1) * self.batch_size batch = x_adv[batch_index_1:batch_index_2] # Main algorithm for each batch # Initialize the search space; optimize to remove features that can't be changed search_space = np.zeros(batch.shape) if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None: clip_min, clip_max = self.classifier.clip_values if self.theta > 0: search_space[batch < clip_max] = 1 else: search_space[batch > clip_min] = 1 # Get current predictions current_pred = preds[batch_index_1:batch_index_2] target = targets[batch_index_1:batch_index_2] active_indices = np.where(current_pred != target)[0] all_feat = np.zeros_like(batch) while active_indices.size != 0: # Compute saliency map feat_ind = self._saliency_map(np.reshape(batch, [batch.shape[0]] + dims)[active_indices], target[active_indices], search_space[active_indices]) # Update used features all_feat[active_indices, feat_ind[:, 0]] = 1 all_feat[active_indices, feat_ind[:, 1]] = 1 # Apply attack with clipping if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None: # Prepare update depending of theta if self.theta > 0: clip_func, clip_value = np.minimum, clip_max else: clip_func, clip_value = np.maximum, clip_min # Update adversarial examples tmp_batch = batch[active_indices] tmp_batch[np.arange(len(active_indices)), feat_ind[:, 0]] = \ clip_func(clip_value, tmp_batch[np.arange(len(active_indices)), feat_ind[:, 0]] + self.theta) tmp_batch[np.arange(len(active_indices)), feat_ind[:, 1]] = \ clip_func(clip_value, tmp_batch[np.arange(len(active_indices)), feat_ind[:, 1]] + self.theta) batch[active_indices] = tmp_batch # Remove indices from search space if max/min values were reached search_space[batch == clip_value] = 0 # Apply attack without clipping else: tmp_batch = batch[active_indices] tmp_batch[np.arange(len(active_indices)), feat_ind[:, 0]] += self.theta tmp_batch[np.arange(len(active_indices)), feat_ind[:, 1]] += self.theta batch[active_indices] = tmp_batch # Recompute model prediction current_pred = np.argmax(self.classifier.predict(np.reshape(batch, [batch.shape[0]] + dims)), axis=1) # Update active_indices active_indices = np.where((current_pred != target) * (np.sum(all_feat, axis=1) / self._nb_features <= self.gamma) * (np.sum(search_space, axis=1) > 0))[0] x_adv[batch_index_1:batch_index_2] = batch x_adv = np.reshape(x_adv, x.shape) logger.info('Success rate of JSMA attack: %.2f%%', 100 * compute_success(self.classifier, x, y, x_adv, batch_size=self.batch_size)) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :param mask: An array with a mask to be applied to the adversarial perturbations. Shape needs to be broadcastable to the shape of x. Any features for which the mask is zero will not be adversarially perturbed. :type mask: `np.ndarray` :return: An array holding the adversarial examples. """ # Check whether random eps is enabled self._random_eps() if isinstance(self.estimator, ClassifierMixin): # Set up targets targets = self._set_targets(x, y) # Get the mask mask = self._get_mask(x, **kwargs) # Start to compute adversarial examples adv_x_best = None rate_best = None for _ in trange(max(1, self.num_random_init), desc="PGD - Random Initializations", disable=not self.verbose): adv_x = x.astype(ART_NUMPY_DTYPE) for i_max_iter in trange(self.max_iter, desc="PGD - Iterations", leave=False, disable=not self.verbose): adv_x = self._compute( adv_x, x, targets, mask, self.eps, self.eps_step, self._project, self.num_random_init > 0 and i_max_iter == 0, ) if self.num_random_init > 1: rate = 100 * compute_success( self.estimator, x, targets, adv_x, self.targeted, batch_size=self.batch_size, # type: ignore ) if rate_best is None or rate > rate_best or adv_x_best is None: rate_best = rate adv_x_best = adv_x else: adv_x_best = adv_x logger.info( "Success rate of attack: %.2f%%", rate_best if rate_best is not None else 100 * compute_success( self.estimator, x, y, adv_x_best, self.targeted, batch_size=self.batch_size, # type: ignore ), ) else: if self.num_random_init > 0: raise ValueError( "Random initialisation is only supported for classification." ) # Set up targets targets = self._set_targets(x, y, classifier_mixin=False) # Get the mask mask = self._get_mask(x, **kwargs) # Start to compute adversarial examples if x.dtype == np.object: adv_x = x.copy() else: adv_x = x.astype(ART_NUMPY_DTYPE) for i_max_iter in trange(self.max_iter, desc="PGD - Iterations", disable=not self.verbose): adv_x = self._compute( adv_x, x, targets, mask, self.eps, self.eps_step, self._project, self.num_random_init > 0 and i_max_iter == 0, ) adv_x_best = adv_x return adv_x_best
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: An array with the original labels to be predicted. :return: An array holding the adversarial examples. """ x_adv = x.astype(ART_NUMPY_DTYPE) preds = self.estimator.predict(x_adv, batch_size=self.batch_size) if self.estimator.nb_classes == 2 and preds.shape[1] == 1: raise ValueError( "This attack has not yet been tested for binary classification with a single output classifier." ) if (preds < 0.0).any() or (preds > 1.0).any(): raise TypeError( "This attack requires a classifier predicting probabilities in the range [0, 1] as output." "Values smaller than 0.0 or larger than 1.0 have been detected." ) # preds_rescaled = self._rescale(preds) # Rescaling needs more testing preds_rescaled = preds # Compute perturbation with implicit batching for batch_id in trange( int(np.ceil(x_adv.shape[0] / float(self.batch_size))), desc="VAT", disable=not self.verbose ): batch_index_1, batch_index_2 = batch_id * self.batch_size, (batch_id + 1) * self.batch_size batch = x_adv[batch_index_1:batch_index_2] batch = batch.reshape((batch.shape[0], -1)) # Main algorithm for each batch var_d = np.random.randn(*batch.shape).astype(ART_NUMPY_DTYPE) # Main loop of the algorithm for _ in range(self.max_iter): var_d = self._normalize(var_d) preds_new = self.estimator.predict((batch + var_d).reshape((-1,) + self.estimator.input_shape)) if (preds_new < 0.0).any() or (preds_new > 1.0).any(): raise TypeError( "This attack requires a classifier predicting probabilities in the range [0, 1] as " "output. Values smaller than 0.0 or larger than 1.0 have been detected." ) # preds_new_rescaled = self._rescale(preds_new) # Rescaling needs more testing preds_new_rescaled = preds_new from scipy.stats import entropy kl_div1 = entropy( np.transpose(preds_rescaled[batch_index_1:batch_index_2]), np.transpose(preds_new_rescaled), ) var_d_new = np.zeros(var_d.shape).astype(ART_NUMPY_DTYPE) for current_index in range(var_d.shape[1]): var_d[:, current_index] += self.finite_diff preds_new = self.estimator.predict((batch + var_d).reshape((-1,) + self.estimator.input_shape)) if (preds_new < 0.0).any() or (preds_new > 1.0).any(): raise TypeError( "This attack requires a classifier predicting probabilities in the range [0, 1]" "as output. Values smaller than 0.0 or larger than 1.0 have been detected." ) # preds_new_rescaled = self._rescale(preds_new) # Rescaling needs more testing preds_new_rescaled = preds_new kl_div2 = entropy( np.transpose(preds_rescaled[batch_index_1:batch_index_2]), np.transpose(preds_new_rescaled), ) var_d_new[:, current_index] = (kl_div2 - kl_div1) / self.finite_diff var_d[:, current_index] -= self.finite_diff var_d = var_d_new # Apply perturbation and clip if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values x_adv[batch_index_1:batch_index_2] = np.clip( batch + self.eps * self._normalize(var_d), clip_min, clip_max ).reshape((-1,) + self.estimator.input_shape) else: x_adv[batch_index_1:batch_index_2] = (batch + self.eps * self._normalize(var_d)).reshape( (-1,) + self.estimator.input_shape ) logger.info( "Success rate of virtual adversarial attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, batch_size=self.batch_size), ) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :param mask: An array with a mask broadcastable to input `x` defining where to apply adversarial perturbations. Shape needs to be broadcastable to the shape of x and can also be of the same shape as `x`. Any features for which the mask is zero will not be adversarially perturbed. :type mask: `np.ndarray` :return: An array holding the adversarial examples. """ mask = self._get_mask(x, **kwargs) # Ensure eps is broadcastable self._check_compatibility_input_and_eps(x=x) if isinstance(self.estimator, ClassifierMixin): if y is not None: y = check_and_transform_label_format(y, self.estimator.nb_classes) if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: # pragma: no cover raise ValueError( "Target labels `y` need to be provided for a targeted attack." ) # Use model predictions as correct outputs logger.info( "Using model predictions as correct labels for FGM.") y_array = get_labels_np_array( self.estimator.predict( x, batch_size=self.batch_size)) # type: ignore else: y_array = y if self.estimator.nb_classes > 2: y_array = y_array / np.sum(y_array, axis=1, keepdims=True) # Return adversarial examples computed with minimal perturbation if option is active adv_x_best = x if self.minimal: logger.info("Performing minimal perturbation FGM.") adv_x_best = self._minimal_perturbation(x, y_array, mask) rate_best = 100 * compute_success( self.estimator, # type: ignore x, y_array, adv_x_best, self.targeted, batch_size=self.batch_size, # type: ignore ) else: rate_best = 0.0 for _ in range(max(1, self.num_random_init)): adv_x = self._compute( x, x, y_array, mask, self.eps, self.eps, self._project, self.num_random_init > 0, ) if self.num_random_init > 1: rate = 100 * compute_success( self.estimator, # type: ignore x, y_array, adv_x, self.targeted, batch_size=self.batch_size, # type: ignore ) if rate > rate_best: rate_best = rate adv_x_best = adv_x else: adv_x_best = adv_x logger.info( "Success rate of FGM attack: %.2f%%", rate_best if rate_best is not None else 100 * compute_success( self.estimator, # type: ignore x, y_array, adv_x_best, self.targeted, batch_size=self.batch_size, ), ) else: if self.minimal: # pragma: no cover raise ValueError( "Minimal perturbation is only supported for classification." ) if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: # pragma: no cover raise ValueError( "Target labels `y` need to be provided for a targeted attack." ) # Use model predictions as correct outputs logger.info( "Using model predictions as correct labels for FGM.") y_array = self.estimator.predict(x, batch_size=self.batch_size) else: y_array = y adv_x_best = self._compute( x, x, y_array, None, self.eps, self.eps, self._project, self.num_random_init > 0, ) if self.summary_writer is not None: self.summary_writer.reset() return adv_x_best
def generate(self, x, y=None): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :type x: `np.ndarray` :param y: If `self.targeted` is true, then `y_val` represents the target labels. Otherwise, the targets are the original class labels. :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ x_adv = x #.astype(NUMPY_DTYPE) if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None: clip_min, clip_max = self.classifier.clip_values else: clip_min, clip_max = np.amin(x), np.amax(x) # Assert that, if attack is targeted, y_val is provided: if self.targeted and y is None: raise ValueError( 'Target labels `y` need to be provided for a targeted attack.') # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array(self.classifier.predict(x, logits=False)) # Compute perturbation with implicit batching nb_batches = int(np.ceil(x_adv.shape[0] / float(self.batch_size))) for batch_id in range(nb_batches): logger.debug('Processing batch %i out of %i', batch_id, nb_batches) batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x_adv[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] # The optimization is performed in tanh space to keep the adversarial images bounded in correct range x_batch_tanh = original_to_tanh(x_batch, clip_min, clip_max, self._tanh_smoother) # Initialize binary search: c = self.initial_const * np.ones(x_batch.shape[0]) c_lower_bound = np.zeros(x_batch.shape[0]) c_double = (np.ones(x_batch.shape[0]) > 0) # Initialize placeholders for best l2 distance and attack found so far best_l2dist = np.inf * np.ones(x_batch.shape[0]) best_x_adv_batch = x_batch.copy() for bss in range(self.binary_search_steps): logger.debug('Binary search step %i out of %i (c_mean==%f)', bss, self.binary_search_steps, np.mean(c)) nb_active = int(np.sum(c < self._c_upper_bound)) logger.debug( 'Number of samples with c < _c_upper_bound: %i out of %i', nb_active, x_batch.shape[0]) if nb_active == 0: break lr = self.learning_rate * np.ones(x_batch.shape[0]) # Initialize perturbation in tanh space: x_adv_batch = x_batch.copy() x_adv_batch_tanh = x_batch_tanh.copy() z, l2dist, loss = self._loss(x_batch, x_adv_batch, y_batch, c) attack_success = (loss - l2dist <= 0) overall_attack_success = attack_success for it in range(self.max_iter): logger.debug('Iteration step %i out of %i', it, self.max_iter) logger.debug('Average Loss: %f', np.mean(loss)) logger.debug('Average L2Dist: %f', np.mean(l2dist)) logger.debug('Average Margin Loss: %f', np.mean(loss - l2dist)) logger.debug( 'Current number of succeeded attacks: %i out of %i', int(np.sum(attack_success)), len(attack_success)) improved_adv = attack_success & (l2dist < best_l2dist) logger.debug('Number of improved L2 distances: %i', int(np.sum(improved_adv))) if np.sum(improved_adv) > 0: best_l2dist[improved_adv] = l2dist[improved_adv] best_x_adv_batch[improved_adv] = x_adv_batch[ improved_adv] active = (c < self._c_upper_bound) & (lr > 0) nb_active = int(np.sum(active)) logger.debug( 'Number of samples with c < _c_upper_bound and lr > 0: %i out of %i', nb_active, x_batch.shape[0]) if nb_active == 0: break # compute gradient: logger.debug('Compute loss gradient') perturbation_tanh = -self._loss_gradient( z[active], y_batch[active], x_batch[active], x_adv_batch[active], x_adv_batch_tanh[active], c[active], clip_min, clip_max) # perform line search to optimize perturbation # first, halve the learning rate until perturbation actually decreases the loss: prev_loss = loss.copy() best_loss = loss.copy() best_lr = np.zeros(x_batch.shape[0]) halving = np.zeros(x_batch.shape[0]) for h in range(self.max_halving): logger.debug('Perform halving iteration %i out of %i', h, self.max_halving) do_halving = (loss[active] >= prev_loss[active]) logger.debug('Halving to be performed on %i samples', int(np.sum(do_halving))) if np.sum(do_halving) == 0: break active_and_do_halving = active.copy() active_and_do_halving[active] = do_halving lr_mult = lr[active_and_do_halving] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] new_x_adv_batch_tanh = x_adv_batch_tanh[active_and_do_halving] + \ lr_mult * perturbation_tanh[do_halving] new_x_adv_batch = tanh_to_original( new_x_adv_batch_tanh, clip_min, clip_max, self._tanh_smoother) _, l2dist[active_and_do_halving], loss[ active_and_do_halving] = self._loss( x_batch[active_and_do_halving], new_x_adv_batch, y_batch[active_and_do_halving], c[active_and_do_halving]) logger.debug('New Average Loss: %f', np.mean(loss)) logger.debug('New Average L2Dist: %f', np.mean(l2dist)) logger.debug('New Average Margin Loss: %f', np.mean(loss - l2dist)) best_lr[loss < best_loss] = lr[loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] lr[active_and_do_halving] /= 2 halving[active_and_do_halving] += 1 lr[active] *= 2 # if no halving was actually required, double the learning rate as long as this # decreases the loss: for d in range(self.max_doubling): logger.debug('Perform doubling iteration %i out of %i', d, self.max_doubling) do_doubling = (halving[active] == 1) & ( loss[active] <= best_loss[active]) logger.debug('Doubling to be performed on %i samples', int(np.sum(do_doubling))) if np.sum(do_doubling) == 0: break active_and_do_doubling = active.copy() active_and_do_doubling[active] = do_doubling lr[active_and_do_doubling] *= 2 lr_mult = lr[active_and_do_doubling] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] new_x_adv_batch_tanh = x_adv_batch_tanh[active_and_do_doubling] + \ lr_mult * perturbation_tanh[do_doubling] new_x_adv_batch = tanh_to_original( new_x_adv_batch_tanh, clip_min, clip_max, self._tanh_smoother) _, l2dist[active_and_do_doubling], loss[ active_and_do_doubling] = self._loss( x_batch[active_and_do_doubling], new_x_adv_batch, y_batch[active_and_do_doubling], c[active_and_do_doubling]) logger.debug('New Average Loss: %f', np.mean(loss)) logger.debug('New Average L2Dist: %f', np.mean(l2dist)) logger.debug('New Average Margin Loss: %f', np.mean(loss - l2dist)) best_lr[loss < best_loss] = lr[loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] lr[halving == 1] /= 2 update_adv = (best_lr[active] > 0) logger.debug( 'Number of adversarial samples to be finally updated: %i', int(np.sum(update_adv))) if np.sum(update_adv) > 0: active_and_update_adv = active.copy() active_and_update_adv[active] = update_adv best_lr_mult = best_lr[active_and_update_adv] for _ in range(len(x.shape) - 1): best_lr_mult = best_lr_mult[:, np.newaxis] x_adv_batch_tanh[active_and_update_adv] = x_adv_batch_tanh[active_and_update_adv] + \ best_lr_mult * perturbation_tanh[update_adv] x_adv_batch[active_and_update_adv] = tanh_to_original( x_adv_batch_tanh[active_and_update_adv], clip_min, clip_max, self._tanh_smoother) z[active_and_update_adv], l2dist[active_and_update_adv], loss[active_and_update_adv] = \ self._loss(x_batch[active_and_update_adv], x_adv_batch[active_and_update_adv], y_batch[active_and_update_adv], c[active_and_update_adv]) attack_success = (loss - l2dist <= 0) overall_attack_success = overall_attack_success | attack_success # Update depending on attack success: improved_adv = attack_success & (l2dist < best_l2dist) logger.debug('Number of improved L2 distances: %i', int(np.sum(improved_adv))) if np.sum(improved_adv) > 0: best_l2dist[improved_adv] = l2dist[improved_adv] best_x_adv_batch[improved_adv] = x_adv_batch[improved_adv] c_double[overall_attack_success] = False c[overall_attack_success] = (c_lower_bound + c)[overall_attack_success] / 2 c_old = c c[~overall_attack_success & c_double] *= 2 c[~overall_attack_success & ~c_double] += (c - c_lower_bound)[~overall_attack_success & ~c_double] / 2 c_lower_bound[~overall_attack_success] = c_old[ ~overall_attack_success] x_adv[batch_index_1:batch_index_2] = best_x_adv_batch rate = 100 * compute_success(self.classifier, x, y, x_adv, self.targeted) TrackedCW.tracked_x.append( (x_adv, rate, batch_id, best_l2dist.mean())) logger.info( 'Success rate of C&W L_2 attack: %.2f%%', 100 * compute_success(self.classifier, x, y, x_adv, self.targeted)) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: An array with the original labels to be predicted. :return: An array holding the adversarial examples. """ x_adv = x.astype(ART_NUMPY_DTYPE) preds = self.estimator.predict(x, batch_size=self.batch_size) if is_probability(preds[0]): logger.warning( "It seems that the attacked model is predicting probabilities. DeepFool expects logits as model output " "to achieve its full attack strength.") # Determine the class labels for which to compute the gradients use_grads_subset = self.nb_grads < self.estimator.nb_classes if use_grads_subset: # TODO compute set of unique labels per batch grad_labels = np.argsort(-preds, axis=1)[:, :self.nb_grads] labels_set = np.unique(grad_labels) else: labels_set = np.arange(self.estimator.nb_classes) sorter = np.arange(len(labels_set)) # Pick a small scalar to avoid division by 0 tol = 10e-8 # Compute perturbation with implicit batching for batch_id in trange(int( np.ceil(x_adv.shape[0] / float(self.batch_size))), desc="DeepFool", disable=not self.verbose): batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size batch = x_adv[batch_index_1:batch_index_2].copy() # Get predictions and gradients for batch f_batch = preds[batch_index_1:batch_index_2] fk_hat = np.argmax(f_batch, axis=1) if use_grads_subset: # Compute gradients only for top predicted classes grd = np.array([ self.estimator.class_gradient(batch, label=_) for _ in labels_set ]) grd = np.squeeze(np.swapaxes(grd, 0, 2), axis=0) else: # Compute gradients for all classes grd = self.estimator.class_gradient(batch) # Get current predictions active_indices = np.arange(len(batch)) current_step = 0 while active_indices.size > 0 and current_step < self.max_iter: # Compute difference in predictions and gradients only for selected top predictions labels_indices = sorter[np.searchsorted(labels_set, fk_hat, sorter=sorter)] grad_diff = grd - grd[np.arange(len(grd)), labels_indices][:, None] f_diff = f_batch[:, labels_set] - f_batch[np.arange(len(f_batch)), labels_indices][:, None] # Choose coordinate and compute perturbation norm = np.linalg.norm(grad_diff.reshape( len(grad_diff), len(labels_set), -1), axis=2) + tol value = np.abs(f_diff) / norm value[np.arange(len(value)), labels_indices] = np.inf l_var = np.argmin(value, axis=1) absolute1 = abs(f_diff[np.arange(len(f_diff)), l_var]) draddiff = grad_diff[np.arange(len(grad_diff)), l_var].reshape(len(grad_diff), -1) pow1 = (pow( np.linalg.norm(draddiff, axis=1), 2, ) + tol) r_var = absolute1 / pow1 r_var = r_var.reshape((-1, ) + (1, ) * (len(x.shape) - 1)) r_var = r_var * grad_diff[np.arange(len(grad_diff)), l_var] # Add perturbation and clip result if self.estimator.clip_values is not None: batch[active_indices] = np.clip( batch[active_indices] + r_var[active_indices] * (self.estimator.clip_values[1] - self.estimator.clip_values[0]), self.estimator.clip_values[0], self.estimator.clip_values[1], ) else: batch[active_indices] += r_var[active_indices] # Recompute prediction for new x f_batch = self.estimator.predict(batch) fk_i_hat = np.argmax(f_batch, axis=1) # Recompute gradients for new x if use_grads_subset: # Compute gradients only for (originally) top predicted classes grd = np.array([ self.estimator.class_gradient(batch, label=_) for _ in labels_set ]) grd = np.squeeze(np.swapaxes(grd, 0, 2), axis=0) else: # Compute gradients for all classes grd = self.estimator.class_gradient(batch) # Stop if misclassification has been achieved active_indices = np.where(fk_i_hat == fk_hat)[0] current_step += 1 # Apply overshoot parameter x_adv1 = x_adv[batch_index_1:batch_index_2] x_adv2 = (1 + self.epsilon) * (batch - x_adv[batch_index_1:batch_index_2]) x_adv[batch_index_1:batch_index_2] = x_adv1 + x_adv2 if self.estimator.clip_values is not None: np.clip( x_adv[batch_index_1:batch_index_2], self.estimator.clip_values[0], self.estimator.clip_values[1], out=x_adv[batch_index_1:batch_index_2], ) logger.info( "Success rate of DeepFool attack: %.2f%%", 100 * compute_success( self.estimator, x, y, x_adv, batch_size=self.batch_size), ) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :param mask: An array with a mask broadcastable to input `x` defining where to apply adversarial perturbations. Shape needs to be broadcastable to the shape of x and can also be of the same shape as `x`. Any features for which the mask is zero will not be adversarially perturbed. :type mask: `np.ndarray` :return: An array holding the adversarial examples. """ import tensorflow as tf # lgtm [py/repeated-import] mask = self._get_mask(x, **kwargs) # Ensure eps is broadcastable self._check_compatibility_input_and_eps(x=x) # Check whether random eps is enabled self._random_eps() # Set up targets targets = self._set_targets(x, y) # Create dataset if mask is not None: # Here we need to make a distinction: if the masks are different for each input, we need to index # those for the current batch. Otherwise (i.e. mask is meant to be broadcasted), keep it as it is. if len(mask.shape) == len(x.shape): dataset = tf.data.Dataset.from_tensor_slices(( x.astype(ART_NUMPY_DTYPE), targets.astype(ART_NUMPY_DTYPE), mask.astype(ART_NUMPY_DTYPE), )).batch(self.batch_size, drop_remainder=False) else: dataset = tf.data.Dataset.from_tensor_slices(( x.astype(ART_NUMPY_DTYPE), targets.astype(ART_NUMPY_DTYPE), np.array([mask.astype(ART_NUMPY_DTYPE)] * x.shape[0]), )).batch(self.batch_size, drop_remainder=False) else: dataset = tf.data.Dataset.from_tensor_slices(( x.astype(ART_NUMPY_DTYPE), targets.astype(ART_NUMPY_DTYPE), )).batch(self.batch_size, drop_remainder=False) # Start to compute adversarial examples adv_x = x.astype(ART_NUMPY_DTYPE) data_loader = iter(dataset) # Compute perturbation with batching for (batch_id, batch_all) in enumerate( tqdm(data_loader, desc="PGD - Batches", leave=False, disable=not self.verbose)): if mask is not None: (batch, batch_labels, mask_batch) = batch_all[0], batch_all[1], batch_all[2] else: (batch, batch_labels, mask_batch) = batch_all[0], batch_all[1], None batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size # Compute batch_eps and batch_eps_step if isinstance(self.eps, np.ndarray): if len(self.eps.shape) == len( x.shape) and self.eps.shape[0] == x.shape[0]: batch_eps = self.eps[batch_index_1:batch_index_2] batch_eps_step = self.eps_step[batch_index_1:batch_index_2] else: batch_eps = self.eps batch_eps_step = self.eps_step else: batch_eps = self.eps batch_eps_step = self.eps_step for rand_init_num in range(max(1, self.num_random_init)): if rand_init_num == 0: # first iteration: use the adversarial examples as they are the only ones we have now adv_x[batch_index_1:batch_index_2] = self._generate_batch( x=batch, targets=batch_labels, mask=mask_batch, eps=batch_eps, eps_step=batch_eps_step) else: adversarial_batch = self._generate_batch( x=batch, targets=batch_labels, mask=mask_batch, eps=batch_eps, eps_step=batch_eps_step) attack_success = compute_success_array( self.estimator, batch, batch_labels, adversarial_batch, self.targeted, batch_size=self.batch_size, ) # return the successful adversarial examples adv_x[batch_index_1:batch_index_2][ attack_success] = adversarial_batch[attack_success] logger.info( "Success rate of attack: %.2f%%", 100 * compute_success(self.estimator, x, y, adv_x, self.targeted, batch_size=self.batch_size), ) return adv_x
def generate(self, x, y=None): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :type x: `np.ndarray` :param y: The labels for the data `x`. Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. Labels should be one-hot-encoded. :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ from art.utils import compute_success, get_labels_np_array, projection if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: raise ValueError( 'Target labels `y` need to be provided for a targeted attack.' ) # Use model predictions as correct outputs targets = get_labels_np_array(self.classifier.predict(x)) else: targets = y adv_x_best = None rate_best = 0.0 for i_random_init in range(max(1, self.num_random_init)): adv_x = x #.astype(NUMPY_DTYPE) noise = np.zeros_like(x) for i_max_iter in range(self.max_iter): # x, x_init, y, eps, eps_step, project, random_init adv_x = self._compute( adv_x, x, targets, self.eps, self.eps_step, self._project, self.num_random_init > 0 and i_max_iter == 0) # if self._project: # noise = projection(adv_x - x, self.eps, self.norm) # adv_x = x + noise rate = 100 * compute_success(self.classifier, x, targets, adv_x, self.targeted) #logger.info('Success rate of attack step: %.2f%%', rate) noise_norm = 0 if self.norm == np.inf: noise_norm = np.sign(noise) elif self.norm == 1: ind = tuple(range(1, len(noise.shape))) noise_norm = np.sum(np.abs(noise), axis=ind, keepdims=True) elif self.norm == 2: ind = tuple(range(1, len(noise.shape))) noise_norm = np.sqrt( np.sum(np.square(noise), axis=ind, keepdims=True)) TrackedPGD.tracked_x.append( (adv_x, rate, i_max_iter, noise_norm)) if rate >= 100: break rate = 100 * compute_success(self.classifier, x, targets, adv_x, self.targeted) if rate > rate_best or adv_x_best is None: rate_best = rate adv_x_best = adv_x if rate >= 100: break logger.info('Success rate of attack: %.2f%%', rate_best) return adv_x_best
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). If `self.targeted` is true, then `y` represents the target labels. If `self.targeted` is true, then `y_val` represents the target labels. Otherwise, the targets are the original class labels. :return: An array holding the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes) x_adv = x.astype(ART_NUMPY_DTYPE) if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values else: clip_min, clip_max = np.amin(x), np.amax(x) # Assert that, if attack is targeted, y_val is provided: if self.targeted and y is None: raise ValueError( "Target labels `y` need to be provided for a targeted attack.") # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)) # Compute perturbation with implicit batching nb_batches = int(np.ceil(x_adv.shape[0] / float(self.batch_size))) for batch_id in trange(nb_batches, desc="C&W L_2", disable=not self.verbose): batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x_adv[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] # The optimization is performed in tanh space to keep the adversarial images bounded in correct range x_batch_tanh = original_to_tanh(x_batch, clip_min, clip_max, self._tanh_smoother) # Initialize binary search: c_current = self.initial_const * np.ones(x_batch.shape[0]) c_lower_bound = np.zeros(x_batch.shape[0]) c_double = np.ones(x_batch.shape[0]) > 0 # Initialize placeholders for best l2 distance and attack found so far best_l2dist = np.inf * np.ones(x_batch.shape[0]) best_x_adv_batch = x_batch.copy() for bss in range(self.binary_search_steps): logger.debug( "Binary search step %i out of %i (c_mean==%f)", bss, self.binary_search_steps, np.mean(c_current), ) nb_active = int(np.sum(c_current < self._c_upper_bound)) logger.debug( "Number of samples with c_current < _c_upper_bound: %i out of %i", nb_active, x_batch.shape[0], ) if nb_active == 0: break learning_rate = self.learning_rate * np.ones(x_batch.shape[0]) # Initialize perturbation in tanh space: x_adv_batch = x_batch.copy() x_adv_batch_tanh = x_batch_tanh.copy() z_logits, l2dist, loss = self._loss(x_batch, x_adv_batch, y_batch, c_current) attack_success = loss - l2dist <= 0 overall_attack_success = attack_success for i_iter in range(self.max_iter): logger.debug("Iteration step %i out of %i", i_iter, self.max_iter) logger.debug("Average Loss: %f", np.mean(loss)) logger.debug("Average L2Dist: %f", np.mean(l2dist)) logger.debug("Average Margin Loss: %f", np.mean(loss - l2dist)) logger.debug( "Current number of succeeded attacks: %i out of %i", int(np.sum(attack_success)), len(attack_success), ) improved_adv = attack_success & (l2dist < best_l2dist) logger.debug("Number of improved L2 distances: %i", int(np.sum(improved_adv))) if np.sum(improved_adv) > 0: best_l2dist[improved_adv] = l2dist[improved_adv] best_x_adv_batch[improved_adv] = x_adv_batch[ improved_adv] active = (c_current < self._c_upper_bound) & (learning_rate > 0) nb_active = int(np.sum(active)) logger.debug( "Number of samples with c_current < _c_upper_bound and learning_rate > 0: %i out of %i", nb_active, x_batch.shape[0], ) if nb_active == 0: break # compute gradient: logger.debug("Compute loss gradient") perturbation_tanh = -self._loss_gradient( z_logits[active], y_batch[active], x_batch[active], x_adv_batch[active], x_adv_batch_tanh[active], c_current[active], clip_min, clip_max, ) # perform line search to optimize perturbation # first, halve the learning rate until perturbation actually decreases the loss: prev_loss = loss.copy() best_loss = loss.copy() best_lr = np.zeros(x_batch.shape[0]) halving = np.zeros(x_batch.shape[0]) for i_halve in range(self.max_halving): logger.debug( "Perform halving iteration %i out of %i", i_halve, self.max_halving, ) do_halving = loss[active] >= prev_loss[active] logger.debug( "Halving to be performed on %i samples", int(np.sum(do_halving)), ) if np.sum(do_halving) == 0: break active_and_do_halving = active.copy() active_and_do_halving[active] = do_halving lr_mult = learning_rate[active_and_do_halving] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] x_adv1 = x_adv_batch_tanh[active_and_do_halving] new_x_adv_batch_tanh = x_adv1 + lr_mult * perturbation_tanh[ do_halving] new_x_adv_batch = tanh_to_original( new_x_adv_batch_tanh, clip_min, clip_max) _, l2dist[active_and_do_halving], loss[ active_and_do_halving] = self._loss( x_batch[active_and_do_halving], new_x_adv_batch, y_batch[active_and_do_halving], c_current[active_and_do_halving], ) logger.debug("New Average Loss: %f", np.mean(loss)) logger.debug("New Average L2Dist: %f", np.mean(l2dist)) logger.debug("New Average Margin Loss: %f", np.mean(loss - l2dist)) best_lr[loss < best_loss] = learning_rate[ loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] learning_rate[active_and_do_halving] /= 2 halving[active_and_do_halving] += 1 learning_rate[active] *= 2 # if no halving was actually required, double the learning rate as long as this # decreases the loss: for i_double in range(self.max_doubling): logger.debug( "Perform doubling iteration %i out of %i", i_double, self.max_doubling, ) do_doubling = (halving[active] == 1) & ( loss[active] <= best_loss[active]) logger.debug( "Doubling to be performed on %i samples", int(np.sum(do_doubling)), ) if np.sum(do_doubling) == 0: break active_and_do_doubling = active.copy() active_and_do_doubling[active] = do_doubling learning_rate[active_and_do_doubling] *= 2 lr_mult = learning_rate[active_and_do_doubling] for _ in range(len(x.shape) - 1): lr_mult = lr_mult[:, np.newaxis] x_adv2 = x_adv_batch_tanh[active_and_do_doubling] new_x_adv_batch_tanh = x_adv2 + lr_mult * perturbation_tanh[ do_doubling] new_x_adv_batch = tanh_to_original( new_x_adv_batch_tanh, clip_min, clip_max) _, l2dist[active_and_do_doubling], loss[ active_and_do_doubling] = self._loss( x_batch[active_and_do_doubling], new_x_adv_batch, y_batch[active_and_do_doubling], c_current[active_and_do_doubling], ) logger.debug("New Average Loss: %f", np.mean(loss)) logger.debug("New Average L2Dist: %f", np.mean(l2dist)) logger.debug("New Average Margin Loss: %f", np.mean(loss - l2dist)) best_lr[loss < best_loss] = learning_rate[ loss < best_loss] best_loss[loss < best_loss] = loss[loss < best_loss] learning_rate[halving == 1] /= 2 update_adv = best_lr[active] > 0 logger.debug( "Number of adversarial samples to be finally updated: %i", int(np.sum(update_adv)), ) if np.sum(update_adv) > 0: active_and_update_adv = active.copy() active_and_update_adv[active] = update_adv best_lr_mult = best_lr[active_and_update_adv] for _ in range(len(x.shape) - 1): best_lr_mult = best_lr_mult[:, np.newaxis] x_adv4 = x_adv_batch_tanh[active_and_update_adv] best_lr1 = best_lr_mult * perturbation_tanh[update_adv] x_adv_batch_tanh[ active_and_update_adv] = x_adv4 + best_lr1 x_adv6 = x_adv_batch_tanh[active_and_update_adv] x_adv_batch[active_and_update_adv] = tanh_to_original( x_adv6, clip_min, clip_max) ( z_logits[active_and_update_adv], l2dist[active_and_update_adv], loss[active_and_update_adv], ) = self._loss( x_batch[active_and_update_adv], x_adv_batch[active_and_update_adv], y_batch[active_and_update_adv], c_current[active_and_update_adv], ) attack_success = loss - l2dist <= 0 overall_attack_success = overall_attack_success | attack_success # Update depending on attack success: improved_adv = attack_success & (l2dist < best_l2dist) logger.debug("Number of improved L2 distances: %i", int(np.sum(improved_adv))) if np.sum(improved_adv) > 0: best_l2dist[improved_adv] = l2dist[improved_adv] best_x_adv_batch[improved_adv] = x_adv_batch[improved_adv] c_double[overall_attack_success] = False c_current[overall_attack_success] = ( c_lower_bound + c_current)[overall_attack_success] / 2 c_old = c_current c_current[~overall_attack_success & c_double] *= 2 c_current1 = (c_current - c_lower_bound)[~overall_attack_success & ~c_double] c_current[~overall_attack_success & ~c_double] += c_current1 / 2 c_lower_bound[~overall_attack_success] = c_old[ ~overall_attack_success] x_adv[batch_index_1:batch_index_2] = best_x_adv_batch logger.info( "Success rate of C&W L_2 attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv