def infer(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Infer membership in the training set of the target estimator. :param x: Input records to attack. :param y: True labels for `x`. :param probabilities: a boolean indicating whether to return the predicted probabilities per class, or just the predicted class. :return: An array holding the inferred membership status, 1 indicates a member and 0 indicates non-member, or class probabilities. """ if y is None: # pragma: no cover raise ValueError( "MembershipInferenceBlackBoxRuleBased requires true labels `y`." ) if self.estimator.input_shape is not None: # pragma: no cover if self.estimator.input_shape[0] != x.shape[1]: raise ValueError( "Shape of x does not match input_shape of classifier") if "probabilities" in kwargs: probabilities = kwargs.get("probabilities") else: probabilities = False y = check_and_transform_label_format(y, len(np.unique(y)), return_one_hot=True) if y is None: raise ValueError("None value detected.") if y.shape[0] != x.shape[0]: # pragma: no cover raise ValueError("Number of rows in x and y do not match") # get model's predictions for x y_pred = self.estimator.predict(x=x) predicted_class = (np.argmax(y, axis=1) == np.argmax(y_pred, axis=1)).astype(int) if probabilities: # use y_pred as the probability if binary classification, otherwise just use 1 if y_pred.shape[1] == 2: pred_prob = np.max(y_pred, axis=1) prob = np.zeros((predicted_class.shape[0], 2)) prob[:, predicted_class] = pred_prob prob[:, np.ones_like(predicted_class) - predicted_class] = np.ones_like(pred_prob) - pred_prob else: # simply returns probability 1 for the predicted class and 0 for the other class prob_none = check_and_transform_label_format( predicted_class, return_one_hot=True) if prob_none is not None: prob = prob_none return prob return predicted_class
def fit(self, x: np.ndarray, y: Optional[np.ndarray] = None) -> None: """ Train the attack model. :param x: Input to training process. Includes all features used to train the original model. :param y: True labels for x. """ # Checks: if self.estimator.input_shape is not None: if self.estimator.input_shape[0] != x.shape[1]: raise ValueError( "Shape of x does not match input_shape of model") if isinstance(self.attack_feature, int) and self.attack_feature >= x.shape[1]: raise ValueError( "`attack_feature` must be a valid index to a feature in x") # get model's predictions for x if ClassifierMixin in type(self.estimator).__mro__: predictions = np.array([ np.argmax(arr) for arr in self.estimator.predict(x) ]).reshape(-1, 1) else: # Regression model if self.scale_range is not None: predictions = minmax_scale(self.estimator.predict(x).reshape( -1, 1), feature_range=self.scale_range) if y is not None: y = minmax_scale(y, feature_range=self.scale_range) else: predictions = self.estimator.predict(x).reshape( -1, 1) * self.prediction_normal_factor if y is not None: y = y * self.prediction_normal_factor # get vector of attacked feature y_attack = x[:, self.attack_feature] self._values = get_feature_values(y_attack, isinstance(self.attack_feature, int)) if isinstance(self.attack_feature, int): y_one_hot = float_to_categorical(y_attack) else: y_one_hot = floats_to_one_hot(y_attack) y_attack_ready = check_and_transform_label_format( y_one_hot, len(np.unique(y_attack)), return_one_hot=True) # create training set for attack model x_train = np.concatenate( (np.delete(x, self.attack_feature, 1), predictions), axis=1).astype(np.float32) if y is not None: y = check_and_transform_label_format(y, return_one_hot=True) x_train = np.concatenate((x_train, y), axis=1) # train attack model self.attack_model.fit(x_train, y_attack_ready)
def test_check_and_transform_label_format(self): labels_expected = np.array([[0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1]]) labels_expected_binary = np.array([[0, 1], [1, 0], [0, 1]]) # test input shape (nb_samples,) labels = np.array([3, 1, 4]) labels_transformed = check_and_transform_label_format( labels, nb_classes=5, return_one_hot=True) np.testing.assert_array_equal(labels_transformed, labels_expected) # test input shape (nb_samples, 1) labels = np.array([[3], [1], [4]]) labels_transformed = check_and_transform_label_format( labels, nb_classes=5, return_one_hot=True) np.testing.assert_array_equal(labels_transformed, labels_expected) # test input shape (nb_samples, 1) - binary labels = np.array([[1], [0], [1]]) labels_transformed = check_and_transform_label_format( labels, nb_classes=2, return_one_hot=True) np.testing.assert_array_equal(labels_transformed, labels_expected_binary) # test input shape (nb_samples, 1) - binary labels = np.array([[0, 1], [1, 0], [0, 1]]) labels_transformed = check_and_transform_label_format( labels, nb_classes=2, return_one_hot=True) np.testing.assert_array_equal(labels_transformed, labels_expected_binary) # test input shape (nb_samples, nb_classes) labels = np.array([[0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1]]) labels_transformed = check_and_transform_label_format( labels, nb_classes=5, return_one_hot=True) np.testing.assert_array_equal(labels_transformed, labels_expected) # test input shape (nb_samples, nb_classes) with return_one_hot=False labels = np.array([[0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1]]) labels_transformed = check_and_transform_label_format( labels, nb_classes=5, return_one_hot=False) np.testing.assert_array_equal( labels_transformed, np.expand_dims(np.argmax(labels_expected, axis=1), axis=1)) # test input shape (nb_samples, 1) - binary labels = np.array([[1], [0], [1]]) labels_transformed = check_and_transform_label_format( labels, nb_classes=2, return_one_hot=False) np.testing.assert_array_equal( labels_transformed, np.expand_dims(np.argmax(labels_expected_binary, axis=1), axis=1)) # ValueError for len(labels.shape) > 2 labels = np.array([[[0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1]]]) with self.assertRaises(ValueError): check_and_transform_label_format(labels)
def infer(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Infer membership in the training set of the target estimator. :param x: Input records to attack. :param y: True labels for `x`. :return: An array holding the inferred membership status, 1 indicates a member and 0 indicates non-member. """ if y is None: raise ValueError( "MembershipInferenceBlackBoxRuleBased requires true labels `y`." ) if self.estimator.input_shape[0] != x.shape[1]: raise ValueError( "Shape of x does not match input_shape of classifier") y = check_and_transform_label_format(y, len(np.unique(y)), return_one_hot=True) y = np.array([np.argmax(arr) for arr in y]).reshape(-1, 1) if y.shape[0] != x.shape[0]: raise ValueError("Number of rows in x and y do not match") # get model's predictions for x predictions = np.array([ np.argmax(arr) for arr in self.estimator.predict(x) ]).reshape(-1, 1) return np.asarray( [1 if p == y[index] else 0 for index, p in enumerate(predictions)])
def fit(self, x: np.ndarray) -> None: """ Train the attack model. :param x: Input to training process. Includes all features used to train the original model. """ # Checks: if self.estimator.input_shape[0] != x.shape[1]: raise ValueError( "Shape of x does not match input_shape of classifier") if self.attack_feature >= x.shape[1]: raise ValueError( "attack_feature must be a valid index to a feature in x") # get model's predictions for x predictions = np.array([ np.argmax(arr) for arr in self.estimator.predict(x) ]).reshape(-1, 1) # get vector of attacked feature y = x[:, self.attack_feature] y_one_hot = float_to_categorical(y) y_ready = check_and_transform_label_format(y_one_hot, len(np.unique(y)), return_one_hot=True) # create training set for attack model x_train = np.concatenate( (np.delete(x, self.attack_feature, 1), predictions), axis=1).astype(np.float32) # train attack model self.attack_model.fit(x_train, y_ready)
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). :return: An array holding the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes) # Check that `y` is provided for targeted attacks if self.targeted and y is None: raise ValueError( "Target labels `y` need to be provided for a targeted attack.") # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)) if self.estimator.nb_classes == 2 and y.shape[1] == 1: raise ValueError( "This attack has not yet been tested for binary classification with a single output classifier." ) # Compute adversarial examples with implicit batching nb_batches = int(np.ceil(x.shape[0] / float(self.batch_size))) x_adv = [] for batch_id in trange(nb_batches, desc="ZOO", disable=not self.verbose): batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] res = self._generate_batch(x_batch, y_batch) x_adv.append(res) x_adv = np.vstack(x_adv) # Apply clip if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values np.clip(x_adv, clip_min, clip_max, out=x_adv) # Log success rate of the ZOO attack logger.info( "Success rate of ZOO attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: int = 10, **kwargs) -> None: """ Fit the classifier on the training set `(x, y)`. :param x: Training data. :param y: Labels, one-hot-encoded of shape (nb_samples, nb_classes) or index labels of shape (nb_samples,). :param batch_size: Size of batches. :param nb_epochs: Number of epochs to use for training. :param kwargs: Dictionary of framework-specific arguments. This parameter is not currently supported for TensorFlow and providing it takes no effect. """ import tensorflow as tf # lgtm [py/repeated-import] if self._train_step is None: raise TypeError( "The training function `train_step` is required for fitting a model but it has not been " "defined." ) y = check_and_transform_label_format(y, self.nb_classes) # Apply preprocessing x_preprocessed, y_preprocessed = self._apply_preprocessing(x, y, fit=True) # Check label shape if self._reduce_labels: y_preprocessed = np.argmax(y_preprocessed, axis=1) train_ds = tf.data.Dataset.from_tensor_slices((x_preprocessed, y_preprocessed)).shuffle(10000).batch(batch_size) for _ in range(nb_epochs): for images, labels in train_ds: self._train_step(self.model, images, labels)
def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: int = 20, **kwargs) -> None: """ Fit the classifier on the training set `(x, y)`. :param x: Training data. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or index labels of shape (nb_samples,). :param batch_size: Size of batches. :param nb_epochs: Number of epochs to use for training. :param kwargs: Dictionary of framework-specific arguments. These should be parameters supported by the `fit_generator` function in Keras and will be passed to this function as such. Including the number of epochs or the number of steps per epoch as part of this argument will result in as error. """ y = check_and_transform_label_format(y, self.nb_classes) # Apply preprocessing x_preprocessed, y_preprocessed = self._apply_preprocessing(x, y, fit=True) # Adjust the shape of y for loss functions that do not take labels in one-hot encoding if self._reduce_labels: y_preprocessed = np.argmax(y_preprocessed, axis=1) gen = generator_fit(x_preprocessed, y_preprocessed, batch_size) steps_per_epoch = max(int(x_preprocessed.shape[0] / batch_size), 1) self._model.fit_generator(gen, steps_per_epoch=steps_per_epoch, epochs=nb_epochs, **kwargs)
def _set_targets(self, x: np.ndarray, y: np.ndarray, classifier_mixin: bool = True) -> np.ndarray: """ Check and set up targets. :param x: An array with the original inputs. :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :param classifier_mixin: Whether the estimator is of type `ClassifierMixin`. :return: The targets. """ if classifier_mixin: y = check_and_transform_label_format(y, self.estimator.nb_classes) if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: # pragma: no cover raise ValueError("Target labels `y` need to be provided for a targeted attack.") # Use model predictions as correct outputs if classifier_mixin: targets = get_labels_np_array(self.estimator.predict(x, batch_size=self.batch_size)) else: targets = self.estimator.predict(x, batch_size=self.batch_size) else: targets = y return targets
def fit(self, x: np.ndarray, y: np.ndarray) -> None: """ Train the attack model. :param x: Input to training process. Includes all features used to train the original model. :param y: True labels of the features. """ # Checks: if self.single_index_feature and self.attack_feature >= x.shape[1]: raise ValueError( "attack_feature must be a valid index to a feature in x") # get vector of attacked feature attacked_feature = x[:, self.attack_feature] if self.single_index_feature: y_one_hot = float_to_categorical(attacked_feature) else: y_one_hot = floats_to_one_hot(attacked_feature) y_ready = check_and_transform_label_format( y_one_hot, len(np.unique(attacked_feature)), return_one_hot=True) # create training set for attack model normalized_labels = y * self.prediction_normal_factor x_train = np.concatenate( (np.delete(x, self.attack_feature, 1), normalized_labels), axis=1).astype(np.float32) # train attack model self.attack_model.fit(x_train, y_ready)
def infer(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Infer membership in the training set of the target estimator. :param x: Input records to attack. :param y: True labels for `x`. :return: An array holding the inferred membership status, 1 indicates a member and 0 indicates non-member. """ if y is None: raise ValueError( "MembershipInferenceBlackBoxRuleBased requires true labels `y`." ) if self.estimator.input_shape is not None: if self.estimator.input_shape[0] != x.shape[1]: raise ValueError( "Shape of x does not match input_shape of classifier") y = check_and_transform_label_format(y, len(np.unique(y)), return_one_hot=True) if y.shape[0] != x.shape[0]: raise ValueError("Number of rows in x and y do not match") # get model's predictions for x y_pred = self.estimator.predict(x=x) return (np.argmax(y, axis=1) == np.argmax(y_pred, axis=1)).astype(np.int)
def fit(self, x: np.ndarray) -> None: """ Train the attack model. :param x: Input to training process. Includes all features used to train the original model. """ # Checks: if isinstance(self.attack_feature, int) and self.attack_feature >= x.shape[1]: raise ValueError( "attack_feature must be a valid index to a feature in x") # get vector of attacked feature y = x[:, self.attack_feature] self._values = get_feature_values(y, isinstance(self.attack_feature, int)) if isinstance(self.attack_feature, int): y_one_hot = float_to_categorical(y) else: y_one_hot = floats_to_one_hot(y) y_ready = check_and_transform_label_format(y_one_hot, len(np.unique(y)), return_one_hot=True) if y_ready is None: raise ValueError("None value detected.") # create training set for attack model x_train = np.delete(x, self.attack_feature, 1).astype(np.float32) # train attack model self.attack_model.fit(x_train, y_ready)
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). If `self.targeted` is true, then `y` represents the target labels. Otherwise, the targets are the original class labels. :return: An array holding the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes) x_adv = x.astype(ART_NUMPY_DTYPE) # Assert that, if attack is targeted, y is provided: if self.targeted and y is None: raise ValueError( "Target labels `y` need to be provided for a targeted attack.") # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)) if self.estimator.nb_classes == 2 and y.shape[1] == 1: raise ValueError( "This attack has not yet been tested for binary classification with a single output classifier." ) # Compute adversarial examples with implicit batching nb_batches = int(np.ceil(x_adv.shape[0] / float(self.batch_size))) for batch_id in trange(nb_batches, desc="EAD", disable=not self.verbose): batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x_adv[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] x_adv[batch_index_1:batch_index_2] = self._generate_batch( x_batch, y_batch) # Apply clip if self.estimator.clip_values is not None: x_adv = np.clip(x_adv, self.estimator.clip_values[0], self.estimator.clip_values[1]) # Compute success rate of the EAD attack logger.info( "Success rate of EAD attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> Tuple[np.ndarray, np.ndarray]: """ Generate an adversarial patch and return the patch and its mask in arrays. :param x: An array with the original input images of shape NHWC or input videos of shape NFHWC. :param y: An array with the original true labels. :return: An array with adversarial patch and an array of the patch mask. """ import tensorflow as tf # lgtm [py/repeated-import] y = check_and_transform_label_format( labels=y, nb_classes=self.estimator.nb_classes) shuffle = kwargs.get("shuffle", True) if shuffle: ds = (tf.data.Dataset.from_tensor_slices( (x, y)).shuffle(10000).batch(self.batch_size).repeat( math.ceil(x.shape[0] / self.batch_size))) else: ds = (tf.data.Dataset.from_tensor_slices( (x, y)).batch(self.batch_size).repeat( math.ceil(x.shape[0] / self.batch_size))) for _ in trange(self.max_iter, desc="Adversarial Patch TensorFlow v2"): for images, target in ds: _ = self._train_step(images=images, target=target) return ( self._patch.numpy(), self._get_circular_patch_mask(nb_samples=1).numpy()[0], )
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`. :param cost_matrix: A non-negative cost matrix. :type cost_matrix: `np.ndarray` :return: An array holding the adversarial examples. """ if y is not None: y = check_and_transform_label_format(y, self.estimator.nb_classes) x_adv = x.copy().astype(ART_NUMPY_DTYPE) if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: raise ValueError( "Target labels `y` need to be provided for a targeted attack." ) # Use model predictions as correct outputs targets = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)) else: targets = y if self.estimator.nb_classes == 2 and targets.shape[1] == 1: raise ValueError( "This attack has not yet been tested for binary classification with a single output classifier." ) # Compute the cost matrix if needed cost_matrix = kwargs.get("cost_matrix") if cost_matrix is None: cost_matrix = self._compute_cost_matrix(self.p, self.kernel_size) # Compute perturbation with implicit batching nb_batches = int(np.ceil(x.shape[0] / float(self.batch_size))) for batch_id in trange(nb_batches, desc="Wasserstein", disable=not self.verbose): logger.debug("Processing batch %i out of %i", batch_id, nb_batches) batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size batch = x_adv[batch_index_1:batch_index_2] batch_labels = targets[batch_index_1:batch_index_2] x_adv[batch_index_1:batch_index_2] = self._generate_batch( batch, batch_labels, cost_matrix) return x_adv
def generate(self, x, y=None, **kwargs): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :type x: `np.ndarray` :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape (nb_samples,). :type y: `np.ndarray` :param x_adv_init: Initial array to act as initial adversarial examples. Same shape as `x`. :type x_adv_init: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ y = check_and_transform_label_format(y, self.classifier.nb_classes()) # Get clip_min and clip_max from the classifier or infer them from data if hasattr(self.classifier, 'clip_values') and self.classifier.clip_values is not None: clip_min, clip_max = self.classifier.clip_values else: clip_min, clip_max = np.min(x), np.max(x) # Prediction from the original images preds = np.argmax(self.classifier.predict(x, batch_size=self.batch_size), axis=1) # Prediction from the initial adversarial examples if not None x_adv_init = kwargs.get('x_adv_init') if x_adv_init is not None: init_preds = np.argmax(self.classifier.predict(x_adv_init, batch_size=self.batch_size), axis=1) else: init_preds = [None] * len(x) x_adv_init = [None] * len(x) # Assert that, if attack is targeted, y is provided if self.targeted and y is None: raise ValueError('Target labels `y` need to be provided for a targeted attack.') # Some initial setups x_adv = x.astype(NUMPY_DTYPE) if y is not None: y = np.argmax(y, axis=1) # Generate the adversarial samples for ind, val in enumerate(x_adv): if self.targeted: x_adv[ind] = self._perturb(x=val, y=y[ind], y_p=preds[ind], init_pred=init_preds[ind], adv_init=x_adv_init[ind], clip_min=clip_min, clip_max=clip_max) else: x_adv[ind] = self._perturb(x=val, y=-1, y_p=preds[ind], init_pred=init_preds[ind], adv_init=x_adv_init[ind], clip_min=clip_min, clip_max=clip_max) if y is not None: y = to_categorical(y, self.classifier.nb_classes()) logger.info('Success rate of HopSkipJump attack: %.2f%%', 100 * compute_success(self.classifier, x, y, x_adv, self.targeted, batch_size=self.batch_size)) return x_adv
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial examples and return them as an array. :param x: An array with the original inputs to be attacked. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). :return: An array holding the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes, return_one_hot=False) x_adv = x.copy() for index in trange(x_adv.shape[0], desc="Decision tree attack", disable=not self.verbose): path = self.estimator.get_decision_path(x_adv[index]) legitimate_class = np.argmax(self.estimator.predict(x_adv[index].reshape(1, -1))) position = -2 adv_path = [-1] ancestor = path[position] while np.abs(position) < (len(path) - 1) or adv_path[0] == -1: ancestor = path[position] current_child = path[position + 1] # search in right subtree if current_child == self.estimator.get_left_child(ancestor): if y is None: adv_path = self._df_subtree(self.estimator.get_right_child(ancestor), legitimate_class) else: adv_path = self._df_subtree( self.estimator.get_right_child(ancestor), legitimate_class, y[index], ) else: # search in left subtree if y is None: adv_path = self._df_subtree(self.estimator.get_left_child(ancestor), legitimate_class) else: adv_path = self._df_subtree( self.estimator.get_left_child(ancestor), legitimate_class, y[index], ) position = position - 1 # we are going the decision path upwards adv_path.append(ancestor) # we figured out which is the way to the target, now perturb # first one is leaf-> no threshold, cannot be perturbed for i in range(1, 1 + len(adv_path[1:])): go_for = adv_path[i - 1] threshold = self.estimator.get_threshold_at_node(adv_path[i]) feature = self.estimator.get_feature_at_node(adv_path[i]) # only perturb if the feature is actually wrong if x_adv[index][feature] > threshold and go_for == self.estimator.get_left_child(adv_path[i]): x_adv[index][feature] = threshold - self.offset elif x_adv[index][feature] <= threshold and go_for == self.estimator.get_right_child(adv_path[i]): x_adv[index][feature] = threshold + self.offset logger.info( "Success rate of decision tree attack: %.2f%%", 100 * compute_success(self.estimator, x, y, x_adv), ) return x_adv
def infer(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Infer membership in the training set of the target estimator. :param x: Input records to attack. :param y: True labels for `x`. :return: An array holding the inferred membership status, 1 indicates a member and 0 indicates non-member. """ if y is None: raise ValueError( "MembershipInferenceBlackBox requires true labels `y`.") if self.estimator.input_shape[0] != x.shape[1]: raise ValueError( "Shape of x does not match input_shape of classifier") y = check_and_transform_label_format(y, len(np.unique(y)), return_one_hot=True) if y.shape[0] != x.shape[0]: raise ValueError("Number of rows in x and y do not match") if self.input_type == "prediction": features = self.estimator.predict(x).astype(np.float32) elif self.input_type == "loss": features = self.estimator.loss(x, y).astype(np.float32).reshape( -1, 1) if self.default_model and self.attack_model_type == "nn": import torch # lgtm [py/repeated-import] from torch.utils.data import DataLoader # lgtm [py/repeated-import] self.attack_model.eval() inferred = None test_set = self._get_attack_dataset(f_1=features, f_2=y) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=True, num_workers=0) for input1, input2, _ in test_loader: outputs = self.attack_model(input1, input2) predicted = torch.round(outputs) if inferred is None: inferred = predicted.detach().numpy() else: inferred = np.vstack( (inferred, predicted.detach().numpy())) inferred = inferred.reshape(-1).astype(np.int) else: inferred = np.array([ np.argmax(arr) for arr in self.attack_model.predict(np.c_[features, y]) ]) return inferred
def generate(self, x, y=None, **kwargs): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :type x: `np.ndarray` :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ y = check_and_transform_label_format(y, self.classifier.nb_classes()) # Check that `y` is provided for targeted attacks if self.targeted and y is None: raise ValueError( "Target labels `y` need to be provided for a targeted attack.") # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array( self.classifier.predict(x, batch_size=self.batch_size)) # Compute adversarial examples with implicit batching nb_batches = int(np.ceil(x.shape[0] / float(self.batch_size))) x_adv = [] for batch_id in range(nb_batches): logger.debug("Processing batch %i out of %i", batch_id, nb_batches) batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] res = self._generate_batch(x_batch, y_batch) x_adv.append(res) x_adv = np.vstack(x_adv) # Apply clip if hasattr(self.classifier, "clip_values") and self.classifier.clip_values is not None: clip_min, clip_max = self.classifier.clip_values np.clip(x_adv, clip_min, clip_max, out=x_adv) # Log success rate of the ZOO attack logger.info( "Success rate of ZOO attack: %.2f%%", 100 * compute_success(self.classifier, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def infer(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Infer membership of input `x` in estimator's training data. :param x: Input data. :param y: True labels for `x`. :Keyword Arguments for HopSkipJump: * *norm*: Order of the norm. Possible values: "inf", np.inf or 2. * *max_iter*: Maximum number of iterations. * *max_eval*: Maximum number of evaluations for estimating gradient. * *init_eval*: Initial number of evaluations for estimating gradient. * *init_size*: Maximum number of trials for initial generation of adversarial examples. * *verbose*: Show progress bars. :return: An array holding the inferred membership status, 1 indicates a member and 0 indicates non-member. """ from art.attacks.evasion.hop_skip_jump import HopSkipJump if y is None: raise ValueError( "Argument `y` is None, but this attack requires true labels `y` to be provided." ) if self.distance_threshold_tau is None: raise ValueError( "No value for distance threshold `distance_threshold_tau` provided. Please set" "`distance_threshold_tau` or run method `calibrate_distance_threshold` on known training and test" "dataset.") if "classifier" in kwargs: raise ValueError( "Keyword `classifier` in kwargs is not supported.") if "targeted" in kwargs: raise ValueError("Keyword `targeted` in kwargs is not supported.") y = check_and_transform_label_format(y, self.estimator.nb_classes) hsj = HopSkipJump(classifier=self.estimator, targeted=False, **kwargs) x_adv = hsj.generate(x=x, y=y) distance = np.linalg.norm((x_adv - x).reshape((x.shape[0], -1)), ord=2, axis=1) y_pred = self.estimator.predict(x=x) distance[np.argmax(y_pred, axis=1) != np.argmax(y, axis=1)] = 0 is_member = np.where(distance > self.distance_threshold_tau, 1, 0) return is_member
def generate(self, x, y=None, **kwargs): """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs to be attacked. :type x: `np.ndarray` :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). If `self.targeted` is true, then `y` represents the target labels. Otherwise, the targets are the original class labels. :type y: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ y = check_and_transform_label_format(y, self.classifier.nb_classes()) x_adv = x.astype(ART_NUMPY_DTYPE) # Assert that, if attack is targeted, y is provided: if self.targeted and y is None: raise ValueError( "Target labels `y` need to be provided for a targeted attack.") # No labels provided, use model prediction as correct class if y is None: y = get_labels_np_array( self.classifier.predict(x, batch_size=self.batch_size)) # Compute adversarial examples with implicit batching nb_batches = int(np.ceil(x_adv.shape[0] / float(self.batch_size))) for batch_id in range(nb_batches): logger.debug("Processing batch %i out of %i", batch_id, nb_batches) batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_batch = x_adv[batch_index_1:batch_index_2] y_batch = y[batch_index_1:batch_index_2] x_adv[batch_index_1:batch_index_2] = self._generate_batch( x_batch, y_batch) # Apply clip if hasattr(self.classifier, "clip_values") and self.classifier.clip_values is not None: x_adv = np.clip(x_adv, self.classifier.clip_values[0], self.classifier.clip_values[1]) # Compute success rate of the EAD attack logger.info( "Success rate of EAD attack: %.2f%%", 100 * compute_success(self.classifier, x, y, x_adv, self.targeted, batch_size=self.batch_size), ) return x_adv
def infer(self, x: Optional[np.ndarray], y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Extract a thieved classifier. :param x: An array with the initial input to the victim classifier. If `None`, then initial input will be initialized as zero array. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). :return: The inferred training samples. """ if x is None and y is None: raise ValueError("Either `x` or `y` should be provided.") y = check_and_transform_label_format(y, self.estimator.nb_classes) if x is None: x = np.zeros((len(y),) + self.estimator.input_shape) if y is None: y = get_labels_np_array(self.estimator.predict(x, batch_size=self.batch_size)) x_infer = x.astype(ART_NUMPY_DTYPE) # Compute inversions with implicit batching for batch_id in trange( int(np.ceil(x.shape[0] / float(self.batch_size))), desc="Model inversion", disable=not self.verbose ): batch_index_1, batch_index_2 = batch_id * self.batch_size, (batch_id + 1) * self.batch_size batch = x_infer[batch_index_1:batch_index_2] batch_labels = y[batch_index_1:batch_index_2] active = np.array([True] * len(batch)) window = np.inf * np.ones((len(batch), self.window_length)) i = 0 while i < self.max_iter and sum(active) > 0: grads = self.estimator.class_gradient(batch[active], np.argmax(batch_labels[active], axis=1)) grads = np.reshape(grads, (grads.shape[0],) + grads.shape[2:]) batch[active] = batch[active] + self.learning_rate * grads if self.estimator.clip_values is not None: clip_min, clip_max = self.estimator.clip_values batch[active] = np.clip(batch[active], clip_min, clip_max) cost = 1 - self.estimator.predict(batch)[np.arange(len(batch)), np.argmax(batch_labels, axis=1)] active = (cost <= self.threshold) + (cost >= np.max(window, axis=1)) i_window = i % self.window_length window[::, i_window] = cost i = i + 1 x_infer[batch_index_1:batch_index_2] = batch return x_infer
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. This requires a lot of memory, therefore it accepts only a single samples as input, e.g. a batch of size 1. :param x: An array of a single original input sample. :param y: An array of a single target label. :return: An array with the adversarial examples. """ y = check_and_transform_label_format(y, self.estimator.nb_classes) if y is None: # Throw error if attack is targeted, but no targets are provided if self.targeted: raise ValueError("Target labels `y` need to be provided for a targeted attack.") logger.info("Using model predictions as correct labels for FGM.") y = get_labels_np_array(self.estimator.predict(x, batch_size=self.batch_size)) else: self.targeted = True if x.shape[0] > 1 or y.shape[0] > 1: raise ValueError("This attack only accepts a single sample as input.") if x.ndim != 4: raise ValueError("Unrecognized input dimension. Shadow Attack can only be applied to image data.") x = x.astype(ART_NUMPY_DTYPE) x_batch = np.repeat(x, repeats=self.batch_size, axis=0).astype(ART_NUMPY_DTYPE) x_batch = x_batch + np.random.normal(scale=self.sigma, size=x_batch.shape).astype(ART_NUMPY_DTYPE) y_batch = np.repeat(y, repeats=self.batch_size, axis=0) perturbation = ( np.random.uniform( low=self.estimator.clip_values[0], high=self.estimator.clip_values[1], size=x.shape ).astype(ART_NUMPY_DTYPE) - (self.estimator.clip_values[1] - self.estimator.clip_values[0]) / 2 ) for _ in trange(self.nb_steps, desc="Shadow attack", disable=not self.verbose): gradients_ce = np.mean( self.estimator.loss_gradient(x=x_batch + perturbation, y=y_batch, sampling=False) * (1 - 2 * int(self.targeted)), axis=0, keepdims=True, ) gradients = gradients_ce - self._get_regularisation_loss_gradients(perturbation) perturbation += self.learning_rate * gradients x_p = x + perturbation x_adv = np.clip(x_p, a_min=self.estimator.clip_values[0], a_max=self.estimator.clip_values[1]).astype( ART_NUMPY_DTYPE ) return x_adv
def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: int = 10, **kwargs) -> None: """ Fit the classifier on the training set `(x, y)`. :param x: Training data. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or index labels of shape (nb_samples,). :param batch_size: Size of batches. :param nb_epochs: Number of epochs to use for training. :param kwargs: Dictionary of framework-specific arguments. This parameter is not currently supported for TensorFlow and providing it takes no effect. """ # Check if train and output_ph available if self._train is None or self._labels_ph is None: raise ValueError( "Need the training objective and the output placeholder to train the model." ) y = check_and_transform_label_format(y, self.nb_classes) # Apply preprocessing x_preprocessed, y_preprocessed = self._apply_preprocessing(x, y, fit=True) # Check label shape if self._reduce_labels: y_preprocessed = np.argmax(y_preprocessed, axis=1) num_batch = int(np.ceil(len(x_preprocessed) / float(batch_size))) ind = np.arange(len(x_preprocessed)) # Start training for _ in range(nb_epochs): # Shuffle the examples random.shuffle(ind) # Train for one epoch for m in range(num_batch): i_batch = x_preprocessed[ind[m * batch_size:(m + 1) * batch_size]] o_batch = y_preprocessed[ind[m * batch_size:(m + 1) * batch_size]] # Create feed_dict feed_dict = {self._input_ph: i_batch, self._labels_ph: o_batch} feed_dict.update(self._feed_dict) # Run train step self._sess.run(self._train, feed_dict=feed_dict)
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> Tuple[np.ndarray, np.ndarray]: """ Generate an adversarial patch and return the patch and its mask in arrays. :param x: An array with the original input images of shape NHWC or NCHW or input videos of shape NFHWC or NFCHW. :param y: An array with the original true labels. :return: An array with adversarial patch and an array of the patch mask. """ logger.info("Creating adversarial patch.") if len(x.shape) == 2: raise ValueError( "Feature vectors detected. The adversarial patch can only be applied to data with spatial " "dimensions.") y_target = check_and_transform_label_format( labels=y, nb_classes=self.estimator.nb_classes) for _ in trange(self.max_iter, desc="Adversarial Patch Numpy"): patched_images, patch_mask_transformed, transforms = self._augment_images_with_random_patch( x, self.patch) num_batches = int(math.ceil(x.shape[0] / self.batch_size)) patch_gradients = np.zeros_like(self.patch) for i_batch in range(num_batches): i_batch_start = i_batch * self.batch_size i_batch_end = (i_batch + 1) * self.batch_size gradients = self.estimator.loss_gradient( patched_images[i_batch_start:i_batch_end], y_target[i_batch_start:i_batch_end], ) for i_image in range(gradients.shape[0]): patch_gradients_i = self._reverse_transformation( gradients[i_image, :, :, :], patch_mask_transformed[i_image, :, :, :], transforms[i_image], ) patch_gradients += patch_gradients_i # patch_gradients = patch_gradients / (num_batches * self.batch_size) self.patch -= patch_gradients * self.learning_rate self.patch = np.clip( self.patch, a_min=self.estimator.clip_values[0], a_max=self.estimator.clip_values[1], ) return self.patch, self._get_circular_patch_mask()
def compute_loss( # pylint: disable=W0221 self, x: Union[np.ndarray, "torch.Tensor"], y: Union[np.ndarray, "torch.Tensor"], reduction: str = "none", **kwargs) -> Union[np.ndarray, "torch.Tensor"]: """ Compute the loss. :param x: Sample input with shape as expected by the model. :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape `(nb_samples,)`. :param reduction: Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'. 'none': no reduction will be applied 'mean': the sum of the output will be divided by the number of elements in the output, 'sum': the output will be summed. :return: Array of losses of the same shape as `x`. """ import torch # lgtm [py/repeated-import] self._model.eval() y = check_and_transform_label_format(y, self.nb_classes) # Apply preprocessing x_preprocessed, y_preprocessed = self._apply_preprocessing(x, y, fit=False) # Check label shape y_preprocessed = self.reduce_labels(y_preprocessed) if isinstance(x, torch.Tensor): inputs_t = x_preprocessed labels_t = y_preprocessed else: # Convert the inputs to Tensors inputs_t = torch.from_numpy(x_preprocessed).to(self._device) # Convert the labels to Tensors labels_t = torch.from_numpy(y_preprocessed).to(self._device) # Compute the loss and return model_outputs = self._model(inputs_t) prev_reduction = self._loss.reduction # Return individual loss values self._loss.reduction = reduction loss = self._loss(model_outputs[-1], labels_t) self._loss.reduction = prev_reduction if isinstance(x, torch.Tensor): return loss return loss.detach().cpu().numpy()
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial samples and return them in an array. :param x: An array with the original inputs. `x` is expected to have spatial dimensions. :param y: An array with the original labels to be predicted. :return: An array holding the adversarial patch. """ logger.info("Creating adversarial patch.") # print("Something") if len(x.shape) == 2: raise ValueError( "Feature vectors detected. The adversarial patch can only be applied to data with spatial " "dimensions.") y_target = check_and_transform_label_format( labels=y, nb_classes=self.estimator.nb_classes) for _ in trange(self.max_iter, desc="Adversarial patch"): patched_images, patch_mask_transformed, transforms = self._augment_images_with_random_patch( x, self.patch) num_batches = int(x.shape[0] / self.batch_size) patch_gradients = np.zeros_like(self.patch) for i_batch in range(num_batches): i_batch_start = i_batch * self.batch_size i_batch_end = (i_batch + 1) * self.batch_size gradients = self.estimator.loss_gradient( patched_images[i_batch_start:i_batch_end], y_target[i_batch_start:i_batch_end], ) for i_image in range(self.batch_size): patch_gradients_i = self._reverse_transformation( gradients[i_image, :, :, :], patch_mask_transformed[i_image, :, :, :], transforms[i_image], ) patch_gradients += patch_gradients_i # patch_gradients = patch_gradients / (num_batches * self.batch_size) self.patch -= patch_gradients * self.learning_rate self.patch = np.clip( self.patch, a_min=self.estimator.clip_values[0], a_max=self.estimator.clip_values[1], ) return self.patch, self._get_circular_patch_mask()
def fit(self, x: np.ndarray, y: np.ndarray, batch_size: int = 128, nb_epochs: int = 10, **kwargs) -> None: """ Fit the classifier on the training set `(x, y)`. :param x: Training data. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or index labels of shape (nb_samples,). :param batch_size: Size of batches. :param nb_epochs: Number of epochs to use for training. :param kwargs: Dictionary of framework-specific arguments. This parameter is not currently supported for PyTorch and providing it takes no effect. """ import torch # lgtm [py/repeated-import] if self._optimizer is None: raise ValueError("An optimizer is needed to train the model, but none for provided.") y = check_and_transform_label_format(y, self.nb_classes) # Apply preprocessing x_preprocessed, y_preprocessed = self._apply_preprocessing(x, y, fit=True) # Check label shape y_preprocessed = self.reduce_labels(y_preprocessed) num_batch = int(np.ceil(len(x_preprocessed) / float(batch_size))) ind = np.arange(len(x_preprocessed)) # Start training for _ in range(nb_epochs): # Shuffle the examples random.shuffle(ind) # Train for one epoch for m in range(num_batch): i_batch = torch.from_numpy(x_preprocessed[ind[m * batch_size : (m + 1) * batch_size]]).to(self._device) o_batch = torch.from_numpy(y_preprocessed[ind[m * batch_size : (m + 1) * batch_size]]).to(self._device) # Zero the parameter gradients self._optimizer.zero_grad() # Perform prediction model_outputs = self._model(i_batch) # Form the loss function loss = self._loss(model_outputs[-1], o_batch) # Actual training loss.backward() self._optimizer.step() targets = o_batch.detach().cpu().numpy() predictions = torch.argmax(model_outputs[0].detach(), axis=1).cpu().numpy() return targets, predictions
def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Generate adversarial examples. :param x: Original input samples representing videos of format NFHWC. :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape (nb_samples,). :return: Adversarial examples. """ import torch # lgtm [py/repeated-import] if y is not None: y = check_and_transform_label_format(y, self.estimator.nb_classes) if y is None: if self.targeted: # pragma: no cover raise ValueError( "Target labels `y` need to be provided for a targeted attack." ) # Use model predictions as true labels logger.info("Using model predictions as true labels.") y = get_labels_np_array( self.estimator.predict(x, batch_size=self.batch_size)) dataset = torch.utils.data.TensorDataset( torch.from_numpy(x.astype(ART_NUMPY_DTYPE)), torch.from_numpy(y.astype(ART_NUMPY_DTYPE)), ) data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=self.batch_size, shuffle=False, drop_last=False) x_adv = x.copy().astype(ART_NUMPY_DTYPE) # Compute perturbation with batching for (batch_id, batch_all) in enumerate( tqdm(data_loader, desc="OverTheAirFlickeringPyTorch - Batches", leave=False, disable=not self.verbose)): (batch, batch_labels) = batch_all[0], batch_all[1] batch_index_1, batch_index_2 = batch_id * self.batch_size, ( batch_id + 1) * self.batch_size x_adv[batch_index_1:batch_index_2] = self._generate_batch( batch, batch_labels) return x_adv
def fit(self, x: np.ndarray, y: np.ndarray) -> None: """ Train the attack model. :param x: Input to training process. Includes all features used to train the original model. :param y: True labels of the features. """ # Checks: if isinstance(self.attack_feature, int) and self.attack_feature >= x.shape[1]: raise ValueError( "attack_feature must be a valid index to a feature in x") # get vector of attacked feature attacked_feature = x[:, self.attack_feature] self._values = get_feature_values(attacked_feature, isinstance(self.attack_feature, int)) if isinstance(self.attack_feature, int): y_one_hot = float_to_categorical(attacked_feature) else: y_one_hot = floats_to_one_hot(attacked_feature) y_ready = check_and_transform_label_format( y_one_hot, len(np.unique(attacked_feature)), return_one_hot=True) if y_ready is None: raise ValueError("None value detected.") # create training set for attack model if self.scale_range is not None: normalized_labels = minmax_scale(y, feature_range=self.scale_range) else: normalized_labels = y * self.prediction_normal_factor normalized_labels = check_and_transform_label_format( normalized_labels, return_one_hot=True) x_train = np.concatenate( (np.delete(x, self.attack_feature, 1), normalized_labels), axis=1).astype(np.float32) # train attack model self.attack_model.fit(x_train, y_ready)