def _test_mnist_targeted(self, classifier):
        # Get MNIST
        (_, _), (x_test, _) = self.mnist

        # Test FGSM with np.inf norm
        attack = FastGradientMethod(classifier, eps=1.0, targeted=True)

        pred_sort = classifier.predict(x_test).argsort(axis=1)
        y_test_adv = np.zeros((x_test.shape[0], 10))
        for i in range(x_test.shape[0]):
            y_test_adv[i, pred_sort[i, -2]] = 1.0

        attack_params = {"minimal": True, "eps_step": 0.01, "eps": 1.0}
        attack.set_params(**attack_params)

        x_test_adv = attack.generate(x_test, y=y_test_adv)
        self.assertFalse((x_test == x_test_adv).all())

        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))

        self.assertEqual(y_test_adv.shape, test_y_pred.shape)
        self.assertGreaterEqual((y_test_adv == test_y_pred).sum(),
                                x_test.shape[0] // 2)
    def _test_mnist_targeted(self, classifier):
        # Get MNIST
        (_, _), (x_test, y_test) = self.mnist
        x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST]

        # Generate random target classes
        nb_classes = np.unique(np.argmax(y_test, axis=1)).shape[0]
        targets = np.random.randint(nb_classes, size=NB_TEST)
        while (targets == np.argmax(y_test, axis=1)).any():
            targets = np.random.randint(nb_classes, size=NB_TEST)

        # Perform attack
        df = SaliencyMapMethod(classifier, theta=1, batch_size=100)
        x_test_adv = df.generate(x_test, y=to_categorical(targets, nb_classes))

        self.assertFalse((x_test == x_test_adv).all())
        self.assertFalse((0. == x_test_adv).all())

        y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == y_pred).all())

        acc = np.sum(np.argmax(y_pred, axis=1) == np.argmax(y_test, axis=1)) / y_test.shape[0]
        logger.info('Accuracy on adversarial examples: %.2f%%', (acc * 100))
Ejemplo n.º 3
0
    def fit(self, x, y, **kwargs):
        """
        Train a model adversarially. Each attack specified when creating the AdversarialTrainer is applied to all
        samples in the dataset, and only the successful ones (on the source model) are kept for data augmentation.

        :param x: Training set
        :type x: `np.ndarray`
        :param y: Labels
        :type y: `np.ndarray`
        :param kwargs: Dictionary of parameters to be passed on to the `fit` method of the classifier
        :type kwargs: `dict`
        :return: `None`
        """
        x_augmented = list(x.copy())
        y_augmented = list(y.copy())

        # Generate adversarial samples for each attack
        for i, attack in enumerate(self.attacks):
            # Fit the classifier to be used for the attack
            # TODO Do not refit classifier if already fitted
            attack.classifier.fit(x, y, **kwargs)

            # Predict new labels for the adversarial samples generated
            x_adv = attack.generate(x, **self.attacks[attack])
            y_pred = get_labels_np_array(attack.classifier.predict(x_adv))
            x_adv = x_adv[np.argmax(y_pred, axis=1) != np.argmax(y, axis=1)]
            y_adv = y_pred[np.argmax(y_pred, axis=1) != np.argmax(y, axis=1)]

            # Only add successful attacks to augmented dataset
            x_augmented.extend(list(x_adv))
            y_augmented.extend(list(y_adv))

        # Fit the model with the extended dataset
        self.classifier.fit(np.array(x_augmented), np.array(y_augmented),
                            **kwargs)
        self.x = x_augmented
        self.y = y_augmented
Ejemplo n.º 4
0
    def _test_mnist_untargeted(self, classifier):
        # Get MNIST
        (_, _), (x_test, y_test) = self.mnist
        x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST]

        # import time
        df = SaliencyMapMethod(classifier, theta=1)

        # starttime = time.clock()
        # x_test_adv = df.generate(x_test, batch_size=1)
        # endtime = time.clock()
        # print(1, endtime - starttime)
        #
        # starttime = time.clock()
        # x_test_adv = df.generate(x_test, batch_size=10)
        # endtime = time.clock()
        # print(10, endtime - starttime)
        #
        # starttime = time.clock()
        x_test_adv = df.generate(x_test, batch_size=100)
        # endtime = time.clock()
        # print(100, endtime - starttime)

        # starttime = time.clock()
        # x_test_adv = df.generate(x_test, batch_size=1000)
        # endtime = time.clock()
        # print(1000, endtime - starttime)

        self.assertFalse((x_test == x_test_adv).all())
        self.assertFalse((0. == x_test_adv).all())

        y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == y_pred).all())

        acc = np.sum(np.argmax(y_pred, axis=1) == np.argmax(
            y_test, axis=1)) / y_test.shape[0]
        logger.info('Accuracy on adversarial examples: %.2f%%', (acc * 100))
    def _set_targets(self, x, y, classifier_mixin=True):
        """
        Check and set up targets.

        :param x: An array with the original inputs.
        :type x: `np.ndarray`
        :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape
                  (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial
                  samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect
                  (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`.
        :type y: `np.ndarray`
        :param classifier_mixin: Whether the estimator is of type `ClassifierMixin`.
        :type classifier_mixin: `bool`
        :return: The targets.
        :rtype: `np.ndarray`
        """
        if classifier_mixin:
            y = check_and_transform_label_format(y, self.estimator.nb_classes)

        if y is None:
            # Throw error if attack is targeted, but no targets are provided
            if self.targeted:
                raise ValueError(
                    "Target labels `y` need to be provided for a targeted attack."
                )

            # Use model predictions as correct outputs
            if classifier_mixin:
                targets = get_labels_np_array(
                    self.estimator.predict(x, batch_size=self.batch_size))
            else:
                targets = self.estimator.predict(x, batch_size=self.batch_size)

        else:
            targets = y

        return targets
    def _test_mnist_targeted(self, classifier):
        # Get MNIST
        (_, _), (x_test, _) = self.mnist

        # Test FGSM with np.inf norm
        attack = BasicIterativeMethod(classifier,
                                      eps=1.0,
                                      eps_step=0.01,
                                      targeted=True)
        # y_test_adv = to_categorical((np.argmax(y_test, axis=1) + 1)  % 10, 10)
        pred_sort = classifier.predict(x_test).argsort(axis=1)
        y_test_adv = np.zeros((x_test.shape[0], 10))
        for i in range(x_test.shape[0]):
            y_test_adv[i, pred_sort[i, -2]] = 1.0
        x_test_adv = attack.generate(x_test, y=y_test_adv)

        self.assertFalse((x_test == x_test_adv).all())

        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))

        self.assertEqual(y_test_adv.shape, test_y_pred.shape)
        # This doesn't work all the time, especially with small networks
        self.assertTrue(
            (y_test_adv == test_y_pred).sum() >= x_test.shape[0] // 2)
    def _test_backend_mnist(self, classifier, x_train, y_train, x_test,
                            y_test):
        x_test_original = x_test.copy()

        # Test PGD with np.inf norm
        attack = ProjectedGradientDescent(classifier, eps=1.0, eps_step=0.1)
        x_train_adv = attack.generate(x_train)
        x_test_adv = attack.generate(x_test)

        self.assertFalse((x_train == x_train_adv).all())
        self.assertFalse((x_test == x_test_adv).all())

        train_y_pred = get_labels_np_array(classifier.predict(x_train_adv))
        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))

        self.assertFalse((y_train == train_y_pred).all())
        self.assertFalse((y_test == test_y_pred).all())

        acc = np.sum(
            np.argmax(train_y_pred, axis=1) == np.argmax(
                y_train, axis=1)) / y_train.shape[0]
        logger.info("Accuracy on adversarial train examples: %.2f%%",
                    acc * 100)

        acc = np.sum(
            np.argmax(test_y_pred, axis=1) == np.argmax(
                y_test, axis=1)) / y_test.shape[0]
        logger.info("Accuracy on adversarial test examples: %.2f%%", acc * 100)

        # Test PGD with 3 random initialisations
        attack = ProjectedGradientDescent(classifier, num_random_init=3)
        x_train_adv = attack.generate(x_train)
        x_test_adv = attack.generate(x_test)

        self.assertFalse((x_train == x_train_adv).all())
        self.assertFalse((x_test == x_test_adv).all())

        train_y_pred = get_labels_np_array(classifier.predict(x_train_adv))
        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))

        self.assertFalse((y_train == train_y_pred).all())
        self.assertFalse((y_test == test_y_pred).all())

        acc = np.sum(
            np.argmax(train_y_pred, axis=1) == np.argmax(
                y_train, axis=1)) / y_train.shape[0]
        logger.info(
            "Accuracy on adversarial train examples with 3 random initialisations: %.2f%%",
            acc * 100)

        acc = np.sum(
            np.argmax(test_y_pred, axis=1) == np.argmax(
                y_test, axis=1)) / y_test.shape[0]
        logger.info(
            "Accuracy on adversarial test examples with 3 random initialisations: %.2f%%",
            acc * 100)

        # Check that x_test has not been modified by attack and classifier
        self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))),
                               0.0,
                               delta=0.00001)

        # Test the masking
        attack = ProjectedGradientDescent(classifier, num_random_init=1)
        mask = np.random.binomial(n=1, p=0.5, size=np.prod(x_test.shape))
        mask = mask.reshape(x_test.shape).astype(np.float32)

        x_test_adv = attack.generate(x_test, mask=mask)
        mask_diff = (1 - mask) * (x_test_adv - x_test)
        self.assertAlmostEqual(float(np.max(np.abs(mask_diff))),
                               0.0,
                               delta=0.00001)

        # Test eps of array type 1
        attack = ProjectedGradientDescent(classifier, eps=1.0, eps_step=0.1)

        eps = np.ones(shape=x_test.shape) * 1.0
        eps_step = np.ones_like(eps) * 0.1

        attack_params = {"eps_step": eps_step, "eps": eps}
        attack.set_params(**attack_params)

        x_test_adv = attack.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())

        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == test_y_pred).all())

        # Test eps of array type 2
        eps = np.ones(shape=x_test.shape[1:]) * 1.0
        eps_step = np.ones_like(eps) * 0.1

        attack_params = {"eps_step": eps_step, "eps": eps}
        attack.set_params(**attack_params)

        x_test_adv = attack.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())

        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == test_y_pred).all())

        # Test eps of array type 3
        eps = np.ones(shape=x_test.shape[2:]) * 1.0
        eps_step = np.ones_like(eps) * 0.1

        attack_params = {"eps_step": eps_step, "eps": eps}
        attack.set_params(**attack_params)

        x_test_adv = attack.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())

        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == test_y_pred).all())

        # Test eps of array type 4
        eps = np.ones(shape=x_test.shape[3:]) * 1.0
        eps_step = np.ones_like(eps) * 0.1

        attack_params = {"eps_step": eps_step, "eps": eps}
        attack.set_params(**attack_params)

        x_test_adv = attack.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())

        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == test_y_pred).all())
Ejemplo n.º 8
0
    def generate(self,
                 x: np.ndarray,
                 y: Optional[np.ndarray] = None,
                 **kwargs) -> np.ndarray:
        """
        Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs to be attacked.
        :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape
                  (nb_samples,). If `self.targeted` is true, then `y` represents the target labels.
        :param x_adv_init: Initial array to act as initial adversarial examples. Same shape as `x`.
        :type x_adv_init: `np.ndarray`
        :return: An array holding the adversarial examples.
        """
        if y is None:
            # Throw error if attack is targeted, but no targets are provided
            if self.targeted:  # pragma: no cover
                raise ValueError(
                    "Target labels `y` need to be provided for a targeted attack."
                )

            # Use model predictions as correct outputs
            y = get_labels_np_array(
                self.estimator.predict(
                    x, batch_size=self.batch_size))  # type: ignore

        y = check_and_transform_label_format(y,
                                             self.estimator.nb_classes,
                                             return_one_hot=False)

        if y is not None and self.estimator.nb_classes == 2 and y.shape[1] == 1:
            raise ValueError(  # pragma: no cover
                "This attack has not yet been tested for binary classification with a single output classifier."
            )

        # Get clip_min and clip_max from the classifier or infer them from data
        if self.estimator.clip_values is not None:
            clip_min, clip_max = self.estimator.clip_values
        else:
            clip_min, clip_max = np.min(x), np.max(x)

        # Prediction from the original images
        preds = np.argmax(self.estimator.predict(x,
                                                 batch_size=self.batch_size),
                          axis=1)

        # Prediction from the initial adversarial examples if not None
        x_adv_init = kwargs.get("x_adv_init")

        if x_adv_init is not None:
            init_preds = np.argmax(self.estimator.predict(
                x_adv_init, batch_size=self.batch_size),
                                   axis=1)
        else:
            init_preds = [None] * len(x)
            x_adv_init = [None] * len(x)

        # Assert that, if attack is targeted, y is provided
        if self.targeted and y is None:  # pragma: no cover
            raise ValueError(
                "Target labels `y` need to be provided for a targeted attack.")

        # Some initial setups
        x_adv = x.astype(ART_NUMPY_DTYPE)

        # Generate the adversarial samples
        for ind, val in enumerate(
                tqdm(x_adv, desc="Boundary attack", disable=not self.verbose)):
            if self.targeted:
                x_adv[ind] = self._perturb(
                    x=val,
                    y=y[ind],
                    y_p=preds[ind],
                    init_pred=init_preds[ind],
                    adv_init=x_adv_init[ind],
                    clip_min=clip_min,
                    clip_max=clip_max,
                )
            else:
                x_adv[ind] = self._perturb(
                    x=val,
                    y=-1,
                    y_p=preds[ind],
                    init_pred=init_preds[ind],
                    adv_init=x_adv_init[ind],
                    clip_min=clip_min,
                    clip_max=clip_max,
                )

        y = to_categorical(y, self.estimator.nb_classes)

        logger.info(
            "Success rate of Boundary attack: %.2f%%",
            100 * compute_success(self.estimator,
                                  x,
                                  y,
                                  x_adv,
                                  self.targeted,
                                  batch_size=self.batch_size),
        )

        return x_adv
    def _test_backend_mnist(self, classifier, x_train, y_train, x_test,
                            y_test):
        x_test_original = x_test.copy()

        # Test PGD with np.inf norm
        attack = ProjectedGradientDescent(classifier,
                                          eps=1,
                                          eps_step=0.1,
                                          max_iter=5)
        x_train_adv = attack.generate(x_train)
        x_test_adv = attack.generate(x_test)

        self.assertFalse((x_train == x_train_adv).all())
        self.assertFalse((x_test == x_test_adv).all())

        train_y_pred = get_labels_np_array(classifier.predict(x_train_adv))
        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))

        self.assertFalse((y_train == train_y_pred).all())
        self.assertFalse((y_test == test_y_pred).all())

        acc = np.sum(
            np.argmax(train_y_pred, axis=1) == np.argmax(
                y_train, axis=1)) / len(y_train)
        logger.info("Accuracy on adversarial train examples: %.2f%%",
                    acc * 100)

        acc = np.sum(
            np.argmax(test_y_pred, axis=1) == np.argmax(np.array(y_test),
                                                        axis=1)) / len(y_test)
        logger.info("Accuracy on adversarial test examples: %.2f%%", acc * 100)

        # Test PGD with 3 random initialisations
        attack = ProjectedGradientDescent(classifier,
                                          num_random_init=3,
                                          max_iter=5)
        x_train_adv = attack.generate(x_train)
        x_test_adv = attack.generate(x_test)

        self.assertFalse((x_train == x_train_adv).all())
        self.assertFalse((x_test == x_test_adv).all())

        train_y_pred = get_labels_np_array(classifier.predict(x_train_adv))
        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))

        self.assertFalse((y_train == train_y_pred).all())
        self.assertFalse((y_test == test_y_pred).all())

        acc = np.sum(
            np.argmax(train_y_pred, axis=1) == np.argmax(
                y_train, axis=1)) / len(y_train)
        logger.info(
            "Accuracy on adversarial train examples with 3 random initialisations: %.2f%%",
            acc * 100)

        acc = np.sum(
            np.argmax(test_y_pred, axis=1) == np.argmax(np.array(y_test),
                                                        axis=1)) / len(y_test)
        logger.info(
            "Accuracy on adversarial test examples with 3 random initialisations: %.2f%%",
            acc * 100)

        # Check that x_test has not been modified by attack and classifier
        self.assertAlmostEqual(float(
            np.max(np.abs(np.array(x_test_original) - np.array(x_test)))),
                               0.0,
                               delta=0.00001)
    def generate(self,
                 x: np.ndarray,
                 y: Optional[np.ndarray] = None,
                 **kwargs) -> np.ndarray:
        """
        Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs.
        :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape
                  (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial
                  samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect
                  (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`.
        :return: An array holding the adversarial examples.
        """
        if x.ndim != 4:
            raise ValueError(
                "Unrecognized input dimension. Attack can only be applied to image data."
            )

        x_adv = x.astype(ART_NUMPY_DTYPE)

        y = check_and_transform_label_format(y, self.estimator.nb_classes)

        if y is None:
            # Use model predictions as true labels
            logger.info("Using model predictions as true labels.")
            y = get_labels_np_array(
                self.estimator.predict(x, batch_size=self.batch_size))

        if self.estimator.channels_first:
            channels = x.shape[1]
            height = x.shape[2]
            width = x.shape[3]
        else:
            height = x.shape[1]
            width = x.shape[2]
            channels = x.shape[3]

        for _ in trange(self.nb_restarts,
                        desc="SquareAttack - restarts",
                        disable=not self.verbose):

            # Determine correctly predicted samples
            y_pred = self.estimator.predict(x_adv, batch_size=self.batch_size)
            sample_is_robust = np.argmax(y_pred, axis=1) == np.argmax(y,
                                                                      axis=1)

            if np.sum(sample_is_robust) == 0:
                break

            # x_robust = x_adv[sample_is_robust]
            x_robust = x[sample_is_robust]
            y_robust = y[sample_is_robust]
            sample_logits_diff_init = self._get_logits_diff(x_robust, y_robust)

            if self.norm in [np.inf, "inf"]:

                if self.estimator.channels_first:
                    size = (x_robust.shape[0], channels, 1, width)
                else:
                    size = (x_robust.shape[0], 1, width, channels)

                # Add vertical stripe perturbations
                x_robust_new = np.clip(
                    x_robust + self.eps * np.random.choice([-1, 1], size=size),
                    a_min=self.estimator.clip_values[0],
                    a_max=self.estimator.clip_values[1],
                ).astype(ART_NUMPY_DTYPE)

                sample_logits_diff_new = self._get_logits_diff(
                    x_robust_new, y_robust)
                logits_diff_improved = (sample_logits_diff_new -
                                        sample_logits_diff_init) < 0.0

                x_robust[logits_diff_improved] = x_robust_new[
                    logits_diff_improved]

                x_adv[sample_is_robust] = x_robust

                for i_iter in trange(self.max_iter,
                                     desc="SquareAttack - iterations",
                                     leave=False,
                                     disable=not self.verbose):

                    percentage_of_elements = self._get_percentage_of_elements(
                        i_iter)

                    # Determine correctly predicted samples
                    y_pred = self.estimator.predict(x_adv,
                                                    batch_size=self.batch_size)
                    sample_is_robust = np.argmax(y_pred,
                                                 axis=1) == np.argmax(y,
                                                                      axis=1)

                    if np.sum(sample_is_robust) == 0:
                        break

                    x_robust = x_adv[sample_is_robust]
                    x_init = x[sample_is_robust]
                    y_robust = y[sample_is_robust]

                    sample_logits_diff_init = self._get_logits_diff(
                        x_robust, y_robust)

                    height_tile = max(
                        int(
                            round(
                                math.sqrt(percentage_of_elements * height *
                                          width))), 1)

                    height_mid = np.random.randint(0, height - height_tile)
                    width_start = np.random.randint(0, width - height_tile)

                    delta_new = np.zeros(self.estimator.input_shape)

                    if self.estimator.channels_first:
                        delta_new[:, height_mid:height_mid + height_tile,
                                  width_start:width_start +
                                  height_tile] = np.random.choice(
                                      [-2 * self.eps, 2 * self.eps],
                                      size=[channels, 1, 1])
                    else:
                        delta_new[height_mid:height_mid + height_tile,
                                  width_start:width_start +
                                  height_tile, :] = np.random.choice(
                                      [-2 * self.eps, 2 * self.eps],
                                      size=[1, 1, channels])

                    x_robust_new = x_robust + delta_new

                    x_robust_new = np.minimum(
                        np.maximum(x_robust_new, x_init - self.eps),
                        x_init + self.eps)

                    x_robust_new = np.clip(
                        x_robust_new,
                        a_min=self.estimator.clip_values[0],
                        a_max=self.estimator.clip_values[1]).astype(
                            ART_NUMPY_DTYPE)

                    sample_logits_diff_new = self._get_logits_diff(
                        x_robust_new, y_robust)
                    logits_diff_improved = (sample_logits_diff_new -
                                            sample_logits_diff_init) < 0.0

                    x_robust[logits_diff_improved] = x_robust_new[
                        logits_diff_improved]

                    x_adv[sample_is_robust] = x_robust

            elif self.norm == 2:

                n_tiles = 5

                height_tile = height // n_tiles

                def _get_perturbation(h):
                    delta = np.zeros([h, h])
                    gaussian_perturbation = np.zeros([h // 2, h])

                    x_c = h // 4
                    y_c = h // 2

                    for i_y in range(y_c):
                        gaussian_perturbation[
                            max(x_c, 0):min(x_c + (2 * i_y + 1), h // 2),
                            max(0, y_c):min(y_c + (2 * i_y + 1), h)] += 1.0 / (
                                (i_y + 1)**2)
                        x_c -= 1
                        y_c -= 1

                    gaussian_perturbation /= np.sqrt(
                        np.sum(gaussian_perturbation**2))

                    delta[:h // 2] = gaussian_perturbation
                    delta[h // 2:h // 2 + gaussian_perturbation.
                          shape[0]] = -gaussian_perturbation

                    delta /= np.sqrt(np.sum(delta**2))

                    if random.random() > 0.5:
                        delta = np.transpose(delta)

                    if random.random() > 0.5:
                        delta = -delta

                    return delta

                delta_init = np.zeros(x_robust.shape, dtype=ART_NUMPY_DTYPE)

                height_start = 0
                for _ in range(n_tiles):
                    width_start = 0
                    for _ in range(n_tiles):
                        if self.estimator.channels_first:
                            perturbation_size = (1, 1, height_tile,
                                                 height_tile)
                            random_size = (x_robust.shape[0], channels, 1, 1)
                        else:
                            perturbation_size = (1, height_tile, height_tile,
                                                 1)
                            random_size = (x_robust.shape[0], 1, 1, channels)

                        perturbation = _get_perturbation(height_tile).reshape(
                            perturbation_size) * np.random.choice(
                                [-1, 1], size=random_size)

                        if self.estimator.channels_first:
                            delta_init[:, :,
                                       height_start:height_start + height_tile,
                                       width_start:width_start +
                                       height_tile] += perturbation
                        else:
                            delta_init[:,
                                       height_start:height_start + height_tile,
                                       width_start:width_start +
                                       height_tile, :] += perturbation
                        width_start += height_tile
                    height_start += height_tile

                x_robust_new = np.clip(
                    x_robust + delta_init / np.sqrt(
                        np.sum(delta_init**2, axis=(1, 2, 3), keepdims=True)) *
                    self.eps,
                    self.estimator.clip_values[0],
                    self.estimator.clip_values[1],
                )

                sample_logits_diff_new = self._get_logits_diff(
                    x_robust_new, y_robust)
                logits_diff_improved = (sample_logits_diff_new -
                                        sample_logits_diff_init) < 0.0

                x_robust[logits_diff_improved] = x_robust_new[
                    logits_diff_improved]

                x_adv[sample_is_robust] = x_robust

                for i_iter in trange(self.max_iter,
                                     desc="SquareAttack - iterations",
                                     leave=False,
                                     disable=not self.verbose):

                    percentage_of_elements = self._get_percentage_of_elements(
                        i_iter)

                    # Determine correctly predicted samples
                    y_pred = self.estimator.predict(x_adv,
                                                    batch_size=self.batch_size)
                    sample_is_robust = np.argmax(y_pred,
                                                 axis=1) == np.argmax(y,
                                                                      axis=1)

                    if np.sum(sample_is_robust) == 0:
                        break

                    x_robust = x_adv[sample_is_robust]
                    x_init = x[sample_is_robust]
                    y_robust = y[sample_is_robust]

                    sample_logits_diff_init = self._get_logits_diff(
                        x_robust, y_robust)

                    delta_x_robust_init = x_robust - x_init

                    height_tile = max(
                        int(
                            round(
                                math.sqrt(percentage_of_elements * height *
                                          width))), 3)

                    if height_tile % 2 == 0:
                        height_tile += 1
                    height_tile_2 = height_tile

                    height_start = np.random.randint(0, height - height_tile)
                    width_start = np.random.randint(0, width - height_tile)

                    new_deltas_mask = np.zeros(x_init.shape)
                    if self.estimator.channels_first:
                        new_deltas_mask[:, :, height_start:height_start +
                                        height_tile, width_start:width_start +
                                        height_tile] = 1.0
                        w_1_norm = np.sqrt(
                            np.sum(
                                delta_x_robust_init[:, :,
                                                    height_start:height_start +
                                                    height_tile,
                                                    width_start:width_start +
                                                    height_tile, ]**2,
                                axis=(2, 3),
                                keepdims=True,
                            ))
                    else:
                        new_deltas_mask[:, height_start:height_start +
                                        height_tile, width_start:width_start +
                                        height_tile, :] = 1.0
                        w_1_norm = np.sqrt(
                            np.sum(
                                delta_x_robust_init[:,
                                                    height_start:height_start +
                                                    height_tile,
                                                    width_start:width_start +
                                                    height_tile, :, ]**2,
                                axis=(1, 2),
                                keepdims=True,
                            ))

                    height_2_start = np.random.randint(0,
                                                       height - height_tile_2)
                    width_2_start = np.random.randint(0, width - height_tile_2)

                    new_deltas_mask_2 = np.zeros(x_init.shape)
                    if self.estimator.channels_first:
                        new_deltas_mask_2[:, :, height_2_start:height_2_start +
                                          height_tile_2,
                                          width_2_start:width_2_start +
                                          height_tile_2, ] = 1.0
                    else:
                        new_deltas_mask_2[:, height_2_start:height_2_start +
                                          height_tile_2,
                                          width_2_start:width_2_start +
                                          height_tile_2, :, ] = 1.0

                    norms_x_robust = np.sqrt(
                        np.sum((x_robust - x_init)**2,
                               axis=(1, 2, 3),
                               keepdims=True))
                    w_norm = np.sqrt(
                        np.sum(
                            (delta_x_robust_init * np.maximum(
                                new_deltas_mask, new_deltas_mask_2))**2,
                            axis=(1, 2, 3),
                            keepdims=True,
                        ))

                    if self.estimator.channels_first:
                        new_deltas_size = [
                            x_init.shape[0], channels, height_tile, height_tile
                        ]
                        random_choice_size = [x_init.shape[0], channels, 1, 1]
                        perturbation_size = [1, 1, height_tile, height_tile]
                    else:
                        new_deltas_size = [
                            x_init.shape[0], height_tile, height_tile, channels
                        ]
                        random_choice_size = [x_init.shape[0], 1, 1, channels]
                        perturbation_size = [1, height_tile, height_tile, 1]

                    delta_new = (
                        np.ones(new_deltas_size) * _get_perturbation(
                            height_tile).reshape(perturbation_size) *
                        np.random.choice([-1, 1], size=random_choice_size))

                    if self.estimator.channels_first:
                        delta_new += delta_x_robust_init[:, :, height_start:
                                                         height_start +
                                                         height_tile,
                                                         width_start:
                                                         width_start +
                                                         height_tile] / (
                                                             np.maximum(
                                                                 1e-9,
                                                                 w_1_norm))
                    else:
                        delta_new += delta_x_robust_init[:, height_start:
                                                         height_start +
                                                         height_tile,
                                                         width_start:
                                                         width_start +
                                                         height_tile, :] / (
                                                             np.maximum(
                                                                 1e-9,
                                                                 w_1_norm))

                    diff_norm = (self.eps * np.ones(
                        delta_new.shape))**2 - norms_x_robust**2
                    diff_norm[diff_norm < 0.0] = 0.0

                    if self.estimator.channels_first:
                        delta_new /= np.sqrt(
                            np.sum(delta_new**2, axis=(2, 3), keepdims=True)
                        ) * np.sqrt(diff_norm / channels + w_norm**2)
                        delta_x_robust_init[:, :,
                                            height_2_start:height_2_start +
                                            height_tile_2,
                                            width_2_start:width_2_start +
                                            height_tile_2, ] = 0.0
                        delta_x_robust_init[:, :, height_start:height_start +
                                            height_tile,
                                            width_start:width_start +
                                            height_tile] = delta_new
                    else:
                        delta_new /= np.sqrt(
                            np.sum(delta_new**2, axis=(1, 2), keepdims=True)
                        ) * np.sqrt(diff_norm / channels + w_norm**2)
                        delta_x_robust_init[:, height_2_start:height_2_start +
                                            height_tile_2,
                                            width_2_start:width_2_start +
                                            height_tile_2, :, ] = 0.0
                        delta_x_robust_init[:, height_start:height_start +
                                            height_tile,
                                            width_start:width_start +
                                            height_tile, :] = delta_new

                    x_robust_new = np.clip(
                        x_init + self.eps * delta_x_robust_init / np.sqrt(
                            np.sum(delta_x_robust_init**2,
                                   axis=(1, 2, 3),
                                   keepdims=True)),
                        self.estimator.clip_values[0],
                        self.estimator.clip_values[1],
                    )

                    sample_logits_diff_new = self._get_logits_diff(
                        x_robust_new, y_robust)
                    logits_diff_improved = (sample_logits_diff_new -
                                            sample_logits_diff_init) < 0.0

                    x_robust[logits_diff_improved] = x_robust_new[
                        logits_diff_improved]

                    x_adv[sample_is_robust] = x_robust

        return x_adv
Ejemplo n.º 11
0
    def generate(self, x, y=None):
        """
        Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs to be attacked.
        :type x: `np.ndarray`
        :param y: If `self.targeted` is true, then `y_val` represents the target labels. Otherwise, the targets are
                the original class labels.
        :type y: `np.ndarray`
        :return: An array holding the adversarial examples.
        :rtype: `np.ndarray`
        """
        x_adv = x  #.astype(NUMPY_DTYPE)
        if hasattr(self.classifier,
                   'clip_values') and self.classifier.clip_values is not None:
            clip_min, clip_max = self.classifier.clip_values
        else:
            clip_min, clip_max = np.amin(x), np.amax(x)

        # Assert that, if attack is targeted, y_val is provided:
        if self.targeted and y is None:
            raise ValueError(
                'Target labels `y` need to be provided for a targeted attack.')

        # No labels provided, use model prediction as correct class
        if y is None:
            y = get_labels_np_array(self.classifier.predict(x, logits=False))

        # Compute perturbation with implicit batching
        nb_batches = int(np.ceil(x_adv.shape[0] / float(self.batch_size)))
        for batch_id in range(nb_batches):
            logger.debug('Processing batch %i out of %i', batch_id, nb_batches)

            batch_index_1, batch_index_2 = batch_id * self.batch_size, (
                batch_id + 1) * self.batch_size
            x_batch = x_adv[batch_index_1:batch_index_2]
            y_batch = y[batch_index_1:batch_index_2]

            # The optimization is performed in tanh space to keep the adversarial images bounded in correct range
            x_batch_tanh = original_to_tanh(x_batch, clip_min, clip_max,
                                            self._tanh_smoother)

            # Initialize binary search:
            c = self.initial_const * np.ones(x_batch.shape[0])
            c_lower_bound = np.zeros(x_batch.shape[0])
            c_double = (np.ones(x_batch.shape[0]) > 0)

            # Initialize placeholders for best l2 distance and attack found so far
            best_l2dist = np.inf * np.ones(x_batch.shape[0])
            best_x_adv_batch = x_batch.copy()

            for bss in range(self.binary_search_steps):
                logger.debug('Binary search step %i out of %i (c_mean==%f)',
                             bss, self.binary_search_steps, np.mean(c))
                nb_active = int(np.sum(c < self._c_upper_bound))
                logger.debug(
                    'Number of samples with c < _c_upper_bound: %i out of %i',
                    nb_active, x_batch.shape[0])
                if nb_active == 0:
                    break
                lr = self.learning_rate * np.ones(x_batch.shape[0])

                # Initialize perturbation in tanh space:
                x_adv_batch = x_batch.copy()
                x_adv_batch_tanh = x_batch_tanh.copy()

                z, l2dist, loss = self._loss(x_batch, x_adv_batch, y_batch, c)
                attack_success = (loss - l2dist <= 0)
                overall_attack_success = attack_success

                for it in range(self.max_iter):
                    logger.debug('Iteration step %i out of %i', it,
                                 self.max_iter)
                    logger.debug('Average Loss: %f', np.mean(loss))
                    logger.debug('Average L2Dist: %f', np.mean(l2dist))
                    logger.debug('Average Margin Loss: %f',
                                 np.mean(loss - l2dist))
                    logger.debug(
                        'Current number of succeeded attacks: %i out of %i',
                        int(np.sum(attack_success)), len(attack_success))

                    improved_adv = attack_success & (l2dist < best_l2dist)
                    logger.debug('Number of improved L2 distances: %i',
                                 int(np.sum(improved_adv)))
                    if np.sum(improved_adv) > 0:
                        best_l2dist[improved_adv] = l2dist[improved_adv]
                        best_x_adv_batch[improved_adv] = x_adv_batch[
                            improved_adv]

                    active = (c < self._c_upper_bound) & (lr > 0)
                    nb_active = int(np.sum(active))
                    logger.debug(
                        'Number of samples with c < _c_upper_bound and lr > 0: %i out of %i',
                        nb_active, x_batch.shape[0])
                    if nb_active == 0:
                        break

                    # compute gradient:
                    logger.debug('Compute loss gradient')
                    perturbation_tanh = -self._loss_gradient(
                        z[active], y_batch[active], x_batch[active],
                        x_adv_batch[active], x_adv_batch_tanh[active],
                        c[active], clip_min, clip_max)

                    # perform line search to optimize perturbation
                    # first, halve the learning rate until perturbation actually decreases the loss:
                    prev_loss = loss.copy()
                    best_loss = loss.copy()
                    best_lr = np.zeros(x_batch.shape[0])
                    halving = np.zeros(x_batch.shape[0])

                    for h in range(self.max_halving):
                        logger.debug('Perform halving iteration %i out of %i',
                                     h, self.max_halving)
                        do_halving = (loss[active] >= prev_loss[active])
                        logger.debug('Halving to be performed on %i samples',
                                     int(np.sum(do_halving)))
                        if np.sum(do_halving) == 0:
                            break
                        active_and_do_halving = active.copy()
                        active_and_do_halving[active] = do_halving

                        lr_mult = lr[active_and_do_halving]
                        for _ in range(len(x.shape) - 1):
                            lr_mult = lr_mult[:, np.newaxis]

                        new_x_adv_batch_tanh = x_adv_batch_tanh[active_and_do_halving] + \
                            lr_mult * perturbation_tanh[do_halving]
                        new_x_adv_batch = tanh_to_original(
                            new_x_adv_batch_tanh, clip_min, clip_max,
                            self._tanh_smoother)
                        _, l2dist[active_and_do_halving], loss[
                            active_and_do_halving] = self._loss(
                                x_batch[active_and_do_halving],
                                new_x_adv_batch,
                                y_batch[active_and_do_halving],
                                c[active_and_do_halving])

                        logger.debug('New Average Loss: %f', np.mean(loss))
                        logger.debug('New Average L2Dist: %f', np.mean(l2dist))
                        logger.debug('New Average Margin Loss: %f',
                                     np.mean(loss - l2dist))

                        best_lr[loss < best_loss] = lr[loss < best_loss]
                        best_loss[loss < best_loss] = loss[loss < best_loss]
                        lr[active_and_do_halving] /= 2
                        halving[active_and_do_halving] += 1
                    lr[active] *= 2

                    # if no halving was actually required, double the learning rate as long as this
                    # decreases the loss:
                    for d in range(self.max_doubling):
                        logger.debug('Perform doubling iteration %i out of %i',
                                     d, self.max_doubling)
                        do_doubling = (halving[active] == 1) & (
                            loss[active] <= best_loss[active])
                        logger.debug('Doubling to be performed on %i samples',
                                     int(np.sum(do_doubling)))
                        if np.sum(do_doubling) == 0:
                            break
                        active_and_do_doubling = active.copy()
                        active_and_do_doubling[active] = do_doubling
                        lr[active_and_do_doubling] *= 2

                        lr_mult = lr[active_and_do_doubling]
                        for _ in range(len(x.shape) - 1):
                            lr_mult = lr_mult[:, np.newaxis]

                        new_x_adv_batch_tanh = x_adv_batch_tanh[active_and_do_doubling] + \
                            lr_mult * perturbation_tanh[do_doubling]
                        new_x_adv_batch = tanh_to_original(
                            new_x_adv_batch_tanh, clip_min, clip_max,
                            self._tanh_smoother)
                        _, l2dist[active_and_do_doubling], loss[
                            active_and_do_doubling] = self._loss(
                                x_batch[active_and_do_doubling],
                                new_x_adv_batch,
                                y_batch[active_and_do_doubling],
                                c[active_and_do_doubling])
                        logger.debug('New Average Loss: %f', np.mean(loss))
                        logger.debug('New Average L2Dist: %f', np.mean(l2dist))
                        logger.debug('New Average Margin Loss: %f',
                                     np.mean(loss - l2dist))
                        best_lr[loss < best_loss] = lr[loss < best_loss]
                        best_loss[loss < best_loss] = loss[loss < best_loss]

                    lr[halving == 1] /= 2

                    update_adv = (best_lr[active] > 0)
                    logger.debug(
                        'Number of adversarial samples to be finally updated: %i',
                        int(np.sum(update_adv)))

                    if np.sum(update_adv) > 0:
                        active_and_update_adv = active.copy()
                        active_and_update_adv[active] = update_adv
                        best_lr_mult = best_lr[active_and_update_adv]
                        for _ in range(len(x.shape) - 1):
                            best_lr_mult = best_lr_mult[:, np.newaxis]
                        x_adv_batch_tanh[active_and_update_adv] = x_adv_batch_tanh[active_and_update_adv] + \
                            best_lr_mult * perturbation_tanh[update_adv]
                        x_adv_batch[active_and_update_adv] = tanh_to_original(
                            x_adv_batch_tanh[active_and_update_adv], clip_min,
                            clip_max, self._tanh_smoother)
                        z[active_and_update_adv], l2dist[active_and_update_adv], loss[active_and_update_adv] = \
                            self._loss(x_batch[active_and_update_adv], x_adv_batch[active_and_update_adv],
                                       y_batch[active_and_update_adv], c[active_and_update_adv])
                        attack_success = (loss - l2dist <= 0)
                        overall_attack_success = overall_attack_success | attack_success

                # Update depending on attack success:
                improved_adv = attack_success & (l2dist < best_l2dist)
                logger.debug('Number of improved L2 distances: %i',
                             int(np.sum(improved_adv)))

                if np.sum(improved_adv) > 0:
                    best_l2dist[improved_adv] = l2dist[improved_adv]
                    best_x_adv_batch[improved_adv] = x_adv_batch[improved_adv]

                c_double[overall_attack_success] = False
                c[overall_attack_success] = (c_lower_bound +
                                             c)[overall_attack_success] / 2

                c_old = c
                c[~overall_attack_success & c_double] *= 2
                c[~overall_attack_success
                  & ~c_double] += (c - c_lower_bound)[~overall_attack_success
                                                      & ~c_double] / 2
                c_lower_bound[~overall_attack_success] = c_old[
                    ~overall_attack_success]

            x_adv[batch_index_1:batch_index_2] = best_x_adv_batch
            rate = 100 * compute_success(self.classifier, x, y, x_adv,
                                         self.targeted)
            TrackedCW.tracked_x.append(
                (x_adv, rate, batch_id, best_l2dist.mean()))

        logger.info(
            'Success rate of C&W L_2 attack: %.2f%%',
            100 * compute_success(self.classifier, x, y, x_adv, self.targeted))

        return x_adv
    def setUpClass(cls):
        k.set_learning_phase(1)

        # Get MNIST
        (x_train, y_train), (x_test, y_test), _, _ = load_mnist()
        x_train, y_train, x_test, y_test = x_train[:
                                                   NB_TRAIN], y_train[:
                                                                      NB_TRAIN], x_test[:
                                                                                        NB_TEST], y_test[:
                                                                                                         NB_TEST]
        cls.mnist = (x_train, y_train), (x_test, y_test)

        # Keras classifier
        cls.classifier_k = cls._cnn_mnist_k([28, 28, 1])
        cls.classifier_k.fit(x_train,
                             y_train,
                             batch_size=BATCH_SIZE,
                             nb_epochs=2)

        scores = cls.classifier_k._model.evaluate(x_train, y_train)
        logger.info('[Keras, MNIST] Accuracy on training set: %.2f%%',
                    (scores[1] * 100))
        scores = cls.classifier_k._model.evaluate(x_test, y_test)
        logger.info('[Keras, MNIST] Accuracy on test set: %.2f%%',
                    (scores[1] * 100))

        # Create basic CNN on MNIST using TensorFlow
        cls.classifier_tf = cls._cnn_mnist_tf([28, 28, 1])
        cls.classifier_tf.fit(x_train,
                              y_train,
                              nb_epochs=2,
                              batch_size=BATCH_SIZE)

        scores = get_labels_np_array(cls.classifier_tf.predict(x_train))
        acc = np.sum(np.argmax(scores, axis=1) == np.argmax(
            y_train, axis=1)) / y_train.shape[0]
        logger.info('[TF, MNIST] Accuracy on training set: %.2f%%',
                    (acc * 100))

        scores = get_labels_np_array(cls.classifier_tf.predict(x_test))
        acc = np.sum(np.argmax(scores, axis=1) == np.argmax(
            y_test, axis=1)) / y_test.shape[0]
        logger.info('[TF, MNIST] Accuracy on test set: %.2f%%', (acc * 100))

        # Create basic PyTorch model
        cls.classifier_py = cls._cnn_mnist_py()
        x_train, x_test = np.swapaxes(x_train, 1, 3), np.swapaxes(x_test, 1, 3)
        cls.classifier_py.fit(x_train,
                              y_train,
                              nb_epochs=2,
                              batch_size=BATCH_SIZE)

        scores = get_labels_np_array(cls.classifier_py.predict(x_train))
        acc = np.sum(np.argmax(scores, axis=1) == np.argmax(
            y_train, axis=1)) / y_train.shape[0]
        logger.info('[PyTorch, MNIST] Accuracy on training set: %.2f%%',
                    (acc * 100))

        scores = get_labels_np_array(cls.classifier_py.predict(x_test))
        acc = np.sum(np.argmax(scores, axis=1) == np.argmax(
            y_test, axis=1)) / y_test.shape[0]
        logger.info('[PyTorch, MNIST] Accuracy on test set: %.2f%%',
                    (acc * 100))
    def generate(self, x, **kwargs):
        """
        Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs to be attacked.
        :type x: `np.ndarray`
        :param y: If `self.targeted` is true, then `y_val` represents the target labels. Otherwise, the targets are
                the original class labels.
        :type y: `np.ndarray`
        :return: An array holding the adversarial examples.
        :rtype: `np.ndarray`
        """
        x_adv = x.copy()
        (clip_min, clip_max) = self.classifier.clip_values

        # Parse and save attack-specific parameters
        params_cpy = dict(kwargs)
        y = params_cpy.pop(str('y'), None)
        self.set_params(**params_cpy)

        # Assert that, if attack is targeted, y_val is provided:
        assert not (self.targeted and y is None)

        # No labels provided, use model prediction as correct class
        if y is None:
            y = get_labels_np_array(self.classifier.predict(x, logits=False))

        for j, (ex, target) in enumerate(zip(x_adv, y)):
            image = ex.copy()

            # The optimization is performed in tanh space to keep the
            # adversarial images bounded from clip_min and clip_max. To avoid division by zero (which occurs if
            # arguments of arctanh are +1 or -1), we multiply arguments with _tanh_smoother.
            # It appears this is what Carlini and Wagner
            # (2016) are alluding to in their footnote 8. However, it is not clear how their proposed trick
            # ("instead of scaling by 1/2 we cale by 1/2 + eps") would actually work.
            image_tanh = np.clip(image, clip_min, clip_max)
            image_tanh = (image_tanh - clip_min) / (clip_max - clip_min)
            image_tanh = np.arctanh(
                ((image_tanh * 2) - 1) * self._tanh_smoother)

            # Initialize binary search:
            c = self.initial_const
            c_lower_bound = 0
            c_double = True

            # Initialize placeholders for best l2 distance and attack found so far
            best_l2dist = sys.float_info.max
            best_adv_image = image

            for _ in range(self.binary_search_steps):
                attack_success = False
                loss_prev = sys.float_info.max
                lr = self.learning_rate

                # Initialize perturbation in tanh space:
                perturbation_tanh = np.zeros(image_tanh.shape)

                for it in range(self.max_iter):
                    # First transform current adversarial sample from tanh to original space:
                    adv_image = image_tanh + perturbation_tanh
                    adv_image = (np.tanh(adv_image) / self._tanh_smoother +
                                 1) / 2
                    adv_image = adv_image * (clip_max - clip_min) + clip_min

                    # Collect current logits, loss and l2 distance.
                    z, l2dist, loss = self.loss(image, adv_image, target, c)
                    last_attack_success = loss - l2dist <= 0
                    attack_success = attack_success or last_attack_success

                    if last_attack_success:
                        if l2dist < best_l2dist:
                            best_l2dist = l2dist
                            best_adv_image = adv_image
                        break
                    #elif loss >= loss_prev:
                    #    break
                    else:
                        if self.targeted:
                            i_sub, i_add = np.argmax(target), np.argmax(
                                z * (1 - target))
                        else:
                            i_add, i_sub = np.argmax(target), np.argmax(
                                z * (1 - target))

                        grad_l2p = self.classifier.class_gradient(
                            np.array([adv_image]), label=i_add, logits=True)[0]
                        grad_l2p -= self.classifier.class_gradient(
                            np.array([adv_image]), label=i_sub, logits=True)[0]
                        grad_l2p *= c
                        grad_l2p += 2 * (adv_image - image)
                        grad_l2p *= (clip_max - clip_min)
                        grad_l2p *= (1 - np.square(
                            np.tanh(image_tanh + perturbation_tanh))) / (
                                2 * self._tanh_smoother)

                        # Update the pertubation with decayed learning rate
                        lr *= (1. / (1. + self.decay * it))
                        perturbation_tanh -= lr * grad_l2p[0]
                        loss_prev = loss

                # Update binary search:
                if attack_success:
                    c_double = False
                    c = (c_lower_bound + c) / 2
                else:
                    c_old = c
                    if c_double:
                        c = 2 * c
                    else:
                        c = c + (c - c_lower_bound) / 2
                    c_lower_bound = c_old

                # Abort binary search if c exceeds upper bound:
                if c > self._c_upper_bound:
                    break

            x_adv[j] = best_adv_image

        return x_adv
Ejemplo n.º 14
0
    def generate(self,
                 x: np.ndarray,
                 y: Optional[np.ndarray] = None,
                 **kwargs) -> np.ndarray:
        """
        Generate adversarial samples.

        :param x: An array with the original inputs to be attacked.
        :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape
                  (nb_samples,). If `self.targeted` is true, then `y` represents the target labels.
        :return: The adversarial examples.
        """
        if y is not None:
            y = check_and_transform_label_format(y,
                                                 self.estimator.nb_classes,
                                                 return_one_hot=True)

        if y is not None and self.estimator.nb_classes == 2 and y.shape[
                1] == 1:  # pragma: no cover
            raise ValueError(
                "This attack has not yet been tested for binary classification with a single output classifier."
            )

        x_adv = x.copy()

        if y is None:
            # Throw error if attack is targeted, but no targets are provided
            if self.targeted:  # pragma: no cover
                raise ValueError(
                    "Target labels `y` need to be provided for a targeted attack."
                )

            # Use model predictions as correct outputs
            y = get_labels_np_array(
                self.estimator.predict(
                    x, batch_size=self.batch_size))  # type: ignore

        # Get clip_min and clip_max from the classifier or infer them from data
        if self.estimator.clip_values is not None:
            self.clip_min, self.clip_max = self.estimator.clip_values
        else:
            self.clip_min, self.clip_max = np.min(x), np.max(x)

        # Check for square input images
        if (self.estimator.channels_first
                and x.shape[2] != x.shape[3]) or (  # pragma: no cover
                    not self.estimator.channels_first
                    and x.shape[1] != x.shape[2]):
            raise ValueError("Input images `x` have to be square.")

        # Create or load DCT basis
        image_size = x.shape[2]
        logger.info("Create or load DCT basis.")
        path = f"2d_dct_basis_{self.sub_dim}_{image_size}.npy"
        if os.path.exists(path):
            self.sub_basis = np.load(path).astype(ART_NUMPY_DTYPE)
        else:
            self.sub_basis = self._generate_2d_dct_basis(
                sub_dim=self.sub_dim, res=image_size).astype(ART_NUMPY_DTYPE)
            np.save(path, self.sub_basis)

        for i in trange(x.shape[0],
                        desc="GeoDA - samples",
                        disable=not self.verbose,
                        position=0):
            x_i = x[[i]]
            y_i = y[[i]]

            # Reset number of calls
            self.nb_calls = 0

            # Random search
            x_random = self._find_random_adversarial(x=x_i, y=y_i)
            logger.info("Random search adversarial example is adversarial: %r",
                        self._is_adversarial(x_random, y_i))

            # Binary search
            x_boundary = self._binary_search(x_i,
                                             y_i,
                                             x_random,
                                             tol=self.bin_search_tol)
            logger.info("Binary search example at boundary is adversarial: %r",
                        self._is_adversarial(x_boundary, y_i))

            grad = np.zeros_like(x_i)
            x_adv_i = x_i

            for k in trange(self.iterate,
                            desc="GeoDA - steps",
                            disable=not self.verbose,
                            position=1):
                grad_oi, _ = self._black_grad_batch(x_boundary,
                                                    self.q_opt_iter[k],
                                                    self.batch_size, y_i)
                grad = grad_oi + grad
                x_adv_i = self._go_to_boundary(x_i, y_i, grad)
                x_adv_i = self._binary_search(x_i,
                                              y_i,
                                              x_adv_i,
                                              tol=self.bin_search_tol)
                x_boundary = x_adv_i

            x_adv_i = np.clip(x_adv_i,
                              a_min=self.clip_min,
                              a_max=self.clip_max)

            x_adv[i] = x_adv_i

        return x_adv
    def generate(self, x, y=None, **kwargs):
        """
        Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs.
        :type x: `np.ndarray`
        :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape
                  (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial
                  samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect
                  (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`.
        :type y: `np.ndarray`
        :return: An array holding the adversarial examples.
        :rtype: `np.ndarray`
        """
        y = check_and_transform_label_format(y, self.classifier.nb_classes())

        if y is None:
            # Throw error if attack is targeted, but no targets are provided
            if self.targeted:
                raise ValueError(
                    'Target labels `y` need to be provided for a targeted attack.'
                )

            # Use model predictions as correct outputs
            targets = get_labels_np_array(
                self.classifier.predict(x, batch_size=self.batch_size))
        else:
            targets = y

        adv_x_best = None
        rate_best = None

        if self.random_eps:
            ratio = self.eps_step / self.eps
            self.eps = np.round(self.norm_dist.rvs(1)[0], 10)
            self.eps_step = ratio * self.eps

        for _ in range(max(1, self.num_random_init)):
            adv_x = x.astype(ART_NUMPY_DTYPE)

            for i_max_iter in range(self.max_iter):
                adv_x = self._compute(
                    adv_x, x, targets, self.eps, self.eps_step, self._project,
                    self.num_random_init > 0 and i_max_iter == 0)

            if self.num_random_init > 1:
                rate = 100 * compute_success(self.classifier,
                                             x,
                                             targets,
                                             adv_x,
                                             self.targeted,
                                             batch_size=self.batch_size)
                if rate_best is None or rate > rate_best or adv_x_best is None:
                    rate_best = rate
                    adv_x_best = adv_x
            else:
                adv_x_best = adv_x

        logger.info(
            'Success rate of attack: %.2f%%',
            rate_best if rate_best is not None else 100 *
            compute_success(self.classifier,
                            x,
                            y,
                            adv_x_best,
                            self.targeted,
                            batch_size=self.batch_size))

        return adv_x_best
Ejemplo n.º 16
0
    def generate(self,
                 x: np.ndarray,
                 y: Optional[np.ndarray] = None,
                 **kwargs) -> np.ndarray:
        """
        Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs to be attacked.
        :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape
                  (nb_samples,).
        :return: An array holding the adversarial examples.
        """
        if y is not None:
            y = check_and_transform_label_format(y, self.estimator.nb_classes)

        # Check that `y` is provided for targeted attacks
        if self.targeted and y is None:  # pragma: no cover
            raise ValueError(
                "Target labels `y` need to be provided for a targeted attack.")

        # No labels provided, use model prediction as correct class
        if y is None:
            y = get_labels_np_array(
                self.estimator.predict(x, batch_size=self.batch_size))

        if self.estimator.nb_classes == 2 and y.shape[
                1] == 1:  # pragma: no cover
            raise ValueError(
                "This attack has not yet been tested for binary classification with a single output classifier."
            )

        # Compute adversarial examples with implicit batching
        nb_batches = int(np.ceil(x.shape[0] / float(self.batch_size)))
        x_adv_list = []
        for batch_id in trange(nb_batches,
                               desc="ZOO",
                               disable=not self.verbose):
            batch_index_1, batch_index_2 = batch_id * self.batch_size, (
                batch_id + 1) * self.batch_size
            x_batch = x[batch_index_1:batch_index_2]
            y_batch = y[batch_index_1:batch_index_2]
            res = self._generate_batch(x_batch, y_batch)
            x_adv_list.append(res)
        x_adv = np.vstack(x_adv_list)

        # Apply clip
        if self.estimator.clip_values is not None:
            clip_min, clip_max = self.estimator.clip_values
            np.clip(x_adv, clip_min, clip_max, out=x_adv)

        # Log success rate of the ZOO attack
        logger.info(
            "Success rate of ZOO attack: %.2f%%",
            100 * compute_success(self.estimator,
                                  x,
                                  y,
                                  x_adv,
                                  self.targeted,
                                  batch_size=self.batch_size),
        )

        return x_adv
Ejemplo n.º 17
0
    def _test_attack(self, classifier, x_test, y_test, targeted):
        """
        Test with SimBA
        :return:
        """
        x_test_original = x_test.copy()

        # set the targeted label
        if targeted:
            y_target = np.zeros(10)
            y_target[8] = 1.0

        #######
        # dct #
        #######

        df = SimBA(classifier, attack="dct", targeted=targeted)

        x_i = x_test_original[0][None, ...]
        if targeted:
            x_test_adv = df.generate(x_i, y=y_target.reshape(1, 10))
        else:
            x_test_adv = df.generate(x_i)

        for i in range(1, len(x_test_original)):
            x_i = x_test_original[i][None, ...]
            if targeted:
                tmp_x_test_adv = df.generate(x_i, y=y_target.reshape(1, 10))
                x_test_adv = np.concatenate([x_test_adv, tmp_x_test_adv])
            else:
                tmp_x_test_adv = df.generate(x_i)
                x_test_adv = np.concatenate([x_test_adv, tmp_x_test_adv])

        self.assertFalse((x_test == x_test_adv).all())
        self.assertFalse((0.0 == x_test_adv).all())

        y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == y_pred).all())

        # Check that x_test has not been modified by attack and classifier
        self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))),
                               0.0,
                               delta=0.00001)

        ######
        # px #
        ######
        df_px = SimBA(classifier, attack="px", targeted=targeted)

        x_i = x_test_original[0][None, ...]
        if targeted:
            x_test_adv = df_px.generate(x_i, y=y_target.reshape(1, 10))
        else:
            x_test_adv = df_px.generate(x_i)

        for i in range(1, len(x_test_original)):
            x_i = x_test_original[i][None, ...]
            if targeted:
                tmp_x_test_adv = df_px.generate(x_i, y=y_target.reshape(1, 10))
                x_test_adv = np.concatenate([x_test_adv, tmp_x_test_adv])
            else:
                tmp_x_test_adv = df_px.generate(x_i)
                x_test_adv = np.concatenate([x_test_adv, tmp_x_test_adv])

        self.assertFalse((x_test == x_test_adv).all())
        self.assertFalse((0.0 == x_test_adv).all())

        y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == y_pred).all())

        # Check that x_test has not been modified by attack and classifier
        self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))),
                               0.0,
                               delta=0.00001)

        #############
        # px - diag #
        #############
        df_px = SimBA(classifier, attack="px", targeted=targeted, order="diag")

        x_i = x_test_original[0][None, ...]
        if targeted:
            x_test_adv = df_px.generate(x_i, y=y_target.reshape(1, 10))
        else:
            x_test_adv = df_px.generate(x_i)

        for i in range(1, len(x_test_original)):
            x_i = x_test_original[i][None, ...]
            if targeted:
                tmp_x_test_adv = df_px.generate(x_i, y=y_target.reshape(1, 10))
                x_test_adv = np.concatenate([x_test_adv, tmp_x_test_adv])
            else:
                tmp_x_test_adv = df_px.generate(x_i)
                x_test_adv = np.concatenate([x_test_adv, tmp_x_test_adv])

        self.assertFalse((x_test == x_test_adv).all())
        self.assertFalse((0.0 == x_test_adv).all())

        y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == y_pred).all())

        # Check that x_test has not been modified by attack and classifier
        self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))),
                               0.0,
                               delta=0.00001)
    def _test_backend_mnist(self, classifier, x_train, y_train, x_test,
                            y_test):
        x_test_original = x_test.copy()

        # Test BIM with np.inf norm
        attack = BasicIterativeMethod(classifier,
                                      eps=1.0,
                                      eps_step=0.1,
                                      batch_size=128,
                                      verbose=False)
        x_train_adv = attack.generate(x_train)
        x_test_adv = attack.generate(x_test)

        self.assertFalse((x_train == x_train_adv).all())
        self.assertFalse((x_test == x_test_adv).all())

        train_y_pred = get_labels_np_array(classifier.predict(x_train_adv))
        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))

        self.assertFalse((y_train == train_y_pred).all())
        self.assertFalse((y_test == test_y_pred).all())

        acc = np.sum(
            np.argmax(train_y_pred, axis=1) == np.argmax(
                y_train, axis=1)) / y_train.shape[0]
        logger.info("Accuracy on adversarial train examples: %.2f%%",
                    (acc * 100))

        acc = np.sum(
            np.argmax(test_y_pred, axis=1) == np.argmax(
                y_test, axis=1)) / y_test.shape[0]
        logger.info("Accuracy on adversarial test examples: %.2f%%",
                    (acc * 100))

        # Check that x_test has not been modified by attack and classifier
        self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))),
                               0.0,
                               delta=0.00001)

        # Test eps of array type 1
        eps = np.ones(shape=x_test.shape) * 1.0
        eps_step = np.ones_like(eps) * 0.1

        attack_params = {"eps_step": eps_step, "eps": eps}
        attack.set_params(**attack_params)

        x_test_adv = attack.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())

        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == test_y_pred).all())

        # Test eps of array type 2
        eps = np.ones(shape=x_test.shape[1:]) * 1.0
        eps_step = np.ones_like(eps) * 0.1

        attack_params = {"eps_step": eps_step, "eps": eps}
        attack.set_params(**attack_params)

        x_test_adv = attack.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())

        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == test_y_pred).all())

        # Test eps of array type 3
        eps = np.ones(shape=x_test.shape[2:]) * 1.0
        eps_step = np.ones_like(eps) * 0.1

        attack_params = {"eps_step": eps_step, "eps": eps}
        attack.set_params(**attack_params)

        x_test_adv = attack.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())

        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == test_y_pred).all())

        # Test eps of array type 4
        eps = np.ones(shape=x_test.shape[3:]) * 1.0
        eps_step = np.ones_like(eps) * 0.1

        attack_params = {"eps_step": eps_step, "eps": eps}
        attack.set_params(**attack_params)

        x_test_adv = attack.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())

        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == test_y_pred).all())
    def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray:
        """
        Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs.
        :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape
                  (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial
                  samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect
                  (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`.
        :param mask: An array with a mask broadcastable to input `x` defining where to apply adversarial perturbations.
                     Shape needs to be broadcastable to the shape of x and can also be of the same shape as `x`. Any
                     features for which the mask is zero will not be adversarially perturbed.
        :type mask: `np.ndarray`
        :return: An array holding the adversarial examples.
        """
        mask = kwargs.get("mask")

        y = check_and_transform_label_format(y, self.estimator.nb_classes)

        if y is None:
            if self.targeted:
                raise ValueError("Target labels `y` need to be provided for a targeted attack.")
            y = get_labels_np_array(self.estimator.predict(x, batch_size=self.batch_size)).astype(np.int32)

        x_adv = x.astype(ART_NUMPY_DTYPE)

        for _ in trange(max(1, self.nb_random_init), desc="AutoPGD - restart", disable=not self.verbose):
            # Determine correctly predicted samples
            y_pred = self.estimator.predict(x_adv)
            if self.targeted:
                sample_is_robust = np.argmax(y_pred, axis=1) != np.argmax(y, axis=1)
            elif not self.targeted:
                sample_is_robust = np.argmax(y_pred, axis=1) == np.argmax(y, axis=1)

            if np.sum(sample_is_robust) == 0:
                break

            x_robust = x_adv[sample_is_robust]
            y_robust = y[sample_is_robust]
            x_init = x[sample_is_robust]

            n = x_robust.shape[0]
            m = np.prod(x_robust.shape[1:]).item()
            random_perturbation = (
                random_sphere(n, m, self.eps, self.norm).reshape(x_robust.shape).astype(ART_NUMPY_DTYPE)
            )

            x_robust = x_robust + random_perturbation

            if self.estimator.clip_values is not None:
                clip_min, clip_max = self.estimator.clip_values
                x_robust = np.clip(x_robust, clip_min, clip_max)

            perturbation = projection(x_robust - x_init, self.eps, self.norm)
            x_robust = x_init + perturbation

            # Compute perturbation with implicit batching
            for batch_id in trange(
                int(np.ceil(x_robust.shape[0] / float(self.batch_size))),
                desc="AutoPGD - batch",
                leave=False,
                disable=not self.verbose,
            ):
                self.eta = 2 * self.eps_step
                batch_index_1, batch_index_2 = batch_id * self.batch_size, (batch_id + 1) * self.batch_size
                x_k = x_robust[batch_index_1:batch_index_2].astype(ART_NUMPY_DTYPE)
                x_init_batch = x_init[batch_index_1:batch_index_2].astype(ART_NUMPY_DTYPE)
                y_batch = y_robust[batch_index_1:batch_index_2]

                p_0 = 0
                p_1 = 0.22
                W = [p_0, p_1]

                while True:
                    p_j_p_1 = W[-1] + max(W[-1] - W[-2] - 0.03, 0.06)
                    if p_j_p_1 > 1:
                        break
                    W.append(p_j_p_1)

                W = [math.ceil(p * self.max_iter) for p in W]

                eta = self.eps_step
                self.count_condition_1 = 0

                for k_iter in trange(self.max_iter, desc="AutoPGD - iteration", leave=False, disable=not self.verbose):

                    # Get perturbation, use small scalar to avoid division by 0
                    tol = 10e-8

                    # Get gradient wrt loss; invert it if attack is targeted
                    grad = self.estimator.loss_gradient(x_k, y_batch) * (1 - 2 * int(self.targeted))

                    # Apply norm bound
                    if self.norm in [np.inf, "inf"]:
                        grad = np.sign(grad)
                    elif self.norm == 1:
                        ind = tuple(range(1, len(x_k.shape)))
                        grad = grad / (np.sum(np.abs(grad), axis=ind, keepdims=True) + tol)
                    elif self.norm == 2:
                        ind = tuple(range(1, len(x_k.shape)))
                        grad = grad / (np.sqrt(np.sum(np.square(grad), axis=ind, keepdims=True)) + tol)
                    assert x_k.shape == grad.shape

                    perturbation = grad

                    if mask is not None:
                        perturbation = perturbation * (mask.astype(ART_NUMPY_DTYPE))

                    # Apply perturbation and clip
                    z_k_p_1 = x_k + eta * perturbation

                    if self.estimator.clip_values is not None:
                        clip_min, clip_max = self.estimator.clip_values
                        z_k_p_1 = np.clip(z_k_p_1, clip_min, clip_max)

                    if k_iter == 0:
                        x_1 = z_k_p_1
                        perturbation = projection(x_1 - x_init_batch, self.eps, self.norm)
                        x_1 = x_init_batch + perturbation

                        f_0 = self.estimator.compute_loss(x=x_k, y=y_batch, reduction="mean")
                        f_1 = self.estimator.compute_loss(x=x_1, y=y_batch, reduction="mean")

                        self.eta_w_j_m_1 = eta
                        self.f_max_w_j_m_1 = f_0

                        if f_1 >= f_0:
                            self.f_max = f_1
                            self.x_max = x_1
                            self.x_max_m_1 = x_init_batch
                            self.count_condition_1 += 1
                        else:
                            self.f_max = f_0
                            self.x_max = x_k.copy()
                            self.x_max_m_1 = x_init_batch

                        # Settings for next iteration k
                        x_k_m_1 = x_k.copy()
                        x_k = x_1

                    else:
                        perturbation = projection(z_k_p_1 - x_init_batch, self.eps, self.norm)
                        z_k_p_1 = x_init_batch + perturbation

                        alpha = 0.75

                        x_k_p_1 = x_k + alpha * (z_k_p_1 - x_k) + (1 - alpha) * (x_k - x_k_m_1)

                        if self.estimator.clip_values is not None:
                            clip_min, clip_max = self.estimator.clip_values
                            x_k_p_1 = np.clip(x_k_p_1, clip_min, clip_max)

                        perturbation = projection(x_k_p_1 - x_init_batch, self.eps, self.norm)
                        x_k_p_1 = x_init_batch + perturbation

                        f_k_p_1 = self.estimator.compute_loss(x=x_k_p_1, y=y_batch, reduction="mean")

                        if f_k_p_1 == 0.0:
                            x_k = x_k_p_1.copy()
                            break

                        if (not self.targeted and f_k_p_1 > self.f_max) or (self.targeted and f_k_p_1 < self.f_max):
                            self.count_condition_1 += 1
                            self.x_max = x_k_p_1
                            self.x_max_m_1 = x_k
                            self.f_max = f_k_p_1

                        if k_iter in W:

                            rho = 0.75

                            condition_1 = self.count_condition_1 < rho * (k_iter - W[W.index(k_iter) - 1])
                            condition_2 = self.eta_w_j_m_1 == eta and self.f_max_w_j_m_1 == self.f_max

                            if condition_1 or condition_2:
                                eta = eta / 2
                                x_k_m_1 = self.x_max_m_1
                                x_k = self.x_max
                            else:
                                x_k_m_1 = x_k
                                x_k = x_k_p_1.copy()

                            self.count_condition_1 = 0
                            self.eta_w_j_m_1 = eta
                            self.f_max_w_j_m_1 = self.f_max

                        else:
                            x_k_m_1 = x_k
                            x_k = x_k_p_1.copy()

                y_pred_adv_k = self.estimator.predict(x_k)
                if self.targeted:
                    sample_is_not_robust_k = np.invert(np.argmax(y_pred_adv_k, axis=1) != np.argmax(y_batch, axis=1))
                elif not self.targeted:
                    sample_is_not_robust_k = np.invert(np.argmax(y_pred_adv_k, axis=1) == np.argmax(y_batch, axis=1))

                x_robust[batch_index_1:batch_index_2][sample_is_not_robust_k] = x_k[sample_is_not_robust_k]

            x_adv[sample_is_robust] = x_robust

        return x_adv
Ejemplo n.º 20
0
    def generate(self, x, y=None, **kwargs):
        """
        Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs to be attacked.
        :type x: `np.ndarray`
        :param y: If `self.targeted` is true, then `y` represents the target labels. Otherwise, the targets are the
                  original class labels.
        :type y: `np.ndarray`
        :return: An array holding the adversarial examples.
        :rtype: `np.ndarray`
        """
        # ZOO can probably be extended to feature vectors if no zooming or resizing is applied
        if len(x.shape) == 2:
            raise ValueError(
                'Feature vectors detected. The ZOO attack can only be applied to data with spatial'
                'dimensions.')

        # Check that `y` is provided for targeted attacks
        if self.targeted and y is None:
            raise ValueError(
                'Target labels `y` need to be provided for a targeted attack.')

        # No labels provided, use model prediction as correct class
        if y is None:
            y = get_labels_np_array(
                self.classifier.predict(x,
                                        logits=False,
                                        batch_size=self.batch_size))

        # Compute adversarial examples with implicit batching
        nb_batches = int(np.ceil(x.shape[0] / float(self.batch_size)))
        x_adv = []
        for batch_id in range(nb_batches):
            logger.debug('Processing batch %i out of %i', batch_id, nb_batches)

            batch_index_1, batch_index_2 = batch_id * self.batch_size, (
                batch_id + 1) * self.batch_size
            x_batch = x[batch_index_1:batch_index_2]
            y_batch = y[batch_index_1:batch_index_2]
            res = self._generate_batch(x_batch, y_batch)
            x_adv.append(res)
        x_adv = np.vstack(x_adv)

        # Apply clip
        if hasattr(self.classifier,
                   'clip_values') and self.classifier.clip_values is not None:
            clip_min, clip_max = self.classifier.clip_values
            np.clip(x_adv, clip_min, clip_max, out=x_adv)

        # Log success rate of the ZOO attack
        logger.info(
            'Success rate of ZOO attack: %.2f%%',
            100 * compute_success(self.classifier,
                                  x,
                                  y,
                                  x_adv,
                                  self.targeted,
                                  batch_size=self.batch_size))

        return x_adv
    def _test_backend_mnist(self, classifier):
        # Get MNIST
        (x_train, y_train), (x_test, y_test) = self.mnist

        # Test FGSM with np.inf norm
        attack = FastGradientMethod(classifier, eps=1)
        x_test_adv = attack.generate(x_test, **{'batch_size': 2})
        x_train_adv = attack.generate(x_train, **{'batch_size': 4})

        self.assertFalse((x_train == x_train_adv).all())
        self.assertFalse((x_test == x_test_adv).all())

        train_y_pred = get_labels_np_array(classifier.predict(x_train_adv))
        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))

        self.assertFalse((y_train == train_y_pred).all())
        self.assertFalse((y_test == test_y_pred).all())

        acc = np.sum(
            np.argmax(train_y_pred, axis=1) == np.argmax(
                y_train, axis=1)) / y_train.shape[0]
        logger.info('Accuracy on adversarial train examples: %.2f%%',
                    (acc * 100))

        acc = np.sum(
            np.argmax(test_y_pred, axis=1) == np.argmax(
                y_test, axis=1)) / y_test.shape[0]
        logger.info('Accuracy on adversarial test examples: %.2f%%',
                    (acc * 100))

        # Test minimal perturbations
        attack_params = {"minimal": True, "eps_step": .1, "eps_max": 1.}

        x_train_adv_min = attack.generate(x_train, **attack_params)
        x_test_adv_min = attack.generate(x_test, **attack_params)

        self.assertFalse((x_train_adv_min == x_train_adv).all())
        self.assertFalse((x_test_adv_min == x_test_adv).all())

        self.assertFalse((x_train == x_train_adv_min).all())
        self.assertFalse((x_test == x_test_adv_min).all())

        train_y_pred = get_labels_np_array(classifier.predict(x_train_adv_min))
        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv_min))

        self.assertFalse((y_train == train_y_pred).all())
        self.assertFalse((y_test == test_y_pred).all())

        acc = np.sum(
            np.argmax(train_y_pred, axis=1) == np.argmax(
                y_train, axis=1)) / y_train.shape[0]
        logger.info(
            'Accuracy on adversarial train examples with minimal perturbation: %.2f%%',
            (acc * 100))

        acc = np.sum(
            np.argmax(test_y_pred, axis=1) == np.argmax(
                y_test, axis=1)) / y_test.shape[0]
        logger.info(
            'Accuracy on adversarial test examples with minimal perturbation: %.2f%%',
            (acc * 100))

        # L_1 norm
        attack = FastGradientMethod(classifier, eps=1, norm=1)
        x_test_adv = attack.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())

        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == test_y_pred).all())
        acc = np.sum(
            np.argmax(test_y_pred, axis=1) == np.argmax(
                y_test, axis=1)) / y_test.shape[0]
        logger.info(
            'Accuracy on adversarial test examples with L1 norm: %.2f%%',
            (acc * 100))

        # L_2 norm
        attack = FastGradientMethod(classifier, eps=1, norm=2)
        x_test_adv = attack.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())

        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == test_y_pred).all())
        acc = np.sum(
            np.argmax(test_y_pred, axis=1) == np.argmax(
                y_test, axis=1)) / y_test.shape[0]
        logger.info(
            'Accuracy on adversarial test examples with L2 norm: %.2f%%',
            (acc * 100))
    def test_9_keras_mnist(self):
        x_test_original = self.x_test_mnist.copy()

        # Keras classifier
        classifier = get_image_classifier_kr()

        scores = classifier._model.evaluate(self.x_train_mnist,
                                            self.y_train_mnist)
        logger.info("[Keras, MNIST] Accuracy on training set: %.2f%%",
                    (scores[1] * 100))

        scores = classifier._model.evaluate(self.x_test_mnist,
                                            self.y_test_mnist)
        logger.info("[Keras, MNIST] Accuracy on test set: %.2f%%",
                    (scores[1] * 100))

        # targeted

        # Generate random target classes
        nb_classes = np.unique(np.argmax(self.y_test_mnist, axis=1)).shape[0]
        targets = np.random.randint(nb_classes, size=self.n_test)
        while (targets == np.argmax(self.y_test_mnist, axis=1)).any():
            targets = np.random.randint(nb_classes, size=self.n_test)

        # Perform attack
        df = SaliencyMapMethod(classifier,
                               theta=1,
                               batch_size=100,
                               verbose=False)
        x_test_adv = df.generate(self.x_test_mnist,
                                 y=to_categorical(targets, nb_classes))

        self.assertFalse((self.x_test_mnist == x_test_adv).all())
        self.assertFalse((0.0 == x_test_adv).all())

        y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((self.y_test_mnist == y_pred).all())

        accuracy = np.sum(
            np.argmax(y_pred, axis=1) == np.argmax(self.y_test_mnist,
                                                   axis=1)) / self.n_test
        logger.info("Accuracy on adversarial examples: %.2f%%",
                    (accuracy * 100))

        # untargeted
        df = SaliencyMapMethod(classifier,
                               theta=1,
                               batch_size=100,
                               verbose=False)
        x_test_adv = df.generate(self.x_test_mnist)

        self.assertFalse((self.x_test_mnist == x_test_adv).all())
        self.assertFalse((0.0 == x_test_adv).all())

        y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((self.y_test_mnist == y_pred).all())

        accuracy = np.sum(
            np.argmax(y_pred, axis=1) == np.argmax(self.y_test_mnist,
                                                   axis=1)) / self.n_test
        logger.info("Accuracy on adversarial examples: %.2f%%",
                    (accuracy * 100))

        # Check that x_test has not been modified by attack and classifier
        self.assertAlmostEqual(float(
            np.max(np.abs(x_test_original - self.x_test_mnist))),
                               0.0,
                               delta=0.00001)
Ejemplo n.º 23
0
    def generate(self,
                 x: np.ndarray,
                 y: Optional[np.ndarray] = None,
                 **kwargs) -> np.ndarray:
        """Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs.
        :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape
                  (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial
                  samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect
                  (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`.
        :param mask: An array with a mask broadcastable to input `x` defining where to apply adversarial perturbations.
                     Shape needs to be broadcastable to the shape of x and can also be of the same shape as `x`. Any
                     features for which the mask is zero will not be adversarially perturbed.
        :type mask: `np.ndarray`
        :return: An array holding the adversarial examples.
        """
        mask = self._get_mask(x, **kwargs)

        # Ensure eps is broadcastable
        self._check_compatibility_input_and_eps(x=x)

        if isinstance(self.estimator, ClassifierMixin):
            if y is not None:
                y = check_and_transform_label_format(y,
                                                     self.estimator.nb_classes)

            if y is None:
                # Throw error if attack is targeted, but no targets are provided
                if self.targeted:  # pragma: no cover
                    raise ValueError(
                        "Target labels `y` need to be provided for a targeted attack."
                    )

                # Use model predictions as correct outputs
                logger.info(
                    "Using model predictions as correct labels for FGM.")
                y_array = get_labels_np_array(
                    self.estimator.predict(
                        x, batch_size=self.batch_size))  # type: ignore
            else:
                y_array = y

            if self.estimator.nb_classes > 2:
                y_array = y_array / np.sum(y_array, axis=1, keepdims=True)

            # Return adversarial examples computed with minimal perturbation if option is active
            adv_x_best = x
            if self.minimal:
                logger.info("Performing minimal perturbation FGM.")
                adv_x_best = self._minimal_perturbation(x, y_array, mask)
                rate_best = 100 * compute_success(
                    self.estimator,  # type: ignore
                    x,
                    y_array,
                    adv_x_best,
                    self.targeted,
                    batch_size=self.batch_size,  # type: ignore
                )
            else:
                rate_best = 0.0
                for _ in range(max(1, self.num_random_init)):
                    adv_x = self._compute(
                        x,
                        x,
                        y_array,
                        mask,
                        self.eps,
                        self.eps,
                        self._project,
                        self.num_random_init > 0,
                    )

                    if self.num_random_init > 1:
                        rate = 100 * compute_success(
                            self.estimator,  # type: ignore
                            x,
                            y_array,
                            adv_x,
                            self.targeted,
                            batch_size=self.batch_size,  # type: ignore
                        )
                        if rate > rate_best:
                            rate_best = rate
                            adv_x_best = adv_x
                    else:
                        adv_x_best = adv_x

            logger.info(
                "Success rate of FGM attack: %.2f%%",
                rate_best if rate_best is not None else 100 * compute_success(
                    self.estimator,  # type: ignore
                    x,
                    y_array,
                    adv_x_best,
                    self.targeted,
                    batch_size=self.batch_size,
                ),
            )

        else:
            if self.minimal:  # pragma: no cover
                raise ValueError(
                    "Minimal perturbation is only supported for classification."
                )

            if y is None:
                # Throw error if attack is targeted, but no targets are provided
                if self.targeted:  # pragma: no cover
                    raise ValueError(
                        "Target labels `y` need to be provided for a targeted attack."
                    )

                # Use model predictions as correct outputs
                logger.info(
                    "Using model predictions as correct labels for FGM.")
                y_array = self.estimator.predict(x, batch_size=self.batch_size)
            else:
                y_array = y

            adv_x_best = self._compute(
                x,
                x,
                y_array,
                None,
                self.eps,
                self.eps,
                self._project,
                self.num_random_init > 0,
            )

        if self.summary_writer is not None:
            self.summary_writer.reset()

        return adv_x_best
Ejemplo n.º 24
0
    def generate(self,
                 x: np.ndarray,
                 y: Optional[np.ndarray] = None,
                 **kwargs) -> np.ndarray:
        """
        Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs to be attacked.
        :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape
                  (nb_samples,). If `self.targeted` is true, then `y` represents the target labels. If `self.targeted`
                  is true, then `y_val` represents the target labels. Otherwise, the targets are the original class
                  labels.
        :return: An array holding the adversarial examples.
        """
        y = check_and_transform_label_format(y, self.estimator.nb_classes)
        x_adv = x.astype(ART_NUMPY_DTYPE)

        if self.estimator.clip_values is not None:
            clip_min, clip_max = self.estimator.clip_values
        else:
            clip_min, clip_max = np.amin(x), np.amax(x)

        # Assert that, if attack is targeted, y_val is provided:
        if self.targeted and y is None:
            raise ValueError(
                "Target labels `y` need to be provided for a targeted attack.")

        # No labels provided, use model prediction as correct class
        if y is None:
            y = get_labels_np_array(
                self.estimator.predict(x, batch_size=self.batch_size))

        # Compute perturbation with implicit batching
        nb_batches = int(np.ceil(x_adv.shape[0] / float(self.batch_size)))
        for batch_id in trange(nb_batches,
                               desc="C&W L_2",
                               disable=not self.verbose):
            batch_index_1, batch_index_2 = batch_id * self.batch_size, (
                batch_id + 1) * self.batch_size
            x_batch = x_adv[batch_index_1:batch_index_2]
            y_batch = y[batch_index_1:batch_index_2]

            # The optimization is performed in tanh space to keep the adversarial images bounded in correct range
            x_batch_tanh = original_to_tanh(x_batch, clip_min, clip_max,
                                            self._tanh_smoother)

            # Initialize binary search:
            c_current = self.initial_const * np.ones(x_batch.shape[0])
            c_lower_bound = np.zeros(x_batch.shape[0])
            c_double = np.ones(x_batch.shape[0]) > 0

            # Initialize placeholders for best l2 distance and attack found so far
            best_l2dist = np.inf * np.ones(x_batch.shape[0])
            best_x_adv_batch = x_batch.copy()

            for bss in range(self.binary_search_steps):
                logger.debug(
                    "Binary search step %i out of %i (c_mean==%f)",
                    bss,
                    self.binary_search_steps,
                    np.mean(c_current),
                )
                nb_active = int(np.sum(c_current < self._c_upper_bound))
                logger.debug(
                    "Number of samples with c_current < _c_upper_bound: %i out of %i",
                    nb_active,
                    x_batch.shape[0],
                )
                if nb_active == 0:
                    break
                learning_rate = self.learning_rate * np.ones(x_batch.shape[0])

                # Initialize perturbation in tanh space:
                x_adv_batch = x_batch.copy()
                x_adv_batch_tanh = x_batch_tanh.copy()

                z_logits, l2dist, loss = self._loss(x_batch, x_adv_batch,
                                                    y_batch, c_current)
                attack_success = loss - l2dist <= 0
                overall_attack_success = attack_success

                for i_iter in range(self.max_iter):
                    logger.debug("Iteration step %i out of %i", i_iter,
                                 self.max_iter)
                    logger.debug("Average Loss: %f", np.mean(loss))
                    logger.debug("Average L2Dist: %f", np.mean(l2dist))
                    logger.debug("Average Margin Loss: %f",
                                 np.mean(loss - l2dist))
                    logger.debug(
                        "Current number of succeeded attacks: %i out of %i",
                        int(np.sum(attack_success)),
                        len(attack_success),
                    )

                    improved_adv = attack_success & (l2dist < best_l2dist)
                    logger.debug("Number of improved L2 distances: %i",
                                 int(np.sum(improved_adv)))
                    if np.sum(improved_adv) > 0:
                        best_l2dist[improved_adv] = l2dist[improved_adv]
                        best_x_adv_batch[improved_adv] = x_adv_batch[
                            improved_adv]

                    active = (c_current < self._c_upper_bound) & (learning_rate
                                                                  > 0)
                    nb_active = int(np.sum(active))
                    logger.debug(
                        "Number of samples with c_current < _c_upper_bound and learning_rate > 0: %i out of %i",
                        nb_active,
                        x_batch.shape[0],
                    )
                    if nb_active == 0:
                        break

                    # compute gradient:
                    logger.debug("Compute loss gradient")
                    perturbation_tanh = -self._loss_gradient(
                        z_logits[active],
                        y_batch[active],
                        x_batch[active],
                        x_adv_batch[active],
                        x_adv_batch_tanh[active],
                        c_current[active],
                        clip_min,
                        clip_max,
                    )

                    # perform line search to optimize perturbation
                    # first, halve the learning rate until perturbation actually decreases the loss:
                    prev_loss = loss.copy()
                    best_loss = loss.copy()
                    best_lr = np.zeros(x_batch.shape[0])
                    halving = np.zeros(x_batch.shape[0])

                    for i_halve in range(self.max_halving):
                        logger.debug(
                            "Perform halving iteration %i out of %i",
                            i_halve,
                            self.max_halving,
                        )
                        do_halving = loss[active] >= prev_loss[active]
                        logger.debug(
                            "Halving to be performed on %i samples",
                            int(np.sum(do_halving)),
                        )
                        if np.sum(do_halving) == 0:
                            break
                        active_and_do_halving = active.copy()
                        active_and_do_halving[active] = do_halving

                        lr_mult = learning_rate[active_and_do_halving]
                        for _ in range(len(x.shape) - 1):
                            lr_mult = lr_mult[:, np.newaxis]

                        x_adv1 = x_adv_batch_tanh[active_and_do_halving]
                        new_x_adv_batch_tanh = x_adv1 + lr_mult * perturbation_tanh[
                            do_halving]
                        new_x_adv_batch = tanh_to_original(
                            new_x_adv_batch_tanh, clip_min, clip_max)
                        _, l2dist[active_and_do_halving], loss[
                            active_and_do_halving] = self._loss(
                                x_batch[active_and_do_halving],
                                new_x_adv_batch,
                                y_batch[active_and_do_halving],
                                c_current[active_and_do_halving],
                            )

                        logger.debug("New Average Loss: %f", np.mean(loss))
                        logger.debug("New Average L2Dist: %f", np.mean(l2dist))
                        logger.debug("New Average Margin Loss: %f",
                                     np.mean(loss - l2dist))

                        best_lr[loss < best_loss] = learning_rate[
                            loss < best_loss]
                        best_loss[loss < best_loss] = loss[loss < best_loss]
                        learning_rate[active_and_do_halving] /= 2
                        halving[active_and_do_halving] += 1
                    learning_rate[active] *= 2

                    # if no halving was actually required, double the learning rate as long as this
                    # decreases the loss:
                    for i_double in range(self.max_doubling):
                        logger.debug(
                            "Perform doubling iteration %i out of %i",
                            i_double,
                            self.max_doubling,
                        )
                        do_doubling = (halving[active] == 1) & (
                            loss[active] <= best_loss[active])
                        logger.debug(
                            "Doubling to be performed on %i samples",
                            int(np.sum(do_doubling)),
                        )
                        if np.sum(do_doubling) == 0:
                            break
                        active_and_do_doubling = active.copy()
                        active_and_do_doubling[active] = do_doubling
                        learning_rate[active_and_do_doubling] *= 2

                        lr_mult = learning_rate[active_and_do_doubling]
                        for _ in range(len(x.shape) - 1):
                            lr_mult = lr_mult[:, np.newaxis]

                        x_adv2 = x_adv_batch_tanh[active_and_do_doubling]
                        new_x_adv_batch_tanh = x_adv2 + lr_mult * perturbation_tanh[
                            do_doubling]
                        new_x_adv_batch = tanh_to_original(
                            new_x_adv_batch_tanh, clip_min, clip_max)
                        _, l2dist[active_and_do_doubling], loss[
                            active_and_do_doubling] = self._loss(
                                x_batch[active_and_do_doubling],
                                new_x_adv_batch,
                                y_batch[active_and_do_doubling],
                                c_current[active_and_do_doubling],
                            )
                        logger.debug("New Average Loss: %f", np.mean(loss))
                        logger.debug("New Average L2Dist: %f", np.mean(l2dist))
                        logger.debug("New Average Margin Loss: %f",
                                     np.mean(loss - l2dist))
                        best_lr[loss < best_loss] = learning_rate[
                            loss < best_loss]
                        best_loss[loss < best_loss] = loss[loss < best_loss]

                    learning_rate[halving == 1] /= 2

                    update_adv = best_lr[active] > 0
                    logger.debug(
                        "Number of adversarial samples to be finally updated: %i",
                        int(np.sum(update_adv)),
                    )

                    if np.sum(update_adv) > 0:
                        active_and_update_adv = active.copy()
                        active_and_update_adv[active] = update_adv
                        best_lr_mult = best_lr[active_and_update_adv]
                        for _ in range(len(x.shape) - 1):
                            best_lr_mult = best_lr_mult[:, np.newaxis]

                        x_adv4 = x_adv_batch_tanh[active_and_update_adv]
                        best_lr1 = best_lr_mult * perturbation_tanh[update_adv]
                        x_adv_batch_tanh[
                            active_and_update_adv] = x_adv4 + best_lr1

                        x_adv6 = x_adv_batch_tanh[active_and_update_adv]
                        x_adv_batch[active_and_update_adv] = tanh_to_original(
                            x_adv6, clip_min, clip_max)
                        (
                            z_logits[active_and_update_adv],
                            l2dist[active_and_update_adv],
                            loss[active_and_update_adv],
                        ) = self._loss(
                            x_batch[active_and_update_adv],
                            x_adv_batch[active_and_update_adv],
                            y_batch[active_and_update_adv],
                            c_current[active_and_update_adv],
                        )
                        attack_success = loss - l2dist <= 0
                        overall_attack_success = overall_attack_success | attack_success

                # Update depending on attack success:
                improved_adv = attack_success & (l2dist < best_l2dist)
                logger.debug("Number of improved L2 distances: %i",
                             int(np.sum(improved_adv)))

                if np.sum(improved_adv) > 0:
                    best_l2dist[improved_adv] = l2dist[improved_adv]
                    best_x_adv_batch[improved_adv] = x_adv_batch[improved_adv]

                c_double[overall_attack_success] = False
                c_current[overall_attack_success] = (
                    c_lower_bound + c_current)[overall_attack_success] / 2

                c_old = c_current
                c_current[~overall_attack_success & c_double] *= 2

                c_current1 = (c_current -
                              c_lower_bound)[~overall_attack_success
                                             & ~c_double]
                c_current[~overall_attack_success
                          & ~c_double] += c_current1 / 2
                c_lower_bound[~overall_attack_success] = c_old[
                    ~overall_attack_success]

            x_adv[batch_index_1:batch_index_2] = best_x_adv_batch

        logger.info(
            "Success rate of C&W L_2 attack: %.2f%%",
            100 * compute_success(self.estimator,
                                  x,
                                  y,
                                  x_adv,
                                  self.targeted,
                                  batch_size=self.batch_size),
        )

        return x_adv
    def generate(self,
                 x: np.ndarray,
                 y: Optional[np.ndarray] = None,
                 **kwargs) -> np.ndarray:
        """
        Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs.
        :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape
                  (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial
                  samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect
                  (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`.
        :param cost_matrix: A non-negative cost matrix.
        :type cost_matrix: `np.ndarray`
        :return: An array holding the adversarial examples.
        """
        y = check_and_transform_label_format(y, self.estimator.nb_classes)
        x_adv = x.copy().astype(ART_NUMPY_DTYPE)

        if y is None:
            # Throw error if attack is targeted, but no targets are provided
            if self.targeted:
                raise ValueError(
                    "Target labels `y` need to be provided for a targeted attack."
                )

            # Use model predictions as correct outputs
            targets = get_labels_np_array(
                self.estimator.predict(x, batch_size=self.batch_size))
        else:
            targets = y

        # Compute the cost matrix if needed
        cost_matrix = kwargs.get("cost_matrix")
        if cost_matrix is None:
            cost_matrix = self._compute_cost_matrix(self.p, self.kernel_size)

        # Compute perturbation with implicit batching
        nb_batches = int(np.ceil(x.shape[0] / float(self.batch_size)))
        for batch_id in trange(nb_batches,
                               desc="Wasserstein",
                               disable=not self.verbose):
            logger.debug("Processing batch %i out of %i", batch_id, nb_batches)

            batch_index_1, batch_index_2 = batch_id * self.batch_size, (
                batch_id + 1) * self.batch_size
            batch = x_adv[batch_index_1:batch_index_2]
            batch_labels = targets[batch_index_1:batch_index_2]

            x_adv[batch_index_1:batch_index_2] = self._generate_batch(
                batch, batch_labels, cost_matrix)

        logger.info(
            "Success rate of attack: %.2f%%",
            100 * compute_success(self.estimator,
                                  x,
                                  y,
                                  x_adv,
                                  self.targeted,
                                  batch_size=self.batch_size),
        )

        return x_adv
Ejemplo n.º 26
0
    def generate(self,
                 x: np.ndarray,
                 y: Optional[np.ndarray] = None,
                 **kwargs) -> np.ndarray:
        """
        Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs to be attacked.
        :param y: Target values (class labels) one-hot-encoded of shape (nb_samples, nb_classes) or indices of shape
                  (nb_samples,). If `self.targeted` is true, then `y_val` represents the target labels. Otherwise, the
                  targets are the original class labels.
        :return: An array holding the adversarial examples.
        """
        y = check_and_transform_label_format(y, self.estimator.nb_classes)
        x_adv = x.astype(ART_NUMPY_DTYPE)

        if self.estimator.clip_values is not None:
            clip_min_per_pixel, clip_max_per_pixel = self.estimator.clip_values
        else:
            clip_min_per_pixel, clip_max_per_pixel = np.amin(x), np.amax(x)

        # Assert that, if attack is targeted, y_val is provided:
        if self.targeted and y is None:
            raise ValueError(
                "Target labels `y` need to be provided for a targeted attack.")

        # No labels provided, use model prediction as correct class
        if y is None:
            y = get_labels_np_array(
                self.estimator.predict(x, batch_size=self.batch_size))

        # Compute perturbation with implicit batching
        nb_batches = int(np.ceil(x_adv.shape[0] / float(self.batch_size)))
        for batch_id in trange(nb_batches,
                               desc="C&W L_inf",
                               disable=not self.verbose):
            batch_index_1, batch_index_2 = batch_id * self.batch_size, (
                batch_id + 1) * self.batch_size
            x_batch = x_adv[batch_index_1:batch_index_2]
            y_batch = y[batch_index_1:batch_index_2]

            # Determine values for later clipping
            clip_min = np.clip(x_batch - self.eps, clip_min_per_pixel,
                               clip_max_per_pixel)
            clip_max = np.clip(x_batch + self.eps, clip_min_per_pixel,
                               clip_max_per_pixel)

            # The optimization is performed in tanh space to keep the
            # adversarial images bounded from clip_min and clip_max.
            x_batch_tanh = original_to_tanh(x_batch, clip_min, clip_max,
                                            self._tanh_smoother)

            # Initialize perturbation in tanh space:
            x_adv_batch = x_batch.copy()
            x_adv_batch_tanh = x_batch_tanh.copy()

            # Initialize optimization:
            z_logits, loss = self._loss(x_adv_batch, y_batch)
            attack_success = loss <= 0
            learning_rate = self.learning_rate * np.ones(x_batch.shape[0])

            for i_iter in range(self.max_iter):
                logger.debug("Iteration step %i out of %i", i_iter,
                             self.max_iter)
                logger.debug("Average Loss: %f", np.mean(loss))

                logger.debug(
                    "Successful attack samples: %i out of %i",
                    int(np.sum(attack_success)),
                    x_batch.shape[0],
                )

                # only continue optimization for those samples where attack hasn't succeeded yet:
                active = ~attack_success
                if np.sum(active) == 0:
                    break

                # compute gradient:
                logger.debug("Compute loss gradient")
                perturbation_tanh = -self._loss_gradient(
                    z_logits[active],
                    y_batch[active],
                    x_adv_batch[active],
                    x_adv_batch_tanh[active],
                    clip_min[active],
                    clip_max[active],
                )

                # perform line search to optimize perturbation
                # first, halve the learning rate until perturbation actually decreases the loss:
                prev_loss = loss.copy()
                best_loss = loss.copy()
                best_lr = np.zeros(x_batch.shape[0])
                halving = np.zeros(x_batch.shape[0])

                for i_halve in range(self.max_halving):
                    logger.debug(
                        "Perform halving iteration %i out of %i",
                        i_halve,
                        self.max_halving,
                    )
                    do_halving = loss[active] >= prev_loss[active]
                    logger.debug("Halving to be performed on %i samples",
                                 int(np.sum(do_halving)))
                    if np.sum(do_halving) == 0:
                        break
                    active_and_do_halving = active.copy()
                    active_and_do_halving[active] = do_halving

                    lr_mult = learning_rate[active_and_do_halving]
                    for _ in range(len(x.shape) - 1):
                        lr_mult = lr_mult[:, np.newaxis]

                    adv_10 = x_adv_batch_tanh[active_and_do_halving]
                    new_x_adv_batch_tanh = adv_10 + lr_mult * perturbation_tanh[
                        do_halving]

                    new_x_adv_batch = tanh_to_original(
                        new_x_adv_batch_tanh,
                        clip_min[active_and_do_halving],
                        clip_max[active_and_do_halving],
                    )
                    _, loss[active_and_do_halving] = self._loss(
                        new_x_adv_batch, y_batch[active_and_do_halving])
                    logger.debug("New Average Loss: %f", np.mean(loss))
                    logger.debug("Loss: %s", str(loss))
                    logger.debug("Prev_loss: %s", str(prev_loss))
                    logger.debug("Best_loss: %s", str(best_loss))

                    best_lr[loss < best_loss] = learning_rate[loss < best_loss]
                    best_loss[loss < best_loss] = loss[loss < best_loss]
                    learning_rate[active_and_do_halving] /= 2
                    halving[active_and_do_halving] += 1
                learning_rate[active] *= 2

                # if no halving was actually required, double the learning rate as long as this
                # decreases the loss:
                for i_double in range(self.max_doubling):
                    logger.debug(
                        "Perform doubling iteration %i out of %i",
                        i_double,
                        self.max_doubling,
                    )
                    do_doubling = (halving[active]
                                   == 1) & (loss[active] <= best_loss[active])
                    logger.debug(
                        "Doubling to be performed on %i samples",
                        int(np.sum(do_doubling)),
                    )
                    if np.sum(do_doubling) == 0:
                        break
                    active_and_do_doubling = active.copy()
                    active_and_do_doubling[active] = do_doubling
                    learning_rate[active_and_do_doubling] *= 2

                    lr_mult = learning_rate[active_and_do_doubling]
                    for _ in range(len(x.shape) - 1):
                        lr_mult = lr_mult[:, np.newaxis]

                    x_adv15 = x_adv_batch_tanh[active_and_do_doubling]
                    new_x_adv_batch_tanh = x_adv15 + lr_mult * perturbation_tanh[
                        do_doubling]
                    new_x_adv_batch = tanh_to_original(
                        new_x_adv_batch_tanh,
                        clip_min[active_and_do_doubling],
                        clip_max[active_and_do_doubling],
                    )
                    _, loss[active_and_do_doubling] = self._loss(
                        new_x_adv_batch, y_batch[active_and_do_doubling])
                    logger.debug("New Average Loss: %f", np.mean(loss))
                    best_lr[loss < best_loss] = learning_rate[loss < best_loss]
                    best_loss[loss < best_loss] = loss[loss < best_loss]

                learning_rate[halving == 1] /= 2

                update_adv = best_lr[active] > 0
                logger.debug(
                    "Number of adversarial samples to be finally updated: %i",
                    int(np.sum(update_adv)),
                )

                if np.sum(update_adv) > 0:
                    active_and_update_adv = active.copy()
                    active_and_update_adv[active] = update_adv
                    best_lr_mult = best_lr[active_and_update_adv]
                    for _ in range(len(x.shape) - 1):
                        best_lr_mult = best_lr_mult[:, np.newaxis]

                    best_13 = best_lr_mult * perturbation_tanh[update_adv]
                    x_adv_batch_tanh[active_and_update_adv] = x_adv_batch_tanh[
                        active_and_update_adv] + best_13
                    x_adv_batch[active_and_update_adv] = tanh_to_original(
                        x_adv_batch_tanh[active_and_update_adv],
                        clip_min[active_and_update_adv],
                        clip_max[active_and_update_adv],
                    )
                    (
                        z_logits[active_and_update_adv],
                        loss[active_and_update_adv],
                    ) = self._loss(
                        x_adv_batch[active_and_update_adv],
                        y_batch[active_and_update_adv],
                    )
                    attack_success = loss <= 0

            # Update depending on attack success:
            x_adv_batch[~attack_success] = x_batch[~attack_success]
            x_adv[batch_index_1:batch_index_2] = x_adv_batch

        logger.info(
            "Success rate of C&W L_inf attack: %.2f%%",
            100 * compute_success(self.estimator,
                                  x,
                                  y,
                                  x_adv,
                                  self.targeted,
                                  batch_size=self.batch_size),
        )

        return x_adv
    def generate(self, x, **kwargs):
        """
        Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs to be attacked.
        :type x: `np.ndarray`
        :param y: If `self.targeted` is true, then `y_val` represents the target labels. Otherwise, the targets are
                  the original class labels.
        :type y: `np.ndarray`
        :return: An array holding the adversarial examples.
        :rtype: `np.ndarray`
        """
        x_adv = x.astype(NUMPY_DTYPE)

        # Parse and save attack-specific parameters
        params_cpy = dict(kwargs)
        y = params_cpy.pop(str('y'), None)
        self.set_params(**params_cpy)

        # Assert that, if attack is targeted, y_val is provided:
        if self.targeted and y is None:
            raise ValueError(
                'Target labels `y` need to be provided for a targeted attack.')

        # No labels provided, use model prediction as correct class
        if y is None:
            y = get_labels_np_array(self._predict(x, logits=False))

        # Compute perturbation with implicit batching
        nb_batches = int(np.ceil(x_adv.shape[0] / float(self.batch_size)))
        for batch_id in range(nb_batches):
            logger.debug('Processing batch %i out of %i', batch_id, nb_batches)

            batch_index_1, batch_index_2 = batch_id * self.batch_size, (
                batch_id + 1) * self.batch_size
            x_batch = x_adv[batch_index_1:batch_index_2]
            y_batch = y[batch_index_1:batch_index_2]

            (clip_min_per_pixel,
             clip_max_per_pixel) = self.classifier.clip_values
            clip_min = np.clip(x_batch - self.eps, clip_min_per_pixel,
                               clip_max_per_pixel)
            clip_max = np.clip(x_batch + self.eps, clip_min_per_pixel,
                               clip_max_per_pixel)

            # The optimization is performed in tanh space to keep the
            # adversarial images bounded from clip_min and clip_max.
            x_batch_tanh = self._original_to_tanh(x_batch, clip_min, clip_max)

            # Initialize perturbation in tanh space:
            x_adv_batch = x_batch.copy()
            x_adv_batch_tanh = x_batch_tanh.copy()

            # Initialize optimization:
            z, loss = self._loss(x_adv_batch, y_batch)
            attack_success = (loss <= 0)
            lr = self.learning_rate * np.ones(x_batch.shape[0])

            for it in range(self.max_iter):
                logger.debug('Iteration step %i out of %i', it, self.max_iter)
                logger.debug('Average Loss: %f', np.mean(loss))

                logger.debug('Successful attack samples: %i out of %i',
                             int(np.sum(attack_success)), x_batch.shape[0])

                # only continue optimization for those samples where attack hasn't succeeded yet:
                active = ~attack_success
                if np.sum(active) == 0:
                    break

                # compute gradient:
                logger.debug('Compute loss gradient')
                perturbation_tanh = -self._gradient_of_loss(
                    z[active], y_batch[active], x_adv_batch[active],
                    x_adv_batch_tanh[active], clip_min[active],
                    clip_max[active])

                # perform line search to optimize perturbation
                # first, halve the learning rate until perturbation actually decreases the loss:
                prev_loss = loss.copy()
                best_loss = loss.copy()
                best_lr = np.zeros(x_batch.shape[0])
                halving = np.zeros(x_batch.shape[0])

                for h in range(self.max_halving):
                    logger.debug('Perform halving iteration %i out of %i', h,
                                 self.max_halving)
                    do_halving = (loss[active] >= prev_loss[active])
                    logger.debug('Halving to be performed on %i samples',
                                 int(np.sum(do_halving)))
                    if np.sum(do_halving) == 0:
                        break
                    active_and_do_halving = active.copy()
                    active_and_do_halving[active] = do_halving

                    lr_mult = lr[active_and_do_halving]
                    for _ in range(len(x.shape) - 1):
                        lr_mult = lr_mult[:, np.newaxis]

                    new_x_adv_batch_tanh = x_adv_batch_tanh[active_and_do_halving] + \
                        lr_mult * perturbation_tanh[do_halving]
                    new_x_adv_batch = self._tanh_to_original(
                        new_x_adv_batch_tanh, clip_min[active_and_do_halving],
                        clip_max[active_and_do_halving])
                    _, loss[active_and_do_halving] = self._loss(
                        new_x_adv_batch, y_batch[active_and_do_halving])
                    logger.debug('New Average Loss: %f', np.mean(loss))
                    logger.debug('Loss: %s', str(loss))
                    logger.debug('Prev_loss: %s', str(prev_loss))
                    logger.debug('Best_loss: %s', str(best_loss))

                    best_lr[loss < best_loss] = lr[loss < best_loss]
                    best_loss[loss < best_loss] = loss[loss < best_loss]
                    lr[active_and_do_halving] /= 2
                    halving[active_and_do_halving] += 1
                lr[active] *= 2

                # if no halving was actually required, double the learning rate as long as this
                # decreases the loss:
                for d in range(self.max_doubling):
                    logger.debug('Perform doubling iteration %i out of %i', d,
                                 self.max_doubling)
                    do_doubling = (halving[active]
                                   == 1) & (loss[active] <= best_loss[active])
                    logger.debug('Doubling to be performed on %i samples',
                                 int(np.sum(do_doubling)))
                    if np.sum(do_doubling) == 0:
                        break
                    active_and_do_doubling = active.copy()
                    active_and_do_doubling[active] = do_doubling
                    lr[active_and_do_doubling] *= 2

                    lr_mult = lr[active_and_do_doubling]
                    for _ in range(len(x.shape) - 1):
                        lr_mult = lr_mult[:, np.newaxis]

                    new_x_adv_batch_tanh = x_adv_batch_tanh[active_and_do_doubling] + \
                        lr_mult * perturbation_tanh[do_doubling]
                    new_x_adv_batch = self._tanh_to_original(
                        new_x_adv_batch_tanh, clip_min[active_and_do_doubling],
                        clip_max[active_and_do_doubling])
                    _, loss[active_and_do_doubling] = self._loss(
                        new_x_adv_batch, y_batch[active_and_do_doubling])
                    logger.debug('New Average Loss: %f', np.mean(loss))
                    best_lr[loss < best_loss] = lr[loss < best_loss]
                    best_loss[loss < best_loss] = loss[loss < best_loss]

                lr[halving == 1] /= 2

                update_adv = (best_lr[active] > 0)
                logger.debug(
                    'Number of adversarial samples to be finally updated: %i',
                    int(np.sum(update_adv)))

                if np.sum(update_adv) > 0:
                    active_and_update_adv = active.copy()
                    active_and_update_adv[active] = update_adv
                    best_lr_mult = best_lr[active_and_update_adv]
                    for _ in range(len(x.shape) - 1):
                        best_lr_mult = best_lr_mult[:, np.newaxis]

                    x_adv_batch_tanh[active_and_update_adv] = x_adv_batch_tanh[active_and_update_adv] + \
                        best_lr_mult * perturbation_tanh[update_adv]
                    x_adv_batch[
                        active_and_update_adv] = self._tanh_to_original(
                            x_adv_batch_tanh[active_and_update_adv],
                            clip_min[active_and_update_adv],
                            clip_max[active_and_update_adv])
                    z[active_and_update_adv], loss[
                        active_and_update_adv] = self._loss(
                            x_adv_batch[active_and_update_adv],
                            y_batch[active_and_update_adv])
                    attack_success = (loss <= 0)

            # Update depending on attack success:
            x_adv_batch[~attack_success] = x_batch[~attack_success]
            x_adv[batch_index_1:batch_index_2] = x_adv_batch

        adv_preds = np.argmax(self._predict(x_adv), axis=1)
        if self.targeted:
            rate = np.sum(adv_preds == np.argmax(y, axis=1)) / x_adv.shape[0]
        else:
            preds = np.argmax(self._predict(x), axis=1)
            rate = np.sum(adv_preds != preds) / x_adv.shape[0]
        logger.info('Success rate of C&W attack: %.2f%%', 100 * rate)

        return x_adv
    def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray:
        """
        Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs.
        :param y: Target values (class labels) one-hot-encoded of shape `(nb_samples, nb_classes)` or indices of shape
                  (nb_samples,). Only provide this parameter if you'd like to use true labels when crafting adversarial
                  samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect
                  (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`.
        :param mask: An array with a mask broadcastable to input `x` defining where to apply adversarial perturbations.
                     Shape needs to be broadcastable to the shape of x and can also be of the same shape as `x`. Any
                     features for which the mask is zero will not be adversarially perturbed.
        :type mask: `np.ndarray`
        :return: An array holding the adversarial examples.
        """
        x_adv = x.astype(ART_NUMPY_DTYPE)
        y = check_and_transform_label_format(y, self.estimator.nb_classes)

        if y is None:
            y = get_labels_np_array(self.estimator.predict(x, batch_size=self.batch_size))

        # Determine correctly predicted samples
        y_pred = self.estimator_orig.predict(x.astype(ART_NUMPY_DTYPE))
        sample_is_robust = np.argmax(y_pred, axis=1) == np.argmax(y, axis=1)

        # Untargeted attacks
        for attack in self.attacks:

            # Stop if all samples are misclassified
            if np.sum(sample_is_robust) == 0:
                break

            if attack.targeted:
                attack.set_params(targeted=False)

            x_adv, sample_is_robust = self._run_attack(
                x=x_adv, y=y, sample_is_robust=sample_is_robust, attack=attack, **kwargs,
            )

        # Targeted attacks
        if self.targeted:
            # Labels for targeted attacks
            y_t = np.array([range(y.shape[1])] * y.shape[0])
            y_idx = np.argmax(y, axis=1)
            y_idx = np.expand_dims(y_idx, 1)
            y_t = y_t[y_t != y_idx]
            targeted_labels = np.reshape(y_t, (y.shape[0], -1))

            for attack in self.attacks:

                if attack.targeted is not None:

                    if not attack.targeted:
                        attack.set_params(targeted=True)

                    for i in range(self.estimator.nb_classes - 1):
                        # Stop if all samples are misclassified
                        if np.sum(sample_is_robust) == 0:
                            break

                        target = check_and_transform_label_format(targeted_labels[:, i], self.estimator.nb_classes)

                        x_adv, sample_is_robust = self._run_attack(
                            x=x_adv, y=target, sample_is_robust=sample_is_robust, attack=attack, **kwargs,
                        )

        return x_adv
Ejemplo n.º 29
0
    def generate(self, x, y=None):
        """
        Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs.
        :type x: `np.ndarray`
        :param y: The labels for the data `x`. Only provide this parameter if you'd like to use true
                  labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the
                  "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is `None`.
                  Labels should be one-hot-encoded.
        :type y: `np.ndarray`
        :return: An array holding the adversarial examples.
        :rtype: `np.ndarray`
        """

        from art.utils import compute_success, get_labels_np_array, projection

        if y is None:
            # Throw error if attack is targeted, but no targets are provided
            if self.targeted:
                raise ValueError(
                    'Target labels `y` need to be provided for a targeted attack.'
                )

            # Use model predictions as correct outputs
            targets = get_labels_np_array(self.classifier.predict(x))
        else:
            targets = y

        adv_x_best = None
        rate_best = 0.0

        for i_random_init in range(max(1, self.num_random_init)):
            adv_x = x  #.astype(NUMPY_DTYPE)
            noise = np.zeros_like(x)
            for i_max_iter in range(self.max_iter):
                # x, x_init, y, eps, eps_step, project, random_init
                adv_x = self._compute(
                    adv_x, x, targets, self.eps, self.eps_step, self._project,
                    self.num_random_init > 0 and i_max_iter == 0)
                # if self._project:
                #     noise = projection(adv_x - x, self.eps, self.norm)
                #     adv_x = x + noise

                rate = 100 * compute_success(self.classifier, x, targets,
                                             adv_x, self.targeted)
                #logger.info('Success rate of attack step: %.2f%%', rate)

                noise_norm = 0
                if self.norm == np.inf:
                    noise_norm = np.sign(noise)
                elif self.norm == 1:
                    ind = tuple(range(1, len(noise.shape)))
                    noise_norm = np.sum(np.abs(noise), axis=ind, keepdims=True)
                elif self.norm == 2:
                    ind = tuple(range(1, len(noise.shape)))
                    noise_norm = np.sqrt(
                        np.sum(np.square(noise), axis=ind, keepdims=True))

                TrackedPGD.tracked_x.append(
                    (adv_x, rate, i_max_iter, noise_norm))
                if rate >= 100:
                    break

            rate = 100 * compute_success(self.classifier, x, targets, adv_x,
                                         self.targeted)
            if rate > rate_best or adv_x_best is None:
                rate_best = rate
                adv_x_best = adv_x
            if rate >= 100:
                break

        logger.info('Success rate of attack: %.2f%%', rate_best)

        return adv_x_best
Ejemplo n.º 30
0
    def generate(self,
                 x: np.ndarray,
                 y: Optional[np.ndarray] = None,
                 **kwargs) -> np.ndarray:
        """
        Generate adversarial samples and return them in an array. This requires a lot of memory, therefore it accepts
        only a single samples as input, e.g. a batch of size 1.

        :param x: An array of a single original input sample.
        :param y: An array of a single target label.
        :return: An array with the adversarial examples.
        """
        y = check_and_transform_label_format(y, self.estimator.nb_classes)

        if y is None:
            # Throw error if attack is targeted, but no targets are provided
            if self.targeted:
                raise ValueError(
                    "Target labels `y` need to be provided for a targeted attack."
                )

            logger.info("Using model predictions as correct labels for FGM.")
            y = get_labels_np_array(
                self.estimator.predict(x, batch_size=self.batch_size))
        else:
            self.targeted = True

        if x.shape[0] > 1 or y.shape[0] > 1:
            raise ValueError(
                "This attack only accepts a single sample as input.")

        if x.ndim != 4:
            raise ValueError(
                "Unrecognized input dimension. Shadow Attack can only be applied to image data."
            )

        x = x.astype(ART_NUMPY_DTYPE)
        x_batch = np.repeat(x, repeats=self.batch_size,
                            axis=0).astype(ART_NUMPY_DTYPE)
        x_batch = x_batch + np.random.normal(
            scale=self.sigma, size=x_batch.shape).astype(ART_NUMPY_DTYPE)
        y_batch = np.repeat(y, repeats=self.batch_size, axis=0)

        perturbation = (
            np.random.uniform(low=self.estimator.clip_values[0],
                              high=self.estimator.clip_values[1],
                              size=x.shape).astype(ART_NUMPY_DTYPE) -
            (self.estimator.clip_values[1] - self.estimator.clip_values[0]) /
            2)

        for _ in trange(self.nb_steps,
                        desc="Shadow attack",
                        disable=not self.verbose):
            gradients_ce = np.mean(
                self.estimator.loss_gradient(
                    x=x_batch + perturbation, y=y_batch, sampling=False) *
                (1 - 2 * int(self.targeted)),
                axis=0,
                keepdims=True,
            )
            gradients = gradients_ce - self._get_regularisation_loss_gradients(
                perturbation)
            perturbation += self.learning_rate * gradients

        x_p = x + perturbation
        x_adv = np.clip(
            x_p,
            a_min=self.estimator.clip_values[0],
            a_max=self.estimator.clip_values[1]).astype(ART_NUMPY_DTYPE)

        return x_adv