Ejemplo n.º 1
0
    def test_scikitlearn(self):
        from sklearn.linear_model import LogisticRegression
        from sklearn.svm import SVC, LinearSVC

        from art.classifiers.scikitlearn import ScikitlearnLogisticRegression, ScikitlearnSVC

        scikitlearn_test_cases = {
            LogisticRegression: ScikitlearnLogisticRegression,
            SVC: ScikitlearnSVC,
            LinearSVC: ScikitlearnSVC
        }

        (_, _), (x_test, y_test) = self.iris
        x_test_original = x_test.copy()

        for (model_class, classifier_class) in scikitlearn_test_cases.items():
            model = model_class()
            classifier = classifier_class(model=model, clip_values=(0, 1))
            classifier.fit(x=x_test, y=y_test)

            # Test untargeted attack
            attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.1)
            x_test_adv = attack.generate(x_test)
            self.assertFalse((x_test == x_test_adv).all())
            self.assertTrue((x_test_adv <= 1).all())
            self.assertTrue((x_test_adv >= 0).all())

            preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
            self.assertFalse((np.argmax(y_test, axis=1) == preds_adv).all())
            acc = np.sum(
                preds_adv == np.argmax(y_test, axis=1)) / y_test.shape[0]
            logger.info(
                'Accuracy of ' + classifier.__class__.__name__ +
                ' on Iris with PGD adversarial examples: '
                '%.2f%%', (acc * 100))

            # Test targeted attack
            targets = random_targets(y_test, nb_classes=3)
            attack = ProjectedGradientDescent(classifier,
                                              targeted=True,
                                              eps=1,
                                              eps_step=0.1)
            x_test_adv = attack.generate(x_test, **{'y': targets})
            self.assertFalse((x_test == x_test_adv).all())
            self.assertTrue((x_test_adv <= 1).all())
            self.assertTrue((x_test_adv >= 0).all())

            preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
            self.assertTrue((np.argmax(targets, axis=1) == preds_adv).any())
            acc = np.sum(
                preds_adv == np.argmax(targets, axis=1)) / y_test.shape[0]
            logger.info(
                'Success rate of ' + classifier.__class__.__name__ +
                ' on targeted PGD on Iris: %.2f%%', (acc * 100))

            # Check that x_test has not been modified by attack and classifier
            self.assertAlmostEqual(float(
                np.max(np.abs(x_test_original - x_test))),
                                   0.0,
                                   delta=0.00001)
    def test_iris_pt(self):
        (_, _), (x_test, y_test) = self.iris
        classifier = get_iris_classifier_pt()

        # Test untargeted attack
        attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.1)
        x_test_adv = attack.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1).all())
        self.assertTrue((x_test_adv >= 0).all())

        preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
        self.assertFalse((np.argmax(y_test, axis=1) == preds_adv).all())
        acc = np.sum(preds_adv == np.argmax(y_test, axis=1)) / y_test.shape[0]
        logger.info('Accuracy on Iris with PGD adversarial examples: %.2f%%',
                    (acc * 100))

        # Test targeted attack
        targets = random_targets(y_test, nb_classes=3)
        attack = ProjectedGradientDescent(classifier,
                                          targeted=True,
                                          eps=1,
                                          eps_step=0.1)
        x_test_adv = attack.generate(x_test, **{'y': targets})
        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1).all())
        self.assertTrue((x_test_adv >= 0).all())

        preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
        self.assertTrue((np.argmax(targets, axis=1) == preds_adv).any())
        acc = np.sum(preds_adv == np.argmax(targets, axis=1)) / y_test.shape[0]
        logger.info('Success rate of targeted PGD on Iris: %.2f%%',
                    (acc * 100))
    def _test_backend_mnist(self, classifier):
        # Get MNIST
        (x_train, y_train), (x_test, y_test) = self.mnist

        # Test PGD with np.inf norm
        attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.1)
        x_train_adv = attack.generate(x_train)
        x_test_adv = attack.generate(x_test)

        self.assertFalse((x_train == x_train_adv).all())
        self.assertFalse((x_test == x_test_adv).all())

        train_y_pred = get_labels_np_array(classifier.predict(x_train_adv))
        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))

        self.assertFalse((y_train == train_y_pred).all())
        self.assertFalse((y_test == test_y_pred).all())

        acc = np.sum(
            np.argmax(train_y_pred, axis=1) == np.argmax(
                y_train, axis=1)) / y_train.shape[0]
        logger.info('Accuracy on adversarial train examples: %.2f%%',
                    acc * 100)

        acc = np.sum(
            np.argmax(test_y_pred, axis=1) == np.argmax(
                y_test, axis=1)) / y_test.shape[0]
        logger.info('Accuracy on adversarial test examples: %.2f%%', acc * 100)

        # Test PGD with 3 random initialisations
        attack = ProjectedGradientDescent(classifier, num_random_init=3)
        x_train_adv = attack.generate(x_train)
        x_test_adv = attack.generate(x_test)

        self.assertFalse((x_train == x_train_adv).all())
        self.assertFalse((x_test == x_test_adv).all())

        train_y_pred = get_labels_np_array(classifier.predict(x_train_adv))
        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))

        self.assertFalse((y_train == train_y_pred).all())
        self.assertFalse((y_test == test_y_pred).all())

        acc = np.sum(
            np.argmax(train_y_pred, axis=1) == np.argmax(
                y_train, axis=1)) / y_train.shape[0]
        logger.info(
            'Accuracy on adversarial train examples with 3 random initialisations: %.2f%%',
            acc * 100)

        acc = np.sum(
            np.argmax(test_y_pred, axis=1) == np.argmax(
                y_test, axis=1)) / y_test.shape[0]
        logger.info(
            'Accuracy on adversarial test examples with 3 random initialisations: %.2f%%',
            acc * 100)
Ejemplo n.º 4
0
class PGDAttack(AdversarialAttack):
    def __init__(self,
                 model,
                 targeted=False,
                 step_size_iter=.1,
                 max_perturbation=.3,
                 norm_order=np.inf,
                 max_iterations=100,
                 num_random_init=0,
                 batch_size=16):
        super().__init__(model=model)
        self._targeted = targeted
        self._step_size_iter = step_size_iter
        self._max_perturbation = max_perturbation
        self._norm_order = norm_order
        self._max_iterations = max_iterations
        self._num_random_init = num_random_init
        self._method = ProjectedGradientDescent(
            classifier=self.model,
            targeted=self._targeted,
            norm=self._norm_order,
            eps=self._max_perturbation,
            eps_step=self._step_size_iter,
            max_iter=self._max_iterations,
            num_random_init=self._num_random_init,
            batch_size=batch_size)

    def attack_method(self, x, y=None):
        params = {}
        if y is not None:
            params['y'] = y
        return self._method.generate(x=x, **params)
    def test_pytorch_iris_pt(self):
        classifier = get_tabular_classifier_pt()

        # Test untargeted attack
        attack = ProjectedGradientDescent(classifier,
                                          eps=1,
                                          eps_step=0.1,
                                          max_iter=5)
        x_test_adv = attack.generate(self.x_test_iris)
        self.assertFalse((self.x_test_iris == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1).all())
        self.assertTrue((x_test_adv >= 0).all())

        preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
        self.assertFalse((np.argmax(self.y_test_iris,
                                    axis=1) == preds_adv).all())
        acc = np.sum(preds_adv == np.argmax(
            self.y_test_iris, axis=1)) / self.y_test_iris.shape[0]
        logger.info("Accuracy on Iris with PGD adversarial examples: %.2f%%",
                    (acc * 100))

        # Test targeted attack
        targets = random_targets(self.y_test_iris, nb_classes=3)
        attack = ProjectedGradientDescent(classifier,
                                          targeted=True,
                                          eps=1,
                                          eps_step=0.1,
                                          max_iter=5)
        x_test_adv = attack.generate(self.x_test_iris, **{"y": targets})
        self.assertFalse((self.x_test_iris == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1).all())
        self.assertTrue((x_test_adv >= 0).all())

        preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
        self.assertTrue((np.argmax(targets, axis=1) == preds_adv).any())
        acc = np.sum(preds_adv == np.argmax(
            targets, axis=1)) / self.y_test_iris.shape[0]
        logger.info("Success rate of targeted PGD on Iris: %.2f%%",
                    (acc * 100))
def pgd_linf(model, X, y, optimizer,epsilon=0.1):
    """ Construct pgd adversarial examples on the examples X"""
    classifier = PyTorchClassifier(
    model=model_concetenate,
    loss = custom_loss,
    optimizer=optimizer,
    input_shape=(1,28,28),
    nb_classes=10,
    device_type='gpu'
    )
    attack = ProjectedGradientDescent(classifier=classifier,eps=epsilon)
        
    X_adv = attack.generate(X.numpy(),y.numpy())
    return torch.Tensor(X_adv)
    def test_iris_k_unbounded(self):
        (_, _), (x_test, y_test) = self.iris
        classifier, _ = get_iris_classifier_kr()

        # Recreate a classifier without clip values
        classifier = KerasClassifier(model=classifier._model,
                                     use_logits=False,
                                     channel_index=1)
        attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.2)
        x_test_adv = attack.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv > 1).any())
        self.assertTrue((x_test_adv < 0).any())

        preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
        self.assertFalse((np.argmax(y_test, axis=1) == preds_adv).all())
        acc = np.sum(preds_adv == np.argmax(y_test, axis=1)) / y_test.shape[0]
        logger.info('Accuracy on Iris with PGD adversarial examples: %.2f%%',
                    (acc * 100))
Ejemplo n.º 8
0
    def evaluate_pgd(self, data_loader, num_iter=40):
        """Adversarial evaluation by PGD"""
        norm, eps = np.inf, attack_configs['PGD'][self.dataset]['epsilon']
        eps_step = 2 * eps / num_iter
        adv_crafter = ProjectedGradientDescent(self.classifier,
                                               norm=norm,
                                               eps=eps,
                                               eps_step=eps_step,
                                               max_iter=num_iter,
                                               random_init=True)

        data_iter = iter(data_loader)
        examples, labels = next(data_iter)
        examples, labels = examples.cpu().numpy(), labels.cpu().numpy()
        labels_one_hot = np.eye(self.nb_classes)[labels]
        examples_adv = adv_crafter.generate(examples, y=labels_one_hot)

        preds = np.argmax(self.classifier.predict(examples_adv), axis=1)
        acc = np.sum(preds == labels) / labels.shape[0]
        return acc
    def test_scikitlearn(self):
        from sklearn.linear_model import LogisticRegression
        from sklearn.svm import SVC, LinearSVC

        from art.classifiers.scikitlearn import ScikitlearnLogisticRegression, ScikitlearnSVC

        scikitlearn_test_cases = {
            LogisticRegression: ScikitlearnLogisticRegression,
            SVC: ScikitlearnSVC,
            LinearSVC: ScikitlearnSVC,
        }

        (_, _), (x_test, y_test) = self.iris
        x_test_original = x_test.copy()

        for (model_class, classifier_class) in scikitlearn_test_cases.items():
            model = model_class()
            classifier = classifier_class(model=model, clip_values=(0, 1))
            classifier.fit(x=x_test, y=y_test)

            # Test untargeted attack
            attack = ProjectedGradientDescent(classifier,
                                              eps=1,
                                              eps_step=0.1,
                                              max_iter=5)
            x_test_adv = attack.generate(x_test)
            self.assertFalse((np.array(x_test) == x_test_adv).all())
            self.assertTrue((x_test_adv <= 1).all())
            self.assertTrue((x_test_adv >= 0).all())

            preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
            self.assertFalse((np.argmax(np.array(y_test),
                                        axis=1) == preds_adv).all())
            acc = np.sum(
                preds_adv == np.argmax(np.array(y_test), axis=1)) / len(y_test)
            logger.info(
                "Accuracy of " + classifier.__class__.__name__ +
                " on Iris with PGD adversarial examples: "
                "%.2f%%",
                (acc * 100),
            )
    def test_pytorch_iris_pt(self):
        (_, _), (x_test, y_test) = self.iris
        classifier = get_tabular_classifier_pt()

        # Test untargeted attack
        attack = ProjectedGradientDescent(classifier,
                                          eps=1,
                                          eps_step=0.1,
                                          max_iter=5)
        x_test_adv = attack.generate(x_test)
        self.assertFalse((np.array(x_test) == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1).all())
        self.assertTrue((x_test_adv >= 0).all())

        preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
        self.assertFalse((np.argmax(np.array(y_test),
                                    axis=1) == preds_adv).all())
        acc = np.sum(
            preds_adv == np.argmax(np.array(y_test), axis=1)) / len(y_test)
        logger.info("Accuracy on Iris with PGD adversarial examples: %.2f%%",
                    (acc * 100))
class PGD:
    """
    Class for adversarial attacks based on projected gradient descent (PGD).
    The implementation of PGD in ART executes projection on a feasible region
    after each iteration. However, random restrating is not used in this
    implementation. Not using radom restarting is the difference between the
    PGD implemented in ART and the one described by Madry et al. 

    This adversarial attack subsumes the iterative FGSM. 
    """
    def __init__(self, model, loss_criterion, norm=np.inf, batch_size=128):
        self.wrapped_pytorch_model = wrapModel(model, loss_criterion)
        self.norm = norm
        self.batch_size = batch_size
        self.attack = ProjectedGradientDescent(self.wrapped_pytorch_model,
                                               norm=norm,
                                               random_init=False,
                                               batch_size=batch_size)

        # Use GPU for computation if it is available
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

    def generatePerturbation(self, data, budget, max_iter=15):
        images, _ = data

        # eps_step is not allowed to be larger than budget according to the
        # documentation of ART.
        eps_step = budget / 5
        images_adv = self.attack.generate(x=images.cpu().numpy(),
                                          norm=self.norm,
                                          eps=budget,
                                          eps_step=eps_step,
                                          max_iter=max_iter,
                                          batch_size=self.batch_size)
        images_adv = torch.from_numpy(images_adv)

        # The output to be returned should be loaded on an appropriate device.
        return images_adv.to(self.device)
    def test_keras_iris_unbounded(self):
        classifier = get_tabular_classifier_kr()

        # Recreate a classifier without clip values
        classifier = KerasClassifier(model=classifier._model,
                                     use_logits=False,
                                     channel_index=1)
        attack = ProjectedGradientDescent(classifier,
                                          eps=1,
                                          eps_step=0.2,
                                          max_iter=5)
        x_test_adv = attack.generate(self.x_test_iris)
        self.assertFalse((self.x_test_iris == x_test_adv).all())
        self.assertTrue((x_test_adv > 1).any())
        self.assertTrue((x_test_adv < 0).any())

        preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
        self.assertFalse((np.argmax(self.y_test_iris,
                                    axis=1) == preds_adv).all())
        acc = np.sum(preds_adv == np.argmax(
            self.y_test_iris, axis=1)) / self.y_test_iris.shape[0]
        logger.info("Accuracy on Iris with PGD adversarial examples: %.2f%%",
                    (acc * 100))
Ejemplo n.º 13
0
    # ## training for CIFAR
    # classifier = KerasClassifier(model=model, use_logits=False)
    # attack = ProjectedGradientDescent(classifier, eps=8/255, eps_step=2/255, max_iter=10, batch_size=512)

    ## training for SVHN
    classifier = KerasClassifier(clip_values=(-0.5, 0.5),
                                 model=model,
                                 use_logits=False)
    attack = ProjectedGradientDescent(classifier,
                                      eps=8 / 255,
                                      eps_step=1 / 255,
                                      max_iter=20,
                                      batch_size=512)

    x_test_pgd = attack.generate(x_test, y_test)
    # np.save('./data/' + dataset + '_data/model/' + model_name + '_y_' + attack_name + '.npy', x_test_pgd)

    # Evaluate the benign trained model on adv test set
    labels_pgd = np.argmax(classifier.predict(x_test_pgd), axis=1)
    print('Accuracy on original PGD adversarial samples: %.2f%%' %
          (np.sum(labels_pgd == labels_true) / x_test.shape[0] * 100))

    trainer = AdversarialTrainer(classifier, attack, ratio=1.0)
    trainer.fit(x_train, y_train, nb_epochs=60, batch_size=1024)

    classifier.save(filename='adv_' + model_name + '.h5',
                    path='../data/' + dataset + '_data/model/')

    # Evaluate the adversarially trained model on clean test set
    labels_true = np.argmax(y_test, axis=1)
Ejemplo n.º 14
0
                                        batch_size=batch_size,
                                        epsilon=epsilon)
        x_test_adv = adv_crafter_deepfool.generate(x=x_test / 255.0)
        predictions = classifier.predict(x_test_adv * 255.0)
        print(np.argmax(predictions, axis=1))
        accuracy = np.sum(
            np.argmax(predictions, axis=1) == y_test) / len(y_test)
        print('Accuracy on adversarial test examples: {}%'.format(accuracy *
                                                                  100))
        # pgd 20
        adv_crafter_pgd_20 = ProjectedGradientDescent(classifier,
                                                      eps=epsilon,
                                                      eps_step=0.00775,
                                                      max_iter=20,
                                                      batch_size=batch_size)
        x_test_adv = adv_crafter_pgd_20.generate(x=x_test / 255.0)
        # print(x_test_adv)
        predictions = classifier.predict(x_test_adv * 255.0)
        accuracy = np.sum(
            np.argmax(predictions, axis=1) == y_test) / len(y_test)
        print('Accuracy on adversarial test examples: {}%'.format(accuracy *
                                                                  100))

        # C&W 20
        # adv_crafter_cwinf = CarliniLInfMethod(classifier, eps=epsilon, learning_rate=epsilon/10, max_iter=20, batch_size=batch_size)
        # x_test_adv = adv_crafter_cwinf.generate(x=x_test/255.0)

        # predictions = classifier.predict(x_test_adv*255.0)
        # accuracy = np.sum(np.argmax(predictions, axis=1) == y_test) / len(y_test)
        # print('Accuracy after C&W attack: {}%'.format(accuracy * 100))
    def _test_backend_mnist(self, classifier, x_train, y_train, x_test,
                            y_test):
        x_test_original = x_test.copy()

        # Test PGD with np.inf norm
        attack = ProjectedGradientDescent(classifier,
                                          eps=1,
                                          eps_step=0.1,
                                          max_iter=5)
        x_train_adv = attack.generate(x_train)
        x_test_adv = attack.generate(x_test)

        self.assertFalse((x_train == x_train_adv).all())
        self.assertFalse((x_test == x_test_adv).all())

        train_y_pred = get_labels_np_array(classifier.predict(x_train_adv))
        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))

        self.assertFalse((y_train == train_y_pred).all())
        self.assertFalse((y_test == test_y_pred).all())

        acc = np.sum(
            np.argmax(train_y_pred, axis=1) == np.argmax(
                y_train, axis=1)) / len(y_train)
        logger.info("Accuracy on adversarial train examples: %.2f%%",
                    acc * 100)

        acc = np.sum(
            np.argmax(test_y_pred, axis=1) == np.argmax(np.array(y_test),
                                                        axis=1)) / len(y_test)
        logger.info("Accuracy on adversarial test examples: %.2f%%", acc * 100)

        # Test PGD with 3 random initialisations
        attack = ProjectedGradientDescent(classifier,
                                          num_random_init=3,
                                          max_iter=5)
        x_train_adv = attack.generate(x_train)
        x_test_adv = attack.generate(x_test)

        self.assertFalse((x_train == x_train_adv).all())
        self.assertFalse((x_test == x_test_adv).all())

        train_y_pred = get_labels_np_array(classifier.predict(x_train_adv))
        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))

        self.assertFalse((y_train == train_y_pred).all())
        self.assertFalse((y_test == test_y_pred).all())

        acc = np.sum(
            np.argmax(train_y_pred, axis=1) == np.argmax(
                y_train, axis=1)) / len(y_train)
        logger.info(
            "Accuracy on adversarial train examples with 3 random initialisations: %.2f%%",
            acc * 100)

        acc = np.sum(
            np.argmax(test_y_pred, axis=1) == np.argmax(np.array(y_test),
                                                        axis=1)) / len(y_test)
        logger.info(
            "Accuracy on adversarial test examples with 3 random initialisations: %.2f%%",
            acc * 100)

        # Check that x_test has not been modified by attack and classifier
        self.assertAlmostEqual(float(
            np.max(np.abs(np.array(x_test_original) - np.array(x_test)))),
                               0.0,
                               delta=0.00001)
Ejemplo n.º 16
0
)

# Create a toy Keras CNN architecture & wrap it under ART interface
classifier = KerasClassifier(build_model(),
                             clip_values=(0, 1),
                             use_logits=False)

# Create attack for adversarial trainer; here, we use 2 attacks, both crafting adv examples on the target model
pgd = ProjectedGradientDescent(classifier,
                               eps=8,
                               eps_step=2,
                               max_iter=10,
                               num_random_init=20)

# Create some adversarial samples for evaluation
x_test_pgd = pgd.generate(x_test)

# Create adversarial trainer and perform adversarial training
adv_trainer = AdversarialTrainer(classifier, attacks=pgd, ratio=1.0)
adv_trainer.fit_generator(art_datagen, nb_epochs=83)

# Evaluate the adversarially trained model on clean test set
labels_true = np.argmax(y_test, axis=1)
labels_test = np.argmax(classifier.predict(x_test), axis=1)
print("Accuracy test set: %.2f%%" %
      (np.sum(labels_test == labels_true) / x_test.shape[0] * 100))

# Evaluate the adversarially trained model on original adversarial samples
labels_pgd = np.argmax(classifier.predict(x_test_pgd), axis=1)
print("Accuracy on original PGD adversarial samples: %.2f%%" %
      (np.sum(labels_pgd == labels_true) / x_test.shape[0] * 100))
    def test_scikitlearn(self):
        from sklearn.linear_model import LogisticRegression
        from sklearn.svm import SVC, LinearSVC

        from art.classifiers.scikitlearn import SklearnClassifier

        scikitlearn_test_cases = [
            LogisticRegression(solver="lbfgs", multi_class="auto"),
            SVC(gamma="auto"),
            LinearSVC(),
        ]

        x_test_original = self.x_test_iris.copy()

        for model in scikitlearn_test_cases:
            classifier = SklearnClassifier(model=model, clip_values=(0, 1))
            classifier.fit(x=self.x_test_iris, y=self.y_test_iris)

            # Test untargeted attack
            attack = ProjectedGradientDescent(classifier,
                                              eps=1,
                                              eps_step=0.1,
                                              max_iter=5)
            x_test_adv = attack.generate(self.x_test_iris)
            self.assertFalse((self.x_test_iris == x_test_adv).all())
            self.assertTrue((x_test_adv <= 1).all())
            self.assertTrue((x_test_adv >= 0).all())

            preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
            self.assertFalse((np.argmax(self.y_test_iris,
                                        axis=1) == preds_adv).all())
            acc = np.sum(preds_adv == np.argmax(
                self.y_test_iris, axis=1)) / self.y_test_iris.shape[0]
            logger.info(
                "Accuracy of " + classifier.__class__.__name__ +
                " on Iris with PGD adversarial examples: "
                "%.2f%%",
                (acc * 100),
            )

            # Test targeted attack
            targets = random_targets(self.y_test_iris, nb_classes=3)
            attack = ProjectedGradientDescent(classifier,
                                              targeted=True,
                                              eps=1,
                                              eps_step=0.1,
                                              max_iter=5)
            x_test_adv = attack.generate(self.x_test_iris, **{"y": targets})
            self.assertFalse((self.x_test_iris == x_test_adv).all())
            self.assertTrue((x_test_adv <= 1).all())
            self.assertTrue((x_test_adv >= 0).all())

            preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
            self.assertTrue((np.argmax(targets, axis=1) == preds_adv).any())
            acc = np.sum(preds_adv == np.argmax(
                targets, axis=1)) / self.y_test_iris.shape[0]
            logger.info(
                "Success rate of " + classifier.__class__.__name__ +
                " on targeted PGD on Iris: %.2f%%", (acc * 100))

            # Check that x_test has not been modified by attack and classifier
            self.assertAlmostEqual(float(
                np.max(np.abs(x_test_original - self.x_test_iris))),
                                   0.0,
                                   delta=0.00001)
Ejemplo n.º 18
0
adv_inputs = adv_inputs[:nb_elements]

out1 = model2.predict(inputs)

#adv1 = atk.generate(x_train[:5])
#out_adv1 = model2.predict(adv1)
#success = compute_success(classifier, x_train[:5],y_train2[:5],adv1)
#print(success)

#plt.imshow(x_train[0].squeeze(), cmap='gray')
#plt.show()
#plt.imshow(adv1[0].squeeze(), cmap='gray')
#plt.show()

out2 = model2.predict(adv_inputs)
adv2 = atk.generate(inputs, out2)
out_adv2 = model2.predict(adv2)

success = compute_success(classifier, inputs, out1, adv2)
success_targeted = compute_success(classifier,
                                   inputs,
                                   out2,
                                   adv2,
                                   targeted=True)

print(success, success_targeted)

good_targeted = np.argmax(out_adv2, axis=1) == np.argmax(out2, axis=1)
good_adv, good_clean = adv2[good_targeted], inputs[good_targeted]

plt.imshow(good_adv[0].squeeze(), cmap='gray')