def test_with_preprocessing(self):

        session = tf.Session()
        k.set_session(session)

        comp_params = {"loss": 'categorical_crossentropy',
                       "optimizer": 'adam',
                       "metrics": ['accuracy']}

        # get MNIST
        batch_size, nb_train, nb_test = 100, 1000, 100
        (X_train, Y_train), (X_test, Y_test), _, _ = load_mnist()
        X_train, Y_train = X_train[:nb_train], Y_train[:nb_train]
        X_test, Y_test = X_test[:nb_test], Y_test[:nb_test]
        im_shape = X_train[0].shape

        # get classifier
        classifier = CNN(im_shape, act="relu", defences=["featsqueeze1"])
        classifier.compile(comp_params)
        classifier.fit(X_train, Y_train, epochs=1, batch_size=batch_size, verbose=0)
        scores = classifier.evaluate(X_train, Y_train)
        print("\naccuracy on training set: %.2f%%" % (scores[1] * 100))
        scores = classifier.evaluate(X_test, Y_test)
        print("\naccuracy on test set: %.2f%%" % (scores[1] * 100))

        attack_params = {"verbose": 0,
                         "clip_min": 0.,
                         "clip_max": 1.,
                         "eps": 1.}

        attack = FastGradientMethod(classifier, session)
        X_train_adv = attack.generate(X_train, **attack_params)
        X_test_adv = attack.generate(X_test, **attack_params)

        self.assertFalse((X_train == X_train_adv).all())
        self.assertFalse((X_test == X_test_adv).all())

        train_y_pred = get_labels_np_array(classifier.predict(X_train_adv))
        test_y_pred = get_labels_np_array(classifier.predict(X_test_adv))

        self.assertFalse((Y_train == train_y_pred).all())
        self.assertFalse((Y_test == test_y_pred).all())

        scores = classifier.evaluate(X_train_adv, Y_train)
        print('\naccuracy on adversarial train examples: %.2f%%' % (scores[1] * 100))

        scores = classifier.evaluate(X_test_adv, Y_test)
        print('\naccuracy on adversarial test examples: %.2f%%' % (scores[1] * 100))
    def test_mnist_untargeted(self):
        session = tf.Session()
        k.set_session(session)

        comp_params = {"loss": 'categorical_crossentropy',
                       "optimizer": 'adam',
                       "metrics": ['accuracy']}

        # Get MNIST
        batch_size, nb_train, nb_test = 100, 1000, 10
        (X_train, Y_train), (X_test, Y_test), _, _ = load_mnist()
        X_train, Y_train = X_train[:nb_train], Y_train[:nb_train]
        X_test, Y_test = X_test[:nb_test], Y_test[:nb_test]
        im_shape = X_train[0].shape

        # Get classifier
        classifier = CNN(im_shape, act="relu")
        classifier.compile(comp_params)
        classifier.fit(X_train, Y_train, epochs=1, batch_size=batch_size, verbose=0)
        scores = classifier.evaluate(X_test, Y_test)
        print("\naccuracy on test set: %.2f%%" % (scores[1] * 100))

        # Perform attack
        df = SaliencyMapMethod(classifier, sess=session)
        df.set_params(clip_min=0, clip_max=1, theta=1)
        x_test_adv = df.generate(X_test)
        self.assertFalse((X_test == x_test_adv).all())

        y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((Y_test == y_pred).all())

        scores = classifier.evaluate(x_test_adv, Y_test)
        print('\naccuracy on adversarial examples: %.2f%%' % (scores[1] * 100))
Example #3
0
    def test_mnist(self):
        session = tf.Session()
        k.set_session(session)

        comp_params = {"loss": 'categorical_crossentropy',
                       "optimizer": 'adam',
                       "metrics": ['accuracy']}

        # get MNIST
        batch_size, nb_train, nb_test = 10, 10, 10
        (X_train, Y_train), (X_test, Y_test), _, _ = load_mnist()
        X_train, Y_train = X_train[:nb_train], Y_train[:nb_train]
        X_test, Y_test = X_test[:nb_test], Y_test[:nb_test]
        im_shape = X_train[0].shape

        # get classifier
        classifier = CNN(im_shape, act="relu")
        classifier.compile(comp_params)
        classifier.fit(X_train, Y_train, epochs=1, batch_size=batch_size, verbose=0)
        scores = classifier.evaluate(X_test, Y_test)
        print("\naccuracy on test set: %.2f%%" % (scores[1] * 100))

        attack_params = {"verbose": 2,
                         "clip_min": 0.,
                         "clip_max": 1,
                         "attacker": "deepfool"}

        attack = UniversalPerturbation(classifier, session)
        x_train_adv = attack.generate(X_train, **attack_params)
        self.assertTrue((attack.fooling_rate >= 0.2) or not attack.converged)

        x_test_adv = X_test + attack.v
        self.assertFalse((X_test == x_test_adv).all())

        train_y_pred = get_labels_np_array(classifier.predict(x_train_adv))
        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))

        self.assertFalse((Y_test == test_y_pred).all())
        self.assertFalse((Y_train == train_y_pred).all())

        scores = classifier.evaluate(x_train_adv, Y_train)
        print('\naccuracy on adversarial train examples: %.2f%%' % (scores[1] * 100))

        scores = classifier.evaluate(x_test_adv, Y_test)
        print('\naccuracy on adversarial test examples: %.2f%%' % (scores[1] * 100))
Example #4
0
    def test_mnist(self):
        session = tf.Session()
        k.set_session(session)

        comp_params = {
            "loss": 'categorical_crossentropy',
            "optimizer": 'adam',
            "metrics": ['accuracy']
        }

        # get MNIST
        batch_size, nb_train, nb_test = 100, 1000, 10
        (X_train, Y_train), (X_test, Y_test), _, _ = load_mnist()
        X_train, Y_train = X_train[:nb_train], Y_train[:nb_train]
        X_test, Y_test = X_test[:nb_test], Y_test[:nb_test]
        im_shape = X_train[0].shape

        # get classifier
        classifier = CNN(im_shape, act="relu")
        classifier.compile(comp_params)
        classifier.fit(X_train,
                       Y_train,
                       epochs=1,
                       batch_size=batch_size,
                       verbose=0)
        scores = classifier.evaluate(X_test, Y_test)
        print("\naccuracy on test set: %.2f%%" % (scores[1] * 100))

        df = CarliniL2Method(classifier,
                             sess=session,
                             targeted=False,
                             max_iterations=100,
                             binary_search_steps=2,
                             learning_rate=1e-2,
                             initial_const=1)
        params = {
            'y_val':
            random_targets(Y_test,
                           classifier.model.get_output_shape_at(-1)[-1])
        }
        x_test_adv = df.generate(X_test, **params)
        self.assertFalse((X_test == x_test_adv).all())

        y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((Y_test == y_pred).all())

        scores = classifier.evaluate(x_test_adv, Y_test)
        print('\naccuracy on adversarial examples: %.2f%%' % (scores[1] * 100))
Example #5
0
    def fit(self, x_val, y_val, **kwargs):
        """
        Train a model adversarially. Each attack specified when creating the AdversarialTrainer is applied to all
        samples in the dataset, and only the successful ones (on the source model) are kept for data augmentation.

        :param x_val: Training set
        :type x_val: `np.ndarray`
        :param y_val: Labels
        :type y_val: `np.ndarray`
        :param kwargs: Dictionary of parameters to be passed on to the `fit` method of the classifier
        :type kwargs: `dict`
        :return: `None`
        """
        x_augmented = list(x_val.copy())
        y_augmented = list(y_val.copy())

        # Generate adversarial samples for each attack
        for i, attack in enumerate(self.attacks):
            # Fit the classifier to be used for the attack if needed
            if hasattr(attack.classifier, 'is_fitted'):
                if not attack.classifier.is_fitted:
                    attack.classifier.fit(x_val, y_val, **kwargs)
            else:
                attack.classifier.fit(x_val, y_val, **kwargs)

            # Predict new labels for the adversarial samples generated
            x_adv = attack.generate(x_val, **self.attacks[attack])
            y_pred = get_labels_np_array(attack.classifier.predict(x_adv))
            x_adv = x_adv[
                np.argmax(y_pred, axis=1) != np.argmax(y_val, axis=1)]
            y_adv = y_pred[
                np.argmax(y_pred, axis=1) != np.argmax(y_val, axis=1)]

            # Only add successful attacks to augmented dataset
            x_augmented.extend(list(x_adv))
            y_augmented.extend(list(y_adv))

        # Fit the model with the extended dataset
        self.classifier.fit(np.array(x_augmented), np.array(y_augmented),
                            **kwargs)
        self.x = x_augmented
        self.y = y_augmented
    def test_mnist_targeted(self):
        session = tf.Session()
        k.set_session(session)

        comp_params = {"loss": 'categorical_crossentropy',
                       "optimizer": 'adam',
                       "metrics": ['accuracy']}

        # Get MNIST
        batch_size, nb_train, nb_test = 100, 1000, 10
        (x_train, y_train), (x_test, y_test), _, _ = load_mnist()
        x_train, y_train = x_train[:nb_train], y_train[:nb_train]
        x_test, y_test = x_test[:nb_test], y_test[:nb_test]
        im_shape = x_train[0].shape

        # Get classifier
        classifier = CNN(im_shape, act="relu")
        classifier.compile(comp_params)
        classifier.fit(x_train, y_train, epochs=1, batch_size=batch_size, verbose=0)
        scores = classifier.evaluate(x_test, y_test)
        print("\naccuracy on test set: %.2f%%" % (scores[1] * 100))

        # Generate random target classes
        import numpy as np
        nb_classes = np.unique(np.argmax(y_test, axis=1)).shape[0]
        targets = np.random.randint(nb_classes, size=nb_test)
        while (targets == np.argmax(y_test, axis=1)).any():
            targets = np.random.randint(nb_classes, size=nb_test)

        # Perform attack
        df = SaliencyMapMethod(classifier, sess=session, clip_min=0, clip_max=1, theta=1)
        x_test_adv = df.generate(x_test, y_val=targets)
        self.assertFalse((x_test == x_test_adv).all())

        y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == y_pred).all())

        scores = classifier.evaluate(x_test_adv, y_test)
        print('\naccuracy on adversarial examples: %.2f%%' % (scores[1] * 100))