Beispiel #1
0
    def test_krclassifier(self):
        """
        Second test with the KerasClassifier.
        :return:
        """
        # Get MNIST
        batch_size, nb_train, nb_test = 100, 1000, 10
        (x_train, y_train), (x_test, y_test), _, _ = load_mnist()
        x_train, y_train = x_train[:nb_train], y_train[:nb_train]
        x_test, y_test = x_test[:nb_test], y_test[:nb_test]

        # Create simple CNN
        model = Sequential()
        model.add(Conv2D(4, kernel_size=(5, 5), activation='relu', input_shape=(28, 28, 1)))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Flatten())
        model.add(Dense(10, activation='softmax'))

        model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(lr=0.01),
                      metrics=['accuracy'])

        # Get classifier
        krc = KerasClassifier((0, 1), model, use_logits=False)
        krc.fit(x_train, y_train, batch_size=batch_size, nb_epochs=2)

        # Attack
        nf = NewtonFool(krc)
        nf.set_params(max_iter=5)
        x_test_adv = nf.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())

        y_pred = krc.predict(x_test)
        y_pred_adv = krc.predict(x_test_adv)
        y_pred_bool = y_pred.max(axis=1, keepdims=1) == y_pred
        y_pred_max = y_pred.max(axis=1)
        y_pred_adv_max = y_pred_adv[y_pred_bool]
        self.assertTrue((y_pred_max >= y_pred_adv_max).all())
Beispiel #2
0
    def test_emp_robustness_mnist(self):
        # Get MNIST
        (x_train, y_train), (_, _), _, _ = load_mnist()
        x_train, y_train = x_train[:NB_TRAIN], y_train[:NB_TRAIN]

        # Get classifier
        classifier = self._cnn_mnist_k([28, 28, 1])
        classifier.fit(x_train, y_train, batch_size=BATCH_SIZE, nb_epochs=2)

        # Compute minimal perturbations
        params = {"eps_step": 1.1}
        emp_robust = empirical_robustness(classifier, x_train, str('fgsm'),
                                          params)
        self.assertEqual(emp_robust, 0.)

        params = {"eps_step": 1., "eps": 1.}
        emp_robust = empirical_robustness(classifier, x_train, str('fgsm'),
                                          params)
        self.assertAlmostEqual(emp_robust, 1., 3)

        params = {"eps_step": 0.1, "eps": 0.2}
        emp_robust = empirical_robustness(classifier, x_train, str('fgsm'),
                                          params)
        self.assertLessEqual(emp_robust, 0.21)
    def test_clever_l2_same_target(self):
        batch_size = 100
        (x_train, y_train), (x_test, _), _, _ = load_mnist()

        # Get the classifier
        krc = self._create_krclassifier()
        krc.fit(x_train,
                y_train,
                batch_size=batch_size,
                nb_epochs=2,
                verbose=0)

        scores = clever(krc,
                        x_test[0],
                        5,
                        5,
                        3,
                        2,
                        target=np.argmax(krc.predict(x_test[:1])),
                        c_init=1,
                        pool_factor=10)
        self.assertIsNone(
            scores[0],
            msg="Clever scores for the predicted class should be `None`.")
    def test_save_load_mlp(self):
        NB_TRAIN = 100
        NB_TEST = 10

        comp_params = {'loss': 'categorical_crossentropy',
                       'optimizer': 'adam',
                       'metrics': ['accuracy']}

        session = tf.Session()
        keras.backend.set_session(session)

        # get MNIST
        (X_train, Y_train), (X_test, Y_test), _, _ = load_mnist()
        X_train, Y_train, X_test, Y_test = X_train[:NB_TRAIN], Y_train[:NB_TRAIN], X_test[:NB_TEST], Y_test[:NB_TEST]
        im_shape = X_train[0].shape

        classifier = MLP(im_shape, act="brelu")
        classifier.compile(comp_params)

        # Fit the classifier
        classifier.fit(X_train, Y_train, epochs=1, batch_size=BATCH_SIZE)
        path = "./tests/save/mlp/"

        # Test saving
        save_classifier(classifier, path)

        self.assertTrue(os.path.isfile(path + "model.json"))
        self.assertTrue(os.path.getsize(path + "model.json") > 0)
        self.assertTrue(os.path.isfile(path + "weights.h5"))
        self.assertTrue(os.path.getsize(path + "weights.h5") > 0)

        # Test loading
        loaded_classifier = load_classifier(path)
        scores = classifier.evaluate(X_test, Y_test)
        scores_loaded = loaded_classifier.evaluate(X_test, Y_test)
        self.assertAlmostEqual(scores, scores_loaded)
    def test_clever_l2_no_target(self):
        batch_size = 100
        (x_train, y_train), (x_test, _), _, _ = load_mnist()

        # Get the classifier
        krc = self._create_krclassifier()
        krc.fit(x_train,
                y_train,
                batch_size=batch_size,
                nb_epochs=2,
                verbose=0)

        scores = clever(krc,
                        x_test[0],
                        5,
                        5,
                        3,
                        2,
                        target=None,
                        c_init=1,
                        pool_factor=10)
        logger.info("Clever scores for n-1 classes: %s %s", str(scores),
                    str(scores.shape))
        self.assertEqual(scores.shape, (krc.nb_classes - 1, ))
    def test_mnist(self):
        session = tf.Session()
        keras.backend.set_session(session)

        # get MNIST
        (X_train, Y_train), (X_test, Y_test), _, _ = load_mnist()
        X_train, Y_train, X_test, Y_test = X_train[:
                                                   NB_TRAIN], Y_train[:
                                                                      NB_TRAIN], X_test[:
                                                                                        NB_TEST], Y_test[:
                                                                                                         NB_TEST]
        im_shape = X_train[0].shape

        classifier = CNN(im_shape, act="relu")
        classifier.compile({
            'loss': 'categorical_crossentropy',
            'optimizer': 'adam',
            'metrics': ['accuracy']
        })

        # Fit the classifier
        classifier.fit(X_train, Y_train, epochs=1, batch_size=BATCH_SIZE)
        scores = classifier.evaluate(X_test, Y_test)
        print("\naccuracy: %.2f%%" % (scores[1] * 100))
    def setUpClass(cls):
        k.set_learning_phase(1)

        # Get MNIST
        (x_train, y_train), (x_test, y_test), _, _ = load_mnist()
        x_train, y_train, x_test, y_test = x_train[:
                                                   NB_TRAIN], y_train[:
                                                                      NB_TRAIN], x_test[:
                                                                                        NB_TEST], y_test[:
                                                                                                         NB_TEST]
        cls.mnist = (x_train, y_train), (x_test, y_test)

        # Keras classifier
        cls.classifier_k = cls._cnn_mnist_k([28, 28, 1])
        cls.classifier_k.fit(x_train,
                             y_train,
                             batch_size=BATCH_SIZE,
                             nb_epochs=2)

        scores = cls.classifier_k._model.evaluate(x_train, y_train)
        logger.info('[Keras, MNIST] Accuracy on training set: %.2f%%',
                    (scores[1] * 100))
        scores = cls.classifier_k._model.evaluate(x_test, y_test)
        logger.info('[Keras, MNIST] Accuracy on test set: %.2f%%',
                    (scores[1] * 100))

        # Create basic CNN on MNIST using TensorFlow
        cls.classifier_tf = cls._cnn_mnist_tf([28, 28, 1])
        cls.classifier_tf.fit(x_train,
                              y_train,
                              nb_epochs=2,
                              batch_size=BATCH_SIZE)

        scores = get_labels_np_array(cls.classifier_tf.predict(x_train))
        acc = np.sum(np.argmax(scores, axis=1) == np.argmax(
            y_train, axis=1)) / y_train.shape[0]
        logger.info('[TF, MNIST] Accuracy on training set: %.2f%%',
                    (acc * 100))

        scores = get_labels_np_array(cls.classifier_tf.predict(x_test))
        acc = np.sum(np.argmax(scores, axis=1) == np.argmax(
            y_test, axis=1)) / y_test.shape[0]
        logger.info('[TF, MNIST] Accuracy on test set: %.2f%%', (acc * 100))

        # Create basic PyTorch model
        cls.classifier_py = cls._cnn_mnist_py()
        x_train, x_test = np.swapaxes(x_train, 1, 3), np.swapaxes(x_test, 1, 3)
        cls.classifier_py.fit(x_train,
                              y_train,
                              nb_epochs=2,
                              batch_size=BATCH_SIZE)

        scores = get_labels_np_array(cls.classifier_py.predict(x_train))
        acc = np.sum(np.argmax(scores, axis=1) == np.argmax(
            y_train, axis=1)) / y_train.shape[0]
        logger.info('[PyTorch, MNIST] Accuracy on training set: %.2f%%',
                    (acc * 100))

        scores = get_labels_np_array(cls.classifier_py.predict(x_test))
        acc = np.sum(np.argmax(scores, axis=1) == np.argmax(
            y_test, axis=1)) / y_test.shape[0]
        logger.info('[PyTorch, MNIST] Accuracy on test set: %.2f%%',
                    (acc * 100))
"""
The script demonstrates a simple example of using ART with TensorFlow v1.x. The example train a small model on the MNIST
dataset and creates adversarial examples using the Fast Gradient Sign Method. Here we use the ART classifier to train
the model, it would also be possible to provide a pretrained model to the ART classifier.
The parameters are chosen for reduced computational requirements of the script and not optimised for accuracy.
"""
import numpy as np

from art.attacks.evasion import FastGradientMethod
from art.estimators.classification import TensorFlowV2Classifier
from art.utils import load_mnist

# Step 1: Load the MNIST dataset

(x_train, y_train), (x_test,
                     y_test), min_pixel_value, max_pixel_value = load_mnist()

# Step 2: Create the model

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D


class TensorFlowModel(Model):
    """
    Standard TensorFlow model for unit testing.
    """
    def __init__(self):
        super(TensorFlowModel, self).__init__()
        self.conv1 = Conv2D(filters=4, kernel_size=5, activation="relu")
Beispiel #9
0
    def test_binary_activation_detector(self):
        """
        Test the binary activation detector end-to-end.
        :return:
        """
        # Get MNIST
        (x_train, y_train), (x_test, y_test), _, _ = load_mnist()
        x_train, y_train = x_train[:NB_TRAIN], y_train[:NB_TRAIN]
        x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST]

        # Keras classifier
        classifier = get_image_classifier_kr()

        # Generate adversarial samples:
        attacker = FastGradientMethod(classifier, eps=0.1)
        x_train_adv = attacker.generate(x_train[:NB_TRAIN])
        x_test_adv = attacker.generate(x_test[:NB_TRAIN])

        # Compile training data for detector:
        x_train_detector = np.concatenate((x_train[:NB_TRAIN], x_train_adv),
                                          axis=0)
        y_train_detector = np.concatenate(
            (np.array([[1, 0]] * NB_TRAIN), np.array([[0, 1]] * NB_TRAIN)),
            axis=0)

        # Create a simple CNN for the detector
        activation_shape = classifier.get_activations(x_test[:1],
                                                      0,
                                                      batch_size=128).shape[1:]
        number_outputs = 2
        try:
            from keras.optimizers import Adam

            optimizer = Adam(lr=0.01)
        except ImportError:
            from keras.optimizers import adam_v2

            optimizer = adam_v2.Adam(lr=0.01)
        model = Sequential()
        model.add(MaxPooling2D(pool_size=(2, 2), input_shape=activation_shape))
        model.add(Flatten())
        model.add(Dense(number_outputs, activation="softmax"))
        model.compile(loss=keras.losses.categorical_crossentropy,
                      optimizer=optimizer,
                      metrics=["accuracy"])

        # Create detector and train it.
        # Detector consider activations at layer=0:
        detector = BinaryActivationDetector(classifier=classifier,
                                            detector=KerasClassifier(
                                                model=model,
                                                clip_values=(0, 1),
                                                use_logits=False),
                                            layer=0)
        detector.fit(x_train_detector,
                     y_train_detector,
                     nb_epochs=2,
                     batch_size=128)

        # Apply detector on clean and adversarial test data:
        test_detection = np.argmax(detector.predict(x_test), axis=1)
        test_adv_detection = np.argmax(detector.predict(x_test_adv), axis=1)

        # Assert there is at least one true positive and negative
        nb_true_positives = len(np.where(test_adv_detection == 1)[0])
        nb_true_negatives = len(np.where(test_detection == 0)[0])
        logger.debug("Number of true positives detected: %i",
                     nb_true_positives)
        logger.debug("Number of true negatives detected: %i",
                     nb_true_negatives)
        self.assertGreater(nb_true_positives, 0)
        self.assertGreater(nb_true_negatives, 0)
"""
The script demonstrates a simple example of using ART with TensorFlow v1.x. The example train a small model on the MNIST
dataset and creates adversarial examples using the Fast Gradient Sign Method. Here we use the ART classifier to train
the model, it would also be possible to provide a pretrained model to the ART classifier.
The parameters are chosen for reduced computational requirements of the script and not optimised for accuracy.
"""
import tensorflow as tf
import numpy as np
from art.attacks import FastGradientMethod
from art.classifiers import TFClassifier
from art.utils import load_mnist

# Step 1: Load the MNIST dataset

(x_train, y_train), (x_test, y_test), min_pixel_value, max_pixel_value = load_mnist()
# Step 2: Create the model

input_ph = tf.placeholder(tf.float32, shape=[None, 28, 28, 1])
labels_ph = tf.placeholder(tf.int32, shape=[None, 10])

x = tf.layers.conv2d(input_ph, filters=4, kernel_size=5, activation=tf.nn.relu)
x = tf.layers.max_pooling2d(x, 2, 2)
x = tf.layers.conv2d(x, filters=10, kernel_size=5, activation=tf.nn.relu)
x = tf.layers.max_pooling2d(x, 2, 2)
x = tf.layers.flatten(x)
x = tf.layers.dense(x, 100, activation=tf.nn.relu)
logits = tf.layers.dense(x, 10)

loss = tf.reduce_mean(tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels_ph))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
train = optimizer.minimize(loss)
Beispiel #11
0
    def setUpClass(cls):
        master_seed(seed=1234)
        (x_train, y_train), (x_test, y_test), min_, max_ = load_mnist()
        y_train = np.argmax(y_train, axis=1)
        y_test = np.argmax(y_test, axis=1)
        zero_or_four = np.logical_or(y_train == 4, y_train == 0)
        x_train = x_train[zero_or_four]
        y_train = y_train[zero_or_four]
        tr_labels = np.zeros((y_train.shape[0], 2))
        tr_labels[y_train == 0] = np.array([1, 0])
        tr_labels[y_train == 4] = np.array([0, 1])
        y_train = tr_labels

        zero_or_four = np.logical_or(y_test == 4, y_test == 0)
        x_test = x_test[zero_or_four]
        y_test = y_test[zero_or_four]
        te_labels = np.zeros((y_test.shape[0], 2))
        te_labels[y_test == 0] = np.array([1, 0])
        te_labels[y_test == 4] = np.array([0, 1])
        y_test = te_labels

        n_samples_train = x_train.shape[0]
        n_features_train = x_train.shape[1] * x_train.shape[2] * x_train.shape[
            3]
        n_samples_test = x_test.shape[0]
        n_features_test = x_test.shape[1] * x_test.shape[2] * x_test.shape[3]

        x_train = x_train.reshape(n_samples_train, n_features_train)
        x_test = x_test.reshape(n_samples_test, n_features_test)
        x_train = x_train[:NB_TRAIN]
        y_train = y_train[:NB_TRAIN]

        trusted_data = x_test[:NB_TRUSTED]
        trusted_labels = y_test[:NB_TRUSTED]
        x_test = x_test[NB_TRUSTED:]
        y_test = y_test[NB_TRUSTED:]
        valid_data = x_test[:NB_VALID]
        valid_labels = y_test[:NB_VALID]
        x_test = x_test[NB_VALID:]
        y_test = y_test[NB_VALID:]

        no_defense = ScikitlearnSVC(model=SVC(kernel=kernel, gamma="auto"),
                                    clip_values=(min_, max_))
        no_defense.fit(x=x_train, y=y_train)
        poison_points = np.random.randint(
            no_defense._model.support_vectors_.shape[0], size=NB_POISON)
        all_poison_init = np.copy(
            no_defense._model.support_vectors_[poison_points])
        poison_labels = np.array([1, 1]) - no_defense.predict(all_poison_init)

        svm_attack = PoisoningAttackSVM(
            classifier=no_defense,
            x_train=x_train,
            y_train=y_train,
            step=0.1,
            eps=1.0,
            x_val=valid_data,
            y_val=valid_labels,
            max_iter=200,
        )

        poisoned_data, _ = svm_attack.poison(all_poison_init, y=poison_labels)

        # Stack on poison to data and add provenance of bad actor
        all_data = np.vstack([x_train, poisoned_data])
        all_labels = np.vstack([y_train, poison_labels])

        model = SVC(kernel=kernel, gamma="auto")
        cls.mnist = (
            (all_data, all_labels),
            (x_test, y_test),
            (trusted_data, trusted_labels),
            (valid_data, valid_labels),
            (min_, max_),
        )
        cls.classifier = SklearnClassifier(model=model,
                                           clip_values=(min_, max_))

        cls.classifier.fit(all_data, all_labels)
        cls.defense_cal = RONIDefense(
            cls.classifier,
            all_data,
            all_labels,
            trusted_data,
            trusted_labels,
            eps=0.1,
            calibrated=True,
        )
        cls.defence_no_cal = RONIDefense(
            cls.classifier,
            all_data,
            all_labels,
            trusted_data,
            trusted_labels,
            eps=0.1,
            calibrated=False,
        )
Beispiel #12
0
 def setUpClass(cls):
     # Get MNIST
     (_, _), (x_test, y_test), _, _ = load_mnist()
     x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST]
     cls.mnist = x_test, y_test
    def test_mnist(self):
        session = tf.Session()
        k.set_session(session)

        comp_params = {
            "loss": 'categorical_crossentropy',
            "optimizer": 'adam',
            "metrics": ['accuracy']
        }

        # get MNIST
        batch_size, nb_train, nb_test = 100, 1000, 100
        (X_train, Y_train), (X_test, Y_test), _, _ = load_mnist()
        X_train, Y_train = X_train[:nb_train], Y_train[:nb_train]
        X_test, Y_test = X_test[:nb_test], Y_test[:nb_test]
        im_shape = X_train[0].shape

        # get classifier
        classifier = CNN(im_shape, act="relu")
        classifier.compile(comp_params)
        classifier.fit(X_train,
                       Y_train,
                       epochs=1,
                       batch_size=batch_size,
                       verbose=0)
        scores = classifier.evaluate(X_train, Y_train)
        print("\naccuracy on training set: %.2f%%" % (scores[1] * 100))
        scores = classifier.evaluate(X_test, Y_test)
        print("\naccuracy on test set: %.2f%%" % (scores[1] * 100))

        attack_params = {
            "verbose": 0,
            "clip_min": 0.,
            "clip_max": 1.,
            "eps": 1.
        }

        attack = FastGradientMethod(classifier, session)
        X_train_adv = attack.generate(X_train, **attack_params)
        X_test_adv = attack.generate(X_test, **attack_params)

        self.assertFalse((X_train == X_train_adv).all())
        self.assertFalse((X_test == X_test_adv).all())

        train_y_pred = get_labels_np_array(classifier.predict(X_train_adv))
        test_y_pred = get_labels_np_array(classifier.predict(X_test_adv))

        self.assertFalse((Y_train == train_y_pred).all())
        self.assertFalse((Y_test == test_y_pred).all())

        scores = classifier.evaluate(X_train_adv, Y_train)
        print('\naccuracy on adversarial train examples: %.2f%%' %
              (scores[1] * 100))

        scores = classifier.evaluate(X_test_adv, Y_test)
        print('\naccuracy on adversarial test examples: %.2f%%' %
              (scores[1] * 100))

        # test minimal perturbations
        attack_params = {
            "verbose": 0,
            "clip_min": 0.,
            "clip_max": 1.,
            "minimal": True,
            "eps_step": .1,
            "eps_max": 1.
        }

        X_train_adv_min = attack.generate(X_train, **attack_params)
        X_test_adv_min = attack.generate(X_test, **attack_params)

        self.assertFalse((X_train_adv_min == X_train_adv).all())
        self.assertFalse((X_test_adv_min == X_test_adv).all())

        self.assertFalse((X_train == X_train_adv_min).all())
        self.assertFalse((X_test == X_test_adv_min).all())

        train_y_pred = get_labels_np_array(classifier.predict(X_train_adv_min))
        test_y_pred = get_labels_np_array(classifier.predict(X_test_adv_min))

        self.assertFalse((Y_train == train_y_pred).all())
        self.assertFalse((Y_test == test_y_pred).all())

        scores = classifier.evaluate(X_train_adv_min, Y_train)
        print(
            '\naccuracy on adversarial train examples with minimal perturbation: %.2f%%'
            % (scores[1] * 100))

        scores = classifier.evaluate(X_test_adv_min, Y_test)
        print(
            '\naccuracy on adversarial test examples with minimal perturbation: %.2f%%'
            % (scores[1] * 100))
def main():
    # Read MNIST dataset (x_raw contains the original images):
    (x_raw, y_raw), (x_raw_test, y_raw_test), min_, max_ = load_mnist(raw=True)

    n_train = np.shape(x_raw)[0]
    num_selection = 5000
    random_selection_indices = np.random.choice(n_train, num_selection)
    x_raw = x_raw[random_selection_indices]
    y_raw = y_raw[random_selection_indices]

    # Poison training data
    perc_poison = 0.33
    (is_poison_train, x_poisoned_raw,
     y_poisoned_raw) = generate_backdoor(x_raw, y_raw, perc_poison)
    x_train, y_train = preprocess(x_poisoned_raw, y_poisoned_raw)
    # Add channel axis:
    x_train = np.expand_dims(x_train, axis=3)

    # Poison test data
    (is_poison_test, x_poisoned_raw_test,
     y_poisoned_raw_test) = generate_backdoor(x_raw_test, y_raw_test,
                                              perc_poison)
    x_test, y_test = preprocess(x_poisoned_raw_test, y_poisoned_raw_test)
    # Add channel axis:
    x_test = np.expand_dims(x_test, axis=3)

    # Shuffle training data so poison is not together
    n_train = np.shape(y_train)[0]
    shuffled_indices = np.arange(n_train)
    np.random.shuffle(shuffled_indices)
    x_train = x_train[shuffled_indices]
    y_train = y_train[shuffled_indices]
    is_poison_train = is_poison_train[shuffled_indices]

    # Create Keras convolutional neural network - basic architecture from Keras examples
    # Source here: https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py
    model = Sequential()
    model.add(
        Conv2D(32,
               kernel_size=(3, 3),
               activation="relu",
               input_shape=x_train.shape[1:]))
    model.add(Conv2D(64, (3, 3), activation="relu"))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(10, activation="softmax"))

    model.compile(loss="categorical_crossentropy",
                  optimizer="adam",
                  metrics=["accuracy"])

    classifier = KerasClassifier(model=model, clip_values=(min_, max_))

    classifier.fit(x_train, y_train, nb_epochs=30, batch_size=128)

    # Evaluate the classifier on the test set
    preds = np.argmax(classifier.predict(x_test), axis=1)
    acc = np.sum(preds == np.argmax(y_test, axis=1)) / y_test.shape[0]
    print("\nTest accuracy: %.2f%%" % (acc * 100))

    # Evaluate the classifier on poisonous data
    preds = np.argmax(classifier.predict(x_test[is_poison_test]), axis=1)
    acc = np.sum(preds == np.argmax(y_test[is_poison_test],
                                    axis=1)) / y_test[is_poison_test].shape[0]
    print(
        "\nPoisonous test set accuracy (i.e. effectiveness of poison): %.2f%%"
        % (acc * 100))

    # Evaluate the classifier on clean data
    preds = np.argmax(classifier.predict(x_test[is_poison_test == 0]), axis=1)
    acc = np.sum(preds == np.argmax(y_test[
        is_poison_test == 0], axis=1)) / y_test[is_poison_test == 0].shape[0]
    print("\nClean test set accuracy: %.2f%%" % (acc * 100))

    # Calling poisoning defence:
    defence = ActivationDefence(classifier, x_train, y_train)

    # End-to-end method:
    print("------------------- Results using size metric -------------------")
    print(defence.get_params())
    defence.detect_poison(nb_clusters=2, nb_dims=10, reduce="PCA")

    # Evaluate method when ground truth is known:
    is_clean = is_poison_train == 0
    confusion_matrix = defence.evaluate_defence(is_clean)
    print("Evaluation defence results for size-based metric: ")
    jsonObject = json.loads(confusion_matrix)
    for label in jsonObject:
        print(label)
        pprint.pprint(jsonObject[label])

    # Visualize clusters:
    print("Visualize clusters")
    sprites_by_class = defence.visualize_clusters(x_train, "mnist_poison_demo")
    # Show plots for clusters of class 5
    n_class = 5
    try:
        import matplotlib.pyplot as plt

        plt.imshow(sprites_by_class[n_class][0])
        plt.title("Class " + str(n_class) + " cluster: 0")
        plt.show()
        plt.imshow(sprites_by_class[n_class][1])
        plt.title("Class " + str(n_class) + " cluster: 1")
        plt.show()
    except ImportError:
        print(
            "matplotlib not installed. For this reason, cluster visualization was not displayed"
        )

    # Try again using distance analysis this time:
    print(
        "------------------- Results using distance metric -------------------"
    )
    print(defence.get_params())
    defence.detect_poison(nb_clusters=2,
                          nb_dims=10,
                          reduce="PCA",
                          cluster_analysis="distance")
    confusion_matrix = defence.evaluate_defence(is_clean)
    print("Evaluation defence results for distance-based metric: ")
    jsonObject = json.loads(confusion_matrix)
    for label in jsonObject:
        print(label)
        pprint.pprint(jsonObject[label])

    # Other ways to invoke the defence:
    kwargs = {"nb_clusters": 2, "nb_dims": 10, "reduce": "PCA"}
    defence.cluster_activations(**kwargs)

    kwargs = {"cluster_analysis": "distance"}
    defence.analyze_clusters(**kwargs)
    defence.evaluate_defence(is_clean)

    kwargs = {"cluster_analysis": "smaller"}
    defence.analyze_clusters(**kwargs)
    defence.evaluate_defence(is_clean)

    print("done :) ")
    def test_3_clever_pt(self):
        """
        Test with pytorch.
        :return:
        """
        # Get MNIST
        batch_size, nb_train, nb_test = 100, 1000, 10
        (x_train, y_train), (x_test, y_test), _, _ = load_mnist()
        x_train, y_train = x_train[:nb_train], y_train[:nb_train]
        x_test, y_test = x_test[:nb_test], y_test[:nb_test]
        x_train = np.swapaxes(x_train, 1, 3).astype(np.float32)
        x_test = np.swapaxes(x_test, 1, 3).astype(np.float32)

        # Get the classifier
        ptc = self._create_ptclassifier()
        ptc.fit(x_train, y_train, batch_size=batch_size, nb_epochs=1)

        # Test targeted clever
        res0 = clever_t(ptc, x_test[-1], 2, 10, 5, R_L1, norm=1, pool_factor=3)
        res1 = clever_t(ptc, x_test[-1], 2, 10, 5, R_L2, norm=2, pool_factor=3)
        res2 = clever_t(ptc,
                        x_test[-1],
                        2,
                        10,
                        5,
                        R_LI,
                        norm=np.inf,
                        pool_factor=3)
        logger.info("Targeted PyTorch: %f %f %f", res0, res1, res2)
        self.assertNotEqual(res0, res1)
        self.assertNotEqual(res1, res2)
        self.assertNotEqual(res2, res0)

        # Test untargeted clever
        res0 = clever_u(ptc,
                        x_test[-1],
                        10,
                        5,
                        R_L1,
                        norm=1,
                        pool_factor=3,
                        verbose=False)
        res1 = clever_u(ptc,
                        x_test[-1],
                        10,
                        5,
                        R_L2,
                        norm=2,
                        pool_factor=3,
                        verbose=False)
        res2 = clever_u(ptc,
                        x_test[-1],
                        10,
                        5,
                        R_LI,
                        norm=np.inf,
                        pool_factor=3,
                        verbose=False)
        logger.info("Untargeted PyTorch: %f %f %f", res0, res1, res2)
        self.assertNotEqual(res0, res1)
        self.assertNotEqual(res1, res2)
        self.assertNotEqual(res2, res0)
    def test_clever_kr(self):
        """
        Test with keras.
        :return:
        """
        # Get MNIST
        batch_size, nb_train, nb_test = 100, 1000, 10
        (x_train, y_train), (x_test, y_test), _, _ = load_mnist()
        x_train, y_train = x_train[:nb_train], y_train[:nb_train]
        x_test, y_test = x_test[:nb_test], y_test[:nb_test]

        # Get the classifier
        krc = self._create_krclassifier()
        krc.fit(x_train,
                y_train,
                batch_size=batch_size,
                nb_epochs=1,
                verbose=0)

        # Test targeted clever
        res0 = clever_t(krc, x_test[-1], 2, 10, 5, R_L1, norm=1, pool_factor=3)
        res1 = clever_t(krc, x_test[-1], 2, 10, 5, R_L2, norm=2, pool_factor=3)
        res2 = clever_t(krc,
                        x_test[-1],
                        2,
                        10,
                        5,
                        R_LI,
                        norm=np.inf,
                        pool_factor=3)
        logger.info("Targeted Keras: %f %f %f", res0, res1, res2)
        self.assertNotEqual(res0, res1)
        self.assertNotEqual(res1, res2)
        self.assertNotEqual(res2, res0)

        # Test untargeted clever
        res0 = clever_u(krc,
                        x_test[-1],
                        10,
                        5,
                        R_L1,
                        norm=1,
                        pool_factor=3,
                        verbose=False)
        res1 = clever_u(krc,
                        x_test[-1],
                        10,
                        5,
                        R_L2,
                        norm=2,
                        pool_factor=3,
                        verbose=False)
        res2 = clever_u(krc,
                        x_test[-1],
                        10,
                        5,
                        R_LI,
                        norm=np.inf,
                        pool_factor=3,
                        verbose=False)
        logger.info("Untargeted Keras: %f %f %f", res0, res1, res2)
        self.assertNotEqual(res0, res1)
        self.assertNotEqual(res1, res2)
        self.assertNotEqual(res2, res0)
    def test_2_clever_tf(self):
        """
        Test with TensorFlow.
        :return:
        """
        # Get MNIST
        batch_size, nb_train, nb_test = 100, 1000, 10
        (x_train, y_train), (x_test, y_test), _, _ = load_mnist()
        x_train, y_train = x_train[:nb_train], y_train[:nb_train]
        x_test, y_test = x_test[:nb_test], y_test[:nb_test]

        # Get the classifier
        tfc = self._create_tfclassifier()
        tfc.fit(x_train, y_train, batch_size=batch_size, nb_epochs=1)

        # TODO Need to configure r
        # Test targeted clever
        res0 = clever_t(tfc, x_test[-1], 2, 10, 5, R_L1, norm=1, pool_factor=3)
        res1 = clever_t(tfc, x_test[-1], 2, 10, 5, R_L2, norm=2, pool_factor=3)
        res2 = clever_t(tfc,
                        x_test[-1],
                        2,
                        10,
                        5,
                        R_LI,
                        norm=np.inf,
                        pool_factor=3)
        logger.info("Targeted TensorFlow: %f %f %f", res0, res1, res2)
        self.assertNotEqual(res0, res1)
        self.assertNotEqual(res1, res2)
        self.assertNotEqual(res2, res0)

        # Test untargeted clever
        res0 = clever_u(tfc,
                        x_test[-1],
                        10,
                        5,
                        R_L1,
                        norm=1,
                        pool_factor=3,
                        verbose=False)
        res1 = clever_u(tfc,
                        x_test[-1],
                        10,
                        5,
                        R_L2,
                        norm=2,
                        pool_factor=3,
                        verbose=False)
        res2 = clever_u(tfc,
                        x_test[-1],
                        10,
                        5,
                        R_LI,
                        norm=np.inf,
                        pool_factor=3,
                        verbose=False)
        logger.info("Untargeted TensorFlow: %f %f %f", res0, res1, res2)
        self.assertNotEqual(res0, res1)
        self.assertNotEqual(res1, res2)
        self.assertNotEqual(res2, res0)
Beispiel #18
0
def main():
    args = parse_option()
    print(args)

    # check args
    if args.loss not in LOSS_NAMES:
        raise ValueError('Unsupported loss function type {}'.format(args.loss))

    if args.optimizer == 'adam':
        optimizer1 = tf.keras.optimizers.Adam(lr=args.lr_1)
    elif args.optimizer == 'lars':
        from lars_optimizer import LARSOptimizer
        # not compatible with tf2
        optimizer1 = LARSOptimizer(
            args.lr_1,
            exclude_from_weight_decay=['batch_normalization', 'bias'])
    elif args.optimizer == 'sgd':
        optimizer1 = tfa.optimizers.SGDW(learning_rate=args.lr_1,
                                         momentum=0.9,
                                         weight_decay=1e-4)
    optimizer2 = tf.keras.optimizers.Adam(lr=args.lr_2)

    model_name = '{}_model-bs_{}-lr_{}'.format(args.loss, args.batch_size_1,
                                               args.lr_1)

    # 0. Load data
    if args.data == 'mnist':
        mnist = tf.keras.datasets.mnist
    elif args.data == 'fashion_mnist':
        mnist = tf.keras.datasets.fashion_mnist
    print('Loading {} data...'.format(args.data))
    (_, y_train), (_, y_test) = mnist.load_data()
    # x_train, x_test = x_train / 255.0, x_test / 255.0
    # x_train = x_train.reshape(-1, 28*28).astype(np.float32)
    # x_test = x_test.reshape(-1, 28*28).astype(np.float32)
    (x_train, _), (x_test, _), _, _ = load_mnist()
    # print(x_train[0][0])
    print(x_train.shape, x_test.shape)

    # simulate low data regime for training
    # n_train = x_train.shape[0]
    # shuffle_idx = np.arange(n_train)
    # np.random.shuffle(shuffle_idx)

    # x_train = x_train[shuffle_idx][:args.n_data_train]
    # y_train = y_train[shuffle_idx][:args.n_data_train]
    # print('Training dataset shapes after slicing:')
    print(x_train.shape, y_train.shape)

    train_ds = tf.data.Dataset.from_tensor_slices(
        (x_train, y_train)).shuffle(5000).batch(args.batch_size_1)

    train_ds2 = tf.data.Dataset.from_tensor_slices(
        (x_train, y_train)).shuffle(5000).batch(args.batch_size_2)

    test_ds = tf.data.Dataset.from_tensor_slices(
        (x_test, y_test)).batch(args.batch_size_1)

    # 1. Stage 1: train encoder with multiclass N-pair loss
    encoder = Encoder(normalize=True, activation=args.activation)
    projector = Projector(args.projection_dim,
                          normalize=True,
                          activation=args.activation)

    if args.loss == 'max_margin':

        def loss_func(z, y):
            return losses.max_margin_contrastive_loss(z,
                                                      y,
                                                      margin=args.margin,
                                                      metric=args.metric)
    elif args.loss == 'npairs':
        loss_func = losses.multiclass_npairs_loss
    elif args.loss == 'sup_nt_xent':

        def loss_func(z, y):
            return losses.supervised_nt_xent_loss(
                z,
                y,
                temperature=args.temperature,
                base_temperature=args.base_temperature)
    elif args.loss.startswith('triplet'):
        triplet_kind = args.loss.split('-')[1]

        def loss_func(z, y):
            return losses.triplet_loss(z,
                                       y,
                                       kind=triplet_kind,
                                       margin=args.margin)

    train_loss = tf.keras.metrics.Mean(name='train_loss')
    test_loss = tf.keras.metrics.Mean(name='test_loss')

    # tf.config.experimental_run_functions_eagerly(True)
    @tf.function
    # train step for the contrastive loss
    def train_step_stage1(x, y):
        '''
        x: data tensor, shape: (batch_size, data_dim)
        y: data labels, shape: (batch_size, )
        '''
        with tf.GradientTape() as tape:
            r = encoder(x, training=True)
            z = projector(r, training=True)
            # print("z", z, "y", y)
            loss = loss_func(z, y)

        gradients = tape.gradient(
            loss, encoder.trainable_variables + projector.trainable_variables)
        optimizer1.apply_gradients(
            zip(gradients,
                encoder.trainable_variables + projector.trainable_variables))
        train_loss(loss)

    @tf.function
    def test_step_stage1(x, y):
        r = encoder(x, training=False)
        z = projector(r, training=False)
        t_loss = loss_func(z, y)
        test_loss(t_loss)

    print('Stage 1 training ...')
    for epoch in range(args.epoch):
        # Reset the metrics at the start of the next epoch
        train_loss.reset_states()
        test_loss.reset_states()

        for x, y in train_ds:
            train_step_stage1(x, y)

        for x_te, y_te in test_ds:
            test_step_stage1(x_te, y_te)

        template = 'Epoch {}, Loss: {}, Test Loss: {}'
        # print(template.format(epoch + 1,
        #                       train_loss.result(),
        #                       test_loss.result()))

    if args.draw_figures:
        # projecting data with the trained encoder, projector
        x_tr_proj = projector(encoder(x_train))
        x_te_proj = projector(encoder(x_test))
        # convert tensor to np.array
        x_tr_proj = x_tr_proj.numpy()
        x_te_proj = x_te_proj.numpy()
        print(x_tr_proj.shape, x_te_proj.shape)

        # check learned embedding using PCA
        pca = PCA(n_components=2)
        pca.fit(x_tr_proj)
        x_te_proj_pca = pca.transform(x_te_proj)

        x_te_proj_pca_df = pd.DataFrame(x_te_proj_pca, columns=['PC1', 'PC2'])
        x_te_proj_pca_df['label'] = y_test
        # PCA scatter plot
        fig, ax = plt.subplots()
        ax = sns.scatterplot('PC1',
                             'PC2',
                             data=x_te_proj_pca_df,
                             palette='tab10',
                             hue='label',
                             linewidth=0,
                             alpha=0.6,
                             ax=ax)

        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        title = 'Data: {}\nEmbedding: {}\nbatch size: {}; LR: {}'.format(
            args.data, LOSS_NAMES[args.loss], args.batch_size_1, args.lr_1)
        ax.set_title(title)
        fig.savefig('figs/PCA_plot_{}_{}_embed.png'.format(
            args.data, model_name))

        # density plot for PCA
        g = sns.jointplot('PC1', 'PC2', data=x_te_proj_pca_df, kind="hex")
        plt.subplots_adjust(top=0.95)
        g.fig.suptitle(title)

        g.savefig('figs/Joint_PCA_plot_{}_{}_embed.png'.format(
            args.data, model_name))

    # Stage 2: freeze the learned representations and then learn a classifier
    # on a linear layer using a softmax loss
    softmax = SoftmaxPred()

    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_acc = tf.keras.metrics.SparseCategoricalAccuracy(name='train_ACC')
    test_loss = tf.keras.metrics.Mean(name='test_loss')
    test_acc = tf.keras.metrics.SparseCategoricalAccuracy(name='test_ACC')

    cce_loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True)

    @tf.function
    # train step for the 2nd stage
    def train_step(model, x, y):
        '''
        x: data tensor, shape: (batch_size, data_dim)
        y: data labels, shape: (batch_size, )
        '''
        with tf.GradientTape() as tape:
            r = model.layers[0](x, training=False)
            y_preds = model.layers[1](r, training=True)
            loss = cce_loss_obj(y, y_preds)

        # freeze the encoder, only train the softmax layer
        gradients = tape.gradient(loss, model.layers[1].trainable_variables)
        optimizer2.apply_gradients(
            zip(gradients, model.layers[1].trainable_variables))
        train_loss(loss)
        train_acc(y, y_preds)

    @tf.function
    def test_step(x, y):
        r = encoder(x, training=False)
        y_preds = softmax(r, training=False)
        t_loss = cce_loss_obj(y, y_preds)
        test_loss(t_loss)
        test_acc(y, y_preds)

    if args.write_summary:
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        train_log_dir = 'logs/{}/{}/{}/train'.format(model_name, args.data,
                                                     current_time)
        test_log_dir = 'logs/{}/{}/{}/test'.format(model_name, args.data,
                                                   current_time)
        train_summary_writer = tf.summary.create_file_writer(train_log_dir)
        test_summary_writer = tf.summary.create_file_writer(test_log_dir)

    print('Stage 2 training ...')
    model = tf.keras.Sequential([encoder, softmax])
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True)

    classifier = TensorFlowV2Classifier(
        model=model,
        loss_object=loss_object,
        train_step=train_step,
        nb_classes=10,
        input_shape=(28, 28, 1),
        clip_values=(0, 1),
    )

    # classifier.fit(x_train, y_train, batch_size=256, nb_epochs=20)

    for epoch in range(args.epoch):
        # Reset the metrics at the start of the next epoch
        train_loss.reset_states()
        train_acc.reset_states()
        test_loss.reset_states()
        test_acc.reset_states()

        for x, y in train_ds2:
            train_step(model, x, y)

        if args.write_summary:
            with train_summary_writer.as_default():
                tf.summary.scalar('loss', train_loss.result(), step=epoch)
                tf.summary.scalar('accuracy', train_acc.result(), step=epoch)

        for x_te, y_te in test_ds:
            test_step(x_te, y_te)

        if args.write_summary:
            with test_summary_writer.as_default():
                tf.summary.scalar('loss', test_loss.result(), step=epoch)
                tf.summary.scalar('accuracy', test_acc.result(), step=epoch)

        template = 'Epoch {}, Loss: {}, Acc: {}, Test Loss: {}, Test Acc: {}'
        print(
            template.format(epoch + 1, train_loss.result(),
                            train_acc.result() * 100, test_loss.result(),
                            test_acc.result() * 100))

    predictions = classifier.predict(x_test)
    print(predictions.shape, y_test.shape)
    accuracy = np.sum(np.argmax(predictions, axis=1) == y_test) / len(y_test)
    print("Accuracy on benign test examples: {}%".format(accuracy * 100))

    print('Stage 3 attacking ...')

    attack = ProjectedGradientDescent(estimator=classifier,
                                      eps=args.eps,
                                      eps_step=args.eps / 3,
                                      max_iter=20)
    x_test_adv = attack.generate(x=x_test)

    print('Stage 4 attacking ...')

    predictions = classifier.predict(x_test_adv)
    accuracy = np.sum(np.argmax(predictions, axis=1) == y_test) / len(y_test)
    print("Accuracy on adversarial test examples: {}%".format(accuracy * 100))

    natual(args.eps)
Beispiel #19
0
def natual(eps):
    # Step 1: Load the MNIST dataset

    (x_train,
     y_train), (x_test,
                y_test), min_pixel_value, max_pixel_value = load_mnist()

    # Step 2: Create the model

    import tensorflow as tf
    from tensorflow.keras import Model
    from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D

    class TensorFlowModel(Model):
        """
        Standard TensorFlow model for unit testing.
        """
        def __init__(self):
            super(TensorFlowModel, self).__init__()
            self.conv1 = Conv2D(filters=4, kernel_size=5, activation="relu")
            self.conv2 = Conv2D(filters=10, kernel_size=5, activation="relu")
            self.maxpool = MaxPool2D(pool_size=(2, 2),
                                     strides=(2, 2),
                                     padding="valid",
                                     data_format=None)
            self.flatten = Flatten()
            self.dense1 = Dense(100, activation="relu")
            self.logits = Dense(10, activation="linear")

        def call(self, x):
            """
            Call function to evaluate the model.
            :param x: Input to the model
            :return: Prediction of the model
            """
            x = self.conv1(x)
            x = self.maxpool(x)
            x = self.conv2(x)
            x = self.maxpool(x)
            x = self.flatten(x)
            x = self.dense1(x)
            x = self.logits(x)
            return x

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

    def train_step(model, images, labels):
        with tf.GradientTape() as tape:
            predictions = model(images, training=True)
            loss = loss_object(labels, predictions)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    model = TensorFlowModel()
    loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

    # Step 3: Create the ART classifier

    classifier = TensorFlowV2Classifier(
        model=model,
        loss_object=loss_object,
        train_step=train_step,
        nb_classes=10,
        input_shape=(28, 28, 1),
        clip_values=(0, 1),
    )

    # Step 4: Train the ART classifier

    classifier.fit(x_train, y_train, batch_size=64, nb_epochs=10)

    # Step 5: Evaluate the ART classifier on benign test examples

    predictions = classifier.predict(x_test)
    accuracy = np.sum(
        np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(
            y_test)
    print("Accuracy on benign test examples: {}%".format(accuracy * 100))

    # Step 6: Generate adversarial test examples
    attack = ProjectedGradientDescent(estimator=classifier,
                                      eps=eps,
                                      eps_step=eps / 3,
                                      max_iter=20)
    x_test_adv = attack.generate(x=x_test)

    # Step 7: Evaluate the ART classifier on adversarial test examples

    predictions = classifier.predict(x_test_adv)
    accuracy = np.sum(
        np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(
            y_test)
    print("Accuracy on adversarial test examples: {}%".format(accuracy * 100))
    def test_tfclassifier(self):
        """
        First test with the TFClassifier.
        :return:
        """
        # Build a TFClassifier
        # Define input and output placeholders
        self._input_ph = tf.placeholder(tf.float32, shape=[None, 28, 28, 1])
        self._output_ph = tf.placeholder(tf.int32, shape=[None, 10])

        # Define the tensorflow graph
        conv = tf.layers.conv2d(self._input_ph, 4, 5, activation=tf.nn.relu)
        conv = tf.layers.max_pooling2d(conv, 2, 2)
        fc = tf.contrib.layers.flatten(conv)

        # Logits layer
        self._logits = tf.layers.dense(fc, 10)

        # Train operator
        self._loss = tf.reduce_mean(
            tf.losses.softmax_cross_entropy(logits=self._logits,
                                            onehot_labels=self._output_ph))
        optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
        self._train = optimizer.minimize(self._loss)

        # Tensorflow session and initialization
        self._sess = tf.Session()
        self._sess.run(tf.global_variables_initializer())

        # Get MNIST
        batch_size, nb_train, nb_test = 100, 5000, 10
        (x_train, y_train), (x_test, y_test), _, _ = load_mnist()
        x_train, y_train = x_train[:nb_train], y_train[:nb_train]
        x_test, y_test = x_test[:nb_test], y_test[:nb_test]

        # Train the classifier
        tfc = TFClassifier((0, 1), self._input_ph, self._logits,
                           self._output_ph, self._train, self._loss, None,
                           self._sess)
        tfc.fit(x_train, y_train, batch_size=batch_size, nb_epochs=10)

        # First attack
        cl2m = CarliniL2Method(classifier=tfc,
                               targeted=True,
                               max_iter=100,
                               binary_search_steps=1,
                               learning_rate=1,
                               initial_const=10,
                               decay=0)
        params = {'y': random_targets(y_test, tfc.nb_classes)}
        x_test_adv = cl2m.generate(x_test, **params)
        self.assertFalse((x_test == x_test_adv).all())
        #print(x_test_adv)
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())
        target = np.argmax(params['y'], axis=1)
        y_pred_adv = np.argmax(tfc.predict(x_test_adv), axis=1)
        print("CW2 Target: %s" % target)
        print("CW2 Actual: %s" % y_pred_adv)
        print("CW2 Success Rate: %f" %
              (sum(target == y_pred_adv) / float(len(target))))
        self.assertTrue((target == y_pred_adv).any())

        # Second attack
        cl2m = CarliniL2Method(classifier=tfc,
                               targeted=False,
                               max_iter=100,
                               binary_search_steps=1,
                               learning_rate=1,
                               initial_const=10,
                               decay=0)
        params = {'y': random_targets(y_test, tfc.nb_classes)}
        x_test_adv = cl2m.generate(x_test, **params)
        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())
        target = np.argmax(params['y'], axis=1)
        y_pred_adv = np.argmax(tfc.predict(x_test_adv), axis=1)
        print("CW2 Target: %s" % target)
        print("CW2 Actual: %s" % y_pred_adv)
        print("CW2 Success Rate: %f" %
              (sum(target != y_pred_adv) / float(len(target))))
        self.assertTrue((target != y_pred_adv).any())

        # Third attack
        cl2m = CarliniL2Method(classifier=tfc,
                               targeted=False,
                               max_iter=100,
                               binary_search_steps=1,
                               learning_rate=1,
                               initial_const=10,
                               decay=0)
        params = {}
        x_test_adv = cl2m.generate(x_test, **params)
        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())
        y_pred = np.argmax(tfc.predict(x_test), axis=1)
        y_pred_adv = np.argmax(tfc.predict(x_test_adv), axis=1)
        print("CW2 Target: %s" % y_pred)
        print("CW2 Actual: %s" % y_pred_adv)
        print("CW2 Success Rate: %f" %
              (sum(y_pred != y_pred_adv) / float(len(y_pred))))
        self.assertTrue((y_pred != y_pred_adv).any())
Beispiel #21
0
               targets=targets
               ) #this works
# model = GenerateModel(backdoor_type="pattern", model_name=model_name, train=train)

print(model_name)

# model_layer_dict1 = init_coverage_tables(model)
model_layer_times1 = init_coverage_times(model)  # times of each neuron covered
model_layer_times2 = init_coverage_times(model)  # update when new image and adversarial images found
model_layer_value1 = init_coverage_value(model)

img_dir = './seeds'
img_paths = os.listdir(img_dir)
img_num = len(img_paths)

(x_raw, y_raw), (x_raw_test, y_raw_test), min_, max_ = load_mnist(raw=True)
# Poison training data
perc_poison = .33
(is_poison_train, x_poisoned_raw, y_poisoned_raw) = generate_backdoor(x_raw, y_raw, perc_poison, backdoor_type=backdoor_type)

difference_activation_block2_conv1 = []
difference_activation_block1_conv1 = []

if neuron_select_strategy == '[4]':
    x_raw_test = x_raw_test.reshape(x_raw_test.shape[0], 28, 28, 1)
    x_poisoned_raw = x_poisoned_raw.reshape(x_poisoned_raw.shape[0], 28, 28, 1)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    activations_clean_test = get_activations(model, x_raw_test, "block2_conv1")
    activation_rate_clean_test = np.mean(activations_clean_test['block2_conv1/Relu:0'] != 0, axis=(0, 3))
    activations_poisoned_train = get_activations(model, x_poisoned_raw, "block2_conv1")
    def test_krclassifier(self):
        """
        Second test with the KerasClassifier.
        :return:
        """
        # Initialize a tf session
        session = tf.Session()
        k.set_session(session)

        # Get MNIST
        batch_size, nb_train, nb_test = 100, 5000, 10
        (x_train, y_train), (x_test, y_test), _, _ = load_mnist()
        x_train, y_train = x_train[:nb_train], y_train[:nb_train]
        x_test, y_test = x_test[:nb_test], y_test[:nb_test]

        # Create simple CNN
        model = Sequential()
        model.add(
            Conv2D(4,
                   kernel_size=(5, 5),
                   activation='relu',
                   input_shape=(28, 28, 1)))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Flatten())
        model.add(Dense(10, activation='softmax'))

        model.compile(loss=keras.losses.categorical_crossentropy,
                      optimizer=keras.optimizers.Adam(lr=0.01),
                      metrics=['accuracy'])

        # Get classifier
        krc = KerasClassifier((0, 1), model, use_logits=False)
        krc.fit(x_train, y_train, batch_size=batch_size, nb_epochs=10)

        # First attack
        cl2m = CarliniL2Method(classifier=krc,
                               targeted=True,
                               max_iter=100,
                               binary_search_steps=1,
                               learning_rate=1,
                               initial_const=10,
                               decay=0)
        params = {'y': random_targets(y_test, krc.nb_classes)}
        x_test_adv = cl2m.generate(x_test, **params)
        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())
        target = np.argmax(params['y'], axis=1)
        y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1)
        print("CW2 Target: %s" % target)
        print("CW2 Actual: %s" % y_pred_adv)
        print("CW2 Success Rate: %f" %
              (sum(target == y_pred_adv) / float(len(target))))
        self.assertTrue((target == y_pred_adv).any())

        # Second attack
        cl2m = CarliniL2Method(classifier=krc,
                               targeted=False,
                               max_iter=100,
                               binary_search_steps=1,
                               learning_rate=1,
                               initial_const=10,
                               decay=0)
        params = {'y': random_targets(y_test, krc.nb_classes)}
        x_test_adv = cl2m.generate(x_test, **params)
        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())
        target = np.argmax(params['y'], axis=1)
        y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1)
        print("CW2 Target: %s" % target)
        print("CW2 Actual: %s" % y_pred_adv)
        print("CW2 Success Rate: %f" %
              (sum(target != y_pred_adv) / float(len(target))))
        self.assertTrue((target != y_pred_adv).any())

        # Third attack
        cl2m = CarliniL2Method(classifier=krc,
                               targeted=False,
                               max_iter=100,
                               binary_search_steps=1,
                               learning_rate=1,
                               initial_const=10,
                               decay=0)
        params = {}
        x_test_adv = cl2m.generate(x_test, **params)
        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())
        y_pred = np.argmax(krc.predict(x_test), axis=1)
        y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1)
        print("CW2 Target: %s" % y_pred)
        print("CW2 Actual: %s" % y_pred_adv)
        print("CW2 Success Rate: %f" %
              (sum(y_pred != y_pred_adv) / float(len(y_pred))))
        self.assertTrue((y_pred != y_pred_adv).any())
Beispiel #23
0
    model.add(Dense(10, activation="softmax"))

    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=keras.optimizers.Adam(lr=0.01),
                  metrics=["accuracy"])

    classifier = KerasClassifier(model=model, clip_values=(0, 1))
    return classifier


# Get session
session = tf.Session()
k.set_session(session)

# Read MNIST dataset
(x_train, y_train), (x_test, y_test), min_, max_ = load_mnist()

# Construct and train a convolutional neural network on MNIST using Keras
source = cnn_mnist_k(x_train.shape[1:])
source.fit(x_train, y_train, nb_epochs=5, batch_size=128)

# Craft adversarial samples with DeepFool
adv_crafter = DeepFool(source)
x_train_adv = adv_crafter.generate(x_train)
x_test_adv = adv_crafter.generate(x_test)

# Construct and train a convolutional neural network
target = cnn_mnist_tf(x_train.shape[1:])
target.fit(x_train, y_train, nb_epochs=5, batch_size=128)

# Evaluate the CNN on the adversarial samples
    def test_ptclassifier(self):
        """
        Third test with the PyTorchClassifier.
        :return:
        """
        # Get MNIST
        batch_size, nb_train, nb_test = 100, 5000, 10
        (x_train, y_train), (x_test, y_test), _, _ = load_mnist()
        x_train, y_train = x_train[:nb_train], y_train[:nb_train]
        x_test, y_test = x_test[:nb_test], y_test[:nb_test]
        x_train = np.swapaxes(x_train, 1, 3)
        x_test = np.swapaxes(x_test, 1, 3)

        # Create simple CNN
        # Define the network
        model = Model()

        # Define a loss function and optimizer
        loss_fn = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.01)

        # Get classifier
        ptc = PyTorchClassifier((0, 1), model, loss_fn, optimizer, (1, 28, 28),
                                10)
        ptc.fit(x_train, y_train, batch_size=batch_size, nb_epochs=10)

        # First attack
        cl2m = CarliniL2Method(classifier=ptc,
                               targeted=True,
                               max_iter=100,
                               binary_search_steps=1,
                               learning_rate=1,
                               initial_const=10,
                               decay=0)
        params = {'y': random_targets(y_test, ptc.nb_classes)}
        x_test_adv = cl2m.generate(x_test, **params)
        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())
        target = np.argmax(params['y'], axis=1)
        y_pred_adv = np.argmax(ptc.predict(x_test_adv), axis=1)
        self.assertTrue((target == y_pred_adv).any())

        # Second attack
        cl2m = CarliniL2Method(classifier=ptc,
                               targeted=False,
                               max_iter=100,
                               binary_search_steps=1,
                               learning_rate=1,
                               initial_const=10,
                               decay=0)
        params = {'y': random_targets(y_test, ptc.nb_classes)}
        x_test_adv = cl2m.generate(x_test, **params)
        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())
        target = np.argmax(params['y'], axis=1)
        y_pred_adv = np.argmax(ptc.predict(x_test_adv), axis=1)
        self.assertTrue((target != y_pred_adv).any())

        # Third attack
        cl2m = CarliniL2Method(classifier=ptc,
                               targeted=False,
                               max_iter=100,
                               binary_search_steps=1,
                               learning_rate=1,
                               initial_const=10,
                               decay=0)
        params = {}
        x_test_adv = cl2m.generate(x_test, **params)
        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())
        y_pred = np.argmax(ptc.predict(x_test), axis=1)
        y_pred_adv = np.argmax(ptc.predict(x_test_adv), axis=1)
        self.assertTrue((y_pred != y_pred_adv).any())
Beispiel #25
0
    def setUpClass(cls):
        master_seed(301)
        (x_train, y_train), (x_test, y_test), min_, max_ = load_mnist()
        y_train = np.argmax(y_train, axis=1)
        y_test = np.argmax(y_test, axis=1)
        zero_or_four = np.logical_or(y_train == 4, y_train == 0)
        x_train = x_train[zero_or_four]
        y_train = y_train[zero_or_four]
        tr_labels = np.zeros((y_train.shape[0], 2))
        tr_labels[y_train == 0] = np.array([1, 0])
        tr_labels[y_train == 4] = np.array([0, 1])
        y_train = tr_labels

        zero_or_four = np.logical_or(y_test == 4, y_test == 0)
        x_test = x_test[zero_or_four]
        y_test = y_test[zero_or_four]
        te_labels = np.zeros((y_test.shape[0], 2))
        te_labels[y_test == 0] = np.array([1, 0])
        te_labels[y_test == 4] = np.array([0, 1])
        y_test = te_labels

        n_samples_train = x_train.shape[0]
        n_features_train = x_train.shape[1] * x_train.shape[2] * x_train.shape[3]
        n_samples_test = x_test.shape[0]
        n_features_test = x_test.shape[1] * x_test.shape[2] * x_test.shape[3]

        x_train = x_train.reshape(n_samples_train, n_features_train)
        x_test = x_test.reshape(n_samples_test, n_features_test)
        x_train = x_train[:NB_TRAIN]
        y_train = y_train[:NB_TRAIN]

        trusted_data = x_test[:NB_TRUSTED]
        trusted_labels = y_test[:NB_TRUSTED]
        x_test = x_test[NB_TRUSTED:]
        y_test = y_test[NB_TRUSTED:]
        valid_data = x_test[:NB_VALID]
        valid_labels = y_test[:NB_VALID]
        x_test = x_test[NB_VALID:]
        y_test = y_test[NB_VALID:]

        clean_prov = np.random.randint(NB_DEVICES - 1, size=x_train.shape[0])
        p_train = np.eye(NB_DEVICES)[clean_prov]

        no_defense = ScikitlearnSVC(model=SVC(kernel=kernel), clip_values=(min_, max_))
        no_defense.fit(x=x_train, y=y_train)
        poison_points = np.random.randint(no_defense._model.support_vectors_.shape[0], size=NB_POISON)
        all_poison_init = np.copy(no_defense._model.support_vectors_[poison_points])
        poison_labels = np.array([1, 1]) - no_defense.predict(all_poison_init)

        svm_attack = PoisoningAttackSVM(classifier=no_defense, x_train=x_train, y_train=y_train,
                                        step=0.1, eps=1.0, x_val=valid_data, y_val=valid_labels, max_iters=200)

        poisoned_data = svm_attack.generate(all_poison_init, y=poison_labels)

        # Stack on poison to data and add provenance of bad actor
        all_data = np.vstack([x_train, poisoned_data])
        all_labels = np.vstack([y_train, poison_labels])
        poison_prov = np.zeros((NB_POISON, NB_DEVICES))
        poison_prov[:, NB_DEVICES - 1] = 1
        all_p = np.vstack([p_train, poison_prov])

        model = SVC(kernel=kernel)
        cls.mnist = (all_data, all_labels, all_p), (x_test, y_test), (trusted_data, trusted_labels), \
                    (valid_data, valid_labels), (min_, max_)
        cls.classifier = SklearnClassifier(model=model, clip_values=(min_, max_))

        cls.classifier.fit(all_data, all_labels)
        cls.defence_trust = ProvenanceDefense(cls.classifier, all_data, all_labels, all_p,
                                              x_val=trusted_data, y_val=trusted_labels, eps=0.1)
        cls.defence_no_trust = ProvenanceDefense(cls.classifier, all_data, all_labels, all_p, eps=0.1)
def main():
    try:
        print('See if poison model has been previously trained ')
        import pickle
        classifier = pickle.load(open('my_poison_classifier.p', 'rb'))
        print('Loaded model from pickle.... ')

        data_train = np.load('data_training.npz')
        x_train = data_train['x_train']
        y_train = data_train['y_train']
        is_poison_train = data_train['is_poison_train']

        data_test = np.load('data_testing.npz')
        x_test = data_test['x_test']
        y_test = data_test['y_test']
        is_poison_test = data_test['is_poison_test']

    except:
        # Read MNIST dataset (x_raw contains the original images):
        (x_raw, y_raw), (x_raw_test, y_raw_test), min_, max_ = load_mnist(raw=True)

        n_train = np.shape(x_raw)[0]
        num_selection = n_train
        random_selection_indices = np.random.choice(n_train, num_selection)
        x_raw = x_raw[random_selection_indices]
        y_raw = y_raw[random_selection_indices]

        # Poison training data
        perc_poison = .33
        (is_poison_train, x_poisoned_raw, y_poisoned_raw) = generate_backdoor(x_raw, y_raw, perc_poison)
        x_train, y_train = preprocess(x_poisoned_raw, y_poisoned_raw)
        # Add channel axis:
        x_train = np.expand_dims(x_train, axis=3)

        # Poison test data
        (is_poison_test, x_poisoned_raw_test, y_poisoned_raw_test) = generate_backdoor(x_raw_test, y_raw_test,
                                                                                       perc_poison)
        x_test, y_test = preprocess(x_poisoned_raw_test, y_poisoned_raw_test)
        # Add channel axis:
        x_test = np.expand_dims(x_test, axis=3)

        # Shuffle training data so poison is not together
        n_train = np.shape(y_train)[0]
        shuffled_indices = np.arange(n_train)
        np.random.shuffle(shuffled_indices)
        x_train = x_train[shuffled_indices]
        y_train = y_train[shuffled_indices]
        is_poison_train = is_poison_train[shuffled_indices]

        # Save data used for training and testing split:
        np.savez('data_training.npz', x_train=x_train, y_train=y_train, is_poison_train=is_poison_train,
                 x_raw=x_poisoned_raw)
        np.savez('data_testing.npz', x_test=x_test, y_test=y_test, is_poison_test=is_poison_test,
                 x_raw_test=x_poisoned_raw_test)

        # Create Keras convolutional neural network - basic architecture from Keras examples
        # Source here: https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py
        k.set_learning_phase(1)
        model = Sequential()
        model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=x_train.shape[1:]))
        model.add(Conv2D(64, (3, 3), activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(10, activation='softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        classifier = KerasClassifier((min_, max_), model=model)

        classifier.fit(x_train, y_train, nb_epochs=50, batch_size=128)

        print('Saving poisoned model: ')
        pickle.dump(classifier, open('my_poison_classifier.p', 'wb'))

        # Also saving for Anu:
        file_name = 'anu_poison_mnist'
        model.save(file_name + '.hdf5')
        model_json = model.to_json()
        with open(file_name + '.json', "w") as json_file:
            json_file.write(model_json)

    # Evaluate the classifier on the test set
    preds = np.argmax(classifier.predict(x_test), axis=1)
    acc = np.sum(preds == np.argmax(y_test, axis=1)) / y_test.shape[0]
    print("\nTest accuracy: %.2f%%" % (acc * 100))

    # Evaluate the classifier on poisonous data
    preds = np.argmax(classifier.predict(x_test[is_poison_test]), axis=1)
    acc = np.sum(preds == np.argmax(y_test[is_poison_test], axis=1)) / y_test[is_poison_test].shape[0]
    print("\nPoisonous test set accuracy (i.e. effectiveness of poison): %.2f%%" % (acc * 100))

    # Evaluate the classifier on clean data
    preds = np.argmax(classifier.predict(x_test[is_poison_test == 0]), axis=1)
    acc = np.sum(preds == np.argmax(y_test[is_poison_test == 0], axis=1)) / y_test[is_poison_test == 0].shape[0]
    print("\nClean test set accuracy: %.2f%%" % (acc * 100))

    # Calling poisoning defence:
    defence = ActivationDefence(classifier, x_train, y_train)

    # End-to-end method:
    print("------------------- Results using size metric -------------------")
    print(defence.get_params())
    defence.detect_poison(n_clusters=2, ndims=10, reduce="PCA")

    # Now fix the model
    x_new, y_fix = correct_poisoned_labels(x_train, y_train, is_poison_train)

    improvement = defence.relabel_poison_ground_truth(x_new, y_fix, test_set_split=0.7, tolerable_backdoor=0.001,
                                                      max_epochs=5, batch_epochs=10)

    # Evaluate the classifier on poisonous data after backdoor fix:
    preds = np.argmax(classifier.predict(x_test[is_poison_test]), axis=1)
    acc_after = np.sum(preds == np.argmax(y_test[is_poison_test], axis=1)) / y_test[is_poison_test].shape[0]
    print("\nPoisonous test set accuracy (i.e. effectiveness of poison) after backdoor fix: %.2f%%" % (acc_after * 100))

    print("\n Improvement after training: ", improvement)
    print('before: ', acc, ' after: ', acc_after)

    print("done :) ")
Beispiel #27
0
    def test_binary_input_detector(self):
        """
        Test the binary input detector end-to-end.
        :return:
        """
        # Get MNIST
        nb_train, nb_test = 1000, 10
        (x_train, y_train), (x_test, y_test), _, _ = load_mnist()
        x_train, y_train = x_train[:NB_TRAIN], y_train[:NB_TRAIN]
        x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST]

        # Keras classifier
        classifier = get_classifier_kr()

        # Generate adversarial samples:
        attacker = FastGradientMethod(classifier, eps=0.1)
        x_train_adv = attacker.generate(x_train[:nb_train])
        x_test_adv = attacker.generate(x_test[:nb_test])

        # Compile training data for detector:
        x_train_detector = np.concatenate((x_train[:nb_train], x_train_adv),
                                          axis=0)
        y_train_detector = np.concatenate(
            (np.array([[1, 0]] * nb_train), np.array([[0, 1]] * nb_train)),
            axis=0)

        # Create a simple CNN for the detector
        input_shape = x_train.shape[1:]
        model = Sequential()
        model.add(
            Conv2D(4,
                   kernel_size=(5, 5),
                   activation='relu',
                   input_shape=input_shape))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Flatten())
        model.add(Dense(2, activation='softmax'))
        model.compile(loss=keras.losses.categorical_crossentropy,
                      optimizer=keras.optimizers.Adam(lr=0.01),
                      metrics=['accuracy'])

        # Create detector and train it:
        detector = BinaryInputDetector(
            KerasClassifier(model=model, clip_values=(0, 1), use_logits=False))
        detector.fit(x_train_detector,
                     y_train_detector,
                     nb_epochs=2,
                     batch_size=128)

        # Apply detector on clean and adversarial test data:
        test_detection = np.argmax(detector.predict(x_test), axis=1)
        test_adv_detection = np.argmax(detector.predict(x_test_adv), axis=1)

        # Assert there is at least one true positive and negative:
        nb_true_positives = len(np.where(test_adv_detection == 1)[0])
        nb_true_negatives = len(np.where(test_detection == 0)[0])
        logger.debug('Number of true positives detected: %i',
                     nb_true_positives)
        logger.debug('Number of true negatives detected: %i',
                     nb_true_negatives)
        self.assertGreater(nb_true_positives, 0)
        self.assertGreater(nb_true_negatives, 0)
def main():
    # SETTING UP DEFENCE GAN TRAINED MODELS
    # * Clone the defence gan gitrepo https://github.com/yogeshbalaji/InvGAN
    # * Follow the setup instructions and copy the following:
    #   * data/ to adversarial-robustness-toolbox/defence_gan/data/
    #   * output/gans/mnist to adversarial-robustness-toolbox/defence_gan/output/gans/mnist
    #   * output/gans_inv_nottrain/mnist to adversarial-robustness-toolbox/defence_gan/output/gans_inv_nottrain/mnist

    # STEP 0
    logging.info("Loading a Dataset")
    (_, _), (x_test_original,
             y_test_original), min_pixel_value, max_pixel_value = load_mnist()

    # TODO remove before PR request
    # batch_size = x_test_original.shape[0]
    batch_size = 1000

    (x_test, y_test) = (x_test_original[:batch_size],
                        y_test_original[:batch_size])

    # STEP 1
    logging.info("Creating a TS1 Mnist Classifier")
    classifier = create_ts1_art_mnist_classifier(min_pixel_value,
                                                 max_pixel_value)
    classifier.fit(x_test, y_test, batch_size=batch_size, nb_epochs=3)

    # Code to load the original defense_gan paper mnist classifier to reproduce paper results
    # classifier_paper = create_defense_gan_paper_mnist_art_classifier()

    # STEP 2
    logging.info("Evaluate the ART classifier on non adversarial examples")
    predictions = classifier.predict(x_test)
    accuracy_non_adv = get_accuracy(predictions, y_test)

    # STEP 3
    logging.info("Generate adversarial examples")
    attack = FastGradientMethod(classifier, eps=0.2)
    x_test_adv = attack.generate(x=x_test)

    # STEP 4
    logging.info("Evaluate the classifier on the adversarial examples")
    predictions = classifier.predict(x_test_adv)
    accuracy_adv = get_accuracy(predictions, y_test)

    # STEP 5
    logging.info("Create DefenceGAN")
    encoder = create_ts1_encoder_model(batch_size)
    generator = create_ts1_generator_model(batch_size)

    inverse_gan = InverseGAN(sess=generator._sess,
                             gan=generator,
                             inverse_gan=encoder)
    # defense_gan = DefenseGAN(sess=generator.sess,
    #                          generator=generator)

    logging.info("Generating Defended Samples")
    x_test_defended = inverse_gan(x_test_adv, maxiter=1)

    # STEP 6
    logging.info("Evaluate the classifier on the defended examples")
    predictions = classifier.predict(x_test_defended)
    accuracy_defended = get_accuracy(predictions, y_test)

    logger.info(
        "Accuracy on non adversarial examples: {}%".format(accuracy_non_adv))
    logger.info("Accuracy on adversarial examples: {}%".format(accuracy_adv))
    logger.info("Accuracy on defended examples: {}%".format(accuracy_defended))
Beispiel #29
0
 def setUpClass(cls):
     # Get MNIST
     (x_train, y_train), (x_test, y_test), _, _ = load_mnist()
     x_train, y_train = x_train[:NB_TRAIN], y_train[:NB_TRAIN]
     x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST]
     cls.mnist = (x_train, y_train), (x_test, y_test)
Beispiel #30
0
    def test_binary_input_detector(self):
        """
        Test the binary input detector end-to-end.
        :return:
        """
        # Initialize a tf session
        session = tf.Session()
        k.set_session(session)

        # Get MNIST
        batch_size, nb_train, nb_test = 100, 1000, 10
        (x_train, y_train), (x_test, y_test), _, _ = load_mnist()
        x_train, y_train = x_train[:NB_TRAIN], y_train[:NB_TRAIN]
        x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST]

        input_shape = x_train.shape[1:]
        nb_classes = 10

        # Create simple CNN
        model = Sequential()
        model.add(
            Conv2D(4,
                   kernel_size=(5, 5),
                   activation='relu',
                   input_shape=input_shape))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Flatten())
        model.add(Dense(nb_classes, activation='softmax'))
        model.compile(loss=keras.losses.categorical_crossentropy,
                      optimizer=keras.optimizers.Adam(lr=0.01),
                      metrics=['accuracy'])

        # Create classifier and train it:
        classifier = KerasClassifier((0, 1), model, use_logits=False)
        classifier.fit(x_train, y_train, nb_epochs=5, batch_size=128)

        # Generate adversarial samples:
        attacker = FastGradientMethod(classifier, eps=0.1)
        x_train_adv = attacker.generate(x_train[:nb_train])
        x_test_adv = attacker.generate(x_test[:nb_test])

        # Compile training data for detector:
        x_train_detector = np.concatenate((x_train[:nb_train], x_train_adv),
                                          axis=0)
        y_train_detector = np.concatenate(
            (np.array([[1, 0]] * nb_train), np.array([[0, 1]] * nb_train)),
            axis=0)

        # Create a simple CNN for the detector.
        # Note: we use the same architecture as for the classifier, except for the number of outputs (=2)
        model = Sequential()
        model.add(
            Conv2D(4,
                   kernel_size=(5, 5),
                   activation='relu',
                   input_shape=input_shape))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Flatten())
        model.add(Dense(2, activation='softmax'))
        model.compile(loss=keras.losses.categorical_crossentropy,
                      optimizer=keras.optimizers.Adam(lr=0.01),
                      metrics=['accuracy'])

        # Create detector and train it:
        detector = BinaryInputDetector(
            KerasClassifier((0, 1), model, use_logits=False))
        detector.fit(x_train_detector,
                     y_train_detector,
                     nb_epochs=2,
                     batch_size=128)

        # Apply detector on clean and adversarial test data:
        test_detection = np.argmax(detector(x_test), axis=1)
        test_adv_detection = np.argmax(detector(x_test_adv), axis=1)

        # Assert there is at least one true positive and negative:
        nb_true_positives = len(np.where(test_adv_detection == 1)[0])
        nb_true_negatives = len(np.where(test_detection == 0)[0])
        self.assertTrue(nb_true_positives > 0)
        self.assertTrue(nb_true_negatives > 0)