def test_krclassifier(self): """ Second test with the KerasClassifier. :return: """ # Get MNIST batch_size, nb_train, nb_test = 100, 1000, 10 (x_train, y_train), (x_test, y_test), _, _ = load_mnist() x_train, y_train = x_train[:nb_train], y_train[:nb_train] x_test, y_test = x_test[:nb_test], y_test[:nb_test] # Create simple CNN model = Sequential() model.add(Conv2D(4, kernel_size=(5, 5), activation='relu', input_shape=(28, 28, 1))) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Flatten()) model.add(Dense(10, activation='softmax')) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(lr=0.01), metrics=['accuracy']) # Get classifier krc = KerasClassifier((0, 1), model, use_logits=False) krc.fit(x_train, y_train, batch_size=batch_size, nb_epochs=2) # Attack nf = NewtonFool(krc) nf.set_params(max_iter=5) x_test_adv = nf.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) y_pred = krc.predict(x_test) y_pred_adv = krc.predict(x_test_adv) y_pred_bool = y_pred.max(axis=1, keepdims=1) == y_pred y_pred_max = y_pred.max(axis=1) y_pred_adv_max = y_pred_adv[y_pred_bool] self.assertTrue((y_pred_max >= y_pred_adv_max).all())
def test_emp_robustness_mnist(self): # Get MNIST (x_train, y_train), (_, _), _, _ = load_mnist() x_train, y_train = x_train[:NB_TRAIN], y_train[:NB_TRAIN] # Get classifier classifier = self._cnn_mnist_k([28, 28, 1]) classifier.fit(x_train, y_train, batch_size=BATCH_SIZE, nb_epochs=2) # Compute minimal perturbations params = {"eps_step": 1.1} emp_robust = empirical_robustness(classifier, x_train, str('fgsm'), params) self.assertEqual(emp_robust, 0.) params = {"eps_step": 1., "eps": 1.} emp_robust = empirical_robustness(classifier, x_train, str('fgsm'), params) self.assertAlmostEqual(emp_robust, 1., 3) params = {"eps_step": 0.1, "eps": 0.2} emp_robust = empirical_robustness(classifier, x_train, str('fgsm'), params) self.assertLessEqual(emp_robust, 0.21)
def test_clever_l2_same_target(self): batch_size = 100 (x_train, y_train), (x_test, _), _, _ = load_mnist() # Get the classifier krc = self._create_krclassifier() krc.fit(x_train, y_train, batch_size=batch_size, nb_epochs=2, verbose=0) scores = clever(krc, x_test[0], 5, 5, 3, 2, target=np.argmax(krc.predict(x_test[:1])), c_init=1, pool_factor=10) self.assertIsNone( scores[0], msg="Clever scores for the predicted class should be `None`.")
def test_save_load_mlp(self): NB_TRAIN = 100 NB_TEST = 10 comp_params = {'loss': 'categorical_crossentropy', 'optimizer': 'adam', 'metrics': ['accuracy']} session = tf.Session() keras.backend.set_session(session) # get MNIST (X_train, Y_train), (X_test, Y_test), _, _ = load_mnist() X_train, Y_train, X_test, Y_test = X_train[:NB_TRAIN], Y_train[:NB_TRAIN], X_test[:NB_TEST], Y_test[:NB_TEST] im_shape = X_train[0].shape classifier = MLP(im_shape, act="brelu") classifier.compile(comp_params) # Fit the classifier classifier.fit(X_train, Y_train, epochs=1, batch_size=BATCH_SIZE) path = "./tests/save/mlp/" # Test saving save_classifier(classifier, path) self.assertTrue(os.path.isfile(path + "model.json")) self.assertTrue(os.path.getsize(path + "model.json") > 0) self.assertTrue(os.path.isfile(path + "weights.h5")) self.assertTrue(os.path.getsize(path + "weights.h5") > 0) # Test loading loaded_classifier = load_classifier(path) scores = classifier.evaluate(X_test, Y_test) scores_loaded = loaded_classifier.evaluate(X_test, Y_test) self.assertAlmostEqual(scores, scores_loaded)
def test_clever_l2_no_target(self): batch_size = 100 (x_train, y_train), (x_test, _), _, _ = load_mnist() # Get the classifier krc = self._create_krclassifier() krc.fit(x_train, y_train, batch_size=batch_size, nb_epochs=2, verbose=0) scores = clever(krc, x_test[0], 5, 5, 3, 2, target=None, c_init=1, pool_factor=10) logger.info("Clever scores for n-1 classes: %s %s", str(scores), str(scores.shape)) self.assertEqual(scores.shape, (krc.nb_classes - 1, ))
def test_mnist(self): session = tf.Session() keras.backend.set_session(session) # get MNIST (X_train, Y_train), (X_test, Y_test), _, _ = load_mnist() X_train, Y_train, X_test, Y_test = X_train[: NB_TRAIN], Y_train[: NB_TRAIN], X_test[: NB_TEST], Y_test[: NB_TEST] im_shape = X_train[0].shape classifier = CNN(im_shape, act="relu") classifier.compile({ 'loss': 'categorical_crossentropy', 'optimizer': 'adam', 'metrics': ['accuracy'] }) # Fit the classifier classifier.fit(X_train, Y_train, epochs=1, batch_size=BATCH_SIZE) scores = classifier.evaluate(X_test, Y_test) print("\naccuracy: %.2f%%" % (scores[1] * 100))
def setUpClass(cls): k.set_learning_phase(1) # Get MNIST (x_train, y_train), (x_test, y_test), _, _ = load_mnist() x_train, y_train, x_test, y_test = x_train[: NB_TRAIN], y_train[: NB_TRAIN], x_test[: NB_TEST], y_test[: NB_TEST] cls.mnist = (x_train, y_train), (x_test, y_test) # Keras classifier cls.classifier_k = cls._cnn_mnist_k([28, 28, 1]) cls.classifier_k.fit(x_train, y_train, batch_size=BATCH_SIZE, nb_epochs=2) scores = cls.classifier_k._model.evaluate(x_train, y_train) logger.info('[Keras, MNIST] Accuracy on training set: %.2f%%', (scores[1] * 100)) scores = cls.classifier_k._model.evaluate(x_test, y_test) logger.info('[Keras, MNIST] Accuracy on test set: %.2f%%', (scores[1] * 100)) # Create basic CNN on MNIST using TensorFlow cls.classifier_tf = cls._cnn_mnist_tf([28, 28, 1]) cls.classifier_tf.fit(x_train, y_train, nb_epochs=2, batch_size=BATCH_SIZE) scores = get_labels_np_array(cls.classifier_tf.predict(x_train)) acc = np.sum(np.argmax(scores, axis=1) == np.argmax( y_train, axis=1)) / y_train.shape[0] logger.info('[TF, MNIST] Accuracy on training set: %.2f%%', (acc * 100)) scores = get_labels_np_array(cls.classifier_tf.predict(x_test)) acc = np.sum(np.argmax(scores, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] logger.info('[TF, MNIST] Accuracy on test set: %.2f%%', (acc * 100)) # Create basic PyTorch model cls.classifier_py = cls._cnn_mnist_py() x_train, x_test = np.swapaxes(x_train, 1, 3), np.swapaxes(x_test, 1, 3) cls.classifier_py.fit(x_train, y_train, nb_epochs=2, batch_size=BATCH_SIZE) scores = get_labels_np_array(cls.classifier_py.predict(x_train)) acc = np.sum(np.argmax(scores, axis=1) == np.argmax( y_train, axis=1)) / y_train.shape[0] logger.info('[PyTorch, MNIST] Accuracy on training set: %.2f%%', (acc * 100)) scores = get_labels_np_array(cls.classifier_py.predict(x_test)) acc = np.sum(np.argmax(scores, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] logger.info('[PyTorch, MNIST] Accuracy on test set: %.2f%%', (acc * 100))
""" The script demonstrates a simple example of using ART with TensorFlow v1.x. The example train a small model on the MNIST dataset and creates adversarial examples using the Fast Gradient Sign Method. Here we use the ART classifier to train the model, it would also be possible to provide a pretrained model to the ART classifier. The parameters are chosen for reduced computational requirements of the script and not optimised for accuracy. """ import numpy as np from art.attacks.evasion import FastGradientMethod from art.estimators.classification import TensorFlowV2Classifier from art.utils import load_mnist # Step 1: Load the MNIST dataset (x_train, y_train), (x_test, y_test), min_pixel_value, max_pixel_value = load_mnist() # Step 2: Create the model import tensorflow as tf from tensorflow.keras import Model from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D class TensorFlowModel(Model): """ Standard TensorFlow model for unit testing. """ def __init__(self): super(TensorFlowModel, self).__init__() self.conv1 = Conv2D(filters=4, kernel_size=5, activation="relu")
def test_binary_activation_detector(self): """ Test the binary activation detector end-to-end. :return: """ # Get MNIST (x_train, y_train), (x_test, y_test), _, _ = load_mnist() x_train, y_train = x_train[:NB_TRAIN], y_train[:NB_TRAIN] x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST] # Keras classifier classifier = get_image_classifier_kr() # Generate adversarial samples: attacker = FastGradientMethod(classifier, eps=0.1) x_train_adv = attacker.generate(x_train[:NB_TRAIN]) x_test_adv = attacker.generate(x_test[:NB_TRAIN]) # Compile training data for detector: x_train_detector = np.concatenate((x_train[:NB_TRAIN], x_train_adv), axis=0) y_train_detector = np.concatenate( (np.array([[1, 0]] * NB_TRAIN), np.array([[0, 1]] * NB_TRAIN)), axis=0) # Create a simple CNN for the detector activation_shape = classifier.get_activations(x_test[:1], 0, batch_size=128).shape[1:] number_outputs = 2 try: from keras.optimizers import Adam optimizer = Adam(lr=0.01) except ImportError: from keras.optimizers import adam_v2 optimizer = adam_v2.Adam(lr=0.01) model = Sequential() model.add(MaxPooling2D(pool_size=(2, 2), input_shape=activation_shape)) model.add(Flatten()) model.add(Dense(number_outputs, activation="softmax")) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=optimizer, metrics=["accuracy"]) # Create detector and train it. # Detector consider activations at layer=0: detector = BinaryActivationDetector(classifier=classifier, detector=KerasClassifier( model=model, clip_values=(0, 1), use_logits=False), layer=0) detector.fit(x_train_detector, y_train_detector, nb_epochs=2, batch_size=128) # Apply detector on clean and adversarial test data: test_detection = np.argmax(detector.predict(x_test), axis=1) test_adv_detection = np.argmax(detector.predict(x_test_adv), axis=1) # Assert there is at least one true positive and negative nb_true_positives = len(np.where(test_adv_detection == 1)[0]) nb_true_negatives = len(np.where(test_detection == 0)[0]) logger.debug("Number of true positives detected: %i", nb_true_positives) logger.debug("Number of true negatives detected: %i", nb_true_negatives) self.assertGreater(nb_true_positives, 0) self.assertGreater(nb_true_negatives, 0)
""" The script demonstrates a simple example of using ART with TensorFlow v1.x. The example train a small model on the MNIST dataset and creates adversarial examples using the Fast Gradient Sign Method. Here we use the ART classifier to train the model, it would also be possible to provide a pretrained model to the ART classifier. The parameters are chosen for reduced computational requirements of the script and not optimised for accuracy. """ import tensorflow as tf import numpy as np from art.attacks import FastGradientMethod from art.classifiers import TFClassifier from art.utils import load_mnist # Step 1: Load the MNIST dataset (x_train, y_train), (x_test, y_test), min_pixel_value, max_pixel_value = load_mnist() # Step 2: Create the model input_ph = tf.placeholder(tf.float32, shape=[None, 28, 28, 1]) labels_ph = tf.placeholder(tf.int32, shape=[None, 10]) x = tf.layers.conv2d(input_ph, filters=4, kernel_size=5, activation=tf.nn.relu) x = tf.layers.max_pooling2d(x, 2, 2) x = tf.layers.conv2d(x, filters=10, kernel_size=5, activation=tf.nn.relu) x = tf.layers.max_pooling2d(x, 2, 2) x = tf.layers.flatten(x) x = tf.layers.dense(x, 100, activation=tf.nn.relu) logits = tf.layers.dense(x, 10) loss = tf.reduce_mean(tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels_ph)) optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01) train = optimizer.minimize(loss)
def setUpClass(cls): master_seed(seed=1234) (x_train, y_train), (x_test, y_test), min_, max_ = load_mnist() y_train = np.argmax(y_train, axis=1) y_test = np.argmax(y_test, axis=1) zero_or_four = np.logical_or(y_train == 4, y_train == 0) x_train = x_train[zero_or_four] y_train = y_train[zero_or_four] tr_labels = np.zeros((y_train.shape[0], 2)) tr_labels[y_train == 0] = np.array([1, 0]) tr_labels[y_train == 4] = np.array([0, 1]) y_train = tr_labels zero_or_four = np.logical_or(y_test == 4, y_test == 0) x_test = x_test[zero_or_four] y_test = y_test[zero_or_four] te_labels = np.zeros((y_test.shape[0], 2)) te_labels[y_test == 0] = np.array([1, 0]) te_labels[y_test == 4] = np.array([0, 1]) y_test = te_labels n_samples_train = x_train.shape[0] n_features_train = x_train.shape[1] * x_train.shape[2] * x_train.shape[ 3] n_samples_test = x_test.shape[0] n_features_test = x_test.shape[1] * x_test.shape[2] * x_test.shape[3] x_train = x_train.reshape(n_samples_train, n_features_train) x_test = x_test.reshape(n_samples_test, n_features_test) x_train = x_train[:NB_TRAIN] y_train = y_train[:NB_TRAIN] trusted_data = x_test[:NB_TRUSTED] trusted_labels = y_test[:NB_TRUSTED] x_test = x_test[NB_TRUSTED:] y_test = y_test[NB_TRUSTED:] valid_data = x_test[:NB_VALID] valid_labels = y_test[:NB_VALID] x_test = x_test[NB_VALID:] y_test = y_test[NB_VALID:] no_defense = ScikitlearnSVC(model=SVC(kernel=kernel, gamma="auto"), clip_values=(min_, max_)) no_defense.fit(x=x_train, y=y_train) poison_points = np.random.randint( no_defense._model.support_vectors_.shape[0], size=NB_POISON) all_poison_init = np.copy( no_defense._model.support_vectors_[poison_points]) poison_labels = np.array([1, 1]) - no_defense.predict(all_poison_init) svm_attack = PoisoningAttackSVM( classifier=no_defense, x_train=x_train, y_train=y_train, step=0.1, eps=1.0, x_val=valid_data, y_val=valid_labels, max_iter=200, ) poisoned_data, _ = svm_attack.poison(all_poison_init, y=poison_labels) # Stack on poison to data and add provenance of bad actor all_data = np.vstack([x_train, poisoned_data]) all_labels = np.vstack([y_train, poison_labels]) model = SVC(kernel=kernel, gamma="auto") cls.mnist = ( (all_data, all_labels), (x_test, y_test), (trusted_data, trusted_labels), (valid_data, valid_labels), (min_, max_), ) cls.classifier = SklearnClassifier(model=model, clip_values=(min_, max_)) cls.classifier.fit(all_data, all_labels) cls.defense_cal = RONIDefense( cls.classifier, all_data, all_labels, trusted_data, trusted_labels, eps=0.1, calibrated=True, ) cls.defence_no_cal = RONIDefense( cls.classifier, all_data, all_labels, trusted_data, trusted_labels, eps=0.1, calibrated=False, )
def setUpClass(cls): # Get MNIST (_, _), (x_test, y_test), _, _ = load_mnist() x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST] cls.mnist = x_test, y_test
def test_mnist(self): session = tf.Session() k.set_session(session) comp_params = { "loss": 'categorical_crossentropy', "optimizer": 'adam', "metrics": ['accuracy'] } # get MNIST batch_size, nb_train, nb_test = 100, 1000, 100 (X_train, Y_train), (X_test, Y_test), _, _ = load_mnist() X_train, Y_train = X_train[:nb_train], Y_train[:nb_train] X_test, Y_test = X_test[:nb_test], Y_test[:nb_test] im_shape = X_train[0].shape # get classifier classifier = CNN(im_shape, act="relu") classifier.compile(comp_params) classifier.fit(X_train, Y_train, epochs=1, batch_size=batch_size, verbose=0) scores = classifier.evaluate(X_train, Y_train) print("\naccuracy on training set: %.2f%%" % (scores[1] * 100)) scores = classifier.evaluate(X_test, Y_test) print("\naccuracy on test set: %.2f%%" % (scores[1] * 100)) attack_params = { "verbose": 0, "clip_min": 0., "clip_max": 1., "eps": 1. } attack = FastGradientMethod(classifier, session) X_train_adv = attack.generate(X_train, **attack_params) X_test_adv = attack.generate(X_test, **attack_params) self.assertFalse((X_train == X_train_adv).all()) self.assertFalse((X_test == X_test_adv).all()) train_y_pred = get_labels_np_array(classifier.predict(X_train_adv)) test_y_pred = get_labels_np_array(classifier.predict(X_test_adv)) self.assertFalse((Y_train == train_y_pred).all()) self.assertFalse((Y_test == test_y_pred).all()) scores = classifier.evaluate(X_train_adv, Y_train) print('\naccuracy on adversarial train examples: %.2f%%' % (scores[1] * 100)) scores = classifier.evaluate(X_test_adv, Y_test) print('\naccuracy on adversarial test examples: %.2f%%' % (scores[1] * 100)) # test minimal perturbations attack_params = { "verbose": 0, "clip_min": 0., "clip_max": 1., "minimal": True, "eps_step": .1, "eps_max": 1. } X_train_adv_min = attack.generate(X_train, **attack_params) X_test_adv_min = attack.generate(X_test, **attack_params) self.assertFalse((X_train_adv_min == X_train_adv).all()) self.assertFalse((X_test_adv_min == X_test_adv).all()) self.assertFalse((X_train == X_train_adv_min).all()) self.assertFalse((X_test == X_test_adv_min).all()) train_y_pred = get_labels_np_array(classifier.predict(X_train_adv_min)) test_y_pred = get_labels_np_array(classifier.predict(X_test_adv_min)) self.assertFalse((Y_train == train_y_pred).all()) self.assertFalse((Y_test == test_y_pred).all()) scores = classifier.evaluate(X_train_adv_min, Y_train) print( '\naccuracy on adversarial train examples with minimal perturbation: %.2f%%' % (scores[1] * 100)) scores = classifier.evaluate(X_test_adv_min, Y_test) print( '\naccuracy on adversarial test examples with minimal perturbation: %.2f%%' % (scores[1] * 100))
def main(): # Read MNIST dataset (x_raw contains the original images): (x_raw, y_raw), (x_raw_test, y_raw_test), min_, max_ = load_mnist(raw=True) n_train = np.shape(x_raw)[0] num_selection = 5000 random_selection_indices = np.random.choice(n_train, num_selection) x_raw = x_raw[random_selection_indices] y_raw = y_raw[random_selection_indices] # Poison training data perc_poison = 0.33 (is_poison_train, x_poisoned_raw, y_poisoned_raw) = generate_backdoor(x_raw, y_raw, perc_poison) x_train, y_train = preprocess(x_poisoned_raw, y_poisoned_raw) # Add channel axis: x_train = np.expand_dims(x_train, axis=3) # Poison test data (is_poison_test, x_poisoned_raw_test, y_poisoned_raw_test) = generate_backdoor(x_raw_test, y_raw_test, perc_poison) x_test, y_test = preprocess(x_poisoned_raw_test, y_poisoned_raw_test) # Add channel axis: x_test = np.expand_dims(x_test, axis=3) # Shuffle training data so poison is not together n_train = np.shape(y_train)[0] shuffled_indices = np.arange(n_train) np.random.shuffle(shuffled_indices) x_train = x_train[shuffled_indices] y_train = y_train[shuffled_indices] is_poison_train = is_poison_train[shuffled_indices] # Create Keras convolutional neural network - basic architecture from Keras examples # Source here: https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py model = Sequential() model.add( Conv2D(32, kernel_size=(3, 3), activation="relu", input_shape=x_train.shape[1:])) model.add(Conv2D(64, (3, 3), activation="relu")) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(128, activation="relu")) model.add(Dropout(0.5)) model.add(Dense(10, activation="softmax")) model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]) classifier = KerasClassifier(model=model, clip_values=(min_, max_)) classifier.fit(x_train, y_train, nb_epochs=30, batch_size=128) # Evaluate the classifier on the test set preds = np.argmax(classifier.predict(x_test), axis=1) acc = np.sum(preds == np.argmax(y_test, axis=1)) / y_test.shape[0] print("\nTest accuracy: %.2f%%" % (acc * 100)) # Evaluate the classifier on poisonous data preds = np.argmax(classifier.predict(x_test[is_poison_test]), axis=1) acc = np.sum(preds == np.argmax(y_test[is_poison_test], axis=1)) / y_test[is_poison_test].shape[0] print( "\nPoisonous test set accuracy (i.e. effectiveness of poison): %.2f%%" % (acc * 100)) # Evaluate the classifier on clean data preds = np.argmax(classifier.predict(x_test[is_poison_test == 0]), axis=1) acc = np.sum(preds == np.argmax(y_test[ is_poison_test == 0], axis=1)) / y_test[is_poison_test == 0].shape[0] print("\nClean test set accuracy: %.2f%%" % (acc * 100)) # Calling poisoning defence: defence = ActivationDefence(classifier, x_train, y_train) # End-to-end method: print("------------------- Results using size metric -------------------") print(defence.get_params()) defence.detect_poison(nb_clusters=2, nb_dims=10, reduce="PCA") # Evaluate method when ground truth is known: is_clean = is_poison_train == 0 confusion_matrix = defence.evaluate_defence(is_clean) print("Evaluation defence results for size-based metric: ") jsonObject = json.loads(confusion_matrix) for label in jsonObject: print(label) pprint.pprint(jsonObject[label]) # Visualize clusters: print("Visualize clusters") sprites_by_class = defence.visualize_clusters(x_train, "mnist_poison_demo") # Show plots for clusters of class 5 n_class = 5 try: import matplotlib.pyplot as plt plt.imshow(sprites_by_class[n_class][0]) plt.title("Class " + str(n_class) + " cluster: 0") plt.show() plt.imshow(sprites_by_class[n_class][1]) plt.title("Class " + str(n_class) + " cluster: 1") plt.show() except ImportError: print( "matplotlib not installed. For this reason, cluster visualization was not displayed" ) # Try again using distance analysis this time: print( "------------------- Results using distance metric -------------------" ) print(defence.get_params()) defence.detect_poison(nb_clusters=2, nb_dims=10, reduce="PCA", cluster_analysis="distance") confusion_matrix = defence.evaluate_defence(is_clean) print("Evaluation defence results for distance-based metric: ") jsonObject = json.loads(confusion_matrix) for label in jsonObject: print(label) pprint.pprint(jsonObject[label]) # Other ways to invoke the defence: kwargs = {"nb_clusters": 2, "nb_dims": 10, "reduce": "PCA"} defence.cluster_activations(**kwargs) kwargs = {"cluster_analysis": "distance"} defence.analyze_clusters(**kwargs) defence.evaluate_defence(is_clean) kwargs = {"cluster_analysis": "smaller"} defence.analyze_clusters(**kwargs) defence.evaluate_defence(is_clean) print("done :) ")
def test_3_clever_pt(self): """ Test with pytorch. :return: """ # Get MNIST batch_size, nb_train, nb_test = 100, 1000, 10 (x_train, y_train), (x_test, y_test), _, _ = load_mnist() x_train, y_train = x_train[:nb_train], y_train[:nb_train] x_test, y_test = x_test[:nb_test], y_test[:nb_test] x_train = np.swapaxes(x_train, 1, 3).astype(np.float32) x_test = np.swapaxes(x_test, 1, 3).astype(np.float32) # Get the classifier ptc = self._create_ptclassifier() ptc.fit(x_train, y_train, batch_size=batch_size, nb_epochs=1) # Test targeted clever res0 = clever_t(ptc, x_test[-1], 2, 10, 5, R_L1, norm=1, pool_factor=3) res1 = clever_t(ptc, x_test[-1], 2, 10, 5, R_L2, norm=2, pool_factor=3) res2 = clever_t(ptc, x_test[-1], 2, 10, 5, R_LI, norm=np.inf, pool_factor=3) logger.info("Targeted PyTorch: %f %f %f", res0, res1, res2) self.assertNotEqual(res0, res1) self.assertNotEqual(res1, res2) self.assertNotEqual(res2, res0) # Test untargeted clever res0 = clever_u(ptc, x_test[-1], 10, 5, R_L1, norm=1, pool_factor=3, verbose=False) res1 = clever_u(ptc, x_test[-1], 10, 5, R_L2, norm=2, pool_factor=3, verbose=False) res2 = clever_u(ptc, x_test[-1], 10, 5, R_LI, norm=np.inf, pool_factor=3, verbose=False) logger.info("Untargeted PyTorch: %f %f %f", res0, res1, res2) self.assertNotEqual(res0, res1) self.assertNotEqual(res1, res2) self.assertNotEqual(res2, res0)
def test_clever_kr(self): """ Test with keras. :return: """ # Get MNIST batch_size, nb_train, nb_test = 100, 1000, 10 (x_train, y_train), (x_test, y_test), _, _ = load_mnist() x_train, y_train = x_train[:nb_train], y_train[:nb_train] x_test, y_test = x_test[:nb_test], y_test[:nb_test] # Get the classifier krc = self._create_krclassifier() krc.fit(x_train, y_train, batch_size=batch_size, nb_epochs=1, verbose=0) # Test targeted clever res0 = clever_t(krc, x_test[-1], 2, 10, 5, R_L1, norm=1, pool_factor=3) res1 = clever_t(krc, x_test[-1], 2, 10, 5, R_L2, norm=2, pool_factor=3) res2 = clever_t(krc, x_test[-1], 2, 10, 5, R_LI, norm=np.inf, pool_factor=3) logger.info("Targeted Keras: %f %f %f", res0, res1, res2) self.assertNotEqual(res0, res1) self.assertNotEqual(res1, res2) self.assertNotEqual(res2, res0) # Test untargeted clever res0 = clever_u(krc, x_test[-1], 10, 5, R_L1, norm=1, pool_factor=3, verbose=False) res1 = clever_u(krc, x_test[-1], 10, 5, R_L2, norm=2, pool_factor=3, verbose=False) res2 = clever_u(krc, x_test[-1], 10, 5, R_LI, norm=np.inf, pool_factor=3, verbose=False) logger.info("Untargeted Keras: %f %f %f", res0, res1, res2) self.assertNotEqual(res0, res1) self.assertNotEqual(res1, res2) self.assertNotEqual(res2, res0)
def test_2_clever_tf(self): """ Test with TensorFlow. :return: """ # Get MNIST batch_size, nb_train, nb_test = 100, 1000, 10 (x_train, y_train), (x_test, y_test), _, _ = load_mnist() x_train, y_train = x_train[:nb_train], y_train[:nb_train] x_test, y_test = x_test[:nb_test], y_test[:nb_test] # Get the classifier tfc = self._create_tfclassifier() tfc.fit(x_train, y_train, batch_size=batch_size, nb_epochs=1) # TODO Need to configure r # Test targeted clever res0 = clever_t(tfc, x_test[-1], 2, 10, 5, R_L1, norm=1, pool_factor=3) res1 = clever_t(tfc, x_test[-1], 2, 10, 5, R_L2, norm=2, pool_factor=3) res2 = clever_t(tfc, x_test[-1], 2, 10, 5, R_LI, norm=np.inf, pool_factor=3) logger.info("Targeted TensorFlow: %f %f %f", res0, res1, res2) self.assertNotEqual(res0, res1) self.assertNotEqual(res1, res2) self.assertNotEqual(res2, res0) # Test untargeted clever res0 = clever_u(tfc, x_test[-1], 10, 5, R_L1, norm=1, pool_factor=3, verbose=False) res1 = clever_u(tfc, x_test[-1], 10, 5, R_L2, norm=2, pool_factor=3, verbose=False) res2 = clever_u(tfc, x_test[-1], 10, 5, R_LI, norm=np.inf, pool_factor=3, verbose=False) logger.info("Untargeted TensorFlow: %f %f %f", res0, res1, res2) self.assertNotEqual(res0, res1) self.assertNotEqual(res1, res2) self.assertNotEqual(res2, res0)
def main(): args = parse_option() print(args) # check args if args.loss not in LOSS_NAMES: raise ValueError('Unsupported loss function type {}'.format(args.loss)) if args.optimizer == 'adam': optimizer1 = tf.keras.optimizers.Adam(lr=args.lr_1) elif args.optimizer == 'lars': from lars_optimizer import LARSOptimizer # not compatible with tf2 optimizer1 = LARSOptimizer( args.lr_1, exclude_from_weight_decay=['batch_normalization', 'bias']) elif args.optimizer == 'sgd': optimizer1 = tfa.optimizers.SGDW(learning_rate=args.lr_1, momentum=0.9, weight_decay=1e-4) optimizer2 = tf.keras.optimizers.Adam(lr=args.lr_2) model_name = '{}_model-bs_{}-lr_{}'.format(args.loss, args.batch_size_1, args.lr_1) # 0. Load data if args.data == 'mnist': mnist = tf.keras.datasets.mnist elif args.data == 'fashion_mnist': mnist = tf.keras.datasets.fashion_mnist print('Loading {} data...'.format(args.data)) (_, y_train), (_, y_test) = mnist.load_data() # x_train, x_test = x_train / 255.0, x_test / 255.0 # x_train = x_train.reshape(-1, 28*28).astype(np.float32) # x_test = x_test.reshape(-1, 28*28).astype(np.float32) (x_train, _), (x_test, _), _, _ = load_mnist() # print(x_train[0][0]) print(x_train.shape, x_test.shape) # simulate low data regime for training # n_train = x_train.shape[0] # shuffle_idx = np.arange(n_train) # np.random.shuffle(shuffle_idx) # x_train = x_train[shuffle_idx][:args.n_data_train] # y_train = y_train[shuffle_idx][:args.n_data_train] # print('Training dataset shapes after slicing:') print(x_train.shape, y_train.shape) train_ds = tf.data.Dataset.from_tensor_slices( (x_train, y_train)).shuffle(5000).batch(args.batch_size_1) train_ds2 = tf.data.Dataset.from_tensor_slices( (x_train, y_train)).shuffle(5000).batch(args.batch_size_2) test_ds = tf.data.Dataset.from_tensor_slices( (x_test, y_test)).batch(args.batch_size_1) # 1. Stage 1: train encoder with multiclass N-pair loss encoder = Encoder(normalize=True, activation=args.activation) projector = Projector(args.projection_dim, normalize=True, activation=args.activation) if args.loss == 'max_margin': def loss_func(z, y): return losses.max_margin_contrastive_loss(z, y, margin=args.margin, metric=args.metric) elif args.loss == 'npairs': loss_func = losses.multiclass_npairs_loss elif args.loss == 'sup_nt_xent': def loss_func(z, y): return losses.supervised_nt_xent_loss( z, y, temperature=args.temperature, base_temperature=args.base_temperature) elif args.loss.startswith('triplet'): triplet_kind = args.loss.split('-')[1] def loss_func(z, y): return losses.triplet_loss(z, y, kind=triplet_kind, margin=args.margin) train_loss = tf.keras.metrics.Mean(name='train_loss') test_loss = tf.keras.metrics.Mean(name='test_loss') # tf.config.experimental_run_functions_eagerly(True) @tf.function # train step for the contrastive loss def train_step_stage1(x, y): ''' x: data tensor, shape: (batch_size, data_dim) y: data labels, shape: (batch_size, ) ''' with tf.GradientTape() as tape: r = encoder(x, training=True) z = projector(r, training=True) # print("z", z, "y", y) loss = loss_func(z, y) gradients = tape.gradient( loss, encoder.trainable_variables + projector.trainable_variables) optimizer1.apply_gradients( zip(gradients, encoder.trainable_variables + projector.trainable_variables)) train_loss(loss) @tf.function def test_step_stage1(x, y): r = encoder(x, training=False) z = projector(r, training=False) t_loss = loss_func(z, y) test_loss(t_loss) print('Stage 1 training ...') for epoch in range(args.epoch): # Reset the metrics at the start of the next epoch train_loss.reset_states() test_loss.reset_states() for x, y in train_ds: train_step_stage1(x, y) for x_te, y_te in test_ds: test_step_stage1(x_te, y_te) template = 'Epoch {}, Loss: {}, Test Loss: {}' # print(template.format(epoch + 1, # train_loss.result(), # test_loss.result())) if args.draw_figures: # projecting data with the trained encoder, projector x_tr_proj = projector(encoder(x_train)) x_te_proj = projector(encoder(x_test)) # convert tensor to np.array x_tr_proj = x_tr_proj.numpy() x_te_proj = x_te_proj.numpy() print(x_tr_proj.shape, x_te_proj.shape) # check learned embedding using PCA pca = PCA(n_components=2) pca.fit(x_tr_proj) x_te_proj_pca = pca.transform(x_te_proj) x_te_proj_pca_df = pd.DataFrame(x_te_proj_pca, columns=['PC1', 'PC2']) x_te_proj_pca_df['label'] = y_test # PCA scatter plot fig, ax = plt.subplots() ax = sns.scatterplot('PC1', 'PC2', data=x_te_proj_pca_df, palette='tab10', hue='label', linewidth=0, alpha=0.6, ax=ax) box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) title = 'Data: {}\nEmbedding: {}\nbatch size: {}; LR: {}'.format( args.data, LOSS_NAMES[args.loss], args.batch_size_1, args.lr_1) ax.set_title(title) fig.savefig('figs/PCA_plot_{}_{}_embed.png'.format( args.data, model_name)) # density plot for PCA g = sns.jointplot('PC1', 'PC2', data=x_te_proj_pca_df, kind="hex") plt.subplots_adjust(top=0.95) g.fig.suptitle(title) g.savefig('figs/Joint_PCA_plot_{}_{}_embed.png'.format( args.data, model_name)) # Stage 2: freeze the learned representations and then learn a classifier # on a linear layer using a softmax loss softmax = SoftmaxPred() train_loss = tf.keras.metrics.Mean(name='train_loss') train_acc = tf.keras.metrics.SparseCategoricalAccuracy(name='train_ACC') test_loss = tf.keras.metrics.Mean(name='test_loss') test_acc = tf.keras.metrics.SparseCategoricalAccuracy(name='test_ACC') cce_loss_obj = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) @tf.function # train step for the 2nd stage def train_step(model, x, y): ''' x: data tensor, shape: (batch_size, data_dim) y: data labels, shape: (batch_size, ) ''' with tf.GradientTape() as tape: r = model.layers[0](x, training=False) y_preds = model.layers[1](r, training=True) loss = cce_loss_obj(y, y_preds) # freeze the encoder, only train the softmax layer gradients = tape.gradient(loss, model.layers[1].trainable_variables) optimizer2.apply_gradients( zip(gradients, model.layers[1].trainable_variables)) train_loss(loss) train_acc(y, y_preds) @tf.function def test_step(x, y): r = encoder(x, training=False) y_preds = softmax(r, training=False) t_loss = cce_loss_obj(y, y_preds) test_loss(t_loss) test_acc(y, y_preds) if args.write_summary: current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = 'logs/{}/{}/{}/train'.format(model_name, args.data, current_time) test_log_dir = 'logs/{}/{}/{}/test'.format(model_name, args.data, current_time) train_summary_writer = tf.summary.create_file_writer(train_log_dir) test_summary_writer = tf.summary.create_file_writer(test_log_dir) print('Stage 2 training ...') model = tf.keras.Sequential([encoder, softmax]) loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) classifier = TensorFlowV2Classifier( model=model, loss_object=loss_object, train_step=train_step, nb_classes=10, input_shape=(28, 28, 1), clip_values=(0, 1), ) # classifier.fit(x_train, y_train, batch_size=256, nb_epochs=20) for epoch in range(args.epoch): # Reset the metrics at the start of the next epoch train_loss.reset_states() train_acc.reset_states() test_loss.reset_states() test_acc.reset_states() for x, y in train_ds2: train_step(model, x, y) if args.write_summary: with train_summary_writer.as_default(): tf.summary.scalar('loss', train_loss.result(), step=epoch) tf.summary.scalar('accuracy', train_acc.result(), step=epoch) for x_te, y_te in test_ds: test_step(x_te, y_te) if args.write_summary: with test_summary_writer.as_default(): tf.summary.scalar('loss', test_loss.result(), step=epoch) tf.summary.scalar('accuracy', test_acc.result(), step=epoch) template = 'Epoch {}, Loss: {}, Acc: {}, Test Loss: {}, Test Acc: {}' print( template.format(epoch + 1, train_loss.result(), train_acc.result() * 100, test_loss.result(), test_acc.result() * 100)) predictions = classifier.predict(x_test) print(predictions.shape, y_test.shape) accuracy = np.sum(np.argmax(predictions, axis=1) == y_test) / len(y_test) print("Accuracy on benign test examples: {}%".format(accuracy * 100)) print('Stage 3 attacking ...') attack = ProjectedGradientDescent(estimator=classifier, eps=args.eps, eps_step=args.eps / 3, max_iter=20) x_test_adv = attack.generate(x=x_test) print('Stage 4 attacking ...') predictions = classifier.predict(x_test_adv) accuracy = np.sum(np.argmax(predictions, axis=1) == y_test) / len(y_test) print("Accuracy on adversarial test examples: {}%".format(accuracy * 100)) natual(args.eps)
def natual(eps): # Step 1: Load the MNIST dataset (x_train, y_train), (x_test, y_test), min_pixel_value, max_pixel_value = load_mnist() # Step 2: Create the model import tensorflow as tf from tensorflow.keras import Model from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D class TensorFlowModel(Model): """ Standard TensorFlow model for unit testing. """ def __init__(self): super(TensorFlowModel, self).__init__() self.conv1 = Conv2D(filters=4, kernel_size=5, activation="relu") self.conv2 = Conv2D(filters=10, kernel_size=5, activation="relu") self.maxpool = MaxPool2D(pool_size=(2, 2), strides=(2, 2), padding="valid", data_format=None) self.flatten = Flatten() self.dense1 = Dense(100, activation="relu") self.logits = Dense(10, activation="linear") def call(self, x): """ Call function to evaluate the model. :param x: Input to the model :return: Prediction of the model """ x = self.conv1(x) x = self.maxpool(x) x = self.conv2(x) x = self.maxpool(x) x = self.flatten(x) x = self.dense1(x) x = self.logits(x) return x optimizer = tf.keras.optimizers.Adam(learning_rate=0.01) def train_step(model, images, labels): with tf.GradientTape() as tape: predictions = model(images, training=True) loss = loss_object(labels, predictions) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) model = TensorFlowModel() loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True) # Step 3: Create the ART classifier classifier = TensorFlowV2Classifier( model=model, loss_object=loss_object, train_step=train_step, nb_classes=10, input_shape=(28, 28, 1), clip_values=(0, 1), ) # Step 4: Train the ART classifier classifier.fit(x_train, y_train, batch_size=64, nb_epochs=10) # Step 5: Evaluate the ART classifier on benign test examples predictions = classifier.predict(x_test) accuracy = np.sum( np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len( y_test) print("Accuracy on benign test examples: {}%".format(accuracy * 100)) # Step 6: Generate adversarial test examples attack = ProjectedGradientDescent(estimator=classifier, eps=eps, eps_step=eps / 3, max_iter=20) x_test_adv = attack.generate(x=x_test) # Step 7: Evaluate the ART classifier on adversarial test examples predictions = classifier.predict(x_test_adv) accuracy = np.sum( np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len( y_test) print("Accuracy on adversarial test examples: {}%".format(accuracy * 100))
def test_tfclassifier(self): """ First test with the TFClassifier. :return: """ # Build a TFClassifier # Define input and output placeholders self._input_ph = tf.placeholder(tf.float32, shape=[None, 28, 28, 1]) self._output_ph = tf.placeholder(tf.int32, shape=[None, 10]) # Define the tensorflow graph conv = tf.layers.conv2d(self._input_ph, 4, 5, activation=tf.nn.relu) conv = tf.layers.max_pooling2d(conv, 2, 2) fc = tf.contrib.layers.flatten(conv) # Logits layer self._logits = tf.layers.dense(fc, 10) # Train operator self._loss = tf.reduce_mean( tf.losses.softmax_cross_entropy(logits=self._logits, onehot_labels=self._output_ph)) optimizer = tf.train.AdamOptimizer(learning_rate=0.01) self._train = optimizer.minimize(self._loss) # Tensorflow session and initialization self._sess = tf.Session() self._sess.run(tf.global_variables_initializer()) # Get MNIST batch_size, nb_train, nb_test = 100, 5000, 10 (x_train, y_train), (x_test, y_test), _, _ = load_mnist() x_train, y_train = x_train[:nb_train], y_train[:nb_train] x_test, y_test = x_test[:nb_test], y_test[:nb_test] # Train the classifier tfc = TFClassifier((0, 1), self._input_ph, self._logits, self._output_ph, self._train, self._loss, None, self._sess) tfc.fit(x_train, y_train, batch_size=batch_size, nb_epochs=10) # First attack cl2m = CarliniL2Method(classifier=tfc, targeted=True, max_iter=100, binary_search_steps=1, learning_rate=1, initial_const=10, decay=0) params = {'y': random_targets(y_test, tfc.nb_classes)} x_test_adv = cl2m.generate(x_test, **params) self.assertFalse((x_test == x_test_adv).all()) #print(x_test_adv) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) target = np.argmax(params['y'], axis=1) y_pred_adv = np.argmax(tfc.predict(x_test_adv), axis=1) print("CW2 Target: %s" % target) print("CW2 Actual: %s" % y_pred_adv) print("CW2 Success Rate: %f" % (sum(target == y_pred_adv) / float(len(target)))) self.assertTrue((target == y_pred_adv).any()) # Second attack cl2m = CarliniL2Method(classifier=tfc, targeted=False, max_iter=100, binary_search_steps=1, learning_rate=1, initial_const=10, decay=0) params = {'y': random_targets(y_test, tfc.nb_classes)} x_test_adv = cl2m.generate(x_test, **params) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) target = np.argmax(params['y'], axis=1) y_pred_adv = np.argmax(tfc.predict(x_test_adv), axis=1) print("CW2 Target: %s" % target) print("CW2 Actual: %s" % y_pred_adv) print("CW2 Success Rate: %f" % (sum(target != y_pred_adv) / float(len(target)))) self.assertTrue((target != y_pred_adv).any()) # Third attack cl2m = CarliniL2Method(classifier=tfc, targeted=False, max_iter=100, binary_search_steps=1, learning_rate=1, initial_const=10, decay=0) params = {} x_test_adv = cl2m.generate(x_test, **params) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) y_pred = np.argmax(tfc.predict(x_test), axis=1) y_pred_adv = np.argmax(tfc.predict(x_test_adv), axis=1) print("CW2 Target: %s" % y_pred) print("CW2 Actual: %s" % y_pred_adv) print("CW2 Success Rate: %f" % (sum(y_pred != y_pred_adv) / float(len(y_pred)))) self.assertTrue((y_pred != y_pred_adv).any())
targets=targets ) #this works # model = GenerateModel(backdoor_type="pattern", model_name=model_name, train=train) print(model_name) # model_layer_dict1 = init_coverage_tables(model) model_layer_times1 = init_coverage_times(model) # times of each neuron covered model_layer_times2 = init_coverage_times(model) # update when new image and adversarial images found model_layer_value1 = init_coverage_value(model) img_dir = './seeds' img_paths = os.listdir(img_dir) img_num = len(img_paths) (x_raw, y_raw), (x_raw_test, y_raw_test), min_, max_ = load_mnist(raw=True) # Poison training data perc_poison = .33 (is_poison_train, x_poisoned_raw, y_poisoned_raw) = generate_backdoor(x_raw, y_raw, perc_poison, backdoor_type=backdoor_type) difference_activation_block2_conv1 = [] difference_activation_block1_conv1 = [] if neuron_select_strategy == '[4]': x_raw_test = x_raw_test.reshape(x_raw_test.shape[0], 28, 28, 1) x_poisoned_raw = x_poisoned_raw.reshape(x_poisoned_raw.shape[0], 28, 28, 1) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) activations_clean_test = get_activations(model, x_raw_test, "block2_conv1") activation_rate_clean_test = np.mean(activations_clean_test['block2_conv1/Relu:0'] != 0, axis=(0, 3)) activations_poisoned_train = get_activations(model, x_poisoned_raw, "block2_conv1")
def test_krclassifier(self): """ Second test with the KerasClassifier. :return: """ # Initialize a tf session session = tf.Session() k.set_session(session) # Get MNIST batch_size, nb_train, nb_test = 100, 5000, 10 (x_train, y_train), (x_test, y_test), _, _ = load_mnist() x_train, y_train = x_train[:nb_train], y_train[:nb_train] x_test, y_test = x_test[:nb_test], y_test[:nb_test] # Create simple CNN model = Sequential() model.add( Conv2D(4, kernel_size=(5, 5), activation='relu', input_shape=(28, 28, 1))) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Flatten()) model.add(Dense(10, activation='softmax')) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(lr=0.01), metrics=['accuracy']) # Get classifier krc = KerasClassifier((0, 1), model, use_logits=False) krc.fit(x_train, y_train, batch_size=batch_size, nb_epochs=10) # First attack cl2m = CarliniL2Method(classifier=krc, targeted=True, max_iter=100, binary_search_steps=1, learning_rate=1, initial_const=10, decay=0) params = {'y': random_targets(y_test, krc.nb_classes)} x_test_adv = cl2m.generate(x_test, **params) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) target = np.argmax(params['y'], axis=1) y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1) print("CW2 Target: %s" % target) print("CW2 Actual: %s" % y_pred_adv) print("CW2 Success Rate: %f" % (sum(target == y_pred_adv) / float(len(target)))) self.assertTrue((target == y_pred_adv).any()) # Second attack cl2m = CarliniL2Method(classifier=krc, targeted=False, max_iter=100, binary_search_steps=1, learning_rate=1, initial_const=10, decay=0) params = {'y': random_targets(y_test, krc.nb_classes)} x_test_adv = cl2m.generate(x_test, **params) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) target = np.argmax(params['y'], axis=1) y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1) print("CW2 Target: %s" % target) print("CW2 Actual: %s" % y_pred_adv) print("CW2 Success Rate: %f" % (sum(target != y_pred_adv) / float(len(target)))) self.assertTrue((target != y_pred_adv).any()) # Third attack cl2m = CarliniL2Method(classifier=krc, targeted=False, max_iter=100, binary_search_steps=1, learning_rate=1, initial_const=10, decay=0) params = {} x_test_adv = cl2m.generate(x_test, **params) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) y_pred = np.argmax(krc.predict(x_test), axis=1) y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1) print("CW2 Target: %s" % y_pred) print("CW2 Actual: %s" % y_pred_adv) print("CW2 Success Rate: %f" % (sum(y_pred != y_pred_adv) / float(len(y_pred)))) self.assertTrue((y_pred != y_pred_adv).any())
model.add(Dense(10, activation="softmax")) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(lr=0.01), metrics=["accuracy"]) classifier = KerasClassifier(model=model, clip_values=(0, 1)) return classifier # Get session session = tf.Session() k.set_session(session) # Read MNIST dataset (x_train, y_train), (x_test, y_test), min_, max_ = load_mnist() # Construct and train a convolutional neural network on MNIST using Keras source = cnn_mnist_k(x_train.shape[1:]) source.fit(x_train, y_train, nb_epochs=5, batch_size=128) # Craft adversarial samples with DeepFool adv_crafter = DeepFool(source) x_train_adv = adv_crafter.generate(x_train) x_test_adv = adv_crafter.generate(x_test) # Construct and train a convolutional neural network target = cnn_mnist_tf(x_train.shape[1:]) target.fit(x_train, y_train, nb_epochs=5, batch_size=128) # Evaluate the CNN on the adversarial samples
def test_ptclassifier(self): """ Third test with the PyTorchClassifier. :return: """ # Get MNIST batch_size, nb_train, nb_test = 100, 5000, 10 (x_train, y_train), (x_test, y_test), _, _ = load_mnist() x_train, y_train = x_train[:nb_train], y_train[:nb_train] x_test, y_test = x_test[:nb_test], y_test[:nb_test] x_train = np.swapaxes(x_train, 1, 3) x_test = np.swapaxes(x_test, 1, 3) # Create simple CNN # Define the network model = Model() # Define a loss function and optimizer loss_fn = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.01) # Get classifier ptc = PyTorchClassifier((0, 1), model, loss_fn, optimizer, (1, 28, 28), 10) ptc.fit(x_train, y_train, batch_size=batch_size, nb_epochs=10) # First attack cl2m = CarliniL2Method(classifier=ptc, targeted=True, max_iter=100, binary_search_steps=1, learning_rate=1, initial_const=10, decay=0) params = {'y': random_targets(y_test, ptc.nb_classes)} x_test_adv = cl2m.generate(x_test, **params) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) target = np.argmax(params['y'], axis=1) y_pred_adv = np.argmax(ptc.predict(x_test_adv), axis=1) self.assertTrue((target == y_pred_adv).any()) # Second attack cl2m = CarliniL2Method(classifier=ptc, targeted=False, max_iter=100, binary_search_steps=1, learning_rate=1, initial_const=10, decay=0) params = {'y': random_targets(y_test, ptc.nb_classes)} x_test_adv = cl2m.generate(x_test, **params) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) target = np.argmax(params['y'], axis=1) y_pred_adv = np.argmax(ptc.predict(x_test_adv), axis=1) self.assertTrue((target != y_pred_adv).any()) # Third attack cl2m = CarliniL2Method(classifier=ptc, targeted=False, max_iter=100, binary_search_steps=1, learning_rate=1, initial_const=10, decay=0) params = {} x_test_adv = cl2m.generate(x_test, **params) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) y_pred = np.argmax(ptc.predict(x_test), axis=1) y_pred_adv = np.argmax(ptc.predict(x_test_adv), axis=1) self.assertTrue((y_pred != y_pred_adv).any())
def setUpClass(cls): master_seed(301) (x_train, y_train), (x_test, y_test), min_, max_ = load_mnist() y_train = np.argmax(y_train, axis=1) y_test = np.argmax(y_test, axis=1) zero_or_four = np.logical_or(y_train == 4, y_train == 0) x_train = x_train[zero_or_four] y_train = y_train[zero_or_four] tr_labels = np.zeros((y_train.shape[0], 2)) tr_labels[y_train == 0] = np.array([1, 0]) tr_labels[y_train == 4] = np.array([0, 1]) y_train = tr_labels zero_or_four = np.logical_or(y_test == 4, y_test == 0) x_test = x_test[zero_or_four] y_test = y_test[zero_or_four] te_labels = np.zeros((y_test.shape[0], 2)) te_labels[y_test == 0] = np.array([1, 0]) te_labels[y_test == 4] = np.array([0, 1]) y_test = te_labels n_samples_train = x_train.shape[0] n_features_train = x_train.shape[1] * x_train.shape[2] * x_train.shape[3] n_samples_test = x_test.shape[0] n_features_test = x_test.shape[1] * x_test.shape[2] * x_test.shape[3] x_train = x_train.reshape(n_samples_train, n_features_train) x_test = x_test.reshape(n_samples_test, n_features_test) x_train = x_train[:NB_TRAIN] y_train = y_train[:NB_TRAIN] trusted_data = x_test[:NB_TRUSTED] trusted_labels = y_test[:NB_TRUSTED] x_test = x_test[NB_TRUSTED:] y_test = y_test[NB_TRUSTED:] valid_data = x_test[:NB_VALID] valid_labels = y_test[:NB_VALID] x_test = x_test[NB_VALID:] y_test = y_test[NB_VALID:] clean_prov = np.random.randint(NB_DEVICES - 1, size=x_train.shape[0]) p_train = np.eye(NB_DEVICES)[clean_prov] no_defense = ScikitlearnSVC(model=SVC(kernel=kernel), clip_values=(min_, max_)) no_defense.fit(x=x_train, y=y_train) poison_points = np.random.randint(no_defense._model.support_vectors_.shape[0], size=NB_POISON) all_poison_init = np.copy(no_defense._model.support_vectors_[poison_points]) poison_labels = np.array([1, 1]) - no_defense.predict(all_poison_init) svm_attack = PoisoningAttackSVM(classifier=no_defense, x_train=x_train, y_train=y_train, step=0.1, eps=1.0, x_val=valid_data, y_val=valid_labels, max_iters=200) poisoned_data = svm_attack.generate(all_poison_init, y=poison_labels) # Stack on poison to data and add provenance of bad actor all_data = np.vstack([x_train, poisoned_data]) all_labels = np.vstack([y_train, poison_labels]) poison_prov = np.zeros((NB_POISON, NB_DEVICES)) poison_prov[:, NB_DEVICES - 1] = 1 all_p = np.vstack([p_train, poison_prov]) model = SVC(kernel=kernel) cls.mnist = (all_data, all_labels, all_p), (x_test, y_test), (trusted_data, trusted_labels), \ (valid_data, valid_labels), (min_, max_) cls.classifier = SklearnClassifier(model=model, clip_values=(min_, max_)) cls.classifier.fit(all_data, all_labels) cls.defence_trust = ProvenanceDefense(cls.classifier, all_data, all_labels, all_p, x_val=trusted_data, y_val=trusted_labels, eps=0.1) cls.defence_no_trust = ProvenanceDefense(cls.classifier, all_data, all_labels, all_p, eps=0.1)
def main(): try: print('See if poison model has been previously trained ') import pickle classifier = pickle.load(open('my_poison_classifier.p', 'rb')) print('Loaded model from pickle.... ') data_train = np.load('data_training.npz') x_train = data_train['x_train'] y_train = data_train['y_train'] is_poison_train = data_train['is_poison_train'] data_test = np.load('data_testing.npz') x_test = data_test['x_test'] y_test = data_test['y_test'] is_poison_test = data_test['is_poison_test'] except: # Read MNIST dataset (x_raw contains the original images): (x_raw, y_raw), (x_raw_test, y_raw_test), min_, max_ = load_mnist(raw=True) n_train = np.shape(x_raw)[0] num_selection = n_train random_selection_indices = np.random.choice(n_train, num_selection) x_raw = x_raw[random_selection_indices] y_raw = y_raw[random_selection_indices] # Poison training data perc_poison = .33 (is_poison_train, x_poisoned_raw, y_poisoned_raw) = generate_backdoor(x_raw, y_raw, perc_poison) x_train, y_train = preprocess(x_poisoned_raw, y_poisoned_raw) # Add channel axis: x_train = np.expand_dims(x_train, axis=3) # Poison test data (is_poison_test, x_poisoned_raw_test, y_poisoned_raw_test) = generate_backdoor(x_raw_test, y_raw_test, perc_poison) x_test, y_test = preprocess(x_poisoned_raw_test, y_poisoned_raw_test) # Add channel axis: x_test = np.expand_dims(x_test, axis=3) # Shuffle training data so poison is not together n_train = np.shape(y_train)[0] shuffled_indices = np.arange(n_train) np.random.shuffle(shuffled_indices) x_train = x_train[shuffled_indices] y_train = y_train[shuffled_indices] is_poison_train = is_poison_train[shuffled_indices] # Save data used for training and testing split: np.savez('data_training.npz', x_train=x_train, y_train=y_train, is_poison_train=is_poison_train, x_raw=x_poisoned_raw) np.savez('data_testing.npz', x_test=x_test, y_test=y_test, is_poison_test=is_poison_test, x_raw_test=x_poisoned_raw_test) # Create Keras convolutional neural network - basic architecture from Keras examples # Source here: https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py k.set_learning_phase(1) model = Sequential() model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=x_train.shape[1:])) model.add(Conv2D(64, (3, 3), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(128, activation='relu')) model.add(Dropout(0.5)) model.add(Dense(10, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) classifier = KerasClassifier((min_, max_), model=model) classifier.fit(x_train, y_train, nb_epochs=50, batch_size=128) print('Saving poisoned model: ') pickle.dump(classifier, open('my_poison_classifier.p', 'wb')) # Also saving for Anu: file_name = 'anu_poison_mnist' model.save(file_name + '.hdf5') model_json = model.to_json() with open(file_name + '.json', "w") as json_file: json_file.write(model_json) # Evaluate the classifier on the test set preds = np.argmax(classifier.predict(x_test), axis=1) acc = np.sum(preds == np.argmax(y_test, axis=1)) / y_test.shape[0] print("\nTest accuracy: %.2f%%" % (acc * 100)) # Evaluate the classifier on poisonous data preds = np.argmax(classifier.predict(x_test[is_poison_test]), axis=1) acc = np.sum(preds == np.argmax(y_test[is_poison_test], axis=1)) / y_test[is_poison_test].shape[0] print("\nPoisonous test set accuracy (i.e. effectiveness of poison): %.2f%%" % (acc * 100)) # Evaluate the classifier on clean data preds = np.argmax(classifier.predict(x_test[is_poison_test == 0]), axis=1) acc = np.sum(preds == np.argmax(y_test[is_poison_test == 0], axis=1)) / y_test[is_poison_test == 0].shape[0] print("\nClean test set accuracy: %.2f%%" % (acc * 100)) # Calling poisoning defence: defence = ActivationDefence(classifier, x_train, y_train) # End-to-end method: print("------------------- Results using size metric -------------------") print(defence.get_params()) defence.detect_poison(n_clusters=2, ndims=10, reduce="PCA") # Now fix the model x_new, y_fix = correct_poisoned_labels(x_train, y_train, is_poison_train) improvement = defence.relabel_poison_ground_truth(x_new, y_fix, test_set_split=0.7, tolerable_backdoor=0.001, max_epochs=5, batch_epochs=10) # Evaluate the classifier on poisonous data after backdoor fix: preds = np.argmax(classifier.predict(x_test[is_poison_test]), axis=1) acc_after = np.sum(preds == np.argmax(y_test[is_poison_test], axis=1)) / y_test[is_poison_test].shape[0] print("\nPoisonous test set accuracy (i.e. effectiveness of poison) after backdoor fix: %.2f%%" % (acc_after * 100)) print("\n Improvement after training: ", improvement) print('before: ', acc, ' after: ', acc_after) print("done :) ")
def test_binary_input_detector(self): """ Test the binary input detector end-to-end. :return: """ # Get MNIST nb_train, nb_test = 1000, 10 (x_train, y_train), (x_test, y_test), _, _ = load_mnist() x_train, y_train = x_train[:NB_TRAIN], y_train[:NB_TRAIN] x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST] # Keras classifier classifier = get_classifier_kr() # Generate adversarial samples: attacker = FastGradientMethod(classifier, eps=0.1) x_train_adv = attacker.generate(x_train[:nb_train]) x_test_adv = attacker.generate(x_test[:nb_test]) # Compile training data for detector: x_train_detector = np.concatenate((x_train[:nb_train], x_train_adv), axis=0) y_train_detector = np.concatenate( (np.array([[1, 0]] * nb_train), np.array([[0, 1]] * nb_train)), axis=0) # Create a simple CNN for the detector input_shape = x_train.shape[1:] model = Sequential() model.add( Conv2D(4, kernel_size=(5, 5), activation='relu', input_shape=input_shape)) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Flatten()) model.add(Dense(2, activation='softmax')) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(lr=0.01), metrics=['accuracy']) # Create detector and train it: detector = BinaryInputDetector( KerasClassifier(model=model, clip_values=(0, 1), use_logits=False)) detector.fit(x_train_detector, y_train_detector, nb_epochs=2, batch_size=128) # Apply detector on clean and adversarial test data: test_detection = np.argmax(detector.predict(x_test), axis=1) test_adv_detection = np.argmax(detector.predict(x_test_adv), axis=1) # Assert there is at least one true positive and negative: nb_true_positives = len(np.where(test_adv_detection == 1)[0]) nb_true_negatives = len(np.where(test_detection == 0)[0]) logger.debug('Number of true positives detected: %i', nb_true_positives) logger.debug('Number of true negatives detected: %i', nb_true_negatives) self.assertGreater(nb_true_positives, 0) self.assertGreater(nb_true_negatives, 0)
def main(): # SETTING UP DEFENCE GAN TRAINED MODELS # * Clone the defence gan gitrepo https://github.com/yogeshbalaji/InvGAN # * Follow the setup instructions and copy the following: # * data/ to adversarial-robustness-toolbox/defence_gan/data/ # * output/gans/mnist to adversarial-robustness-toolbox/defence_gan/output/gans/mnist # * output/gans_inv_nottrain/mnist to adversarial-robustness-toolbox/defence_gan/output/gans_inv_nottrain/mnist # STEP 0 logging.info("Loading a Dataset") (_, _), (x_test_original, y_test_original), min_pixel_value, max_pixel_value = load_mnist() # TODO remove before PR request # batch_size = x_test_original.shape[0] batch_size = 1000 (x_test, y_test) = (x_test_original[:batch_size], y_test_original[:batch_size]) # STEP 1 logging.info("Creating a TS1 Mnist Classifier") classifier = create_ts1_art_mnist_classifier(min_pixel_value, max_pixel_value) classifier.fit(x_test, y_test, batch_size=batch_size, nb_epochs=3) # Code to load the original defense_gan paper mnist classifier to reproduce paper results # classifier_paper = create_defense_gan_paper_mnist_art_classifier() # STEP 2 logging.info("Evaluate the ART classifier on non adversarial examples") predictions = classifier.predict(x_test) accuracy_non_adv = get_accuracy(predictions, y_test) # STEP 3 logging.info("Generate adversarial examples") attack = FastGradientMethod(classifier, eps=0.2) x_test_adv = attack.generate(x=x_test) # STEP 4 logging.info("Evaluate the classifier on the adversarial examples") predictions = classifier.predict(x_test_adv) accuracy_adv = get_accuracy(predictions, y_test) # STEP 5 logging.info("Create DefenceGAN") encoder = create_ts1_encoder_model(batch_size) generator = create_ts1_generator_model(batch_size) inverse_gan = InverseGAN(sess=generator._sess, gan=generator, inverse_gan=encoder) # defense_gan = DefenseGAN(sess=generator.sess, # generator=generator) logging.info("Generating Defended Samples") x_test_defended = inverse_gan(x_test_adv, maxiter=1) # STEP 6 logging.info("Evaluate the classifier on the defended examples") predictions = classifier.predict(x_test_defended) accuracy_defended = get_accuracy(predictions, y_test) logger.info( "Accuracy on non adversarial examples: {}%".format(accuracy_non_adv)) logger.info("Accuracy on adversarial examples: {}%".format(accuracy_adv)) logger.info("Accuracy on defended examples: {}%".format(accuracy_defended))
def setUpClass(cls): # Get MNIST (x_train, y_train), (x_test, y_test), _, _ = load_mnist() x_train, y_train = x_train[:NB_TRAIN], y_train[:NB_TRAIN] x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST] cls.mnist = (x_train, y_train), (x_test, y_test)
def test_binary_input_detector(self): """ Test the binary input detector end-to-end. :return: """ # Initialize a tf session session = tf.Session() k.set_session(session) # Get MNIST batch_size, nb_train, nb_test = 100, 1000, 10 (x_train, y_train), (x_test, y_test), _, _ = load_mnist() x_train, y_train = x_train[:NB_TRAIN], y_train[:NB_TRAIN] x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST] input_shape = x_train.shape[1:] nb_classes = 10 # Create simple CNN model = Sequential() model.add( Conv2D(4, kernel_size=(5, 5), activation='relu', input_shape=input_shape)) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Flatten()) model.add(Dense(nb_classes, activation='softmax')) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(lr=0.01), metrics=['accuracy']) # Create classifier and train it: classifier = KerasClassifier((0, 1), model, use_logits=False) classifier.fit(x_train, y_train, nb_epochs=5, batch_size=128) # Generate adversarial samples: attacker = FastGradientMethod(classifier, eps=0.1) x_train_adv = attacker.generate(x_train[:nb_train]) x_test_adv = attacker.generate(x_test[:nb_test]) # Compile training data for detector: x_train_detector = np.concatenate((x_train[:nb_train], x_train_adv), axis=0) y_train_detector = np.concatenate( (np.array([[1, 0]] * nb_train), np.array([[0, 1]] * nb_train)), axis=0) # Create a simple CNN for the detector. # Note: we use the same architecture as for the classifier, except for the number of outputs (=2) model = Sequential() model.add( Conv2D(4, kernel_size=(5, 5), activation='relu', input_shape=input_shape)) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Flatten()) model.add(Dense(2, activation='softmax')) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(lr=0.01), metrics=['accuracy']) # Create detector and train it: detector = BinaryInputDetector( KerasClassifier((0, 1), model, use_logits=False)) detector.fit(x_train_detector, y_train_detector, nb_epochs=2, batch_size=128) # Apply detector on clean and adversarial test data: test_detection = np.argmax(detector(x_test), axis=1) test_adv_detection = np.argmax(detector(x_test_adv), axis=1) # Assert there is at least one true positive and negative: nb_true_positives = len(np.where(test_adv_detection == 1)[0]) nb_true_negatives = len(np.where(test_detection == 0)[0]) self.assertTrue(nb_true_positives > 0) self.assertTrue(nb_true_negatives > 0)