def createAttack(model, sess, x, y, X_test, y_test, eps = 0.02): from cleverhans.attacks import MadryEtAl print("Beginning PGD attack") pgd = MadryEtAl(model, back='tf', sess=sess) preds = model(x) t0 = time.time() batch_size = 64 # Incredibly horrible and ugly way to iterate over X_test. Sorry. X_test_adv_pgd = np.zeros(X_test.shape) num_batches = X_test.shape[0] // batch_size for i in range(X_test.shape[0] // batch_size): batch_start = batch_size*i batch_end = batch_size*(i+1) batch = X_test[batch_start:batch_end] if not (i % 20): print("attacking batch", i, "from ", batch_start, " to ", batch_end, file=sys.stderr) attack_target = 1 - y_test[batch_start:batch_end] pgd_params = {'eps': eps, 'eps_iter': 0.01, 'clip_min': -1., 'clip_max': 1., 'nb_iter': 20, 'y_target': attack_target} X_test_adv_pgd[batch_start:batch_end] = pgd.generate_np(batch, **pgd_params) if X_test.shape[0] % batch_size: batch_start = (num_batches * batch_size ) batch_end = X_test.shape[0] batch = X_test[batch_start:batch_end].reshape((-1,224,224,3)) print("attacking residual batch from ", batch_start, " to ", batch_end, file=sys.stderr) attack_target = 1 - y_test[batch_start:batch_end].reshape((-1,2)) pgd_params = {'eps': eps, 'eps_iter': 0.01, 'clip_min': -1., 'clip_max': 1., 'nb_iter': 20, 'y_target': attack_target} X_test_adv_pgd[batch_start:batch_end] = pgd.generate_np(batch, **pgd_params) # Report on timing t1 = time.time() total = t1-t0 m, s = divmod(total, 60) h, m = divmod(m, 60) print ("Completed attack in %d:%02d:%02d" % (h, m, s)) return X_test_adv_pgd
class TestMadryEtAl(CleverHansTest): def setUp(self): super(TestMadryEtAl, self).setUp() self.sess = tf.Session() self.model = SimpleModel() self.attack = MadryEtAl(self.model, sess=self.sess) def test_attack_strength(self): """ If clipping is not done at each iteration (not using clip_min and clip_max), this attack fails by np.mean(orig_labels == new_labels) == .5 """ x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=1.0, eps_iter=0.05, clip_min=0.5, clip_max=0.7, nb_iter=5, sanity_checks=False) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertLess(np.mean(orig_labs == new_labs), 0.1) def test_clip_eta(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=1.0, eps_iter=0.1, nb_iter=5) delta = np.max(np.abs(x_adv - x_val), axis=1) self.assertTrue(np.all(delta <= 1.)) def test_generate_np_gives_clipped_adversarial_examples(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=1.0, eps_iter=0.1, nb_iter=5, clip_min=-0.2, clip_max=0.3, sanity_checks=False) self.assertLess(-0.201, np.min(x_adv)) self.assertLess(np.max(x_adv), .301) def test_multiple_initial_random_step(self): """ This test generates multiple adversarial examples until an adversarial example is generated with a different label compared to the original label. This is the procedure suggested in Madry et al. (2017). This test will fail if an initial random step is not taken (error>0.5). """ x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) new_labs_multi = orig_labs.copy() # Generate multiple adversarial examples for i in range(10): x_adv = self.attack.generate_np(x_val, eps=.5, eps_iter=0.05, clip_min=0.5, clip_max=0.7, nb_iter=2, sanity_checks=False) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) # Examples for which we have not found adversarial examples I = (orig_labs == new_labs_multi) new_labs_multi[I] = new_labs[I] self.assertLess(np.mean(orig_labs == new_labs_multi), 0.5)
# # 'max_iterations': attack_iterations, # # 'learning_rate': 0.1, # # 'batch_size': n_attack, # # 'initial_const': 10} # cw = CarliniWagnerL2(wrap_clf, back='tf', sess=sess) # adv = cw.generate_np(X_atk, **cw_params) from cleverhans.attacks import MadryEtAl pgd_params = {'eps': 0.3, 'eps_iter': 0.01, 'nb_iter': 40, 'clip_min': 0., 'clip_max': 1., 'rand_init': True} pgd = MadryEtAl(wrap_clf, sess=sess) adv = pgd.generate_np(X_atk, **pgd_params) # adv_x = cw.generate(x, **cw_params) # preds_adv = clf(adv_x) # acc = model_eval(sess, x, y, preds_adv, X_test[:n_attack], # y_test[:n_attack], args={'batch_size': n_attack}) # print('Test accuracy on CW adversarial examples: %0.4f\n' % acc) pred = clf.predict(adv) # print(np.sum(np.argmax(pred, axis=1) != np.argmax(y_test[:n_attack], axis=1))) # pred_orig = clf.predict(X_atk) # print(np.sum(np.argmax(pred, axis=1) != np.argmax(pred_orig, axis=1))) print(np.sum(np.argmax(pred, axis=1) == np.argmax(y_target, axis=1))) # Save some images import scipy.misc
class TestMadryEtAl(CleverHansTest): def setUp(self): super(TestMadryEtAl, self).setUp() import tensorflow as tf # The world's simplest neural network def my_model(x): W1 = tf.constant([[1.5, .3], [-2, 0.3]], dtype=tf.float32) h1 = tf.nn.sigmoid(tf.matmul(x, W1)) W2 = tf.constant([[-2.4, 1.2], [0.5, -2.3]], dtype=tf.float32) res = tf.matmul(h1, W2) return res self.sess = tf.Session() self.model = my_model self.attack = MadryEtAl(self.model, sess=self.sess) def test_attack_strength(self): """ If clipping is not done at each iteration (not using clip_min and clip_max), this attack fails by np.mean(orig_labels == new_labels) == .5 """ x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=1.0, eps_iter=0.05, clip_min=0.5, clip_max=0.7, nb_iter=5) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) print(np.mean(orig_labs == new_labs)) self.assertTrue(np.mean(orig_labs == new_labs) < 0.1) def test_clip_eta(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=1.0, eps_iter=0.1, nb_iter=5) delta = np.max(np.abs(x_adv - x_val), axis=1) self.assertTrue(np.all(delta <= 1.)) def test_generate_np_gives_clipped_adversarial_examples(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=1.0, eps_iter=0.1, nb_iter=5, clip_min=-0.2, clip_max=0.3) self.assertTrue(-0.201 < np.min(x_adv)) self.assertTrue(np.max(x_adv) < .301) def test_multiple_initial_random_step(self): """ This test generates multiple adversarial examples until an adversarial example is generated with a different label compared to the original label. This is the procedure suggested in Madry et al. (2017). This test will fail if an initial random step is not taken (error>0.5). """ x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) new_labs_multi = orig_labs.copy() # Generate multiple adversarial examples for i in range(10): x_adv = self.attack.generate_np(x_val, eps=.5, eps_iter=0.05, clip_min=0.5, clip_max=0.7, nb_iter=2) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) # Examples for which we have not found adversarial examples I = (orig_labs == new_labs_multi) new_labs_multi[I] = new_labs[I] self.assertTrue(np.mean(orig_labs == new_labs_multi) < 0.1)
# resize the input image and preprocess it image = image.resize(target) image = keras.preprocessing.image.img_to_array(image) image = np.expand_dims(image, axis=0) image = keras.applications.mobilenet.preprocess_input(image) # return the processed image return image model = keras.models.load_model("./model.h5") #model.summary() src = Image.open("./trixi.png").resize(IMAGE_DIMS) src = prepare_image(src) score = model.predict(src) print(score[0][TREE_FROG_IDX]) print(np.argmax(score)) attack = MadryEtAl(model, sess=sess) attack_params = { 'eps': 0.2, 'nb_iter': 10, 'eps_iter': 0.2, 'y_target': np.expand_dims(np.eye(num_labels)[TREE_FROG_IDX], axis=0) } adv_x = attack.generate_np(src, **attack_params) adversarial = adv_x.reshape((224, 224, 3)) img = keras.preprocessing.image.array_to_img(adversarial) score = model.predict(adv_x) print(score[0][TREE_FROG_IDX]) print(np.argmax(score)) img.save("./solution.png") assert is_similar_img("./trixi.png", "./solution.png")
def train_mnist(model_dir, next_batch_fn, total_batches, train_mode, save_every=1000, print_every=100): x_input = tf.placeholder(tf.float32, (None, 28, 28, 1)) y_input = tf.placeholder(tf.float32, [None, 10]) model = mnist_convnet.Model() logits = model(x_input) loss = tf.nn.softmax_cross_entropy_with_logits(labels=y_input, logits=logits) loss = tf.reduce_mean(loss) accuracy = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(logits, axis=1), tf.argmax(y_input, axis=1)), dtype=tf.float32)) global_step = tf.contrib.framework.get_or_create_global_step() train_step = tf.train.AdamOptimizer(1e-4).minimize(loss, global_step=global_step) saver = tf.train.Saver(max_to_keep=3) a = tf.summary.scalar('accuracy adv train', accuracy) b = tf.summary.scalar('xent adv train', loss) c = tf.summary.image('images adv train', x_input) adv_summaries = tf.summary.merge([a, b, c]) a = tf.summary.scalar('accuracy nat train', accuracy) b = tf.summary.scalar('xent nat train', loss) c = tf.summary.image('images nat train', x_input) nat_summaries = tf.summary.merge([a, b, c]) with tf.Session() as sess: attack = MadryEtAl(model, sess=sess) summary_writer = tf.summary.FileWriter(model_dir, sess.graph) sess.run(tf.global_variables_initializer()) for batch_num in range(total_batches): x_batch, y_batch = next_batch_fn() x_batch = np.reshape(x_batch, (-1, 28, 28, 1)) if train_mode == "adversarial" and batch_num > 1000: x_batch_adv = attack.generate_np(x_batch, y=y_batch, eps=.3, nb_iter=40, eps_iter=.01, rand_init=True, clip_min=0, clip_max=1) else: x_batch_adv = x_batch nat_dict = {x_input: x_batch, y_input: y_batch} adv_dict = {x_input: x_batch_adv, y_input: y_batch} if batch_num % print_every == 0: a, l, s = sess.run((accuracy, loss, nat_summaries), nat_dict) summary_writer.add_summary(s, sess.run(global_step)) print(batch_num, "Clean accuracy", a, "loss", l) if train_mode == "adversarial": a, l, s = sess.run((accuracy, loss, adv_summaries), adv_dict) summary_writer.add_summary(s, sess.run(global_step)) print(batch_num, "Adv accuracy", a, "loss", l) if batch_num % save_every == 0: saver.save(sess, os.path.join(model_dir, "checkpoint"), global_step=global_step) sess.run(train_step, nat_dict) sess.run(train_step, adv_dict)
def main(argv=None): """ CIFAR10 CleverHans tutorial :return: """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # CIFAR10-specific dimensions img_rows = 32 img_cols = 32 channels = 3 nb_classes = 10 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) sess = tf.Session() set_log_level(logging.DEBUG) # Get CIFAR10 test data X_train, Y_train, X_test, Y_test = data_cifar10() # Label smoothing assert Y_train.shape[1] == 10. # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = FLAGS.model_path nb_samples = FLAGS.nb_samples from cnn_models import make_basic_cnn model = make_basic_cnn('fp_', input_shape=(None, img_rows, img_cols, channels), nb_filters=FLAGS.nb_filters) preds = model(x) print("Defined TensorFlow model graph with %d parameters" % model.n_params) rng = np.random.RandomState([2017, 8, 30]) def evaluate(eval_params): # Evaluate the model on legitimate test examples acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) return acc model_load(sess, model_path) print('Restored model from %s' % model_path) eval_params = {'batch_size': FLAGS.batch_size} accuracy = evaluate(eval_params) print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) ########################################################################### # Build dataset to perturb ########################################################################### if FLAGS.targeted: from utils import build_targeted_dataset adv_inputs, true_labels, adv_ys = build_targeted_dataset( X_test, Y_test, np.arange(nb_samples), nb_classes, img_rows, img_cols, channels) att_batch_size = np.clip(nb_samples * (nb_classes - 1), a_max=MAX_BATCH_SIZE, a_min=1) nb_adv_per_sample = nb_classes - 1 yname = "y_target" else: adv_inputs = X_test[:nb_samples] true_labels = Y_test[:nb_samples] att_batch_size = np.minimum(nb_samples, MAX_BATCH_SIZE) nb_adv_per_sample = 1 adv_ys = None yname = "y" print('Crafting ' + str(nb_samples) + ' * ' + str(nb_adv_per_sample) + ' adversarial examples') print("This could take some time ...") if FLAGS.attack == 'pgd': from cleverhans.attacks import MadryEtAl attacker = MadryEtAl(model, sess=sess) attack_params = { 'eps': FLAGS.eps / 255., 'eps_iter': EPS_ITER / 255., 'nb_iter': FLAGS.nb_iter, 'ord': np.inf, 'rand_init': True, 'batch_size': att_batch_size } elif FLAGS.attack == 'cwl2': from cleverhans.attacks import CarliniWagnerL2 attacker = CarliniWagnerL2(model, sess=sess) learning_rate = 0.1 attack_params = { 'binary_search_steps': 1, 'max_iterations': FLAGS.nb_iter, 'learning_rate': learning_rate, 'initial_const': 10, 'batch_size': att_batch_size } attack_params.update({ 'clip_min': 0., 'clip_max': 1., }) # yname: adv_ys}) X_test_adv = attacker.generate_np(adv_inputs, **attack_params) if FLAGS.targeted: assert X_test_adv.shape[0] == nb_samples * \ (nb_classes - 1), X_test_adv.shape # Evaluate the accuracy of the CIFAR10 model on adversarial # examples print("Evaluating targeted results") # adv_accuracy = model_eval(sess, x, y, preds, X_test_adv, true_labels, adv_accuracy = model_eval(sess, x, y, preds_adv, adv_inputs, true_labels, args=eval_params) else: # Evaluate the accuracy of the CIFAR10 model on adversarial # examples print("Evaluating un-targeted results") adv_accuracy = model_eval(sess, x, y, preds, X_test_adv, Y_test, args=eval_params) print('Test accuracy on adversarial examples %.4f' % adv_accuracy) # Compute the avg. distortion introduced by the attack diff = np.abs(X_test_adv - adv_inputs) percent_perturbed = np.mean(np.sum(diff, axis=(1, 2, 3))) print('Avg. L_1 norm of perturbations {0:.4f}'.format(percent_perturbed)) norm = np.mean(np.sqrt(np.sum(np.square(diff), axis=(1, 2, 3)))) print('Avg. L_2 norm of perturbations {0:.4f}'.format(norm)) sess.close()
class TestMadryEtAl(CleverHansTest): def setUp(self): super(TestMadryEtAl, self).setUp() self.sess = tf.Session() self.model = SimpleModel() self.attack = MadryEtAl(self.model, sess=self.sess) def test_attack_strength(self): """ If clipping is not done at each iteration (not using clip_min and clip_max), this attack fails by np.mean(orig_labels == new_labels) == .5 """ x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=1.0, eps_iter=0.05, clip_min=0.5, clip_max=0.7, nb_iter=5) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(orig_labs == new_labs) < 0.1) def test_clip_eta(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=1.0, eps_iter=0.1, nb_iter=5) delta = np.max(np.abs(x_adv - x_val), axis=1) self.assertTrue(np.all(delta <= 1.)) def test_generate_np_gives_clipped_adversarial_examples(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=1.0, eps_iter=0.1, nb_iter=5, clip_min=-0.2, clip_max=0.3) self.assertTrue(-0.201 < np.min(x_adv)) self.assertTrue(np.max(x_adv) < .301) def test_multiple_initial_random_step(self): """ This test generates multiple adversarial examples until an adversarial example is generated with a different label compared to the original label. This is the procedure suggested in Madry et al. (2017). This test will fail if an initial random step is not taken (error>0.5). """ x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) new_labs_multi = orig_labs.copy() # Generate multiple adversarial examples for i in range(10): x_adv = self.attack.generate_np(x_val, eps=.5, eps_iter=0.05, clip_min=0.5, clip_max=0.7, nb_iter=2) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) # Examples for which we have not found adversarial examples I = (orig_labs == new_labs_multi) new_labs_multi[I] = new_labs[I] self.assertTrue(np.mean(orig_labs == new_labs_multi) < 0.1)