def setUp(self): super(TestElasticNetMethod, self).setUp() import tensorflow as tf # The world's simplest neural network def my_model(x): W1 = tf.constant([[1.5, .3], [-2, 0.3]], dtype=tf.float32) h1 = tf.nn.sigmoid(tf.matmul(x, W1)) W2 = tf.constant([[-2.4, 1.2], [0.5, -2.3]], dtype=tf.float32) res = tf.matmul(h1, W2) return res self.sess = tf.Session() self.model = my_model self.attack = ElasticNetMethod(self.model, sess=self.sess)
def main(_): # Images for inception classifier are normalized to be in [-1, 1] interval, # eps is a difference between pixels so it should be in [0, 2] interval. # Renormalizing epsilon from [0, 255] to [0, 2]. batch_size = FLAGS.batch_size batch_shape = [FLAGS.batch_size, FLAGS.image_height, FLAGS.image_width, 3] num_classes = 1001 targeted = False tf.logging.set_verbosity(tf.logging.DEBUG) with tf.Graph().as_default(): # Prepare graph x_input = tf.placeholder(tf.float32, shape=batch_shape) model = InceptionModel(num_classes) with tf.Session() as sess: enm = ENM(model, back='tf', sess=None) enm_params = { 'beta': 0, 'batch_size': batch_size, 'learning_rate': 0.1, 'max_iterations': 1000, 'binary_search_steps': 9 } x_adv = enm.generate(x_input, **mim_params) saver = tf.train.Saver(slim.get_model_variables()) session_creator = tf.train.ChiefSessionCreator( scaffold=tf.train.Scaffold(saver=saver), checkpoint_filename_with_path=FLAGS.checkpoint_path, master=FLAGS.master) saver.restore(sess, FLAGS.checkpoint_path) sess.run(tf.global_variables_initializer()) # with tf.train.MonitoredSession(session_creator=session_creator) as sess: i = 0 for filenames, images in load_images(FLAGS.input_dir, batch_shape): adv_images = sess.run(x_adv, feed_dict={x_input: images}) print "input images: ", images.shape #adv_images = cw.generate_np(images, **cw_params) i += 16 print i # print filenames # print adv_images.shape # adv_images = cw.generate_np( save_images(adv_images, filenames, FLAGS.output_dir)
def setUp(self): super(TestElasticNetMethod, self).setUp() self.sess = tf.Session() self.model = SimpleModel() self.attack = ElasticNetMethod(self.model, sess=self.sess)
fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate_np(x_test[:, None, :, :], **fgsm_params) # In[32]: #checking the accuracy of the generated adverserial examples adv_pred = np.argmax(model.predict(adv_x), axis=1) #advpred_ohe = np_utils.to_categorical(adv_pred) adv_acc = np.mean(np.equal(adv_pred, y_test)) print("After attack, the accuracy is: {}".format(adv_acc * 100)) # In[35]: wrap = KerasModelWrapper(model) en = ElasticNetMethod(wrap, sess=sess) en_params = { "beta": 0.01, "decision_rule": 'L1', "batch_size": 1000, "confidence": 0, "learning_rate": 0.1, "binary_search_steps": 9, "max_iterations": 10, "abort_early": True, "initial_const": 0.01, "clip_min": 0, "clip_max": 1 } adv_x = en.generate_np(x_test[:, None, :, :], **en_params)
def attack_lisa_cnn(sess, cnn_weight_file, y_target=None, standardize=True): """ Generates AE for the LISA-CNN. Assumes you have already run train_lisa_cnn() to train the network. """ epsilon_map = {np.inf : [.02, .05, .075, .1, .15, .2], # assumes values in [0,1] 1 : [.1, 1, 10], 2 : [.1, 1, 10]} #-------------------------------------------------- # data set prep #-------------------------------------------------- # Note: we load the version of the data *without* extra context X_train, Y_train, X_test, Y_test = data_lisa(with_context=False) # Create one-hot target labels (needed for targeted attacks only) if y_target is not None: Y_target_OB = categorical_matrix(y_target, FLAGS.batch_size, Y_test.shape[1]) Y_target = categorical_matrix(y_target, Y_test.shape[0], Y_test.shape[1]) else: Y_target_OB = None Y_target = None # bound the perturbation c_max = np.max(X_test) assert(c_max <= 1.0) # assuming this for now #-------------------------------------------------- # Initialize model that we will attack #-------------------------------------------------- model, x_tf, y_tf = make_lisa_cnn(sess, FLAGS.batch_size, X_train.shape[1]) model_CH = KerasModelWrapper(model) # to make CH happy # the input may or may not require some additional transformation if standardize: x_input = tf.map_fn(lambda z: per_image_standardization(z), x_tf) else: x_input = x_tf model_output = model(x_input) saver = tf.train.Saver() saver.restore(sess, cnn_weight_file) #-------------------------------------------------- # Performance on clean data # (try this before attacking) #-------------------------------------------------- predictions = run_in_batches(sess, x_tf, y_tf, model_output, X_test, Y_test, FLAGS.batch_size) acc_clean = calc_acc(Y_test, predictions) print('[info]: accuracy on clean test data: %0.2f' % acc_clean) print(confusion_matrix(np.argmax(Y_test, axis=1), np.argmax(predictions, axis=1))) save_images_and_estimates(X_test, Y_test, predictions, 'output/Images/Original', CLASSES) #-------------------------------------------------- # Fast Gradient Attack #-------------------------------------------------- # symbolic representation of attack attack = FastGradientMethod(model_CH, sess=sess) acc_fgm = {} acc_tgt_fgm = {} for ord in [np.inf, 1, 2]: epsilon_values = epsilon_map[ord] acc_fgm[ord] = [] acc_tgt_fgm[ord] = [] for idx, epsilon in enumerate(epsilon_values): desc = 'FGM-ell%s-%0.3f' % (ord, epsilon) x_adv_tf = attack.generate(x_tf, eps=epsilon, y_target=Y_target_OB, clip_min=0.0, clip_max=c_max, ord=ord) if Y_target is not None: X_adv = run_in_batches(sess, x_tf, y_tf, x_adv_tf, X_test, Y_target, FLAGS.batch_size) else: X_adv = run_in_batches(sess, x_tf, y_tf, x_adv_tf, X_test, Y_test, FLAGS.batch_size) # # Evaluate the AE. # Currently using the same model we originally attacked. # model_eval = model #preds_tf = model_eval(x_tf) preds_tf = model_eval(x_input) preds = run_in_batches(sess, x_tf, y_tf, preds_tf, X_adv, Y_test, FLAGS.batch_size) acc, acc_tgt = analyze_ae(X_test, X_adv, Y_test, preds, desc, y_target) save_images_and_estimates(X_adv, Y_test, preds, 'output/Images/%s' % desc, CLASSES) save_images_and_estimates(X_test - X_adv, Y_test, preds, 'output/Deltas/%s' % desc, CLASSES) acc_fgm[ord].append(acc) acc_tgt_fgm[ord].append(acc_tgt) #-------------------------------------------------- # Iterative attack #-------------------------------------------------- attack = BasicIterativeMethod(model_CH, sess=sess) acc_ifgm = {} acc_tgt_ifgm = {} for ord in [np.inf, 1, 2]: epsilon_values = epsilon_map[ord] acc_ifgm[ord] = [] acc_tgt_ifgm[ord] = [] for idx, epsilon in enumerate(epsilon_values): desc = 'I-FGM-ell%s-%0.3f' % (ord, epsilon) x_adv_tf = attack.generate(x_tf, eps=epsilon, eps_iter=epsilon/4., nb_iter=100, y_target=Y_target_OB, clip_min=0.0, clip_max=c_max) # # Run the attack (targeted or untargeted) # on the test data. # if Y_target is not None: X_adv = run_in_batches(sess, x_tf, y_tf, x_adv_tf, X_test, Y_target, FLAGS.batch_size) else: X_adv = run_in_batches(sess, x_tf, y_tf, x_adv_tf, X_test, Y_test, FLAGS.batch_size) # # Evaluate the AE. # Currently using the same model we originally attacked. # model_eval = model #preds_tf = model_eval(x_tf) preds_tf = model_eval(x_input) preds = run_in_batches(sess, x_tf, y_tf, preds_tf, X_adv, Y_test, FLAGS.batch_size) acc, acc_tgt = analyze_ae(X_test, X_adv, Y_test, preds, desc, y_target) save_images_and_estimates(X_adv, Y_test, preds, 'output/Images/%s' % desc, CLASSES) save_images_and_estimates(X_test - X_adv, Y_test, preds, 'output/Deltas/%s' % desc, CLASSES) acc_ifgm[ord].append(acc) acc_tgt_ifgm[ord].append(acc_tgt) #-------------------------------------------------- # Post-attack Analysis for *FGM #-------------------------------------------------- for ord in [np.inf, 1, 2]: plt.plot(epsilon_map[ord], acc_fgm[ord], 'o-', label='FGM') plt.plot(epsilon_map[ord], acc_ifgm[ord], 'o-', label='I-FGM') plt.legend() plt.xlabel('epsilon') plt.ylabel('CNN accuracy') plt.title('ell_%s' % ord) plt.grid('on') plt.savefig('./output/attack_accuracy_%s.png' % ord, bbox_inches='tight') plt.close() plt.figure() plt.plot(epsilon_map[ord], acc_tgt_fgm[ord], 'o-', label='FGM') plt.plot(epsilon_map[ord], acc_tgt_ifgm[ord], 'o-', label='I-FGM') plt.legend() plt.xlabel('epsilon') plt.ylabel('Targeted AE Success Rate') plt.title('ell_%s' % ord) plt.grid('on') plt.savefig('./output/targeted_attack_accuracy_%s.png' % ord, bbox_inches='tight') plt.close() #-------------------------------------------------- # Elastic Net # Note: this attack takes awhile to compute...(compared to *FGSM) #-------------------------------------------------- attack = ElasticNetMethod(model_CH, sess=sess) c_vals = [1e-2, 1e-1, 1, 1e2, 1e4] acc_all_elastic = np.zeros((len(c_vals),)) if 0: # turn off for now, is slow #for idx, c in enumerate(c_vals): x_adv_tf = attack.generate(x_tf, batch_size=FLAGS.batch_size, y_target=Y_target_OB, beta=1e-3, # ell_1 coeff confidence=1e-2, # \kappa value from equation (4) initial_const=c, # (an initial value for) c from eq. (7) - note this value increases as binary search progresses... clip_min=0.0, clip_max=c_max) # # Run the attack (targeted or untargeted) # on the test data. # if Y_target is not None: X_adv = run_in_batches(sess, x_tf, y_tf, x_adv_tf, X_test, Y_target, FLAGS.batch_size) else: X_adv = run_in_batches(sess, x_tf, y_tf, x_adv_tf, X_test, Y_test, FLAGS.batch_size) # # Evaluate the AE. # Currently using the same model we originally attacked. # model_eval = model preds_tf = model_eval(x_tf) preds = run_in_batches(sess, x_tf, y_tf, preds_tf, X_adv, Y_test, FLAGS.batch_size) print('Test accuracy after E-Net attack: %0.2f' % calc_acc(Y_test, preds)) print('Maximum per-pixel delta: %0.3f' % np.max(np.abs(X_test - X_adv))) print('Mean per-pixel delta: %0.3f' % np.mean(np.abs(X_test - X_adv))) print('l2: ', np.sqrt(np.sum((X_test - X_adv)**2))) print('l1: ', np.sum(np.abs(X_test - X_adv))) print(confusion_matrix(np.argmax(Y_test, axis=1), np.argmax(preds, axis=1))) save_images_and_estimates(X_adv, Y_test, preds, 'output/Images/Elastic_c%03d' % c, CLASSES) acc_all_elastic[idx] = calc_acc(Y_test, preds) #-------------------------------------------------- # Saliency Map Attack # Note: this is *extremely* slow; will require overnight runs #-------------------------------------------------- attack = SaliencyMapMethod(model_CH, sess=sess) acc_all_saliency = np.zeros((len(epsilon_values),)) #for idx, epsilon in enumerate(epsilon_values): if False: x_adv_tf = attack.generate(x_tf, theta=epsilon/255., y_target=y_tf, clip_min=0.0, clip_max=255.0) # # Run the attack (targeted or untargeted) # on the test data. # if Y_target is not None: X_adv = run_in_batches(sess, x_tf, y_tf, x_adv_tf, X_test, Y_target, FLAGS.batch_size) else: X_adv = run_in_batches(sess, x_tf, y_tf, x_adv_tf, X_test, Y_test, FLAGS.batch_size) # # Evaluate the AE. # Currently using the same model we originally attacked. # model_eval = model preds_tf = model_eval(x_tf) preds = run_in_batches(sess, x_tf, y_tf, preds_tf, X_adv, Y_test, FLAGS.batch_size) print('Test accuracy after SMM attack: %0.3f' % calc_acc(Y_test, preds)) print('Maximum per-pixel delta: %0.1f' % np.max(np.abs(X_test - X_adv))) print(confusion_matrix(np.argmax(Y_test, axis=1), np.argmax(preds, axis=1))) save_images_and_estimates(X_adv, Y_test, preds, 'output/Images/Saliency_%02d' % epsilon, CLASSES) acc_all_saliency[idx] = calc_acc(Y_test, preds) #-------------------------------------------------- # C&W ell-2 #-------------------------------------------------- if 0: attack = CarliniWagnerL2(model, sess=sess) x_adv_tf = attack.generate(x_tf, confidence=.1, y_target=Y_target_OB)
class TestElasticNetMethod(CleverHansTest): def setUp(self): super(TestElasticNetMethod, self).setUp() self.sess = tf.Session() self.model = SimpleModel() self.attack = ElasticNetMethod(self.model, sess=self.sess) def test_generate_np_untargeted_gives_adversarial_example(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, max_iterations=100, binary_search_steps=3, initial_const=1, clip_min=-5, clip_max=5, batch_size=10) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(orig_labs == new_labs) < 0.1) def test_generate_np_targeted_gives_adversarial_example(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) feed_labs = np.zeros((100, 2)) feed_labs[np.arange(100), np.random.randint(0, 1, 100)] = 1 x_adv = self.attack.generate_np(x_val, max_iterations=100, binary_search_steps=3, initial_const=1, clip_min=-5, clip_max=5, batch_size=100, y_target=feed_labs) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue( np.mean(np.argmax(feed_labs, axis=1) == new_labs) > 0.9) def test_generate_gives_adversarial_example(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) feed_labs = np.zeros((100, 2)) feed_labs[np.arange(100), orig_labs] = 1 x = tf.placeholder(tf.float32, x_val.shape) y = tf.placeholder(tf.float32, feed_labs.shape) x_adv_p = self.attack.generate(x, max_iterations=100, binary_search_steps=3, initial_const=1, clip_min=-5, clip_max=5, batch_size=100, y=y) self.assertEqual(x_val.shape, x_adv_p.shape) x_adv = self.sess.run(x_adv_p, {x: x_val, y: feed_labs}) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(orig_labs == new_labs) < 0.1) def test_generate_np_gives_clipped_adversarial_examples(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, max_iterations=10, binary_search_steps=1, learning_rate=1e-3, initial_const=1, clip_min=-0.2, clip_max=0.3, batch_size=100) self.assertTrue(-0.201 < np.min(x_adv)) self.assertTrue(np.max(x_adv) < .301) def test_generate_np_high_confidence_targeted_examples(self): trivial_model = TrivialModel() for CONFIDENCE in [0, 2.3]: x_val = np.random.rand(10, 1) - .5 x_val = np.array(x_val, dtype=np.float32) feed_labs = np.zeros((10, 2)) feed_labs[np.arange(10), np.random.randint(0, 2, 10)] = 1 attack = CarliniWagnerL2(trivial_model, sess=self.sess) x_adv = attack.generate_np(x_val, max_iterations=100, binary_search_steps=2, learning_rate=1e-2, initial_const=1, clip_min=-10, clip_max=10, confidence=CONFIDENCE, y_target=feed_labs, batch_size=10) new_labs = self.sess.run(trivial_model.get_logits(x_adv)) good_labs = new_labs[np.arange(10), np.argmax(feed_labs, axis=1)] bad_labs = new_labs[np.arange(10), 1 - np.argmax(feed_labs, axis=1)] self.assertTrue( np.isclose(0, np.min(good_labs - (bad_labs + CONFIDENCE)), atol=1e-1)) self.assertTrue( np.mean( np.argmax(new_labs, axis=1) == np.argmax(feed_labs, axis=1)) > .9) def test_generate_np_high_confidence_untargeted_examples(self): trivial_model = TrivialModel() for CONFIDENCE in [0, 2.3]: x_val = np.random.rand(10, 1) - .5 x_val = np.array(x_val, dtype=np.float32) orig_labs = np.argmax(self.sess.run( trivial_model.get_logits(x_val)), axis=1) attack = CarliniWagnerL2(trivial_model, sess=self.sess) x_adv = attack.generate_np(x_val, max_iterations=100, binary_search_steps=2, learning_rate=1e-2, initial_const=1, clip_min=-10, clip_max=10, confidence=CONFIDENCE, batch_size=10) new_labs = self.sess.run(trivial_model.get_logits(x_adv)) good_labs = new_labs[np.arange(10), 1 - orig_labs] bad_labs = new_labs[np.arange(10), orig_labs] self.assertTrue( np.mean(np.argmax(new_labs, axis=1) == orig_labs) == 0) self.assertTrue( np.isclose(0, np.min(good_labs - (bad_labs + CONFIDENCE)), atol=1e-1))
def get_adv_examples(sess, wrap, attack_type, X, Y): """ detect adversarial examples :param sess: target model session :param wrap: wrap model :param attack_type: attack for generating adversarial examples :param X: examples to be attacked :param Y: correct label of the examples :return: x_adv: adversarial examples """ x = tf.placeholder(tf.float32, shape=(None, X.shape[1], X.shape[2], X.shape[3])) y = tf.placeholder(tf.float32, shape=(None, Y.shape[1])) adv_label = np.copy(Y) batch_size = 128 # Define attack method parameters if (attack_type == 'fgsm'): attack_params = { 'eps': 0.1, 'clip_min': 0., 'clip_max': 1. } attack_object = FastGradientMethod(wrap, sess=sess) elif (attack_type == 'jsma'): attack_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } attack_object = SaliencyMapMethod(wrap, sess=sess) batch_size = 32 elif (attack_type == 'cw'): attack_params = { 'binary_search_steps': 1, 'y': y, 'max_iterations': 100, 'learning_rate': .2, 'batch_size': 128, 'initial_const': 10 } attack_object = CarliniWagnerL2(wrap, sess=sess) elif (attack_type == 'mim'): attack_object = MomentumIterativeMethod(wrap, back='tf', sess=sess) attack_params = {'clip_min': 0., 'clip_max': 1., 'eps': 0.1} elif (attack_type == 'df'): attack_params = { 'max_iterations': 50, 'clip_min': 0., 'clip_max': 1., 'overshoot': 0.02 } attack_object = DeepFool(wrap, sess=sess) batch_size = 64 elif (attack_type == 'bim'): attack_object = BasicIterativeMethod(wrap, back='tf', sess=sess) attack_params = {'eps': 0.1, 'eps_iter': 0.05, 'nb_iter': 10, 'clip_min': 0., 'clip_max': 1. } elif (attack_type == 'vam'): attack_object = VirtualAdversarialMethod(wrap, back='tf', sess=sess) attack_params = {'clip_min': 0., 'clip_max': 1., 'nb_iter': 100, 'eps': 2, 'xi': 1e-6} elif (attack_type == 'enm'): attack_object = ElasticNetMethod(wrap, back='tf', sess=sess) attack_params = {'y': y, 'max_iterations': 10, 'batch_size': 128} elif (attack_type == 'spsa'): attack_object = SPSA(wrap, sess=sess) adv_x = attack_object.generate(x=x, y=y, eps=0.1, clip_min=0., clip_max=1., nb_iter=100, early_stop_loss_threshold=-5.) batch_size = 1 elif (attack_type == 'lbfgs'): attack_object = LBFGS(wrap, sess=sess) attack_params = {'clip_min': 0, 'clip_max': 1., 'batch_size': 128, 'max_iterations': 10, "y_target": y} true_label = np.argmax(Y, axis=-1) for i in range(len(Y)): ind = (true_label[i] + 1) % FLAGS.nb_classes adv_label[i] = np.zeros([FLAGS.nb_classes]) adv_label[i, ind] = 1 if (attack_type != 'spsa'): adv_x = attack_object.generate(x, **attack_params) # Get adversarial examples if (attack_type == 'lbfgs'): x_adv = get_adv(sess, x, y, adv_x, X, adv_label, batch_size=batch_size) else: x_adv = get_adv(sess, x, y, adv_x, X, Y, batch_size=batch_size) return x_adv
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) source_samples = batch_size # Use label smoothing # Hopefully this doesn't screw up JSMA... # assert Y_train.shape[1] == 10 # label_smooth = .1 # Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) print("evaluate 1") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval( sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval( sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on FGSM adversarial examples: %0.4f\n' % acc) # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc ################################################################ # Init the Elastic Network Method attack object and graph en = ElasticNetMethod(model, back='tf', sess=sess) en_params = {'binary_search_steps': 1, #'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': source_samples, 'initial_const': 10} adv_x_2 = en.generate(x, **en_params) preds_adv_2 = model.get_probs(adv_x_2) en_eval_params = {'batch_size': source_samples} # Evaluate the accuracy of the MNIST model on EN adversarial examples acc = model_eval(sess, x, y, preds_adv_2, X_test, Y_test, args=en_eval_params) print('Test accuracy on EN adversarial examples: %0.4f\n' % acc) ############################################################### # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) fgsm2 = FastGradientMethod(model_2, sess=sess) adv_x_fgsm = fgsm2.generate(x, **fgsm_params) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x_2 = tf.stop_gradient(adv_x_fgsm) preds_2_adv_fgsm = model_2(adv_x_fgsm) ########################################## en2 = ElasticNetMethod(model_2, back='tf',sess=sess) en_params = {'binary_search_steps': 1, #'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': source_samples, 'initial_const': 10} adv_x_en = en2.generate(x, **en_params) preds_2_adv_en = model_2(adv_x_en) print("evaluate 2") def evaluate_2(): # evaluate the final result of the model eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) # Accuracy of the adversarially trained model on FGSM adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv_fgsm, X_test, Y_test, args=eval_params) print('Test accuracy on FGSM adversarial examples: %0.4f' % accuracy) # Accuracy of the adversarially trained model on EN Method adversarial examples en_eval_params = {'batch_size': source_samples} accuracy = model_eval(sess, x, y, preds_2_adv_en, X_test, Y_test, args=en_eval_params) print('Test accuracy on EN adversarial examples: %0.4f' % accuracy) # Perform and evaluate adversarial training # want to combine preds but can't figure out the data types... ??? # hope this training style works preds_2_adv = [preds_2_adv_fgsm, preds_2_adv_en] train_params = { 'nb_epochs': nb_epochs, 'batch_size': source_samples, 'learning_rate': learning_rate } model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=[preds_2_adv_en],evaluate = evaluate_2, args=train_params, rng=rng) return report
def main(type="Resnet", dataset="CIFAR10", attack_type="FGM"): size = 256 eval_params = {'batch_size': 128} ############################################# Prepare the Data ##################################################### if dataset == 'CIFAR10': (_, _), (x_test, y_test) = prepare_CIFAR10() num_classes = 10 input_dim = 32 elif dataset == 'CIFAR100': (_, _), (x_test, y_test) = prepare_CIFAR100() num_classes = 100 input_dim = 32 else: (_, _), (x_test, y_test) = prepare_SVHN("./Data/") num_classes = 10 input_dim = 32 x_test = x_test / 255. y_test = keras.utils.to_categorical(y_test, num_classes) ############################################# Prepare the Data ##################################################### config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # prepare the placeholders x = tf.placeholder(tf.float32, [None, input_dim, input_dim, 3]) y = tf.placeholder(tf.float32, [None, num_classes]) input_output = [] def modelBuilder(x, num_classes, dataset, type, sess, input_output): if len(input_output) == 0: reuse = False # Model/Graph if type == 'End2End': _, tf_model = \ prepare_GBP_End2End(num_classes, inputT=x, sess=sess, checkpoint_dir='./{}_{}/'.format(dataset, type), reuse=reuse) else: _, tf_model = \ prepare_Resnet(num_classes, inputT=x, sess=sess, checkpoint_dir='./{}_{}/'.format(dataset, type), reuse=reuse) input_output.append(x) input_output.append(tf_model.logits) else: reuse = True # Model/Graph if type == 'End2End': _, tf_model = \ prepare_GBP_End2End(num_classes, inputT=x, reuse=reuse) else: _, tf_model = \ prepare_Resnet(num_classes, inputT=x, reuse=reuse) input_output.append(x) input_output.append(tf_model.logits) return tf_model.logits # create an attackable model for the cleverhans model = CallableModelWrapper(lambda placeholder: modelBuilder(placeholder, num_classes, dataset, type, sess, input_output), 'logits') # TODO: check the configurations if attack_type == "FGM": # pass attack = FastGradientMethod(model, back='tf', sess=sess) params = { 'eps' : 0.06, 'clip_min': 0., 'clip_max': 1. } elif attack_type == "CWL2": # pass attack = CarliniWagnerL2(model, back='tf', sess=sess) params = { 'confidence': 0.9, 'batch_size': 128, 'learning_rate': 0.005, } elif attack_type == "DF": # pass attack = DeepFool(model, back='tf', sess=sess) params = { } elif attack_type == "ENM": # configurations checked, quickly tested attack = ElasticNetMethod(model, back='tf', sess=sess) params = { 'confidence': 0.9, 'batch_size': 128, 'learning_rate': 0.005, } elif attack_type == "FFA": # configuration checked attack = FastFeatureAdversaries(model, back='tf', sess=sess) params = { 'eps': 0.06, 'eps_iter': 0.005, 'clip_min': 0., 'clip_max': 1. } elif attack_type == "LBFGS": attack = LBFGS(model, back='tf', sess=sess) params = { 'eps': 0.06, 'clip_min': 0., 'clip_max': 1. } elif attack_type == "MEA": attack = MadryEtAl(model, back='tf', sess=sess) params = { 'eps': 0.06, 'clip_min': 0., 'clip_max': 1. } elif attack_type == "MIM": attack = MomentumIterativeMethod(model, back='tf', sess=sess) params = { 'eps': 0.06, 'clip_min': 0., 'clip_max': 1. } elif attack_type == "SMM": attack = SaliencyMapMethod(model, back='tf', sess=sess) params = { 'eps': 0.06, 'clip_min': 0., 'clip_max': 1. } elif attack_type == "SPSA": attack = SPSA(model, back='tf', sess=sess) params = { 'eps': 0.06, 'clip_min': 0., 'clip_max': 1. } elif attack_type == "VATM": attack = vatm(model, back='tf', sess=sess) params = { 'eps': 0.06, 'clip_min': 0., 'clip_max': 1. } elif attack_type == "VAM": attack = VirtualAdversarialMethod(model, back='tf', sess=sess) params = { 'eps': 0.06, 'clip_min': 0., 'clip_max': 1. } else: raise Exception("I don't recognize {} this attack type. I will use FGM instead.".format(attack_type)) # tf operation adv_x = attack.generate(x, **params) # generate the adversarial examples adv_vals = sess.run(adv_x, feed_dict={x: x_test[:size]}) # notice that "adv_vals" may contain NANs because of the failure of the attack # also the input may not be perturbed at all because of the failure of the attack to_delete = [] for idx, adv in enumerate(adv_vals): # for nan if np.isnan(adv).any(): to_delete.append(idx) # for no perturbation if np.array_equiv(adv, x_test[idx]): to_delete.append(idx) # cleanings adv_vals_cleaned = np.delete(adv_vals, to_delete, axis=0) ori_cleaned = np.delete(x_test[:size], to_delete, axis=0) y_cleaned = np.delete(y_test[:size], to_delete, axis=0) if len(adv_vals_cleaned) == 0: print("No adversarial example is generated!") return print("{} out of {} adversarial examples are generated.".format(len(adv_vals_cleaned), size)) print("The average L_inf distortion is {}".format( np.mean([np.max(np.abs(adv - ori_cleaned[idx])) for idx, adv in enumerate(adv_vals_cleaned)]))) # TODO: visualize the adv_vals accuracy = model_eval(sess, input_output[0], y, tf.nn.softmax(input_output[1]), x_test[:size], y_test[:size], args=eval_params) print('Test accuracy on normal examples: %0.4f' % accuracy) accuracy = model_eval(sess, input_output[0], y, tf.nn.softmax(input_output[1]), adv_vals_cleaned, y_cleaned, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy)
def test_attacks(batch_size=128, source_samples=10, model_path=os.path.join("models", "mnist"), targeted=True): """ Test many attacks on MNIST with deep Bayes classifier. :param batch_size: size of training batches :param source_samples: number of test inputs to attack :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data from cleverhans.utils_mnist import data_mnist X_train, Y_train, X_test, Y_test = data_mnist(train_start=0, train_end=60000, test_start=0, test_end=10000) img_rows, img_cols, channels = X_train[0].shape nb_classes = Y_train.shape[1] # Define input TF placeholder batch_size = min(batch_size, source_samples) x = tf.placeholder(tf.float32, shape=(batch_size, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(batch_size, nb_classes)) # Define TF model graph model_name = str(sys.argv[1]) if model_name == 'bayes': from load_bayes_classifier import BayesModel conv = True checkpoint = 0 #int(sys.argv[1]) K = int(sys.argv[3]) use_mean = True model = BayesModel(sess, 'mnist', conv, K, checkpoint=checkpoint, attack_snapshot=False, use_mean=use_mean) if use_mean: model_name = 'bayes_mean_mlp' else: model_name = 'bayes_K%d' % K if model_name == 'cnn': from load_cnn_classifier import CNNModel model = CNNModel(sess, 'mnist') if model_name == 'wgan': from load_wgan_classifier import WGANModel conv = True checkpoint = 0 #int(sys.argv[1]) K = int(sys.argv[3]) T = int(sys.argv[4]) model = WGANModel(sess, 'mnist', conv, K, T, checkpoint=checkpoint) model_name = 'wgan_K%d_T%d' % (K, T) preds = model.predict(x, softmax=True) # output probabilities print("Defined TensorFlow model graph.") # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy # Craft adversarial examples nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # make adv inputs and labels for the attack if targeted if targeted: adv_inputs = np.array([[instance] * nb_classes for instance in X_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, 1)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) else: adv_inputs = X_test[:source_samples] adv_ys = Y_test[:source_samples] # Instantiate an attack object attack_method = str(sys.argv[2]) if attack_method == 'fgsm': from cleverhans.attacks import FastGradientMethod model_prob = lambda x: model.predict(x, softmax=True) attack = FastGradientMethod(model_prob, sess=sess) from attack_config import config_fgsm attack_params = config_fgsm(targeted, adv_ys) if attack_method == 'bim': from cleverhans.attacks import BasicIterativeMethod model_prob = lambda x: model.predict(x, softmax=True) attack = BasicIterativeMethod(model_prob, sess=sess) from attack_config import config_bim attack_params = config_bim(targeted, adv_ys) if attack_method == 'mim': from cleverhans.attacks import MomentumIterativeMethod model_prob = lambda x: model.predict(x, softmax=True) attack = MomentumIterativeMethod(model_prob, sess=sess) from attack_config import config_mim attack_params = config_mim(targeted, adv_ys) if attack_method == 'jsma': from cleverhans.attacks import SaliencyMapMethod model_prob = lambda x: model.predict(x, softmax=True) attack = SaliencyMapMethod(model_prob, sess=sess) from attack_config import config_jsma attack_params = config_jsma(targeted, adv_ys) if attack_method == 'vat': from cleverhans.attacks import VirtualAdversarialMethod model_logit = lambda x: model.predict(x, softmax=False) attack = VirtualAdversarialMethod(model_logit, sess=sess) from attack_config import config_vat attack_params = config_vat(targeted, adv_ys) if attack_method == 'cw': from cleverhans.attacks import CarliniWagnerL2 model_logit = lambda x: model.predict(x, softmax=False) attack = CarliniWagnerL2(model_logit, sess=sess) from attack_config import config_cw attack_params = config_cw(targeted, adv_ys) if attack_method == 'elastic': from cleverhans.attacks import ElasticNetMethod model_logit = lambda x: model.predict(x, softmax=False) attack = ElasticNetMethod(model_logit, sess=sess) from attack_config import config_elastic attack_params = config_elastic(targeted, adv_ys) if attack_method == 'deepfool': from cleverhans.attacks import DeepFool model_logit = lambda x: model.predict(x, softmax=False) attack = DeepFool(model_logit, sess=sess) from attack_config import config_deepfool attack_params = config_deepfool(targeted, adv_ys) if attack_method == 'madry': from cleverhans.attacks import MadryEtAl model_prob = lambda x: model.predict(x, softmax=True) attack = MadryEtAl(model_prob, sess=sess) from attack_config import config_madry attack_params = config_madry(targeted, adv_ys) attack_params['batch_size'] = batch_size print('batchsize', batch_size) # perform the attack! adv = [] n_batch = int(adv_inputs.shape[0] / batch_size) for i in xrange(n_batch): adv_batch = adv_inputs[i * batch_size:(i + 1) * batch_size] adv.append(attack.generate_np(adv_batch, **attack_params)) adv = np.concatenate(adv, axis=0) for _ in xrange(5): y_adv = [] for i in xrange(n_batch): adv_batch = adv[i * batch_size:(i + 1) * batch_size] y_adv.append(sess.run(preds, {x: adv_batch})) y_adv = np.concatenate(y_adv, axis=0) print('--------------------------------------') for i in xrange(10): print(np.argmax(y_adv[i * 10:(i + 1) * 10], 1)) correct_pred = np.asarray(np.argmax(y_adv, 1) == np.argmax(adv_ys, 1), dtype='f') adv_accuracy = np.mean(correct_pred) if not targeted: # adv_accuracy, y_adv = model_eval(sess, x, y, preds, adv, # adv_ys, args=eval_params, # return_pred=True) # else: # adv_accuracy, y_adv = model_eval(sess, x, y, preds, adv, # Y_test[:source_samples], args=eval_params, # return_pred=True) adv_accuracy = 1. - adv_accuracy print('--------------------------------------') print(np.argmax(adv_ys[:10], 1)) print(np.argmax(y_adv[:10], 1)) for i in xrange(5): tmp = sess.run(preds, {x: adv[:100]}) print(np.argmax(tmp[:10], 1)) # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # visualisation vis_adv = True if vis_adv: N_vis = 100 sys.path.append('../../utils') from visualisation import plot_images if channels == 1: shape = (img_rows, img_cols) else: shape = (img_rows, img_cols, channels) path = 'figs/' filename = model_name + '_' + attack_method if targeted: filename = filename + '_targeted' else: filename = filename + '_untargeted' plot_images(adv_inputs[:N_vis], shape, path, filename + '_data') plot_images(adv[:N_vis], shape, path, filename + '_adv') save_result = True if save_result: path = 'results/' filename = model_name + '_' + attack_method if targeted: filename = filename + '_targeted' y_input = adv_ys else: filename = filename + '_untargeted' y_input = Y_test[:source_samples] results = [adv_inputs, y_input, adv, y_adv] import pickle pickle.dump(results, open(path + filename + '.pkl', 'w')) print("results saved at %s.pkl" % filename) return report
def evaluate_ch(model, config, sess, norm='l1', bound=None, verbose=True): dataset = config['data'] num_eval_examples = config['num_eval_examples'] eval_batch_size = config['eval_batch_size'] if dataset == "mnist": from tensorflow.examples.tutorials.mnist import input_data mnist = input_data.read_data_sets('MNIST_data', one_hot=False) X = mnist.test.images[0:num_eval_examples, :].reshape(-1, 28, 28, 1) Y = mnist.test.labels[0:num_eval_examples] x_image = tf.placeholder(tf.float32, shape=[None, 28, 28, 1]) else: import cifar10_input data_path = config["data_path"] cifar = cifar10_input.CIFAR10Data(data_path) X = cifar.eval_data.xs[0:num_eval_examples, :].astype(np.float32) / 255.0 Y = cifar.eval_data.ys[0:num_eval_examples] x_image = tf.placeholder(tf.float32, shape=[None, 32, 32, 3]) assert norm == 'l1' if norm=='l2': attack = CarliniWagnerL2(model, sess) params = {'batch_size': eval_batch_size, 'binary_search_steps': 9} else: attack = ElasticNetMethod(model, sess, clip_min=0.0, clip_max=1.0) params = {'beta': 1e-2, 'decision_rule': 'L1', 'batch_size': eval_batch_size, 'learning_rate': 1e-2, 'max_iterations': 1000} if verbose: set_log_level(logging.DEBUG, name="cleverhans") y = tf.placeholder(tf.int64, shape=[None, 10]) params['y'] = y adv_x = attack.generate(x_image, **params) preds_adv = model.get_predicted_class(adv_x) preds_nat = model.get_predicted_class(x_image) all_preds, all_preds_adv, all_adv_x = batch_eval( sess, [x_image, y], [preds_nat, preds_adv, adv_x], [X, one_hot(Y, 10)], batch_size=eval_batch_size) print('acc nat', np.mean(all_preds == Y)) print('acc adv', np.mean(all_preds_adv == Y)) if dataset == "cifar10": X *= 255.0 all_adv_x *= 255.0 if norm == 'l2': lps = np.sqrt(np.sum(np.square(all_adv_x - X), axis=(1,2,3))) else: lps = np.sum(np.abs(all_adv_x - X), axis=(1,2,3)) print('mean lp: ', np.mean(lps)) for b in [bound, bound/2.0, bound/4.0, bound/8.0]: print('lp={}, acc={}'.format(b, np.mean((all_preds_adv == Y) | (lps > b)))) all_corr_adv = (all_preds_adv == Y) all_corr_nat = (all_preds == Y) return all_corr_nat, all_corr_adv, lps
def attack_classifier(sess, x, model, x_test, attack_method="fgsm", target=None, batch_size=128): if attack_method == "fgsm": from cleverhans.attacks import FastGradientMethod params = {'eps': 8/255, 'clip_min': 0., 'clip_max': 1. } if target is not None: params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0)) method = FastGradientMethod(model, sess=sess) elif attack_method == "basic_iterative": from cleverhans.attacks import BasicIterativeMethod params = {'eps': 8./255, 'eps_iter': 1./255, 'nb_iter': 10, 'clip_min': 0., 'clip_max': 1., 'ord': np.inf } if target is not None: params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0)) method = BasicIterativeMethod(model,sess = sess) elif attack_method == "momentum_iterative": from cleverhans.attacks import MomentumIterativeMethod params = {'eps':8/255, 'eps_iter':1/255, 'nb_iter': 10, 'clip_min': 0., 'clip_max': 1. } if target is not None: params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0)) method = MomentumIterativeMethod(model,sess = sess) elif attack_method == "saliency": from cleverhans.attacks import SaliencyMapMethod params = {'theta':8/255, 'gamma':0.1, 'clip_min': 0., 'clip_max': 1. } assert target is None method = SaliencyMapMethod(model,sess = sess) elif attack_method == "virtual": from cleverhans.attacks import VirtualAdversarialMethod params = {'eps':8/255, 'num_iterations':10, 'xi' :1e-6, 'clip_min': 0., 'clip_max': 1. } assert target is None method = VirtualAdversarialMethod(model,sess = sess) elif attack_method == "cw": from cleverhans.attacks import CarliniWagnerL2 params = { "confidence":0, "batch_size":128, "learning_rate":1e-4, "binary_search_steps":10, "max_iterations":1000, "abort_early": True, "initial_const":1e-2, "clip_min":0, "clip_max":1 } if target is not None: params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0)) method = CarliniWagnerL2(model,sess = sess) elif attack_method == "elastic_net": from cleverhans.attacks import ElasticNetMethod params = { "fista": "FISTA", "beta": 0.1, "decision_rule":"EN", "confidence":0, "batch_size":128, "learning_rate":1e-4, "binary_search_steps":10, "max_iterations":1000, "abort_early": True, "initial_const":1e-2, "clip_min":0, "clip_max":1 } if target is not None: params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0)) method = ElasticNetMethod(model,sess = sess) elif attack_method == "deepfool": from cleverhans.attacks import DeepFool params = { "nb_candidate":10, "overshoot":1e-3, "max_iter":100, "nb_classes":10, "clip_min":0, "clip_max":1 } assert target is None method = DeepFool(model,sess = sess) elif attack_method == "lbfgs": from cleverhans.attacks import LBFGS params = { 'batch_size':128, "binary_search_steps":10, "max_iterations":1000, "initial_const":1e-2, 'clip_min': 0., 'clip_max': 1. } assert target is not None params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0)) method = LBFGS(model,sess = sess) elif attack_method == "madry": from cleverhans.attacks import MadryEtAl params = {'eps':8/255, 'eps_iter':1/255, 'nb_iter':10, 'ord':np.inf, 'clip_min': 0., 'clip_max': 1. } if target is not None: params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0)) method = MadryEtAl(model, sess = sess) elif attack_method == "SPSA": from cleverhans.attacks import SPSA params = { 'epsilon':1/255, 'num_steps':10, 'is_targeted':False, 'early_stop_loss_threshold':None, 'learning_rate':0.01, 'delta':0.01, 'batch_size':128, 'spsa_iters':1, 'is_debug':False } if target is not None: params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0)) params["is_targeted"] = True method = SPSA(model, sess = sess) else: raise ValueError("Can not recognize this attack method: %s" % attack_method) adv_x = method.generate(x, **params) num_batch = x_test.shape[0] // batch_size adv_imgs = [] for i in range(num_batch): x_feed = x_test[i*batch_size:(i+1)*batch_size] #y_feed = y_test[i*batch_size:(i+1)*batch_size] adv_img = sess.run(adv_x, feed_dict={x: x_feed}) adv_imgs.append(adv_img) adv_imgs = np.concatenate(adv_imgs, axis=0) return adv_imgs
'max_iterations':100, 'binary_search_steps':3, 'initial_const':1, 'clip_min':0, 'clip_max':1, 'batch_size':100, 'rnd': orders, 'y_target':labels, } from cleverhans.utils_keras import KerasModelWrapper keras.backend.set_learning_phase(0) sess = keras.backend.get_session() models = [KerasModelWrapper(model) for model in models] attack = ElasticNetMethod(models, sess=sess) x_adv = attack.generate_np(x_val,**bapp_params) # orig_labs = np.argmax(model.predict(x_val), axis=1) # new_labs = np.argmax(model.predict(x_adv), axis=1) l1dist = np.linalg.norm(x_val-x_adv, ord=1, axis=-1) # l1dist = np.sum(np.absolute(x_adv-x_val, axis=-1)) print(np.mean(l1dist), np.max(l1dist), np.min(l1dist)) # print('normal mnist model acc:', np.mean(orig_labs==labels)) # print('advs mnist model acc:', np.mean(new_labs==labels)) # print('advs acc:', new_labs[orig_labs==labels] != labels[orig_labs==labels]) np.save('advs/'+conf[:-5].split('/')[-1]+'_'+str(target)+'_ead_show.npy', x_adv) # x_adv = self.attack.generate_np(x_val, max_iterations=100, # binary_search_steps=3, # initial_const=1, # clip_min=-5, clip_max=5,
def get_appropriate_attack(dataset, clip_range, attack_name, model, session, harden, attack_type): # Check if valid dataset specified if dataset not in ["mnist", "svhn", "cifar10"]: raise ValueError('Mentioned dataset not implemented') attack_object = None attack_params = {'clip_min': clip_range[0], 'clip_max': clip_range[1]} if attack_name == "momentum": attack_object = MomentumIterativeMethod(model, sess=session) attack_params['eps'], attack_params['eps_iter'], attack_params[ 'nb_iter'] = 0.3, 0.06, 3 elif attack_name == "fgsm": attack_object = FastGradientMethod(model, sess=session) if dataset == "mnist": attack_params['eps'] = 0.3 if attack_type == "black": attack_params['eps'] = 0.3 else: attack_params['eps'] = 0.1 elif attack_name == "elastic": attack_object = ElasticNetMethod(model, sess=session) attack_params['binary_search_steps'], attack_params[ 'max_iterations'], attack_params['beta'] = 1, 5, 1e-2 attack_params['initial_const'], attack_params[ 'learning_rate'] = 1e-1, 1e-1 if dataset == "svhn": attack_params['initial_const'], attack_params[ 'learning_rate'] = 3e-1, 2e-1 if attack_type == "black": attack_params['max_iterations'], attack_params[ 'binary_search_steps'] = 8, 2 if dataset == "mnist": attack_params['learning_rate'], attack_params[ 'initial_const'] = 1e-1, 1e-3 attack_params['binary_search_steps'], attack_params[ 'max_iterations'] = 4, 8 if attack_type == "black": attack_params["max_iterations"], attack_params[ 'binary_search_steps'] = 12, 5 elif attack_name == "virtual": attack_object = VirtualAdversarialMethod(model, sess=session) attack_params['xi'] = 1e-6 attack_params['num_iterations'], attack_params['eps'] = 1, 2.0 if attack_type == "black": attack_params['num_iterations'] = 3 attack_params['xi'], attack_params['eps'] = 1e-4, 3.0 if dataset == "mnist": attack_params['num_iterations'] = 6 attack_params['xi'], attack_params['eps'] = 1e0, 5.0 if attack_type == "black": attack_params['num_iterations'], attack_params['eps'] = 10, 8.0 elif attack_name == "madry": attack_object = MadryEtAl(model, sess=session) attack_params['nb_iter'], attack_params['eps'] = 5, 0.1 if dataset == "mnist": attack_params['eps'], attack_params['nb_iter'] = 0.3, 15 if attack_type == "black": attack_params['nb_iter'] = 20 elif attack_name == "jsma": attack_object = SaliencyMapMethod(model, sess=session) attack_params['gamma'], attack_params['theta'] = 0.1, 1.0 elif attack_name == "carlini": if dataset == "cifar10": attack_params["confidence"], attack_params[ "max_iterations"] = 0.0, 100 attack_params["binary_search_steps"], attack_params[ "abort_early"] = 20, False attack_params["initial_const"] = 1e-4 attack_object = CarliniWagnerL2(model, sess=session) else: raise ValueError('Mentioned attack not implemented') print(attack_name, ":", attack_params) return attack_object, attack_params
attack = FastGradientMethod(model=model, sess=sess) if attackMethod == "LBFGS": print ("Using LBFGS attack method!") attack = LBFGS(model=model, sess=sess) if attackMethod == "CarliniWagnerL2": print ("Using Carlini and Wagner attack method!") attack = CarliniWagnerL2(model=model, sess=sess) if attackMethod == "SPSA": print ("Using SPSA attack method!") attack = SPSA(model=model, sess=sess) if attackMethod == "MadryEtAl": print ("Using Madry et al. attack method!") attack = MadryEtAl(model=model, sess=sess) if attackMethod == "ElasticNet": print ("Using Elastic Net attack method!") attack = ElasticNetMethod(model=model, sess=sess) if attackMethod == "DeepFool": print ("Using Deep Fool attack method!") attack = DeepFool(model=model, sess=sess) if attackMethod == "MomentumIterative": print ("Using Momentum Iterative attack method!") attack = MomentumIterativeMethod(model=model, sess=sess) if attackMethod == "BasicIterative": print ("Using Basic Iterative attack method!") attack = BasicIterativeMethod(model=model, sess=sess) if attackMethod == "SaliencyMap": print ("Using Saliency Map attack method!") attack = SaliencyMapMethod(model=model, sess=sess) if attackMethod == "SPSA": adversarialOp = attack.generate(x=xPlaceholder, y=yPlaceholder, epsilon=Cfg.epsilon * 5.0, num_steps=Cfg.attackIterations)
def setUp(self): super(TestElasticNetMethod, self).setUp() self.sess = tf.Session() self.model = SimpleModel() self.attack = ElasticNetMethod(self.model, sess=self.sess)
class TestElasticNetMethod(CleverHansTest): def setUp(self): super(TestElasticNetMethod, self).setUp() self.sess = tf.Session() self.model = SimpleModel() self.attack = ElasticNetMethod(self.model, sess=self.sess) def test_generate_np_untargeted_gives_adversarial_example(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, max_iterations=100, binary_search_steps=3, initial_const=1, clip_min=-5, clip_max=5, batch_size=10) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(orig_labs == new_labs) < 0.1) def test_generate_np_targeted_gives_adversarial_example(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) feed_labs = np.zeros((100, 2)) feed_labs[np.arange(100), np.random.randint(0, 1, 100)] = 1 x_adv = self.attack.generate_np(x_val, max_iterations=100, binary_search_steps=3, initial_const=1, clip_min=-5, clip_max=5, batch_size=100, y_target=feed_labs) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(np.argmax(feed_labs, axis=1) == new_labs) > 0.9) def test_generate_gives_adversarial_example(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) feed_labs = np.zeros((100, 2)) feed_labs[np.arange(100), orig_labs] = 1 x = tf.placeholder(tf.float32, x_val.shape) y = tf.placeholder(tf.float32, feed_labs.shape) x_adv_p = self.attack.generate(x, max_iterations=100, binary_search_steps=3, initial_const=1, clip_min=-5, clip_max=5, batch_size=100, y=y) x_adv = self.sess.run(x_adv_p, {x: x_val, y: feed_labs}) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(orig_labs == new_labs) < 0.1) def test_generate_np_gives_clipped_adversarial_examples(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, max_iterations=10, binary_search_steps=1, learning_rate=1e-3, initial_const=1, clip_min=-0.2, clip_max=0.3, batch_size=100) self.assertTrue(-0.201 < np.min(x_adv)) self.assertTrue(np.max(x_adv) < .301) def test_generate_np_high_confidence_targeted_examples(self): trivial_model = TrivialModel() for CONFIDENCE in [0, 2.3]: x_val = np.random.rand(10, 1) - .5 x_val = np.array(x_val, dtype=np.float32) feed_labs = np.zeros((10, 2)) feed_labs[np.arange(10), np.random.randint(0, 2, 10)] = 1 attack = CarliniWagnerL2(trivial_model, sess=self.sess) x_adv = attack.generate_np(x_val, max_iterations=100, binary_search_steps=2, learning_rate=1e-2, initial_const=1, clip_min=-10, clip_max=10, confidence=CONFIDENCE, y_target=feed_labs, batch_size=10) new_labs = self.sess.run(trivial_model.get_logits(x_adv)) good_labs = new_labs[np.arange(10), np.argmax(feed_labs, axis=1)] bad_labs = new_labs[np.arange( 10), 1 - np.argmax(feed_labs, axis=1)] self.assertTrue(np.isclose( 0, np.min(good_labs - (bad_labs + CONFIDENCE)), atol=1e-1)) self.assertTrue(np.mean(np.argmax(new_labs, axis=1) == np.argmax(feed_labs, axis=1)) > .9) def test_generate_np_high_confidence_untargeted_examples(self): trivial_model = TrivialModel() for CONFIDENCE in [0, 2.3]: x_val = np.random.rand(10, 1) - .5 x_val = np.array(x_val, dtype=np.float32) orig_labs = np.argmax(self.sess.run(trivial_model.get_logits(x_val)), axis=1) attack = CarliniWagnerL2(trivial_model, sess=self.sess) x_adv = attack.generate_np(x_val, max_iterations=100, binary_search_steps=2, learning_rate=1e-2, initial_const=1, clip_min=-10, clip_max=10, confidence=CONFIDENCE, batch_size=10) new_labs = self.sess.run(trivial_model.get_logits(x_adv)) good_labs = new_labs[np.arange(10), 1 - orig_labs] bad_labs = new_labs[np.arange(10), orig_labs] self.assertTrue(np.mean(np.argmax(new_labs, axis=1) == orig_labs) == 0) self.assertTrue(np.isclose( 0, np.min(good_labs - (bad_labs + CONFIDENCE)), atol=1e-1))
def JSMA_FGSM_BIM(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) source_samples = batch_size # Use label smoothing # Hopefully this doesn't screw up JSMA... assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_par = {'batch_size': batch_size} rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) print("#####Starting attacks on clean model#####") ################################################################# #Clean test against JSMA jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } jsma = SaliencyMapMethod(model, back='tf', sess=sess) adv_x = jsma.generate(x, **jsma_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on JSMA adversarial examples: %0.4f' % acc) ################################################################ #Clean test against FGSM fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on FGSM adversarial examples: %0.4f' % acc) ################################################################ #Clean test against BIM bim_params = { 'eps': 0.3, 'eps_iter': 0.01, 'nb_iter': 100, 'clip_min': 0., 'clip_max': 1. } bim = BasicIterativeMethod(model, sess=sess) adv_x = bim.generate(x, **bim_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on BIM adversarial examples: %0.4f' % acc) ################################################################ #Clean test against EN en_params = { 'binary_search_steps': 1, #'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': source_samples, 'initial_const': 10 } en = ElasticNetMethod(model, back='tf', sess=sess) adv_x = en.generate(x, **en_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on EN adversarial examples: %0.4f' % acc) ################################################################ #Clean test against DF deepfool_params = { 'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': 50, 'clip_min': 0., 'clip_max': 1. } deepfool = DeepFool(model, sess=sess) adv_x = deepfool.generate(x, **deepfool_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on DF adversarial examples: %0.4f' % acc) ################################################################ #Clean test against VAT vat_params = { 'eps': 2.0, 'num_iterations': 1, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1. } vat = VirtualAdversarialMethod(model, sess=sess) adv_x = vat.generate(x, **vat_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on VAT adversarial examples: %0.4f\n' % acc) ################################################################ print("Repeating the process, using adversarial training\n") # Redefine TF model graph model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) ################################################################# #Adversarial test against JSMA jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } jsma = SaliencyMapMethod(model, back='tf', sess=sess) adv_x = jsma.generate(x, **jsma_params) preds_adv_jsma = model.get_probs(adv_x) ################################################################ #Adversarial test against FGSM fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv_fgsm = model.get_probs(adv_x) ################################################################ #Adversarial test against BIM bim_params = { 'eps': 0.3, 'eps_iter': 0.01, 'nb_iter': 100, 'clip_min': 0., 'clip_max': 1. } bim = BasicIterativeMethod(model, sess=sess) adv_x = bim.generate(x, **bim_params) preds_adv_bim = model.get_probs(adv_x) ################################################################ #Adversarial test against EN en_params = { 'binary_search_steps': 5, #'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': source_samples, 'initial_const': 10 } en = ElasticNetMethod(model, back='tf', sess=sess) adv_x = en.generate(x, **en_params) preds_adv_en = model.get_probs(adv_x) ################################################################ #Adversarial test against DF deepfool_params = { 'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': 200, 'clip_min': 0., 'clip_max': 1. } deepfool = DeepFool(model, sess=sess) adv_x = deepfool.generate(x, **deepfool_params) preds_adv_df = model.get_probs(adv_x) ################################################################ #Adversarial test against VAT vat_params = { 'eps': 2.0, 'num_iterations': 1, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1. } vat = VirtualAdversarialMethod(model, sess=sess) adv_x = vat.generate(x, **vat_params) preds_adv_vat = model.get_probs(adv_x) ################################################################ print("#####Evaluate trained model#####") def evaluate_2(): # Evaluate the accuracy of the MNIST model on JSMA adversarial examples acc = model_eval(sess, x, y, preds_adv_jsma, X_test, Y_test, args=eval_par) print('Test accuracy on JSMA adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_fgsm, X_test, Y_test, args=eval_par) print('Test accuracy on FGSM adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on BIM adversarial examples acc = model_eval(sess, x, y, preds_adv_bim, X_test, Y_test, args=eval_par) print('Test accuracy on BIM adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on EN adversarial examples acc = model_eval(sess, x, y, preds_adv_en, X_test, Y_test, args=eval_par) print('Test accuracy on EN adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on DF adversarial examples acc = model_eval(sess, x, y, preds_adv_df, X_test, Y_test, args=eval_par) print('Test accuracy on DF adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on VAT adversarial examples acc = model_eval(sess, x, y, preds_adv_vat, X_test, Y_test, args=eval_par) print('Test accuracy on VAT adversarial examples: %0.4f\n' % acc) preds_2_adv = [ preds_adv_jsma, preds_adv_fgsm, preds_adv_bim # ,preds_adv_en # ,preds_adv_df ] model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, rng=rng)
class TestElasticNetMethod(CleverHansTest): def setUp(self): super(TestElasticNetMethod, self).setUp() import tensorflow as tf # The world's simplest neural network def my_model(x): W1 = tf.constant([[1.5, .3], [-2, 0.3]], dtype=tf.float32) h1 = tf.nn.sigmoid(tf.matmul(x, W1)) W2 = tf.constant([[-2.4, 1.2], [0.5, -2.3]], dtype=tf.float32) res = tf.matmul(h1, W2) return res self.sess = tf.Session() self.model = my_model self.attack = ElasticNetMethod(self.model, sess=self.sess) def test_generate_np_untargeted_gives_adversarial_example(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, max_iterations=100, binary_search_steps=3, initial_const=1, clip_min=-5, clip_max=5, batch_size=10) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(orig_labs == new_labs) < 0.1) def test_generate_np_targeted_gives_adversarial_example(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) feed_labs = np.zeros((100, 2)) feed_labs[np.arange(100), np.random.randint(0, 1, 100)] = 1 x_adv = self.attack.generate_np(x_val, max_iterations=100, binary_search_steps=3, initial_const=1, clip_min=-5, clip_max=5, batch_size=100, y_target=feed_labs) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(np.argmax(feed_labs, axis=1) == new_labs) > 0.9) def test_generate_gives_adversarial_example(self): import tensorflow as tf x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) feed_labs = np.zeros((100, 2)) feed_labs[np.arange(100), orig_labs] = 1 x = tf.placeholder(tf.float32, x_val.shape) y = tf.placeholder(tf.float32, feed_labs.shape) x_adv_p = self.attack.generate(x, max_iterations=100, binary_search_steps=3, initial_const=1, clip_min=-5, clip_max=5, batch_size=100, y=y) x_adv = self.sess.run(x_adv_p, {x: x_val, y: feed_labs}) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(orig_labs == new_labs) < 0.1) def test_generate_np_gives_clipped_adversarial_examples(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, max_iterations=10, binary_search_steps=1, learning_rate=1e-3, initial_const=1, clip_min=-0.2, clip_max=0.3, batch_size=100) self.assertTrue(-0.201 < np.min(x_adv)) self.assertTrue(np.max(x_adv) < .301) def test_generate_np_high_confidence_targeted_examples(self): import tensorflow as tf def trivial_model(x): W1 = tf.constant([[1, -1]], dtype=tf.float32) res = tf.matmul(x, W1) return res for CONFIDENCE in [0, 2.3]: x_val = np.random.rand(10, 1) - .5 x_val = np.array(x_val, dtype=np.float32) feed_labs = np.zeros((10, 2)) feed_labs[np.arange(10), np.random.randint(0, 2, 10)] = 1 attack = CarliniWagnerL2(trivial_model, sess=self.sess) x_adv = attack.generate_np(x_val, max_iterations=100, binary_search_steps=2, learning_rate=1e-2, initial_const=1, clip_min=-10, clip_max=10, confidence=CONFIDENCE, y_target=feed_labs, batch_size=10) new_labs = self.sess.run(trivial_model(x_adv)) good_labs = new_labs[np.arange(10), np.argmax(feed_labs, axis=1)] bad_labs = new_labs[np.arange( 10), 1 - np.argmax(feed_labs, axis=1)] self.assertTrue(np.isclose( 0, np.min(good_labs - (bad_labs + CONFIDENCE)), atol=1e-1)) self.assertTrue(np.mean(np.argmax(new_labs, axis=1) == np.argmax(feed_labs, axis=1)) > .9) def test_generate_np_high_confidence_untargeted_examples(self): import tensorflow as tf def trivial_model(x): W1 = tf.constant([[1, -1]], dtype=tf.float32) res = tf.matmul(x, W1) return res for CONFIDENCE in [0, 2.3]: x_val = np.random.rand(10, 1) - .5 x_val = np.array(x_val, dtype=np.float32) orig_labs = np.argmax(self.sess.run(trivial_model(x_val)), axis=1) attack = CarliniWagnerL2(trivial_model, sess=self.sess) x_adv = attack.generate_np(x_val, max_iterations=100, binary_search_steps=2, learning_rate=1e-2, initial_const=1, clip_min=-10, clip_max=10, confidence=CONFIDENCE, batch_size=10) new_labs = self.sess.run(trivial_model(x_adv)) good_labs = new_labs[np.arange(10), 1 - orig_labs] bad_labs = new_labs[np.arange(10), orig_labs] self.assertTrue(np.mean(np.argmax(new_labs, axis=1) == orig_labs) == 0) self.assertTrue(np.isclose( 0, np.min(good_labs - (bad_labs + CONFIDENCE)), atol=1e-1))
def mnist_tutorial_jsma(train_start=0, train_end=5500, test_start=0, test_end=1000, nb_epochs=8, batch_size=100, nb_classes=10, nb_filters=64, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } # sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) print("x_train shape: ", X_train.shape) print("y_train shape: ", Y_train.shape) # do not log model_train(sess, x, y, preds, X_train, Y_train, args=train_params,verbose=False, rng=rng) f_out_clean = open("Clean_jsma_elastic_against5.log", "w") # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) f_out_clean.write('Test accuracy on legitimate test examples: ' + str(accuracy) + '\n') # Clean test against JSMA jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} jsma = SaliencyMapMethod(model, back='tf', sess=sess) adv_x_jsma = jsma.generate(x, **jsma_params) preds_adv_jsma = model.get_probs(adv_x_jsma) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_jsma, X_test, Y_test, args=eval_params) print('Clean test accuracy on JSMA adversarial examples: %0.4f' % acc) f_out_clean.write('Clean test accuracy on JSMA adversarial examples: ' + str(acc) + '\n') ################################################################ # Clean test against FGSM fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model, sess=sess) adv_x_fgsm = fgsm.generate(x, **fgsm_params) preds_adv_fgsm = model.get_probs(adv_x_fgsm) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_fgsm, X_test, Y_test, args=eval_params) print('Clean test accuracy on FGSM adversarial examples: %0.4f' % acc) f_out_clean.write('Clean test accuracy on FGSM adversarial examples: ' + str(acc) + '\n') ################################################################ # Clean test against BIM bim_params = {'eps': 0.3, 'eps_iter': 0.01, 'nb_iter': 100, 'clip_min': 0., 'clip_max': 1.} bim = BasicIterativeMethod(model, sess=sess) adv_x_bim = bim.generate(x, **bim_params) preds_adv_bim = model.get_probs(adv_x_bim) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_bim, X_test, Y_test, args=eval_params) print('Clean test accuracy on BIM adversarial examples: %0.4f' % acc) f_out_clean.write('Clean test accuracy on BIM adversarial examples: ' + str(acc) + '\n') ################################################################ # Clean test against EN en_params = {'binary_search_steps': 1, # 'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': batch_size, 'initial_const': 10} en = ElasticNetMethod(model, back='tf', sess=sess) adv_x_en = en.generate(x, **en_params) preds_adv_en = model.get_probs(adv_x_en) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_en, X_test, Y_test, args=eval_params) print('Clean test accuracy on EN adversarial examples: %0.4f' % acc) f_out_clean.write('Clean test accuracy on EN adversarial examples: ' + str(acc) + '\n') ################################################################ # Clean test against DF deepfool_params = {'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': 50, 'clip_min': 0., 'clip_max': 1.} deepfool = DeepFool(model, sess=sess) adv_x_df = deepfool.generate(x, **deepfool_params) preds_adv_df = model.get_probs(adv_x_df) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_df, X_test, Y_test, args=eval_params) print('Clean test accuracy on DF adversarial examples: %0.4f' % acc) f_out_clean.write('Clean test accuracy on DF adversarial examples: ' + str(acc) + '\n') ################################################################ # Clean test against VAT vat_params = {'eps': 2.0, 'num_iterations': 1, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1.} vat = VirtualAdversarialMethod(model, sess=sess) adv_x_vat = vat.generate(x, **vat_params) preds_adv_vat = model.get_probs(adv_x_vat) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_vat, X_test, Y_test, args=eval_params) print('Clean test accuracy on VAT adversarial examples: %0.4f\n' % acc) f_out_clean.write('Clean test accuracy on VAT adversarial examples: ' + str(acc) + '\n') f_out_clean.close() ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(X_train.shape[0]) + ' * ' + str(nb_classes-1) + ' adversarial examples') model_2 = make_basic_cnn() preds_2 = model(x) # need this for constructing the array sess.run(tf.global_variables_initializer()) # run this again # sess.run(tf.global_variables_initializer()) # 1. Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model_2, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} adv_random = jsma.generate(x, **jsma_params) preds_adv_random = model_2.get_probs(adv_random) # 2. Instantiate FGSM attack fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model_2, sess=sess) adv_x_fgsm = fgsm.generate(x, **fgsm_params) preds_adv_fgsm = model_2.get_probs(adv_x_fgsm) # 3. Instantiate Elastic net attack en_params = {'binary_search_steps': 5, #'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': batch_size, 'initial_const': 10} enet = ElasticNetMethod(model_2, sess=sess) adv_x_en = enet.generate(x, **en_params) preds_adv_elastic_net = model_2.get_probs(adv_x_en) # 4. Deepfool deepfool_params = {'nb_candidate':10, 'overshoot':0.02, 'max_iter': 50, 'clip_min': 0., 'clip_max': 1.} deepfool = DeepFool(model_2, sess=sess) adv_x_df = deepfool.generate(x, **deepfool_params) preds_adv_deepfool = model_2.get_probs(adv_x_df) # 5. Base Iterative bim_params = {'eps': 0.3, 'eps_iter': 0.01, 'nb_iter': 100, 'clip_min': 0., 'clip_max': 1.} base_iter = BasicIterativeMethod(model_2, sess=sess) adv_x_bi = base_iter.generate(x, **bim_params) preds_adv_base_iter = model_2.get_probs(adv_x_bi) # 6. C & W Attack cw = CarliniWagnerL2(model_2, back='tf', sess=sess) cw_params = {'binary_search_steps': 1, # 'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': batch_size, 'initial_const': 10} adv_x_cw = cw.generate(x, **cw_params) preds_adv_cw = model_2.get_probs(adv_x_cw) #7 vat_params = {'eps': 2.0, 'num_iterations': 1, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1.} vat = VirtualAdversarialMethod(model_2, sess=sess) adv_x = vat.generate(x, **vat_params) preds_adv_vat = model_2.get_probs(adv_x) # ==> generate 10 targeted classes for every train data regardless # This call runs the Jacobian-based saliency map approach # Loop over the samples we want to perturb into adversarial examples X_train_adv_set = [] Y_train_adv_set = [] for index in range(X_train.shape[0]): print('--------------------------------------') x_val = X_train[index:(index+1)] y_val = Y_train[index] # add normal sample in!!!! X_train_adv_set.append(x_val) Y_train_adv_set.append(y_val) # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_val)) target_classes = other_classes(nb_classes, current_class) # Loop over all target classes for target in target_classes: # print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(x_val, **jsma_params) # append to X_train_adv_set and Y_train_adv_set X_train_adv_set.append(adv_x) Y_train_adv_set.append(y_val) # shape is: (1, 28, 28, 1) # print("adv_x shape is: ", adv_x.shape) # check for success rate # res = int(model_argmax(sess, x, preds, adv_x) == target) print('-------------Finished Generating Np Adversarial Data-------------------------') X_train_data = np.concatenate(X_train_adv_set, axis=0) Y_train_data = np.stack(Y_train_adv_set, axis=0) print("X_train_data shape is: ", X_train_data.shape) print("Y_train_data shape is: ", Y_train_data.shape) # saves the output so later no need to re-fun file np.savez("jsma_training_data.npz", x_train=X_train_data , y_train=Y_train_data) # >>> data = np.load('/tmp/123.npz') # >>> data['a'] f_out = open("Adversarial_jsma_elastic_against5.log", "w") # evaluate the function against 5 attacks # fgsm, base iterative, jsma, elastic net, and deepfool def evaluate_against_all(): # 1 Clean Data eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) print('Legitimate accuracy: %0.4f' % accuracy) tmp = 'Legitimate accuracy: '+ str(accuracy) + "\n" f_out.write(tmp) # 2 JSMA accuracy = model_eval(sess, x, y, preds_adv_random, X_test, Y_test, args=eval_params) print('JSMA accuracy: %0.4f' % accuracy) tmp = 'JSMA accuracy:'+ str(accuracy) + "\n" f_out.write(tmp) # 3 FGSM accuracy = model_eval(sess, x, y, preds_adv_fgsm, X_test, Y_test, args=eval_params) print('FGSM accuracy: %0.4f' % accuracy) tmp = 'FGSM accuracy:' + str(accuracy) + "\n" f_out.write(tmp) # 4 Base Iterative accuracy = model_eval(sess, x, y, preds_adv_base_iter, X_test, Y_test, args=eval_params) print('Base Iterative accuracy: %0.4f' % accuracy) tmp = 'Base Iterative accuracy:' + str(accuracy) + "\n" f_out.write(tmp) # 5 Elastic Net accuracy = model_eval(sess, x, y, preds_adv_elastic_net, X_test, Y_test, args=eval_params) print('Elastic Net accuracy: %0.4f' % accuracy) tmp = 'Elastic Net accuracy:' + str(accuracy) + "\n" f_out.write(tmp) # 6 DeepFool accuracy = model_eval(sess, x, y, preds_adv_deepfool, X_test, Y_test, args=eval_params) print('DeepFool accuracy: %0.4f' % accuracy) tmp = 'DeepFool accuracy:' + str(accuracy) + "\n" f_out.write(tmp) # 7 C & W Attack accuracy = model_eval(sess, x, y, preds_adv_cw, X_test, Y_test, args=eval_params) print('C & W accuracy: %0.4f' % accuracy) tmp = 'C & W accuracy:' + str(accuracy) + "\n" f_out.write(tmp) f_out.write("*******End of Epoch***********\n\n") # 8 Virtual Adversarial accuracy = model_eval(sess, x, y, preds_adv_vat, X_test, Y_test, args=eval_params) print('VAT accuracy: %0.4f' % accuracy) tmp = 'VAT accuracy:' + str(accuracy) + "\n" f_out.write(tmp) f_out.write("*******End of Epoch***********\n\n") print("*******End of Epoch***********\n\n") # report.adv_train_adv_eval = accuracy print("Now Adversarial Training with Elastic Net + modified X_train and Y_train") # trained_model.out train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': '/home/stephen/PycharmProjects/jsma-runall-mac/', 'filename': 'trained_model.out' } model_train(sess, x, y, preds_2, X_train_data, Y_train_data, predictions_adv=preds_adv_elastic_net, evaluate=evaluate_against_all, verbose=False, args=train_params, rng=rng) # Close TF session sess.close() return report
} elif attack_method == 'PGD' and order == 2: op = ProjectedGradientDescent(cleverhans_model, sess=sess) params = { 'eps': eps, 'eps_iter': eps_iter, 'nb_iter': nb_iter, 'ord': 2, 'clip_max': 1., 'clip_min': 0 } elif attack_method == 'JSMA': op = SaliencyMapMethod(cleverhans_model, sess=sess) params = {'gamma': eps} elif attack_method == 'EAD': op = ElasticNetMethod(cleverhans_model, sess=sess) params = {'confidence': eps, 'abort_early': True, 'max_iterations': 100} elif attack_method == 'CW': op = CarliniWagnerL2(cleverhans_model, sess=sess) params = {'confidence': eps} x_test = x_test[eps_iter:eps_iter + decay_factor] y_test = y_test[eps_iter:eps_iter + decay_factor] # generate adversarial examples adv_x_op = op.generate(x_op, **params) y_test = to_categorical(y_test) # Run an evaluation of our model against fgsm total = 0 correct = 0 advs = []