def main(_): # Images for inception classifier are normalized to be in [-1, 1] interval, # eps is a difference between pixels so it should be in [0, 2] interval. # Renormalizing epsilon from [0, 255] to [0, 2]. eps = 2.0 * FLAGS.max_epsilon / 255.0 batch_shape = [FLAGS.batch_size, FLAGS.image_height, FLAGS.image_width, 3] num_classes = 1001 tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): # Prepare graph x_input = tf.placeholder(tf.float32, shape=batch_shape) model = InceptionModel(num_classes) salmap = SaliencyMapMethod(model) noisy_images = x_input + 0.05 * tf.sign(tf.random_normal(batch_shape)) x_adv = salmap.generate(noisy_images, clip_min=-1., clip_max=1.) #x_adv = x_input + tf.clip_by_value(x_adv - x_input, -eps, eps) # Run computation saver = tf.train.Saver(slim.get_model_variables()) session_creator = tf.train.ChiefSessionCreator( scaffold=tf.train.Scaffold(saver=saver), checkpoint_filename_with_path=FLAGS.checkpoint_path, master=FLAGS.master) with tf.train.MonitoredSession( session_creator=session_creator) as sess: for filenames, images in load_images(FLAGS.input_dir, batch_shape): adv_images = sess.run(x_adv, feed_dict={x_input: images}) save_images(adv_images, filenames, FLAGS.output_dir)
def main(_): # Images for inception classifier are normalized to be in [-1, 1] interval, # eps is a difference between pixels so it should be in [0, 2] interval. # Renormalizing epsilon from [0, 255] to [0, 2]. eps = 2.0 * FLAGS.max_epsilon / 255.0 batch_shape = [FLAGS.batch_size, FLAGS.image_height, FLAGS.image_width, 3] num_classes = 1001 BUILD_MODEL = True from cleverhans.attacks_tf import jacobian_graph, jsma_batch tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default() as d_graph: # Prepare graph x_input = tf.placeholder(tf.float32, shape=batch_shape) model = InceptionModel(num_classes) preds = model(x_input) #grads = jacobian_graph(preds, x_input, num_classes) saver = tf.train.Saver(slim.get_model_variables()) # Run computation with tf.Session() as sess: #print("Session is closed:",sess._is_closed()) saver.restore(sess, FLAGS.checkpoint_path) salmap = SaliencyMapMethod(model, sess = sess) x_adv = salmap.generate(x_input, eps=eps, clip_min=-1., clip_max=1.) for filenames, images in load_images(FLAGS.input_dir, batch_shape): adv_images = sess.run(x_adv, feed_dict={x_input: images}) save_images(adv_images, filenames, FLAGS.output_dir)
class SaliencyMapAttack(AdversarialAttack): def __init__(self, model, theta=1.0, gamma=1.0, clip_min=-1.0, clip_max=1.0, targeted=False, symbolic_impl=True): super().__init__(model=model, clip_min=clip_min, clip_max=clip_max) self._targeted = targeted self._theta = theta self._gamma = gamma self._symbolic_impl = symbolic_impl with self.graph.as_default(): self._method = SaliencyMapMethod(self._model, sess=self.session, theta=self._theta, gamma=self._gamma, nb_classes=self._n_classes, clip_min=self._clip_min, clip_max=self._clip_max, symbolic_impl=self._symbolic_impl) def attack_method(self, labels): if self._targeted: return self._method.generate(x=self._x_clean, y_target=labels) return self._method.generate(x=self._x_clean)
def saliency_map_method(model, X, Y, theta, gamma, clip_min=None, clip_max=None): nb_classes = Y.shape[1] X_adv = np.zeros_like(X) # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': theta, 'gamma': gamma, 'clip_min': clip_min, 'clip_max': clip_max, 'y_target': None } for i in tqdm(range(len(X))): # Get the sample sample = X[i:(i + 1)] # First, record the current class of the sample current_class = int(np.argmax(Y[i])) # Randomly choose a target class target_class = np.random.choice( other_classes(nb_classes, current_class)) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target_class] = 1 jsma_params['y_target'] = one_hot_target X_adv[i] = jsma.generate_np(sample, **jsma_params) return X_adv
def generate_jsma_examples(sess, model, x, y, X, Y, attack_params, verbose, attack_log_fpath): """ Targeted attack, with target classes in Y. """ Y_target = Y nb_classes = Y.shape[1] jsma = SaliencyMapMethod(KerasModelWrapper(model), back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1.} adv_x_list = [] with click.progressbar(range(0, len(X)), file=sys.stderr, show_pos=True, width=40, bar_template=' [%(bar)s] JSMA Attacking %(info)s', fill_char='>', empty_char='-') as bar: # Loop over the samples we want to perturb into adversarial examples for sample_ind in bar: sample = X[sample_ind:(sample_ind + 1)] adv_x = jsma.generate_np(sample, **jsma_params) adv_x_list.append(adv_x) return np.vstack(adv_x_list)
def attack(model, x_input, input_img): wrap = KerasModelWrapper(model) jsma = SaliencyMapMethod(wrap, sess=sess) jsma_params = { 'theta': 0.1, 'gamma': 0.1, 'clip_min': -1., 'clip_max': 1., 'y_target': None } adv_x = jsma.generate_np(input_img, **jsma_params) return adv_x
def _get_or_create_jsma(sess, x, classifier, in_x, save_file): if not os.path.exists(save_file): jsma = SaliencyMapMethod(classifier) adv = jsma.generate(x, clip_min=0., clip_max=1., gamma=FLAGS.gamma) jsma_adv = np.zeros(in_x.shape) for i in range(in_x.shape[0] // 10000): jsma_adv[i * 10000:(i + 1) * 10000] = adv.eval( feed_dict={x: in_x[i * 10000:(i + 1) * 10000]}) print(f'done computing jsma axs for first {(i+1)*10000}') np.save(save_file, jsma_adv) else: return np.load(save_file)
def mnist_jsma_attack(sample, target, model, sess): jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1., 'gamma': 3, 'clip_min': 0., 'clip_max': 1., 'y_target': None } jsma_params['y_target'] = target adv = jsma.generate_np(sample, **jsma_params) return adv
def setUp(self): super(TestSaliencyMapMethod, self).setUp() self.sess = tf.Session() self.sess.as_default() self.model = DummyModel() self.attack = SaliencyMapMethod(self.model, sess=self.sess) # initialize model with tf.name_scope('dummy_model'): self.model(tf.placeholder(tf.float32, shape=(None, 1000))) self.sess.run(tf.global_variables_initializer()) self.attack = SaliencyMapMethod(self.model, sess=self.sess)
def setUp(self): super(TestSaliencyMapMethod, self).setUp() self.sess = tf.Session() self.sess.as_default() self.model = DummyModel() self.attack = SaliencyMapMethod(self.model, sess=self.sess) # initialize model with tf.name_scope('dummy_model'): self.model(tf.placeholder(tf.float32, shape=(None, 1000))) self.sess.run(tf.global_variables_initializer()) self.attack = SaliencyMapMethod(self.model, sess=self.sess)
class TestSaliencyMapMethod(CleverHansTest): def setUp(self): super(TestSaliencyMapMethod, self).setUp() import tensorflow as tf import tensorflow.contrib.slim as slim def dummy_model(x): net = slim.fully_connected(x, 60) return slim.fully_connected(net, 10, activation_fn=None) self.sess = tf.Session() self.sess.as_default() self.model = tf.make_template('dummy_model', dummy_model) self.attack = SaliencyMapMethod(self.model, sess=self.sess) # initialize model with tf.name_scope('dummy_model'): self.model(tf.placeholder(tf.float32, shape=(None, 1000))) self.sess.run(tf.global_variables_initializer()) self.attack = SaliencyMapMethod(self.model, sess=self.sess) def test_generate_np_targeted_gives_adversarial_example(self): x_val = np.random.rand(10, 1000) x_val = np.array(x_val, dtype=np.float32) feed_labs = np.zeros((10, 10)) feed_labs[np.arange(10), np.random.randint(0, 9, 10)] = 1 x_adv = self.attack.generate_np(x_val, clip_min=-5., clip_max=5., y_target=feed_labs) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) worked = np.mean(np.argmax(feed_labs, axis=1) == new_labs) self.assertTrue(worked > .9)
class TestSaliencyMapMethod(CleverHansTest): def setUp(self): super(TestSaliencyMapMethod, self).setUp() self.sess = tf.Session() self.sess.as_default() self.model = DummyModel() self.attack = SaliencyMapMethod(self.model, sess=self.sess) # initialize model with tf.name_scope('dummy_model'): self.model(tf.placeholder(tf.float32, shape=(None, 1000))) self.sess.run(tf.global_variables_initializer()) self.attack = SaliencyMapMethod(self.model, sess=self.sess) def test_generate_np_targeted_gives_adversarial_example(self): x_val = np.random.rand(10, 1000) x_val = np.array(x_val, dtype=np.float32) feed_labs = np.zeros((10, 10)) feed_labs[np.arange(10), np.random.randint(0, 9, 10)] = 1 x_adv = self.attack.generate_np(x_val, clip_min=-5., clip_max=5., y_target=feed_labs) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) worked = np.mean(np.argmax(feed_labs, axis=1) == new_labs) self.assertTrue(worked > .9)
class TestSaliencyMapMethod(CleverHansTest): def setUp(self): super(TestSaliencyMapMethod, self).setUp() self.sess = tf.Session() self.sess.as_default() self.model = DummyModel() self.attack = SaliencyMapMethod(self.model, sess=self.sess) # initialize model with tf.name_scope('dummy_model'): self.model(tf.placeholder(tf.float32, shape=(None, 1000))) self.sess.run(tf.global_variables_initializer()) self.attack = SaliencyMapMethod(self.model, sess=self.sess) def test_generate_np_targeted_gives_adversarial_example(self): x_val = np.random.rand(10, 1000) x_val = np.array(x_val, dtype=np.float32) feed_labs = np.zeros((10, 10)) feed_labs[np.arange(10), np.random.randint(0, 9, 10)] = 1 x_adv = self.attack.generate_np(x_val, clip_min=-5., clip_max=5., y_target=feed_labs) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) worked = np.mean(np.argmax(feed_labs, axis=1) == new_labs) self.assertTrue(worked > .9)
def setUp(self): super(TestSaliencyMapMethod, self).setUp() import tensorflow as tf import tensorflow.contrib.slim as slim def dummy_model(x): net = slim.fully_connected(x, 60) return slim.fully_connected(net, 10, activation_fn=None) self.sess = tf.Session() self.sess.as_default() self.model = tf.make_template('dummy_model', dummy_model) self.attack = SaliencyMapMethod(self.model, sess=self.sess) # initialize model with tf.name_scope('dummy_model'): self.model(tf.placeholder(tf.float32, shape=(None, 1000))) self.sess.run(tf.global_variables_initializer()) self.attack = SaliencyMapMethod(self.model, sess=self.sess)
def adv_jsma_train(self, epoch=5000): print('training {} for epoch={}'.format(self.MODEL_NAME, epoch)) params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } x, y = self.cnn.make_inputs() probs = self.cnn.make_model(x) self.cnn.start_session() self.cnn.init_session_and_restore() jsma = SaliencyMapMethod(self.cnn, sess=self.cnn.sess) adv_x = jsma.generate(x, **params) adv_probs = self.cnn.make_model(adv_x) self.cnn.train(probs, x, y, epoch, self.dataset, adv_preds=adv_probs) # cnn.test(gtsrb) self.cnn.end_session()
def jsma_craft(surrogate_model, source_samples, target_info): import tensorflow as tf import numpy as np from dataset import Data from cleverhans.attacks import SaliencyMapMethod with tf.variable_scope(surrogate_model.scope): sess = tf.get_default_session() jsma = SaliencyMapMethod(surrogate_model.tf(), back='tf', sess=sess) one_hot_target = np.zeros((1, 10), dtype=np.float32) one_hot_target[0, target_info['target_class']] = 1 jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': one_hot_target } # Loop over the samples we want to perturb into adversarial examples adv_xs = [] for sample_ind in range(len(source_samples)): sample = source_samples.getx(sample_ind) adv_xs.append(jsma.generate_np(sample, **jsma_params)) # TODO remove break if sample_ind == 2: break adv_ys = np.concatenate([one_hot_target] * len(adv_xs)) adv_xs_d = Data(np.concatenate(adv_xs), adv_ys) return source_samples, adv_xs_d
def __init__(self, model, theta=1.0, gamma=1.0, clip_min=-1.0, clip_max=1.0, targeted=False, symbolic_impl=True): super().__init__(model=model, clip_min=clip_min, clip_max=clip_max) self._targeted = targeted self._theta = theta self._gamma = gamma self._symbolic_impl = symbolic_impl with self.graph.as_default(): self._method = SaliencyMapMethod(self._model, sess=self.session, theta=self._theta, gamma=self._gamma, nb_classes=self._n_classes, clip_min=self._clip_min, clip_max=self._clip_max, symbolic_impl=self._symbolic_impl)
def generate(sess, model, data_feeder, source, target, adv_dump_dir, nb_samples, perturbation_step, max_perturbation): wrap = KerasModelWrapper(model.model) jsma = SaliencyMapMethod(wrap, sess=sess) jsma_params = { 'gamma': max_perturbation, 'theta': perturbation_step, 'clip_min': 0.0, 'clip_max': 1.0, 'y_target': data_feeder.get_labels(target, nb_samples) } craft_data = data_feeder.get_evasion_craft_data(source_class=source, total_count=nb_samples) adv_data = jsma.generate_np(craft_data, **jsma_params) # Commit data adv_writer = AdversarialWriterEvasion(source_class=source, target_class=target, attack_params={ 'step': perturbation_step, 'max_perturbation':max_perturbation}, adversarial_data_path=adv_dump_dir) adv_writer.batch_put(craft_data, adv_data) adv_writer.commit()
def get_adversarial_attack_and_params(attack_name, wrap, sess): params = None stop_gradient = False if attack_name == "fgsm": attack = FastGradientMethod(wrap, sess=sess) params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} stop_gradient = True if attack_name == "deepfool": attack = DeepFool(wrap, sess=sess) if attack_name == "lbfgs": attack = LBFGS(wrap, sess=sess) if attack_name == "saliency": attack = SaliencyMapMethod(wrap, sess=sess) if attack_name == "bim": attack = BasicIterativeMethod(wrap, sess=sess) return attack, params, stop_gradient
def main(type="Resnet", dataset="CIFAR10", attack_type="FGM"): size = 256 eval_params = {'batch_size': 128} ############################################# Prepare the Data ##################################################### if dataset == 'CIFAR10': (_, _), (x_test, y_test) = prepare_CIFAR10() num_classes = 10 input_dim = 32 elif dataset == 'CIFAR100': (_, _), (x_test, y_test) = prepare_CIFAR100() num_classes = 100 input_dim = 32 else: (_, _), (x_test, y_test) = prepare_SVHN("./Data/") num_classes = 10 input_dim = 32 x_test = x_test / 255. y_test = keras.utils.to_categorical(y_test, num_classes) ############################################# Prepare the Data ##################################################### config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # prepare the placeholders x = tf.placeholder(tf.float32, [None, input_dim, input_dim, 3]) y = tf.placeholder(tf.float32, [None, num_classes]) input_output = [] def modelBuilder(x, num_classes, dataset, type, sess, input_output): if len(input_output) == 0: reuse = False # Model/Graph if type == 'End2End': _, tf_model = \ prepare_GBP_End2End(num_classes, inputT=x, sess=sess, checkpoint_dir='./{}_{}/'.format(dataset, type), reuse=reuse) else: _, tf_model = \ prepare_Resnet(num_classes, inputT=x, sess=sess, checkpoint_dir='./{}_{}/'.format(dataset, type), reuse=reuse) input_output.append(x) input_output.append(tf_model.logits) else: reuse = True # Model/Graph if type == 'End2End': _, tf_model = \ prepare_GBP_End2End(num_classes, inputT=x, reuse=reuse) else: _, tf_model = \ prepare_Resnet(num_classes, inputT=x, reuse=reuse) input_output.append(x) input_output.append(tf_model.logits) return tf_model.logits # create an attackable model for the cleverhans model = CallableModelWrapper(lambda placeholder: modelBuilder(placeholder, num_classes, dataset, type, sess, input_output), 'logits') # TODO: check the configurations if attack_type == "FGM": # pass attack = FastGradientMethod(model, back='tf', sess=sess) params = { 'eps' : 0.06, 'clip_min': 0., 'clip_max': 1. } elif attack_type == "CWL2": # pass attack = CarliniWagnerL2(model, back='tf', sess=sess) params = { 'confidence': 0.9, 'batch_size': 128, 'learning_rate': 0.005, } elif attack_type == "DF": # pass attack = DeepFool(model, back='tf', sess=sess) params = { } elif attack_type == "ENM": # configurations checked, quickly tested attack = ElasticNetMethod(model, back='tf', sess=sess) params = { 'confidence': 0.9, 'batch_size': 128, 'learning_rate': 0.005, } elif attack_type == "FFA": # configuration checked attack = FastFeatureAdversaries(model, back='tf', sess=sess) params = { 'eps': 0.06, 'eps_iter': 0.005, 'clip_min': 0., 'clip_max': 1. } elif attack_type == "LBFGS": attack = LBFGS(model, back='tf', sess=sess) params = { 'eps': 0.06, 'clip_min': 0., 'clip_max': 1. } elif attack_type == "MEA": attack = MadryEtAl(model, back='tf', sess=sess) params = { 'eps': 0.06, 'clip_min': 0., 'clip_max': 1. } elif attack_type == "MIM": attack = MomentumIterativeMethod(model, back='tf', sess=sess) params = { 'eps': 0.06, 'clip_min': 0., 'clip_max': 1. } elif attack_type == "SMM": attack = SaliencyMapMethod(model, back='tf', sess=sess) params = { 'eps': 0.06, 'clip_min': 0., 'clip_max': 1. } elif attack_type == "SPSA": attack = SPSA(model, back='tf', sess=sess) params = { 'eps': 0.06, 'clip_min': 0., 'clip_max': 1. } elif attack_type == "VATM": attack = vatm(model, back='tf', sess=sess) params = { 'eps': 0.06, 'clip_min': 0., 'clip_max': 1. } elif attack_type == "VAM": attack = VirtualAdversarialMethod(model, back='tf', sess=sess) params = { 'eps': 0.06, 'clip_min': 0., 'clip_max': 1. } else: raise Exception("I don't recognize {} this attack type. I will use FGM instead.".format(attack_type)) # tf operation adv_x = attack.generate(x, **params) # generate the adversarial examples adv_vals = sess.run(adv_x, feed_dict={x: x_test[:size]}) # notice that "adv_vals" may contain NANs because of the failure of the attack # also the input may not be perturbed at all because of the failure of the attack to_delete = [] for idx, adv in enumerate(adv_vals): # for nan if np.isnan(adv).any(): to_delete.append(idx) # for no perturbation if np.array_equiv(adv, x_test[idx]): to_delete.append(idx) # cleanings adv_vals_cleaned = np.delete(adv_vals, to_delete, axis=0) ori_cleaned = np.delete(x_test[:size], to_delete, axis=0) y_cleaned = np.delete(y_test[:size], to_delete, axis=0) if len(adv_vals_cleaned) == 0: print("No adversarial example is generated!") return print("{} out of {} adversarial examples are generated.".format(len(adv_vals_cleaned), size)) print("The average L_inf distortion is {}".format( np.mean([np.max(np.abs(adv - ori_cleaned[idx])) for idx, adv in enumerate(adv_vals_cleaned)]))) # TODO: visualize the adv_vals accuracy = model_eval(sess, input_output[0], y, tf.nn.softmax(input_output[1]), x_test[:size], y_test[:size], args=eval_params) print('Test accuracy on normal examples: %0.4f' % accuracy) accuracy = model_eval(sess, input_output[0], y, tf.nn.softmax(input_output[1]), adv_vals_cleaned, y_cleaned, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy)
def JSMA_FGSM_BIM(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) source_samples = batch_size # Use label smoothing # Hopefully this doesn't screw up JSMA... assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_par = {'batch_size': batch_size} rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) print("#####Starting attacks on clean model#####") ################################################################# #Clean test against JSMA jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } jsma = SaliencyMapMethod(model, back='tf', sess=sess) adv_x = jsma.generate(x, **jsma_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on JSMA adversarial examples: %0.4f' % acc) ################################################################ #Clean test against FGSM fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on FGSM adversarial examples: %0.4f' % acc) ################################################################ #Clean test against BIM bim_params = { 'eps': 0.3, 'eps_iter': 0.01, 'nb_iter': 100, 'clip_min': 0., 'clip_max': 1. } bim = BasicIterativeMethod(model, sess=sess) adv_x = bim.generate(x, **bim_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on BIM adversarial examples: %0.4f' % acc) ################################################################ #Clean test against EN en_params = { 'binary_search_steps': 1, #'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': source_samples, 'initial_const': 10 } en = ElasticNetMethod(model, back='tf', sess=sess) adv_x = en.generate(x, **en_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on EN adversarial examples: %0.4f' % acc) ################################################################ #Clean test against DF deepfool_params = { 'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': 50, 'clip_min': 0., 'clip_max': 1. } deepfool = DeepFool(model, sess=sess) adv_x = deepfool.generate(x, **deepfool_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on DF adversarial examples: %0.4f' % acc) ################################################################ #Clean test against VAT vat_params = { 'eps': 2.0, 'num_iterations': 1, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1. } vat = VirtualAdversarialMethod(model, sess=sess) adv_x = vat.generate(x, **vat_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on VAT adversarial examples: %0.4f\n' % acc) ################################################################ print("Repeating the process, using adversarial training\n") # Redefine TF model graph model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) ################################################################# #Adversarial test against JSMA jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } jsma = SaliencyMapMethod(model, back='tf', sess=sess) adv_x = jsma.generate(x, **jsma_params) preds_adv_jsma = model.get_probs(adv_x) ################################################################ #Adversarial test against FGSM fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv_fgsm = model.get_probs(adv_x) ################################################################ #Adversarial test against BIM bim_params = { 'eps': 0.3, 'eps_iter': 0.01, 'nb_iter': 100, 'clip_min': 0., 'clip_max': 1. } bim = BasicIterativeMethod(model, sess=sess) adv_x = bim.generate(x, **bim_params) preds_adv_bim = model.get_probs(adv_x) ################################################################ #Adversarial test against EN en_params = { 'binary_search_steps': 5, #'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': source_samples, 'initial_const': 10 } en = ElasticNetMethod(model, back='tf', sess=sess) adv_x = en.generate(x, **en_params) preds_adv_en = model.get_probs(adv_x) ################################################################ #Adversarial test against DF deepfool_params = { 'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': 200, 'clip_min': 0., 'clip_max': 1. } deepfool = DeepFool(model, sess=sess) adv_x = deepfool.generate(x, **deepfool_params) preds_adv_df = model.get_probs(adv_x) ################################################################ #Adversarial test against VAT vat_params = { 'eps': 2.0, 'num_iterations': 1, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1. } vat = VirtualAdversarialMethod(model, sess=sess) adv_x = vat.generate(x, **vat_params) preds_adv_vat = model.get_probs(adv_x) ################################################################ print("#####Evaluate trained model#####") def evaluate_2(): # Evaluate the accuracy of the MNIST model on JSMA adversarial examples acc = model_eval(sess, x, y, preds_adv_jsma, X_test, Y_test, args=eval_par) print('Test accuracy on JSMA adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_fgsm, X_test, Y_test, args=eval_par) print('Test accuracy on FGSM adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on BIM adversarial examples acc = model_eval(sess, x, y, preds_adv_bim, X_test, Y_test, args=eval_par) print('Test accuracy on BIM adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on EN adversarial examples acc = model_eval(sess, x, y, preds_adv_en, X_test, Y_test, args=eval_par) print('Test accuracy on EN adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on DF adversarial examples acc = model_eval(sess, x, y, preds_adv_df, X_test, Y_test, args=eval_par) print('Test accuracy on DF adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on VAT adversarial examples acc = model_eval(sess, x, y, preds_adv_vat, X_test, Y_test, args=eval_par) print('Test accuracy on VAT adversarial examples: %0.4f\n' % acc) preds_2_adv = [ preds_adv_jsma, preds_adv_fgsm, preds_adv_bim # ,preds_adv_en # ,preds_adv_df ] model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, rng=rng)
def attack_classifier(sess, x, model, x_test, attack_method="fgsm", target=None, batch_size=128): if attack_method == "fgsm": from cleverhans.attacks import FastGradientMethod params = {'eps': 8/255, 'clip_min': 0., 'clip_max': 1. } if target is not None: params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0)) method = FastGradientMethod(model, sess=sess) elif attack_method == "basic_iterative": from cleverhans.attacks import BasicIterativeMethod params = {'eps': 8./255, 'eps_iter': 1./255, 'nb_iter': 10, 'clip_min': 0., 'clip_max': 1., 'ord': np.inf } if target is not None: params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0)) method = BasicIterativeMethod(model,sess = sess) elif attack_method == "momentum_iterative": from cleverhans.attacks import MomentumIterativeMethod params = {'eps':8/255, 'eps_iter':1/255, 'nb_iter': 10, 'clip_min': 0., 'clip_max': 1. } if target is not None: params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0)) method = MomentumIterativeMethod(model,sess = sess) elif attack_method == "saliency": from cleverhans.attacks import SaliencyMapMethod params = {'theta':8/255, 'gamma':0.1, 'clip_min': 0., 'clip_max': 1. } assert target is None method = SaliencyMapMethod(model,sess = sess) elif attack_method == "virtual": from cleverhans.attacks import VirtualAdversarialMethod params = {'eps':8/255, 'num_iterations':10, 'xi' :1e-6, 'clip_min': 0., 'clip_max': 1. } assert target is None method = VirtualAdversarialMethod(model,sess = sess) elif attack_method == "cw": from cleverhans.attacks import CarliniWagnerL2 params = { "confidence":0, "batch_size":128, "learning_rate":1e-4, "binary_search_steps":10, "max_iterations":1000, "abort_early": True, "initial_const":1e-2, "clip_min":0, "clip_max":1 } if target is not None: params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0)) method = CarliniWagnerL2(model,sess = sess) elif attack_method == "elastic_net": from cleverhans.attacks import ElasticNetMethod params = { "fista": "FISTA", "beta": 0.1, "decision_rule":"EN", "confidence":0, "batch_size":128, "learning_rate":1e-4, "binary_search_steps":10, "max_iterations":1000, "abort_early": True, "initial_const":1e-2, "clip_min":0, "clip_max":1 } if target is not None: params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0)) method = ElasticNetMethod(model,sess = sess) elif attack_method == "deepfool": from cleverhans.attacks import DeepFool params = { "nb_candidate":10, "overshoot":1e-3, "max_iter":100, "nb_classes":10, "clip_min":0, "clip_max":1 } assert target is None method = DeepFool(model,sess = sess) elif attack_method == "lbfgs": from cleverhans.attacks import LBFGS params = { 'batch_size':128, "binary_search_steps":10, "max_iterations":1000, "initial_const":1e-2, 'clip_min': 0., 'clip_max': 1. } assert target is not None params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0)) method = LBFGS(model,sess = sess) elif attack_method == "madry": from cleverhans.attacks import MadryEtAl params = {'eps':8/255, 'eps_iter':1/255, 'nb_iter':10, 'ord':np.inf, 'clip_min': 0., 'clip_max': 1. } if target is not None: params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0)) method = MadryEtAl(model, sess = sess) elif attack_method == "SPSA": from cleverhans.attacks import SPSA params = { 'epsilon':1/255, 'num_steps':10, 'is_targeted':False, 'early_stop_loss_threshold':None, 'learning_rate':0.01, 'delta':0.01, 'batch_size':128, 'spsa_iters':1, 'is_debug':False } if target is not None: params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0)) params["is_targeted"] = True method = SPSA(model, sess = sess) else: raise ValueError("Can not recognize this attack method: %s" % attack_method) adv_x = method.generate(x, **params) num_batch = x_test.shape[0] // batch_size adv_imgs = [] for i in range(num_batch): x_feed = x_test[i*batch_size:(i+1)*batch_size] #y_feed = y_test[i*batch_size:(i+1)*batch_size] adv_img = sess.run(adv_x, feed_dict={x: x_feed}) adv_imgs.append(adv_img) adv_imgs = np.concatenate(adv_imgs, axis=0) return adv_imgs
perturbations = np.zeros((1, source_samples), dtype=float) grads = jacobian_graph(predictions , X_placeholder, 1) X_adv = np.zeros((source_samples, X_test.shape[1])) for sample_ind in range(0, source_samples): # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) # Target the benign class for target in [0]: if (current_class == 0): break # This call runs the Jacobian-based saliency map approac adv_x , res , percent_perturb = SaliencyMapMethod(sess, X_placeholder, predictions , grads, X_test[sample_ind: (sample_ind+1)], target , theta=1, gamma =0.1, increase=True , clip_min=0, clip_max=1) X_adv[sample_ind] = adv_x results[target , sample_ind] = res perturbations[target , sample_ind] = percent_perturb # ============== Evaluation of the model with adversarial instances ============== print("Performance when using adversarial testing instances") mlp_model_eval(X_adv, Y_test, history, 2)
def main(): sess = tf.Session() train_size, test_size = 55000, 10000 batch_size = 100 lr = 0.05 epochs = 100 steps = epochs * int(train_size / batch_size) global_step = tf.Variable(0, name='global_step', trainable=False) x = tf.placeholder(tf.float32, [None, 784]) # input for real images x_adv = tf.placeholder(tf.float32, [None, 784]) y_ = tf.placeholder(tf.float32, [None, 10]) # groundtruth class label y_target = tf.placeholder(tf.float32, [None, 10]) noise = tf.placeholder(tf.float32, [None, 100]) tx = transformer(x, noise, 3) yx_2, z_norm = classifier_x(x) yx = classifier(x) ytx = classifier(tx) ytx_2, z_norm2 = classifier_x(tx) y_x_adv = classifier(x_adv) x_fgsm = attack.fgsm(x, yx_2, eps=0.1, clip_min=0, clip_max=1) y_x_fgsm = classifier(x_fgsm) jsma = SaliencyMapMethod(classifier, back='tf', sess=sess) one_hot_target = np.zeros((100, 10), dtype=np.float32) one_hot_target[:, 1] = 1 jsma_params = { 'theta': 1., 'gamma': 0.1, 'nb_classes': 10, 'clip_min': 0., 'clip_max': 1., 'targets': yx, 'y_val': one_hot_target } perturb = {} perturb['tx'] = tf.reduce_mean(tf.norm(tx - x, axis=1)) perturb['fgsm'] = tf.reduce_mean(tf.norm(x_fgsm - x, axis=1)) loss = {} loss['cx'] = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=yx_2)) loss['ctx'] = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=ytx_2)) loss['cttx'] = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_target, logits=ytx)) loss['classifier'] = loss['cx'] #+ loss['ctx'] #loss['transformer'] = loss['cttx'] loss['transformer'] = -loss['ctx'] all_vars = tf.trainable_variables() c_vars = [var for var in all_vars if 'classifier' in var.name] t_vars = [var for var in all_vars if 'transformer' in var.name] train_op = {} train_op['classifier'] = GradientDescentOptimizer(learning_rate = lr) \ .minimize(loss['classifier'], var_list = c_vars, global_step = global_step) train_op['transformer'] = GradientDescentOptimizer(learning_rate = lr) \ .minimize(loss['transformer'], var_list = t_vars, global_step = global_step) acc = {} acc['x'] = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(yx, 1), tf.argmax(y_, 1)), tf.float32)) acc['tx'] = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(ytx, 1), tf.argmax(y_, 1)), tf.float32)) acc['x_fgsm'] = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(y_x_fgsm, 1), tf.argmax(y_, 1)), tf.float32)) acc['x_adv'] = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(y_x_adv, 1), tf.argmax(y_, 1)), tf.float32)) mnist = input_data.read_data_sets('MNIST_data', one_hot=True) x_adv_mnist_fsgm = np.load(os.path.join('data', 'x_fgsm_mnist.npy')) tf.set_random_seed(1024) sess.run(tf.global_variables_initializer()) with sess.as_default(): for t in range(1, steps + 1): batch = mnist.train.next_batch(batch_size) y_tar = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0] for i in range(batch_size)] y_tar = np.array(y_tar, dtype='float32') noise_sample = sample_Z(batch_size, 100) sess.run(train_op['classifier'], feed_dict={ x: batch[0], y_: batch[1], noise: noise_sample, y_target: y_tar }) sess.run(train_op['transformer'], feed_dict={ x: batch[0], y_: batch[1], noise: noise_sample, y_target: y_tar }) if t % int(train_size / batch_size) == 0: epoch = int(t / int(train_size / batch_size)) noise_sample2 = sample_Z(10000, 100) test_batch = mnist.test.next_batch(10000) print(test_batch[0].shape) var_list = [acc, z_norm] res = sess.run(var_list, feed_dict = {x: test_batch[0], y_: test_batch[1], noise : noise_sample2, \ x_adv: x_adv_mnist_fsgm, y_target: y_tar}) print(epoch) for r in res: print(r)
def do_jsma(): print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where( adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape( -1).shape[0] # Display the original and adversarial images side-by-side if FLAGS.viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols)), np.reshape(adv_x, (img_rows, img_cols)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format( succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format( percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) if FLAGS.viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_jsma(train_start=0, train_end=5500, test_start=0, test_end=1000, nb_epochs=8, batch_size=100, nb_classes=10, nb_filters=64, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } # sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) print("x_train shape: ", X_train.shape) print("y_train shape: ", Y_train.shape) # do not log model_train(sess, x, y, preds, X_train, Y_train, args=train_params,verbose=False, rng=rng) f_out_clean = open("Clean_jsma_elastic_against5.log", "w") # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) f_out_clean.write('Test accuracy on legitimate test examples: ' + str(accuracy) + '\n') # Clean test against JSMA jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} jsma = SaliencyMapMethod(model, back='tf', sess=sess) adv_x_jsma = jsma.generate(x, **jsma_params) preds_adv_jsma = model.get_probs(adv_x_jsma) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_jsma, X_test, Y_test, args=eval_params) print('Clean test accuracy on JSMA adversarial examples: %0.4f' % acc) f_out_clean.write('Clean test accuracy on JSMA adversarial examples: ' + str(acc) + '\n') ################################################################ # Clean test against FGSM fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model, sess=sess) adv_x_fgsm = fgsm.generate(x, **fgsm_params) preds_adv_fgsm = model.get_probs(adv_x_fgsm) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_fgsm, X_test, Y_test, args=eval_params) print('Clean test accuracy on FGSM adversarial examples: %0.4f' % acc) f_out_clean.write('Clean test accuracy on FGSM adversarial examples: ' + str(acc) + '\n') ################################################################ # Clean test against BIM bim_params = {'eps': 0.3, 'eps_iter': 0.01, 'nb_iter': 100, 'clip_min': 0., 'clip_max': 1.} bim = BasicIterativeMethod(model, sess=sess) adv_x_bim = bim.generate(x, **bim_params) preds_adv_bim = model.get_probs(adv_x_bim) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_bim, X_test, Y_test, args=eval_params) print('Clean test accuracy on BIM adversarial examples: %0.4f' % acc) f_out_clean.write('Clean test accuracy on BIM adversarial examples: ' + str(acc) + '\n') ################################################################ # Clean test against EN en_params = {'binary_search_steps': 1, # 'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': batch_size, 'initial_const': 10} en = ElasticNetMethod(model, back='tf', sess=sess) adv_x_en = en.generate(x, **en_params) preds_adv_en = model.get_probs(adv_x_en) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_en, X_test, Y_test, args=eval_params) print('Clean test accuracy on EN adversarial examples: %0.4f' % acc) f_out_clean.write('Clean test accuracy on EN adversarial examples: ' + str(acc) + '\n') ################################################################ # Clean test against DF deepfool_params = {'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': 50, 'clip_min': 0., 'clip_max': 1.} deepfool = DeepFool(model, sess=sess) adv_x_df = deepfool.generate(x, **deepfool_params) preds_adv_df = model.get_probs(adv_x_df) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_df, X_test, Y_test, args=eval_params) print('Clean test accuracy on DF adversarial examples: %0.4f' % acc) f_out_clean.write('Clean test accuracy on DF adversarial examples: ' + str(acc) + '\n') ################################################################ # Clean test against VAT vat_params = {'eps': 2.0, 'num_iterations': 1, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1.} vat = VirtualAdversarialMethod(model, sess=sess) adv_x_vat = vat.generate(x, **vat_params) preds_adv_vat = model.get_probs(adv_x_vat) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_vat, X_test, Y_test, args=eval_params) print('Clean test accuracy on VAT adversarial examples: %0.4f\n' % acc) f_out_clean.write('Clean test accuracy on VAT adversarial examples: ' + str(acc) + '\n') f_out_clean.close() ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(X_train.shape[0]) + ' * ' + str(nb_classes-1) + ' adversarial examples') model_2 = make_basic_cnn() preds_2 = model(x) # need this for constructing the array sess.run(tf.global_variables_initializer()) # run this again # sess.run(tf.global_variables_initializer()) # 1. Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model_2, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} adv_random = jsma.generate(x, **jsma_params) preds_adv_random = model_2.get_probs(adv_random) # 2. Instantiate FGSM attack fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model_2, sess=sess) adv_x_fgsm = fgsm.generate(x, **fgsm_params) preds_adv_fgsm = model_2.get_probs(adv_x_fgsm) # 3. Instantiate Elastic net attack en_params = {'binary_search_steps': 5, #'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': batch_size, 'initial_const': 10} enet = ElasticNetMethod(model_2, sess=sess) adv_x_en = enet.generate(x, **en_params) preds_adv_elastic_net = model_2.get_probs(adv_x_en) # 4. Deepfool deepfool_params = {'nb_candidate':10, 'overshoot':0.02, 'max_iter': 50, 'clip_min': 0., 'clip_max': 1.} deepfool = DeepFool(model_2, sess=sess) adv_x_df = deepfool.generate(x, **deepfool_params) preds_adv_deepfool = model_2.get_probs(adv_x_df) # 5. Base Iterative bim_params = {'eps': 0.3, 'eps_iter': 0.01, 'nb_iter': 100, 'clip_min': 0., 'clip_max': 1.} base_iter = BasicIterativeMethod(model_2, sess=sess) adv_x_bi = base_iter.generate(x, **bim_params) preds_adv_base_iter = model_2.get_probs(adv_x_bi) # 6. C & W Attack cw = CarliniWagnerL2(model_2, back='tf', sess=sess) cw_params = {'binary_search_steps': 1, # 'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': batch_size, 'initial_const': 10} adv_x_cw = cw.generate(x, **cw_params) preds_adv_cw = model_2.get_probs(adv_x_cw) #7 vat_params = {'eps': 2.0, 'num_iterations': 1, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1.} vat = VirtualAdversarialMethod(model_2, sess=sess) adv_x = vat.generate(x, **vat_params) preds_adv_vat = model_2.get_probs(adv_x) # ==> generate 10 targeted classes for every train data regardless # This call runs the Jacobian-based saliency map approach # Loop over the samples we want to perturb into adversarial examples X_train_adv_set = [] Y_train_adv_set = [] for index in range(X_train.shape[0]): print('--------------------------------------') x_val = X_train[index:(index+1)] y_val = Y_train[index] # add normal sample in!!!! X_train_adv_set.append(x_val) Y_train_adv_set.append(y_val) # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_val)) target_classes = other_classes(nb_classes, current_class) # Loop over all target classes for target in target_classes: # print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(x_val, **jsma_params) # append to X_train_adv_set and Y_train_adv_set X_train_adv_set.append(adv_x) Y_train_adv_set.append(y_val) # shape is: (1, 28, 28, 1) # print("adv_x shape is: ", adv_x.shape) # check for success rate # res = int(model_argmax(sess, x, preds, adv_x) == target) print('-------------Finished Generating Np Adversarial Data-------------------------') X_train_data = np.concatenate(X_train_adv_set, axis=0) Y_train_data = np.stack(Y_train_adv_set, axis=0) print("X_train_data shape is: ", X_train_data.shape) print("Y_train_data shape is: ", Y_train_data.shape) # saves the output so later no need to re-fun file np.savez("jsma_training_data.npz", x_train=X_train_data , y_train=Y_train_data) # >>> data = np.load('/tmp/123.npz') # >>> data['a'] f_out = open("Adversarial_jsma_elastic_against5.log", "w") # evaluate the function against 5 attacks # fgsm, base iterative, jsma, elastic net, and deepfool def evaluate_against_all(): # 1 Clean Data eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) print('Legitimate accuracy: %0.4f' % accuracy) tmp = 'Legitimate accuracy: '+ str(accuracy) + "\n" f_out.write(tmp) # 2 JSMA accuracy = model_eval(sess, x, y, preds_adv_random, X_test, Y_test, args=eval_params) print('JSMA accuracy: %0.4f' % accuracy) tmp = 'JSMA accuracy:'+ str(accuracy) + "\n" f_out.write(tmp) # 3 FGSM accuracy = model_eval(sess, x, y, preds_adv_fgsm, X_test, Y_test, args=eval_params) print('FGSM accuracy: %0.4f' % accuracy) tmp = 'FGSM accuracy:' + str(accuracy) + "\n" f_out.write(tmp) # 4 Base Iterative accuracy = model_eval(sess, x, y, preds_adv_base_iter, X_test, Y_test, args=eval_params) print('Base Iterative accuracy: %0.4f' % accuracy) tmp = 'Base Iterative accuracy:' + str(accuracy) + "\n" f_out.write(tmp) # 5 Elastic Net accuracy = model_eval(sess, x, y, preds_adv_elastic_net, X_test, Y_test, args=eval_params) print('Elastic Net accuracy: %0.4f' % accuracy) tmp = 'Elastic Net accuracy:' + str(accuracy) + "\n" f_out.write(tmp) # 6 DeepFool accuracy = model_eval(sess, x, y, preds_adv_deepfool, X_test, Y_test, args=eval_params) print('DeepFool accuracy: %0.4f' % accuracy) tmp = 'DeepFool accuracy:' + str(accuracy) + "\n" f_out.write(tmp) # 7 C & W Attack accuracy = model_eval(sess, x, y, preds_adv_cw, X_test, Y_test, args=eval_params) print('C & W accuracy: %0.4f' % accuracy) tmp = 'C & W accuracy:' + str(accuracy) + "\n" f_out.write(tmp) f_out.write("*******End of Epoch***********\n\n") # 8 Virtual Adversarial accuracy = model_eval(sess, x, y, preds_adv_vat, X_test, Y_test, args=eval_params) print('VAT accuracy: %0.4f' % accuracy) tmp = 'VAT accuracy:' + str(accuracy) + "\n" f_out.write(tmp) f_out.write("*******End of Epoch***********\n\n") print("*******End of Epoch***********\n\n") # report.adv_train_adv_eval = accuracy print("Now Adversarial Training with Elastic Net + modified X_train and Y_train") # trained_model.out train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': '/home/stephen/PycharmProjects/jsma-runall-mac/', 'filename': 'trained_model.out' } model_train(sess, x, y, preds_2, X_train_data, Y_train_data, predictions_adv=preds_adv_elastic_net, evaluate=evaluate_against_all, verbose=False, args=train_params, rng=rng) # Close TF session sess.close() return report
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) model_train(sess, x, y, preds, X_train, Y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind+1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, channels)), np.reshape(adv_x, (img_rows, img_cols, channels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def main(args): normalize = Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) transform = Compose([Resize(256), CenterCrop(224), ToTensor(), normalize]) dataset = ImageDataset(args.image_folder, transform=transform, return_paths=True) # n_images = len(dataset) dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, pin_memory=True, num_workers=0) model = models.resnet50(pretrained=True).to(args.device) model.eval() config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1, allow_soft_placement=True, device_count={'CPU': 1}) sess = tf.Session(config=config) x_op = tf.placeholder(tf.float32, shape=( None, 3, 224, 224, )) tf_model = convert_pytorch_model_to_tf(model, args.device) cleverhans_model = CallableModelWrapper(tf_model, output_layer='logits') # compute clip_min and clip_max suing a full black and a full white image clip_min = normalize(torch.zeros(3, 1, 1)).min().item() clip_max = normalize(torch.ones(3, 1, 1)).max().item() eps = args.eps / 255. eps_iter = 20 nb_iter = 10 args.ord = np.inf if args.ord < 0 else args.ord grad_params = {'eps': eps, 'ord': args.ord} common_params = {'clip_min': clip_min, 'clip_max': clip_max} iter_params = {'eps_iter': eps_iter / 255., 'nb_iter': nb_iter} attack_name = '' if args.attack == 'fgsm': attack_name = '_L{}_eps{}'.format(args.ord, args.eps) attack_op = FastGradientMethod(cleverhans_model, sess=sess) attack_params = {**common_params, **grad_params} elif args.attack == 'iter': attack_name = '_L{}_eps{}_epsi{}_i{}'.format(args.ord, args.eps, eps_iter, nb_iter) attack_op = BasicIterativeMethod(cleverhans_model, sess=sess) attack_params = {**common_params, **grad_params, **iter_params} elif args.attack == 'm-iter': attack_name = '_L{}_eps{}_epsi{}_i{}'.format(args.ord, args.eps, eps_iter, nb_iter) attack_op = MomentumIterativeMethod(cleverhans_model, sess=sess) attack_params = {**common_params, **grad_params, **iter_params} elif args.attack == 'pgd': attack_name = '_L{}_eps{}_epsi{}_i{}'.format(args.ord, args.eps, eps_iter, nb_iter) attack_op = MadryEtAl(cleverhans_model, sess=sess) attack_params = {**common_params, **grad_params, **iter_params} elif args.attack == 'jsma': attack_op = SaliencyMapMethod(cleverhans_model, sess=sess) attack_params = {'theta': eps, 'symbolic_impl': False, **common_params} elif args.attack == 'deepfool': attack_op = DeepFool(cleverhans_model, sess=sess) attack_params = common_params elif args.attack == 'cw': attack_op = CarliniWagnerL2(cleverhans_model, sess=sess) attack_params = common_params elif args.attack == 'lbfgs': attack_op = LBFGS(cleverhans_model, sess=sess) target = np.zeros((1, 1000)) target[0, np.random.randint(1000)] = 1 y = tf.placeholder(tf.float32, target.shape) attack_params = {'y_target': y, **common_params} attack_name = args.attack + attack_name print('Running [{}]. Params: {}'.format(args.attack.upper(), attack_params)) adv_x_op = attack_op.generate(x_op, **attack_params) adv_preds_op = tf_model(adv_x_op) preds_op = tf_model(x_op) n_success = 0 n_processed = 0 progress = tqdm(dataloader) for paths, x in progress: progress.set_description('ATTACK') z, adv_x, adv_z = sess.run([preds_op, adv_x_op, adv_preds_op], feed_dict={ x_op: x, y: target }) src, dst = np.argmax(z, axis=1), np.argmax(adv_z, axis=1) success = src != dst success_paths = np.array(paths)[success] success_adv_x = adv_x[success] success_src = src[success] success_dst = dst[success] n_success += success_adv_x.shape[0] n_processed += x.shape[0] progress.set_postfix( {'Success': '{:3.2%}'.format(n_success / n_processed)}) progress.set_description('SAVING') for p, a, s, d in zip(success_paths, success_adv_x, success_src, success_dst): path = '{}_{}_src{}_dst{}.npz'.format(p, attack_name, s, d) path = os.path.join(args.out_folder, path) np.savez_compressed(path, img=a)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {'allow_soft_placement' : True} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1. } jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph jsma = SaliencyMapMethod(model, sess=sess) adv_jsma_x = jsma.generate(x, **jsma_params) preds_adv_jsma = model.get_logits(adv_jsma_x) # Generate JSMA adversarial examples and save to disk dir = 'images/jsma_rotated_adv/' if not os.path.exists('images'): os.mkdir('images') if not os.path.exists(dir): os.mkdir(dir) if not os.path.exists(dir+'train/'): os.mkdir(dir+'train/') if not os.path.exists(dir+'test/'): os.mkdir(dir+'test/') for index in range(len(y_test)): print('test '+str(index)) x_ = x_test[index] label = np.argmax(y_test[index]) raw_data = (jsma.generate_np(x_.reshape((1, 28, 28, 1)), **jsma_params).reshape((28, 28)) * 255).astype('uint8') im = Image.fromarray(raw_data, mode='P') rot = im.rotate(30) rot.save(dir + 'test/' + str(label)+'_'+str(uuid.uuid4())+'.png') for index in range(len(y_train)): print('train ' + str(index)) x_ = x_train[index] label = np.argmax(y_train[index]) raw_data = (jsma.generate_np(x_.reshape((1, 28, 28, 1)), **jsma_params).reshape((28, 28)) * 255).astype('uint8') im = Image.fromarray(raw_data, mode='P') rot = im.rotate(30) rot.save(dir + 'train/' + str(label)+'_'+str(uuid.uuid4())+'.png') return report
# model = loaded_model # print("Loaded model from disk") # Generate adversarial samples for all test datapoints source_samples = X_test_scaled.shape[0] # Jacobian-based Saliency Map Attack results = np.zeros((FLAGS.nb_classes, source_samples), dtype='i') perturbations = np.zeros((FLAGS.nb_classes, source_samples), dtype='f') grads = jacobian_graph(predictions, x, FLAGS.nb_classes) X_adv = np.zeros((source_samples, X_test_scaled.shape[1])) models = KerasModelWrapper(model) jsma = SaliencyMapMethod(models, sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} for sample_ind in range(0, source_samples): sample = X_test_scaled[sample_ind: (sample_ind+1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) # Only target the normal class for target in [0]: if current_class == 0: break print('Generating adv. example for target class {} for sample {}'.format(target, sample_ind), end='\r')
model_train(sess, x, y, predictions, X_train_scaled, y_train, evaluate=evaluate, args=train_params) # Generate adversarial samples for all test datapoints source_samples = X_test_scaled.shape[0] # Jacobian -based Saliency Map results = np.zeros((FLAGS.nb_classes, source_samples), dtype ='i') perturbations = np.zeros((FLAGS.nb_classes, source_samples), dtype ='f') grads = jacobian_graph(predictions, x, FLAGS.nb_classes) X_adv = np.zeros((source_samples, X_test_scaled.shape[1])) print(type(model)) # <class 'keras.engine.sequential.Sequential'> wrap = KerasModelWrapper(model) jsma = SaliencyMapMethod(wrap, sess=sess) """ Take in a dictionary of parameters and applies attack-specific checks before saving them as attributes. Attack-specific parameters: :param theta: (optional float) Perturbation introduced to modified components (can be positive or negative) :param gamma: (optional float) Maximum percentage of perturbed features :param clip_min: (optional float) Minimum component value for clipping :param clip_max: (optional float) Maximum component value for clipping :param y_target: (optional) Target tensor if the attack is targeted """ # Other paper has theta = 1 and gamma = 0.1 jsma_params = {'theta': 0.1, 'gamma': 0.01,
if i > 0 and i < n - 1: z1[i] = tf.matmul(h1[i - 1], w[i]) + b[i] h1[i] = tf.nn.relu(z1[i]) if i == n - 1: z1[i] = tf.matmul(h1[i - 1], w[i]) + b[i] h1[i] = z1[i] y = classifier(x) yn = classifier_n(x) loss_cls = softmax_loss(label, yn) all_vars = tf.trainable_variables() c_vars = [var for var in all_vars if 'classifier' in var.name] train_op_classifier = GradientDescentOptimizer(learning_rate = learning_rate) \ .minimize(loss_cls, var_list = c_vars, global_step = global_step) jsma = SaliencyMapMethod(classifier, back='tf', sess=sess) x_fgsm = attack.fgsm(x, y, eps=0.2, clip_min=0, clip_max=1) def main(): mnist = input_data.read_data_sets('MNIST_data', one_hot=True) sess.run(tf.global_variables_initializer()) with sess.as_default(): acc = {} print('train classifier') for t in range(1, steps + 1): batch = mnist.train.next_batch(batch_size) f_dict = {x: batch[0], label: batch[1]} sess.run(train_op_classifier, feed_dict=f_dict) if t % 550 == 0: epoch = int(t / 550)
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind+1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report