class TestDeepFool(CleverHansTest): def setUp(self): super(TestDeepFool, self).setUp() self.sess = tf.Session() self.model = SimpleModel() self.attack = DeepFool(self.model, sess=self.sess) def test_generate_np_gives_adversarial_example(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, over_shoot=0.02, max_iter=50, nb_candidate=2, clip_min=-5, clip_max=5) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(orig_labs == new_labs) < 0.1) def test_generate_gives_adversarial_example(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) x = tf.placeholder(tf.float32, x_val.shape) x_adv_p = self.attack.generate(x, over_shoot=0.02, max_iter=50, nb_candidate=2, clip_min=-5, clip_max=5) x_adv = self.sess.run(x_adv_p, {x: x_val}) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(orig_labs == new_labs) < 0.1) def test_generate_np_gives_clipped_adversarial_examples(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, over_shoot=0.02, max_iter=50, nb_candidate=2, clip_min=-0.2, clip_max=0.3) self.assertTrue(-0.201 < np.min(x_adv)) self.assertTrue(np.max(x_adv) < .301)
def attack_batch(model, in_im, net_name, attack_name, im_list, gt_labels, sample_size, batch_size): print('aaa') logging.basicConfig(filename='Logs/'+net_name+"_"+attack_name+'.log', level=logging.INFO, format='%(asctime)s:%(levelname)s:%(message)s') config = tf.ConfigProto(device_count = {'GPU': 2}) imgs = open(im_list).readlines() # [::10] gt_labels = open(gt_labels).readlines() # [::10] top_1 = 0;top_1_real = 0;fool_rate = 0 isotropic, size = get_params(net_name) imageModel = CallableModelWrapper(model, 'logits') with tf.Session(config=config) as sess: if attack_name=='fgsm': print('bbb') attack = FastGradientMethod(imageModel, back='tf') adv_x = attack.generate(in_im,eps=8,clip_min=-124, clip_max=155) if attack_name=='ifgsm': attack = BasicIterativeMethod(imageModel, back='tf') adv_x = attack.generate(in_im,eps=8,eps_iter=1,nb_iter=12,clip_min=-124, clip_max=155) if attack_name=='cw2': attack = CarliniWagnerL2(imageModel, back='tf') adv_x = attack.generate(in_im,clip_min=-124, clip_max=155) if attack_name=='jsma': attack = SaliencyMapMethod(imageModel, back='tf') adv_x = attack.generate(in_im) if attack_name=='pgd': attack = MadryEtAl(imageModel, back='tf') adv_x = attack.generate(in_im,eps=8,eps_iter=1,nb_iter=12,clip_min=-124, clip_max=155) if attack_name=='deepfool': attack = DeepFool(imageModel, back='tf') adv_x = attack.generate(in_im, sess=sess, clip_min=-124, clip_max=155) print('ccc') sess.run(tf.global_variables_initializer()) print('fdfdfd') img_loader = loader_func(net_name, sess, isotropic, size) batch_im = np.zeros((batch_size, size, size, 3)) print('ddd') for i in range(sample_size/batch_size): lim = min(batch_size, len(imgs)-i*batch_size) for j in range(lim): im = img_loader(imgs[i*batch_size+j].strip()) batch_im[j] = np.copy(im) gt = np.array([int(gt_labels[i*batch_size+j].strip()) for j in range(lim)]) adv_x_np=adv_x.eval(feed_dict={in_im: batch_im}) print('qqq') # Calculate the neural probabilities y_adv_prob=tf.nn.softmax(model(in_im), name="yadv").eval(feed_dict={in_im: adv_x_np}); y_adv = np.argmax(y_adv_prob,1) y_true_prob=tf.nn.softmax(model(in_im), name="ypred").eval(feed_dict={in_im: batch_im}); y_true = np.argmax(y_true_prob,1) # Calculate the top-1, top-1-true accuracies and fooling rate top_1 += np.sum(y_adv == gt); top_1_real += np.sum(y_true == gt) fool_rate += np.sum(y_true != y_adv ) if i != 0 and i % 2 == 0: logging.info("batch: {} ==================================================================".format(i)) logging.info("fooling rate {}".format((fool_rate)/float((i+1)*batch_size)*100)) print('ddd') logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.info('Real Top-1 Accuracy = {}'.format( top_1_real/float(sample_size)*100)) logging.info('Top-1 Accuracy = {}'.format((top_1/float(sample_size)*100))) logging.info('Top-1 Fooling Rate = {}'.format(fool_rate/float(sample_size)*100)) logging.info("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
#Definning the session sess = backend.get_session() # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 784)) y = tf.placeholder(tf.float32, shape=(None, 10)) pred = np.argmax(keras_model.predict(x_test), axis=1) acc = np.mean(np.equal(pred, y_test)) print("The Test accuracy is: {}".format(acc)) #################################### Adversarial Attack (DF=X_train (30000 samples) ################################### wrap = KerasModelWrapper(keras_model) df = DeepFool(wrap, back='tf', sess=sess) df_params = { 'over_shoot': 0.09, 'max_iter': 10, 'clip_max': 1, 'clip_min': 0, 'nb_candidate': 10 } adv_x = df.generate_np(x_test, **df_params) adv_conf = keras_model.predict(adv_x) adv_pred = np.argmax(adv_conf, axis=1) adv_acc = np.mean(np.equal(adv_pred, y_test)) print("The adversarial accuracy is: {}".format(adv_acc)) ###################################### Original Image ##########################################
def build_adv(make_obs_tf, q_func, num_actions, epsilon, attack): with tf.variable_scope('deepq', reuse=tf.AUTO_REUSE): obs_tf_in = make_obs_tf("observation") stochastic_ph_adv = tf.placeholder(tf.bool, (), name="stochastic_adv") update_eps_ph_adv = tf.placeholder(tf.float32, (), name="update_eps_adv") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) update_eps_expr_adv = eps.assign( tf.cond(update_eps_ph_adv >= 0, lambda: update_eps_ph_adv, lambda: eps)) def wrapper(x): return q_func(x, num_actions, scope="q_func", concat_softmax=False) if attack == 'fgsm': adversary = FastGradientMethod(CallableModelWrapper( wrapper, 'logits'), sess=U.get_session()) adv_observations = adversary.generate(obs_tf_in.get(), eps=epsilon, clip_min=0.0, clip_max=255.0, ord=np.inf) elif attack == 'bim': adversary = BasicIterativeMethod(CallableModelWrapper( wrapper, 'logits'), sess=U.get_session()) adv_observations = adversary.generate(obs_tf_in.get(), eps=epsilon, eps_iter=epsilon / 10, nb_iter=10, clip_min=0.0, clip_max=255.0, ord=np.inf) elif attack == 'deepfool': adversary = DeepFool(CallableModelWrapper(wrapper, 'logits'), sess=U.get_session()) adv_observations = adversary.generate(obs_tf_in.get(), clip_min=0.0, clip_max=255.0, nb_candidate=num_actions) elif attack == 'momentum': adversary = MomentumIterativeMethod(CallableModelWrapper( wrapper, 'logits'), sess=U.get_session()) adv_observations = adversary.generate(obs_tf_in.get(), eps=epsilon, eps_iter=epsilon / 10, nb_iter=10, clip_min=0.0, clip_max=255.0) elif attack == 'jsma': adversary = SaliencyMapMethod(CallableModelWrapper( wrapper, 'logits'), sess=U.get_session()) adv_observations = adversary.generate(obs_tf_in.get(), clip_min=0.0, clip_max=255.0) craft_adv_obs = U.function( inputs=[obs_tf_in, stochastic_ph_adv, update_eps_ph_adv], outputs=adv_observations, givens={ update_eps_ph_adv: -1.0, stochastic_ph_adv: True }, updates=[update_eps_expr_adv]) return craft_adv_obs
def test_attacks(batch_size=128, source_samples=10, model_path=os.path.join("models", "mnist"), targeted=True): """ Test many attacks on MNIST with deep Bayes classifier. :param batch_size: size of training batches :param source_samples: number of test inputs to attack :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data from cleverhans.utils_mnist import data_mnist X_train, Y_train, X_test, Y_test = data_mnist(train_start=0, train_end=60000, test_start=0, test_end=10000) img_rows, img_cols, channels = X_train[0].shape nb_classes = Y_train.shape[1] # Define input TF placeholder batch_size = min(batch_size, source_samples) x = tf.placeholder(tf.float32, shape=(batch_size, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(batch_size, nb_classes)) # Define TF model graph model_name = str(sys.argv[1]) if model_name == 'bayes': from load_bayes_classifier import BayesModel conv = True checkpoint = 0 #int(sys.argv[1]) K = int(sys.argv[3]) use_mean = True model = BayesModel(sess, 'mnist', conv, K, checkpoint=checkpoint, attack_snapshot=False, use_mean=use_mean) if use_mean: model_name = 'bayes_mean_mlp' else: model_name = 'bayes_K%d' % K if model_name == 'cnn': from load_cnn_classifier import CNNModel model = CNNModel(sess, 'mnist') if model_name == 'wgan': from load_wgan_classifier import WGANModel conv = True checkpoint = 0 #int(sys.argv[1]) K = int(sys.argv[3]) T = int(sys.argv[4]) model = WGANModel(sess, 'mnist', conv, K, T, checkpoint=checkpoint) model_name = 'wgan_K%d_T%d' % (K, T) preds = model.predict(x, softmax=True) # output probabilities print("Defined TensorFlow model graph.") # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy # Craft adversarial examples nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # make adv inputs and labels for the attack if targeted if targeted: adv_inputs = np.array([[instance] * nb_classes for instance in X_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, 1)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) else: adv_inputs = X_test[:source_samples] adv_ys = Y_test[:source_samples] # Instantiate an attack object attack_method = str(sys.argv[2]) if attack_method == 'fgsm': from cleverhans.attacks import FastGradientMethod model_prob = lambda x: model.predict(x, softmax=True) attack = FastGradientMethod(model_prob, sess=sess) from attack_config import config_fgsm attack_params = config_fgsm(targeted, adv_ys) if attack_method == 'bim': from cleverhans.attacks import BasicIterativeMethod model_prob = lambda x: model.predict(x, softmax=True) attack = BasicIterativeMethod(model_prob, sess=sess) from attack_config import config_bim attack_params = config_bim(targeted, adv_ys) if attack_method == 'mim': from cleverhans.attacks import MomentumIterativeMethod model_prob = lambda x: model.predict(x, softmax=True) attack = MomentumIterativeMethod(model_prob, sess=sess) from attack_config import config_mim attack_params = config_mim(targeted, adv_ys) if attack_method == 'jsma': from cleverhans.attacks import SaliencyMapMethod model_prob = lambda x: model.predict(x, softmax=True) attack = SaliencyMapMethod(model_prob, sess=sess) from attack_config import config_jsma attack_params = config_jsma(targeted, adv_ys) if attack_method == 'vat': from cleverhans.attacks import VirtualAdversarialMethod model_logit = lambda x: model.predict(x, softmax=False) attack = VirtualAdversarialMethod(model_logit, sess=sess) from attack_config import config_vat attack_params = config_vat(targeted, adv_ys) if attack_method == 'cw': from cleverhans.attacks import CarliniWagnerL2 model_logit = lambda x: model.predict(x, softmax=False) attack = CarliniWagnerL2(model_logit, sess=sess) from attack_config import config_cw attack_params = config_cw(targeted, adv_ys) if attack_method == 'elastic': from cleverhans.attacks import ElasticNetMethod model_logit = lambda x: model.predict(x, softmax=False) attack = ElasticNetMethod(model_logit, sess=sess) from attack_config import config_elastic attack_params = config_elastic(targeted, adv_ys) if attack_method == 'deepfool': from cleverhans.attacks import DeepFool model_logit = lambda x: model.predict(x, softmax=False) attack = DeepFool(model_logit, sess=sess) from attack_config import config_deepfool attack_params = config_deepfool(targeted, adv_ys) if attack_method == 'madry': from cleverhans.attacks import MadryEtAl model_prob = lambda x: model.predict(x, softmax=True) attack = MadryEtAl(model_prob, sess=sess) from attack_config import config_madry attack_params = config_madry(targeted, adv_ys) attack_params['batch_size'] = batch_size print('batchsize', batch_size) # perform the attack! adv = [] n_batch = int(adv_inputs.shape[0] / batch_size) for i in xrange(n_batch): adv_batch = adv_inputs[i * batch_size:(i + 1) * batch_size] adv.append(attack.generate_np(adv_batch, **attack_params)) adv = np.concatenate(adv, axis=0) for _ in xrange(5): y_adv = [] for i in xrange(n_batch): adv_batch = adv[i * batch_size:(i + 1) * batch_size] y_adv.append(sess.run(preds, {x: adv_batch})) y_adv = np.concatenate(y_adv, axis=0) print('--------------------------------------') for i in xrange(10): print(np.argmax(y_adv[i * 10:(i + 1) * 10], 1)) correct_pred = np.asarray(np.argmax(y_adv, 1) == np.argmax(adv_ys, 1), dtype='f') adv_accuracy = np.mean(correct_pred) if not targeted: # adv_accuracy, y_adv = model_eval(sess, x, y, preds, adv, # adv_ys, args=eval_params, # return_pred=True) # else: # adv_accuracy, y_adv = model_eval(sess, x, y, preds, adv, # Y_test[:source_samples], args=eval_params, # return_pred=True) adv_accuracy = 1. - adv_accuracy print('--------------------------------------') print(np.argmax(adv_ys[:10], 1)) print(np.argmax(y_adv[:10], 1)) for i in xrange(5): tmp = sess.run(preds, {x: adv[:100]}) print(np.argmax(tmp[:10], 1)) # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # visualisation vis_adv = True if vis_adv: N_vis = 100 sys.path.append('../../utils') from visualisation import plot_images if channels == 1: shape = (img_rows, img_cols) else: shape = (img_rows, img_cols, channels) path = 'figs/' filename = model_name + '_' + attack_method if targeted: filename = filename + '_targeted' else: filename = filename + '_untargeted' plot_images(adv_inputs[:N_vis], shape, path, filename + '_data') plot_images(adv[:N_vis], shape, path, filename + '_adv') save_result = True if save_result: path = 'results/' filename = model_name + '_' + attack_method if targeted: filename = filename + '_targeted' y_input = adv_ys else: filename = filename + '_untargeted' y_input = Y_test[:source_samples] results = [adv_inputs, y_input, adv, y_adv] import pickle pickle.dump(results, open(path + filename + '.pkl', 'w')) print("results saved at %s.pkl" % filename) return report
def setUp(self): super(TestDeepFool, self).setUp() self.sess = tf.Session() self.model = SimpleModel() self.attack = DeepFool(self.model, sess=self.sess)
def generate(sess, model, X, Y, attack_method, dataset, attack_params): """ detect adversarial examples :param model_name: the name of the target model. Models are named in the form of model-<dataset>-<architecture>-<transform_type>.h5 :param attack_method: attack for generating adversarial examples :param X: examples to be attacked :param Y: correct label of the examples :return: adversarial examples """ batch_size = 128 img_rows, img_cols, nb_channels = X.shape[1:4] nb_classes = Y.shape[1] # label smoothing label_smoothing_rate = 0.1 Y -= label_smoothing_rate * (Y - 1. / nb_classes) # to be able to call the model in the custom loss, we need to call it once before. # see https://github.com/tensorflow/tensorflow/issues/23769 model(model.input) # wrap a keras model, making it fit the cleverhans framework wrap_model = KerasModelWrapper(model) # initialize the attack object attacker = None if attack_method == ATTACK.FGSM: """ The Fast Gradient Sign Method, by Ian J. Goodfellow, Jonathon Shlens, Christian Szegedy 2014 link: https://arxiv.org/abs/1412.6572 """ attacker = FastGradientMethod(wrap_model, sess=sess) elif attack_method == ATTACK.JSMA: """ The Jacobian-based Saliency Map Method by Nicolas Papernot, Patrick McDaniel, Somesh Jha, Matt Fredrikson, Z. Berkay Celik, Ananthram Swami 2016 link: https://arxiv.org/abs/1511.07528 """ batch_size = 64 attacker = SaliencyMapMethod(wrap_model, sess=sess) elif attack_method == ATTACK.CW_L2: """ Untageted attack """ attacker = CarliniWagnerL2(wrap_model, sess=sess) elif attack_method == ATTACK.CW_Linf: """ Untageted attack """ # TODO: bug fix --- cannot compute gradients correctly # attacker = CarliniWagnerLinf(wrap_model, sess=sess) elif attack_method == ATTACK.CW_L0: """ Untargeted attack """ # TODO: bug fix --- cannot compute gradients correctly # attacker = CarliniWagnerL0(wrap_model, sess=sess) elif attack_method == ATTACK.DEEPFOOL: """ The DeepFool Method, is an untargeted & iterative attack which is based on an iterative linearization of the classifier. by Seyed-Mohsen Moosavi-Dezfooli, Alhussein Fawzi, Pascal Frossard, 2016 link: https://arxiv.org/abs/1511.04599 """ batch_size = 64 ord = attack_params['ord'] attack_params.pop('ord') if ord == 2: # cleverhans supports only l2 norm so far. attacker = DeepFool(wrap_model, sess=sess) elif ord == np.inf: # TODO pass else: raise ValueError('DeepFool supports only l2 and l-inf norms.') elif attack_method == ATTACK.BIM: """ The Basic Iterative Method (also, iterative FGSM) by Alexey Kurakin, Ian Goodfellow, Samy Bengio, 2016 link: https://arxiv.org/abs/1607.02533 """ attacker = BasicIterativeMethod(wrap_model, back='tf', sess=sess) elif attack_method == ATTACK.PGD: """ The Projected Gradient Descent approach. """ attacker = ProjectedGradientDescent(wrap_model) elif attack_method == ATTACK.MIM: """ The Momentum Iterative Method by Yinpeng Dong, Fangzhou Liao, Tianyu Pang, Hang Su, Jun Zhu, Xiaolin Hu, Jianguo Li, 2018 link: https://arxiv.org/abs/1710.06081 """ attacker = MomentumIterativeMethod(wrap_model, sess=sess) else: raise ValueError('{} attack is not supported.'.format(attack_method.upper())) # define custom loss function for adversary compile_params = get_compile_params(dataset, get_adversarial_metric(model, attacker, attack_params)) print(compile_params) print('#### Recompile the model') model.compile(optimizer=compile_params['optimizer'], loss=keras.losses.categorical_crossentropy, metrics=['accuracy', compile_params['metrics']]) # define the graph print('define the graph') adv_x = attacker.generate(model.input, **attack_params) # consider the attack to be constant adv_x = tf.stop_gradient(adv_x) # generating adversarial examples print('generating adversarial example...') adv_examples, = batch_eval(sess, [model.input, wrap_model(adv_x)], [adv_x], [X, Y], batch_size=batch_size) if MODE.DEBUG: score = model.evaluate(adv_examples, Y, verbose=2) print('*** Evaluation on adversarial examples: {}'.format(score)) return adv_examples, Y
def DF(torch_model, dataset, eps_list, opt, c, h, w, clip_min, clip_max): if opt == 'evaluate': acclist = [] for eps in eps_list: sess = tf.Session() x_op = tf.placeholder(tf.float32, shape=( None, c, h, w, )) # Convert pytorch model to a tf_model and wrap it in cleverhans tf_model_fn = convert_pytorch_model_to_tf(torch_model) cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits') # Create an FGSM attack atk_op = DeepFool(cleverhans_model, sess=sess) atk_params = {'clip_min': clip_min, 'clip_max': clip_max} adv_x_op = atk_op.generate(x_op, **atk_params) adv_preds_op = tf_model_fn(adv_x_op) # Run an evaluation of our model against fgsm total = 0 correct = 0 for xs, ys in dataset: xs, ys = xs.to(device), ys.to(device) adv_preds = sess.run(adv_preds_op, feed_dict={x_op: xs}) correct += (np.argmax( adv_preds, axis=1) == ys.cpu().detach().numpy()).sum() total += dataset.batch_size acc = float(correct) / total print('Adv accuracy: {:.3f}'.format(acc * 100)) acclist.append(acc) return acclist elif opt == 'generate': advpacklist = [] for eps in eps_list: advlist = [] sess = tf.Session() x_op = tf.placeholder(tf.float32, shape=( None, c, h, w, )) # Convert pytorch model to a tf_model and wrap it in cleverhans tf_model_fn = convert_pytorch_model_to_tf(torch_model) cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits') # Create an FGSM attack atk_op = DeepFool(cleverhans_model, sess=sess) atk_params = {'clip_min': clip_min, 'clip_max': clip_max} adv_x_op = atk_op.generate(x_op, **atk_params) # Run an evaluation of our model against fgsm for xs, ys in dataset: xs, ys = xs.to(device), ys.to(device) adv = torch.from_numpy(sess.run(adv_x_op, feed_dict={x_op: xs})) if ys == np.argmax(torch_model(xs).data.cpu().numpy()): pred = np.argmax(torch_model(adv).data.cpu().numpy()) if ys != pred: adv = adv.numpy() advlist.append(adv) print(len(advlist)) advpacklist.append(advlist) return advpacklist
attack = LBFGS(model=model, sess=sess) if attackMethod == "CarliniWagnerL2": print ("Using Carlini and Wagner attack method!") attack = CarliniWagnerL2(model=model, sess=sess) if attackMethod == "SPSA": print ("Using SPSA attack method!") attack = SPSA(model=model, sess=sess) if attackMethod == "MadryEtAl": print ("Using Madry et al. attack method!") attack = MadryEtAl(model=model, sess=sess) if attackMethod == "ElasticNet": print ("Using Elastic Net attack method!") attack = ElasticNetMethod(model=model, sess=sess) if attackMethod == "DeepFool": print ("Using Deep Fool attack method!") attack = DeepFool(model=model, sess=sess) if attackMethod == "MomentumIterative": print ("Using Momentum Iterative attack method!") attack = MomentumIterativeMethod(model=model, sess=sess) if attackMethod == "BasicIterative": print ("Using Basic Iterative attack method!") attack = BasicIterativeMethod(model=model, sess=sess) if attackMethod == "SaliencyMap": print ("Using Saliency Map attack method!") attack = SaliencyMapMethod(model=model, sess=sess) if attackMethod == "SPSA": adversarialOp = attack.generate(x=xPlaceholder, y=yPlaceholder, epsilon=Cfg.epsilon * 5.0, num_steps=Cfg.attackIterations) elif attackMethod == "LBFGS": adversarialOp = attack.generate(x=xPlaceholder, y_target=tf.one_hot(yPlaceholder, depth=Cfg.numClasses), **attackParams) else:
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="train_dir", filename="mnist.ckpt", load_model=False, testing=False, label_smoothing=0.1, method='FGSM'): """ MNIST tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) print('y_train: ', y_train.shape) x_train = np.pad(x_train, ((0, 0), (2, 2), (2, 2), (0, 0)), mode='constant') x_test = np.pad(x_test, ((0, 0), (2, 2), (2, 2), (0, 0)), mode='constant') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] print('img_rows: {}, img_cols: {}, nchannels: {}'.format(img_rows, img_cols, nchannels)) nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph if train_dir=='mnist_ff_model': model=mnist_ff_model() elif train_dir=='mnist_BP_model': model = mnist_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = CrossEntropy(wrap, smoothing=label_smoothing) train(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, save=True, rng=rng) print('Training done!') # Calculate training error print('testing param:', testing) if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph # fgsm = FastGradientMethod(wrap, sess=sess) if method=='FGSM': clw=FastGradientMethod(wrap, sess=sess) elif method=='BIM': clw=BasicIterativeMethod(wrap, sess=sess) elif method=='DeepFool': clw=DeepFool(wrap, sess=sess) else: raise NotImplementedError print('method chosen: ', method) clw_params = {} adv_x = clw.generate(x, **clw_params) with sess.as_default(): feed_dict={x:x_test, y:y_test} store_data=adv_x.eval(feed_dict=feed_dict) print('store_data: {}'.format(store_data.shape)) save_name='{}/mnist_{}_data.pkl'.format(train_dir,method) with open(save_name,'wb') as fw: pickle.dump(store_data, fw, protocol=2) print('data stored as {}'.format(save_name)) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_train, y_train, args=eval_par) report.train_clean_train_adv_eval = acc return report
def baseline_deepfool(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } deepfool_params = { 'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': 50, 'clip_min': 0., 'clip_max': 1. } rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # # HERE already trained model, thus we need a new one (model_2) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph deepfool = DeepFool(model, sess=sess) adv_x = deepfool.generate(x, **deepfool_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on DeepFool adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) deepfool2 = DeepFool(model_2, sess=sess) adv_x_2 = deepfool2.generate(x, **deepfool_params) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x_2 = tf.stop_gradient(adv_x_2) preds_2_adv = model_2(adv_x_2) # # let's generate DeepFool examples # # let's generate FGSM examples # fgsm = FastGradientMethod(model_2, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x_fgsm = fgsm.generate(x, **fgsm_params) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x_fgsm = tf.stop_gradient(adv_x_fgsm) preds_2_fgsm = model_2(adv_x_fgsm) # DON'T WANT TO TRAIN on FGSM adv examples yet def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on FGSM adversarial examples accuracy = model_eval(sess, x, y, preds_2_fgsm, X_test, Y_test, args=eval_params) print('Test accuracy on FGSM adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Accuracy of the DeepFool adv trained model on DeepFool examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on DeepFool adversarial examples: %0.4f' % accuracy) # Perform and evaluate adversarial training model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def attack_classifier(sess, x, y, model, x_test, y_test, attack_method="fgsm", target=None, batch_size=128): tf.set_random_seed(1822) set_log_level(logging.DEBUG) # Initialize attack if attack_method == "fgsm": from cleverhans.attacks import FastGradientMethod params = {'eps': 8 / 255, 'clip_min': 0., 'clip_max': 1.} if target is not None: params["y_target"] = tf.constant( np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0)) method = FastGradientMethod(model, sess=sess) elif attack_method == "basic_iterative": from cleverhans.attacks import BasicIterativeMethod params = { 'eps': 8 / 255, 'eps_iter': 1 / 255, 'nb_iter': 10, 'clip_min': 0., 'clip_max': 1. } if target is not None: params["y_target"] = tf.constant( np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0)) method = BasicIterativeMethod(model, sess=sess) elif attack_method == "momentum_iterative": from cleverhans.attacks import MomentumIterativeMethod params = { 'eps': 8 / 255, 'eps_iter': 1 / 255, 'nb_iter': 10, 'clip_min': 0., 'clip_max': 1. } if target is not None: params["y_target"] = tf.constant( np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0)) method = MomentumIterativeMethod(model, sess=sess) elif attack_method == "saliency": from cleverhans.attacks import SaliencyMapMethod params = { 'theta': 8 / 255, 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1. } assert target is None method = SaliencyMapMethod(model, sess=sess) elif attack_method == "virtual": from cleverhans.attacks import VirtualAdversarialMethod params = { 'eps': 8 / 255, 'num_iterations': 10, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1. } assert target is None method = VirtualAdversarialMethod(model, sess=sess) elif attack_method == "cw": from cleverhans.attacks import CarliniWagnerL2 params = { "confidence": 0, "batch_size": 128, "learning_rate": 1e-4, "binary_search_steps": 10, "max_iterations": 1000, "abort_early": True, "initial_const": 1e-2, "clip_min": 0, "clip_max": 1 } if target is not None: params["y_target"] = tf.constant( np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0)) method = CarliniWagnerL2(model, sess=sess) elif attack_method == "elastic_net": from cleverhans.attacks import ElasticNetMethod params = { "fista": "FISTA", "beta": 0.1, "decision_rule": "EN", "confidence": 0, "batch_size": 128, "learning_rate": 1e-4, "binary_search_steps": 10, "max_iterations": 1000, "abort_early": True, "initial_const": 1e-2, "clip_min": 0, "clip_max": 1 } if target is not None: params["y_target"] = tf.constant( np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0)) method = ElasticNetMethod(model, sess=sess) elif attack_method == "deepfool": from cleverhans.attacks import DeepFool params = { "nb_candidate": 10, "overshoot": 1e-3, "max_iter": 100, "nb_classes": 10, "clip_min": 0, "clip_max": 1 } assert target is None method = DeepFool(model, sess=sess) elif attack_method == "lbfgs": from cleverhans.attacks import LBFGS params = { 'batch_size': 128, "binary_search_steps": 10, "max_iterations": 1000, "initial_const": 1e-2, 'clip_min': 0., 'clip_max': 1. } assert target is not None params["y_target"] = tf.constant( np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0)) method = LBFGS(model, sess=sess) elif attack_method == "madry": from cleverhans.attacks import MadryEtAl params = { 'eps': 8 / 255, 'eps_iter': 1 / 255, 'nb_iter': 10, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1. } if target is not None: params["y_target"] = tf.constant( np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0)) method = MadryEtAl(model, sess=sess) elif attack_method == "SPSA": from cleverhans.attacks import SPSA params = { 'epsilon': 1 / 255, 'num_steps': 10, 'is_targeted': False, 'early_stop_loss_threshold': None, 'learning_rate': 0.01, 'delta': 0.01, 'batch_size': 128, 'spsa_iters': 1, 'is_debug': False } if target is not None: params["y_target"] = tf.constant( np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0)) params["is_targeted"] = True method = SPSA(model, sess=sess) else: raise ValueError("Can not recognize this attack method") adv_x = method.generate(x, **params) num_batch = x_test.shape[0] // batch_size adv_imgs = [] for i in range(num_batch): if (i + 1) * batch_size >= x_test.shape[0]: adv_imgs.append( sess.run(adv_x, feed_dict={ x: x_test[i * batch_size:], y: y_test[i * batch_size:] })) else: adv_imgs.append( sess.run(adv_x, feed_dict={ x: x_test[i * batch_size:(i + 1) * batch_size], y: y_test[i * batch_size:(i + 1) * batch_size] })) adv_imgs = np.concatenate(adv_imgs, axis=0) return adv_imgs
# name='my_target', # sparse=K.is_sparse(model.outputs[0]), # dtype=K.dtype(model.outputs[0]) # ) # model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy', metrics=['acc'], target=[target]) model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy', metrics=['acc']) model.load_weights(model_path) # ------------ Unsupervised Attacking ------------- y_test = y_test.flatten() y_test_pre = np.argmax(model.predict(x_test), axis=1) ch_model = KerasModelWrapper(model) deepfool = DeepFool(ch_model, back='tf', sess=K.get_session()) raw_acc = np.sum(y_test_pre == y_test) / len(y_test_pre) # print(np.sum(y_test_pre == y_test), y_test_pre.shape, y_test.shape) print(raw_acc) v, fool_list = universal_perturbation(model, deepfool, x_train) print(v, '\n', fool_list) adv_x = x_test + v print(v.shape) # np.savez('adv_x_test.npz', x=adv_x) # df_params = {'max_iter': 50, 'nb_candidate': 3, # 'clip_min': -1., 'clip_max': 1.} # adv_x = deepfool.generate_np(x_test, **df_params) # v = adv_x - x_test
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64, num_threads=None): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } dp_params = {'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) s = [] for i in range(0, len(X_test), 1): pred = sess.run(preds, {x: X_test[i:i + 1]}) print(pred) print(Y_test[i:i + 1]) s.append(np.sort(pred)[0, -1] - np.sort(pred)[0, -2]) #Draw a histogram def draw_hist(myList, Title, Xlabel, Ylabel): plt.hist(myList, np.arange(0, 1, 0.01), normed=True, stacked=True, facecolor='blue') plt.xlabel(Xlabel) plt.ylabel(Ylabel) plt.title(Title) plt.show() draw_hist(myList=s, Title='legitimate', Xlabel='difference between max and second largest', Ylabel='Probability') # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the deepfool attack object and # graph deepfool = DeepFool(model, back='tf', sess=sess) adv_x = deepfool.generate(x, **dp_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) ''' s = [] for i in range(0,len(X_test),1): pred = sess.run(preds_adv, {x: X_test[i:i+1]}) print(pred) print(Y_test[i:i+1]) s.append(np.sort(pred)[0,-1]-np.sort(pred)[0,-2]) #Draw a histogram def draw_hist(myList,Title,Xlabel,Ylabel): plt.hist(myList,np.arange(0,1,0.01),normed=True,stacked=True,facecolor='red') plt.xlabel(Xlabel) plt.ylabel(Ylabel) plt.title(Title) plt.show() draw_hist(myList=s,Title='adversarial',Xlabel='difference between max and second largest', Ylabel='Probability') ''' report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, train_dir=TRAIN_DIR, filename=FILENAME, load_model=LOAD_MODEL, testing=True, label_smoothing=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ tf.keras.backend.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if keras.backend.image_data_format() != 'channels_last': raise NotImplementedError("this tutorial requires keras to be configured to channels_last format") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = CrossEntropy(wrap, smoothing=label_smoothing) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc df = DeepFool(wrap, sess=sess) adv_x = df.generate(x) batch = 1000 x_adv_test = None x_adv_train = None for i in tqdm(range(int(len(x_test) / batch))): tmp = sess.run(adv_x, feed_dict={x: x_test[i*batch:(i+1)*batch]}) if x_adv_test is None: x_adv_test = tmp else: x_adv_test = np.concatenate((x_adv_test, tmp)) for i in tqdm(range(int(len(x_train) / batch))): tmp = sess.run(adv_x, feed_dict={x: x_train[i*batch:(i+1)*batch]}) if x_adv_train is None: x_adv_train = tmp else: x_adv_train = np.concatenate((x_adv_train, tmp)) def evaluate_adv(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_adv_test, y_test, args=eval_params) report.clean_train_clean_eval = acc print('Test accuracy on legitimate examples: %0.4f' % acc) evaluate_adv() save_list = [x_adv_train, x_adv_test] print(x_adv_train.shape) print(x_adv_test.shape) pickle.dump(save_list, open("./df.pkl", 'wb'))
def adv_generate(nb_epochs=25, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, nb_filters=64, num_threads=None, data='cifar', adv_attack='fgsm', save_dir='data'): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information # set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} config = tf.ConfigProto(**config_args) config.gpu_options.allow_growth = True sess = tf.Session(config=config) if data == "mnist": # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=0, train_end=60000, test_start=0, test_end=10000) else: X_train, Y_train, X_test, Y_test = data_cifar10() # print (Y_test.shape) ''' for i in range(Y_test.shape[0]): img = np.squeeze(X_test[i,:,:,:]) imsave(os.path.join("benign", str(i) + ".jpg"), img) for i in range(Y_test.shape[0]): img = np.squeeze(X_test[i,:,:,:]) benign_path = "benign_" + str(np.argmax(Y_test[i,:], axis=0)) if not os.path.exists(benign_path): os.makedirs(benign_path) imsave(os.path.join(benign_path, str(i) + ".jpg"), img) ''' # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder if data == 'mnist': x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) else: x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) # model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } rng = np.random.RandomState([2018, 7, 18]) if clean_train: if data == 'mnist': model = build_model(0.01, 1e-6) else: model = build_model_cifar(0.01, 1e-6) preds = model(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc if adv_attack == "FGSM": # Initialize the attack object and graph # FGSM print "FGSM ATTACK..." fgsm_params = {'eps': 0.1, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model(adv_x) elif adv_attack == "CWL2": # CWL2 print "CWL2 ATTACK..." cwl2_params = {'batch_size': 8} cwl2 = CarliniWagnerL2(model, sess=sess) adv_x = cwl2.generate(x, **cwl2_params) preds_adv = model(adv_x) elif adv_attack == "JSMA": # JSMA print "JSMA ATTACK..." jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1. } adv_x = jsma.generate(x, **jsma_params) preds_adv = model(adv_x) elif adv_attack == "DeepFool": # DeepFool print "DeepFool ATTACK..." deepfool = DeepFool(model, sess=sess) deepfool_params = { 'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': 50, 'clip_min': 0.0, 'clip_max': 1.0 } adv_x = deepfool.generate(x, **deepfool_params) preds_adv = model(adv_x) elif adv_attack == "LBFGS": # LBFGS print "LBFGS ATTACK..." lbfgs_params = {'y_target': y, 'batch_size': 100} lbfgs = LBFGS(model, sess=sess) adv_x = lbfgs.generate(x, **lbfgs_params) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} adv_imgs = [] adv_imgs_test = [] if not adv_attack == "LBFGS": for i in range(5000): adv_imgs_train, _ = sess.run( [adv_x, preds_adv], feed_dict={x: X_train[i * 10:(i + 1) * 10]}) adv_imgs.append(adv_imgs_train) adv_imgs = np.vstack(adv_imgs) print(adv_imgs.shape) for i in range(1000): adv_imgs_tmp, _ = sess.run( [adv_x, preds_adv], feed_dict={x: X_test[i * 10:(i + 1) * 10]}) adv_imgs_test.append(adv_imgs_tmp) adv_imgs_test = np.vstack(adv_imgs_test) else: for i in range(500): target = np_utils.to_categorical( (np.argmax(Y_train[i * 100:(i + 1) * 100], axis=1) + 1) % 10, 10) adv_imgs_train, _ = sess.run([adv_x, preds_adv], feed_dict={ x: X_train[i * 100:(i + 1) * 100], y: target }) print('train image: %s' % str(i)) adv_imgs.append(adv_imgs_train) print(adv_imgs.shape) for i in range(100): target = np_utils.to_categorical( (np.argmax(Y_train[i * 100:(i + 1) * 100], axis=1) + 1) % 10, 10) adv_imgs_train, _ = sess.run([adv_x, preds_adv], feed_dict={ x: X_train[i * 100:(i + 1) * 100], y: target }) adv_imgs_test.append(adv_imgs_tmp) print('test image: %s' % str(i)) adv_imgs_test = np.vstack(adv_imgs_test) ''' for i in range(6): target = np_utils.to_categorical((np.argmax(Y_train[i*10000: (i+1)*10000, ...], axis = 1) + 1) % 10, 10) adv_imgs_train, adv_labels_train = sess.run([adv_x, preds_adv], feed_dict={x: X_train[i*10000: (i+1)*10000,...], y: target}) for i in range(60000): target = np_utils.to_categorical((np.argmax(Y_train[i:i+1, ...], axis = 1) + 1) % 10, 10) adv_imgs_train = sess.run([adv_x], feed_dict={x: X_train[i:i+1,...], y: target}) print (len(adv_imgs_train), adv_imgs_train[0].shape, adv_imgs_train[1]) ''' label_truth_train = np.argmax(Y_train, axis=1) label_truth_test = np.argmax(Y_test, axis=1) save_dir = os.path.join( save_dir, os.path.join(adv_attack)) #, "eps_" + str(eps))) if not os.path.exists(save_dir): os.makedirs(save_dir) print(adv_imgs.shape, adv_imgs_test.shape) provider.save_h5(adv_imgs, label_truth_train, os.path.join(save_dir, "train_adv.h5")) provider.save_h5(adv_imgs_test, label_truth_test, os.path.join(save_dir, "test_adv.h5")) # utils.save_h5(X_train, label_truth_train, "FGSM/train_benign.h5") # utils.save_h5(X_test, label_truth_test, "FGSM/test_benign.h5") ''' for i in range(adv_labels.shape[0]): img = np.squeeze(adv_imgs[i,:,:,:]) imsave(os.path.join("adv", str(i) + ".jpg"), img) for i in range(adv_labels.shape[0]): img = np.squeeze(adv_imgs[i,:,:,:]) adv_path = "adv_" + str(np.argmax(adv_labels[i,:], axis=0)) if not os.path.exists(adv_path): os.makedirs(adv_path) imsave(os.path.join(adv_path, str(i) + ".jpg"), img) ''' acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc return report
def train(alpha, eps2_ratio, gen_ratio, fgsm_eps, LR, logfile): logfile.write("fgsm_eps \t %g, LR \t %g, alpha \t %d , eps2_ratio \t %d , gen_ratio \t %d \n"%(fgsm_eps, LR, alpha, eps2_ratio, gen_ratio)) ############################# ##Hyper-parameter Setting#### ############################# hk = 256; #number of hidden units at the last layer Delta2 = (14*14+2)*25; #global sensitivity for the first hidden layer Delta3_adv = 2*hk #10*(hk + 1/4 * hk**2) #10*(hk) #global sensitivity for the output layer Delta3_benign = 2*hk #10*(hk); #global sensitivity for the output layer D = 50000; #size of the dataset L = 2499; #batch size image_size = 28; padding = 4; #numHidUnits = 14*14*32 + 7*7*64 + M + 10; #number of hidden units #gen_ratio = 1 epsilon1 = 0.0; #0.175; #epsilon for dpLRP epsilon2 = 0.1*(1 + gen_ratio); #epsilon for the first hidden layer epsilon3 = 0.1*(1); #epsilon for the last hidden layer total_eps = epsilon1 + epsilon2 + epsilon3 print(total_eps) uncert = 0.1; #uncertainty modeling at the output layer infl = 1; #inflation rate in the privacy budget redistribution R_lowerbound = 1e-5; #lower bound of the LRP c = [0, 40, 50, 200] #norm bounds epochs = 200; #number of epochs preT_epochs = 50; #number of epochs T = int(D/L*epochs + 1); #number of steps T pre_T = int(D/L*preT_epochs + 1); step_for_epoch = int(D/L); #number of steps for one epoch broken_ratio = 1 #alpha = 9.0 # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] #eps2_ratio = 10; # [1/10, 1/8, 1/6, 1/4, 1/2, 1, 2, 4, 6, 8, 10] #eps_benign = 1/(1+eps2_ratio)*(2*epsilon2) #eps_adv = eps2_ratio/(1+eps2_ratio)*(2*epsilon2) #fgsm_eps = 0.1 rand_alpha = 0.05 ##Robustness## robustness_T = (fgsm_eps*18*18*L*epsilon2)/Delta2; #### LRPfile = os.getcwd() + '/Relevance_R_0_075.txt'; ############################# mnist = input_data.read_data_sets("MNIST_data/", one_hot = True); ############################# ##Construct the Model######## ############################# #Step 4: Randomly initiate the noise, Compute 1/|L| * Delta3 for the output layer# #Compute the 1/|L| * Delta3 for the last hidden layer# """eps3_ratio = Delta3_adv/Delta3_benign; eps3_benign = 1/(1+eps3_ratio)*(epsilon3) eps3_adv = eps3_ratio/(1+eps3_ratio)*(epsilon3)""" loc, scale3_benign, scale3_adv = 0., Delta3_benign/(epsilon3*L), Delta3_adv/(epsilon3*L); ### #End Step 4# # Parameters Declarification W_conv1 = weight_variable('W_conv1', [5, 5, 1, 32], collect=[AECODER_VARIABLES]); b_conv1 = bias_variable('b_conv1', [32], collect=[AECODER_VARIABLES]); shape = W_conv1.get_shape().as_list() w_t = tf.reshape(W_conv1, [-1, shape[-1]]) w = tf.transpose(w_t) sing_vals = tf.svd(w, compute_uv=False) sensitivity = tf.reduce_max(sing_vals) gamma = 2*(14*14 + 2)*25/(L*sensitivity) dp_epsilon=1.0 #0.1 delta_r = fgsm_eps*(image_size**2); #delta_h = 1.0 * delta_r; #sensitivity*(14**2) = sensitivity*(\beta**2) can also be used #dp_mult = (Delta2/(L*epsilon2))/(delta_r / dp_epsilon) + (2*Delta2/(L*epsilon2))/(delta_h / dp_epsilon) W_conv2 = weight_variable('W_conv2', [5, 5, 32, 64], collect=[CONV_VARIABLES]); b_conv2 = bias_variable('b_conv2', [64], collect=[CONV_VARIABLES]); W_fc1 = weight_variable('W_fc1', [4 * 4 * 64, hk], collect=[CONV_VARIABLES]); b_fc1 = bias_variable('b_fc1', [hk], collect=[CONV_VARIABLES]); W_fc2 = weight_variable('W_fc2', [hk, 10], collect=[CONV_VARIABLES]); b_fc2 = bias_variable('b_fc2', [10], collect=[CONV_VARIABLES]); """scale2 = tf.Variable(tf.ones([hk])) beta2 = tf.Variable(tf.zeros([hk])) tf.add_to_collections([CONV_VARIABLES], scale2) tf.add_to_collections([CONV_VARIABLES], beta2)""" params = [W_conv1, b_conv1, W_conv2, b_conv2, W_fc1, b_fc1, W_fc2, b_fc2] ### #Step 5: Create the model# noise = tf.placeholder(tf.float32, [None, image_size, image_size, 1]); adv_noise = tf.placeholder(tf.float32, [None, image_size, image_size, 1]); keep_prob = tf.placeholder(tf.float32); x = tf.placeholder(tf.float32, [None, image_size*image_size]); x_image = tf.reshape(x, [-1,image_size,image_size,1]); #perturbFMx = np.random.laplace(0.0, Delta2/(2*epsilon2*L), 28*28) #perturbFMx = np.reshape(perturbFMx, [-1, 28, 28, 1]); # pretrain ### #Enc_Layer1 = EncLayer(inpt=x_image, n_filter_in = 1, n_filter_out = 32, filter_size = 5, W=W_conv1, b=b_conv1, activation=tf.nn.relu) #pretrain = Enc_Layer1.get_train_ops2(xShape = tf.shape(x_image)[0], Delta = Delta2, epsilon = 2*epsilon2, batch_size = L, learning_rate= LR, W = W_conv1, b = b_conv1, perturbFMx = noise) ########### adv_x = tf.placeholder(tf.float32, [None, image_size*image_size]); adv_image = tf.reshape(adv_x, [-1,image_size,image_size,1]); #perturbFMx_adv = np.random.laplace(0.0, Delta2/(2*epsilon2*L), 28*28) #perturbFMx_adv = np.reshape(perturbFMx_adv, [-1, 28, 28, 1]); # pretrain adv ### #perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2*L), 14*14*32) #perturbFM_h = np.reshape(perturbFM_h, [-1, 14, 14, 32]); FM_h = tf.placeholder(tf.float32, [None, 14, 14, 32]); Enc_Layer2 = EncLayer(inpt=adv_image, n_filter_in = 1, n_filter_out = 32, filter_size = 5, W=W_conv1, b=b_conv1, activation=tf.nn.relu) pretrain_adv = Enc_Layer2.get_train_ops2(xShape = tf.shape(adv_image)[0], Delta = Delta2, batch_size = L, learning_rate= LR, W = W_conv1, b = b_conv1, perturbFMx = adv_noise, perturbFM_h = FM_h) Enc_Layer3 = EncLayer(inpt=x_image, n_filter_in = 1, n_filter_out = 32, filter_size = 5, W=W_conv1, b=b_conv1, activation=tf.nn.relu) pretrain_benign = Enc_Layer3.get_train_ops2(xShape = tf.shape(x_image)[0], Delta = Delta2, batch_size = L, learning_rate= LR, W = W_conv1, b = b_conv1, perturbFMx = noise, perturbFM_h = FM_h) ########### x_image += noise; x_image = tf.clip_by_value(x_image, -10, 10) #Clip the values of each input feature. adv_image += adv_noise; adv_image = tf.clip_by_value(adv_image, -10, 10) #Clip the values of each input feature. #perturbFM = np.random.laplace(0.0, scale3_benign, hk) #perturbFM = np.reshape(perturbFM, [hk]); perturbFM = np.random.laplace(0.0, scale3_benign, hk * 10) perturbFM = np.reshape(perturbFM, [hk, 10]); y_conv = inference(x_image, perturbFM, hk, FM_h, params); softmax_y_conv = tf.nn.softmax(y_conv) #robust_mask = inference_robust_mask(y_conv, Delta2, L, epsilon2, robustness_T) #perturbFM = np.random.laplace(0.0, scale3_adv, hk) #perturbFM = np.reshape(perturbFM, [hk]); y_adv_conv = inference(adv_image, perturbFM, hk, FM_h, params); #adv_robust_mask = inference_robust_mask(y_adv_conv, Delta2, L, epsilon2, robustness_T) # test model perturbFM_test = np.random.laplace(0.0, 0, hk) perturbFM_test = np.reshape(perturbFM_test, [hk]); x_test = tf.reshape(x, [-1,image_size,image_size,1]); y_test = inference(x_test, perturbFM_test, hk, FM_h, params); #test_robust_mask = inference_robust_mask(y_test, Delta2, L, epsilon2, robustness_T) #Define a place holder for the output label# y_ = tf.placeholder(tf.float32, [None, 10]); adv_y_ = tf.placeholder(tf.float32, [None, 10]); #End Step 5# ############################# ############################# ##Define loss and Optimizer## ############################# ''' Computes differentially private sigmoid cross entropy given `logits`. Measures the probability error in discrete classification tasks in which each class is independent and not mutually exclusive. For brevity, let `x = logits`, `z = labels`. The logistic loss is z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x))) = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x))) = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x)) = (1 - z) * x + log(1 + exp(-x)) = x - x * z + log(1 + exp(-x)) For x < 0, to avoid overflow in exp(-x), we reformulate the above x - x * z + log(1 + exp(-x)) = log(exp(x)) - x * z + log(1 + exp(-x)) = - x * z + log(1 + exp(x)) Hence, to ensure stability and avoid overflow, the implementation uses this equivalent formulation max(x, 0) - x * z + log(1 + exp(-abs(x))) `logits` and `labels` must have the same type and shape. Let denote neg_abs_logits = -abs(y_conv) = -abs(h_fc1 * W_fc2). By Applying Taylor Expansion, we have: Taylor = max(y_conv, 0) - y_conv * y_ + log(1 + exp(-abs(y_conv))); = max(h_fc1 * W_fc2, 0) - (y_ * h_fc1) * W_fc2 + (math.log(2.0) + 0.5*neg_abs_logits + 1.0/8.0*neg_abs_logits**2) = max(h_fc1 * W_fc2, 0) - (y_ * h_fc1) * W_fc2 + (math.log(2.0) + 0.5*(-abs(h_fc1 * W_fc2)) + 1.0/8.0*(-abs(h_fc1 * W_fc2))**2) = F1 + F2 where: F1 = max(h_fc1 * W_fc2, 0) + (math.log(2.0) + 0.5*(-abs(h_fc1 * W_fc2)) + 1.0/8.0*(-abs(h_fc1 * W_fc2))**2) and F2 = - (y_ * h_fc1) * W_fc2 To ensure that Taylor is differentially private, we need to perturb all the coefficients, including the term y_ * h_fc1 * W_fc2. Note that h_fc1 is differentially private, since its computation on top of the DP Affine transformation does not access the original data. Therefore, F1 should be differentially private. We need to preserve DP in F2, which reads the groundtruth label y_, as follows: By applying Funtional Mechanism, we perturb (y_ * h_fc1) * W_fc2 as ((y_ * h_fc1) + perturbFM) * W_fc2 = (y_ * h_fc1)*W_fc2 + (perturbFM * W_fc2): perturbFM = np.random.laplace(0.0, scale3, hk * 10) perturbFM = np.reshape(perturbFM/L, [hk, 10]); where scale3 = Delta3/(epsilon3) = 2*hk/(epsilon3); To allow computing gradients at zero, we define custom versions of max and abs functions [Tensorflow]. Source: https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/python/ops/nn_impl.py @ TensorFlow ''' ### Taylor for benign x zeros = array_ops.zeros_like(y_conv, dtype=y_conv.dtype) cond = (y_conv >= zeros) relu_logits = array_ops.where(cond, y_conv, zeros) neg_abs_logits = array_ops.where(cond, -y_conv, y_conv) #Taylor = math_ops.add(relu_logits - y_conv * y_, math_ops.log1p(math_ops.exp(neg_abs_logits))) Taylor_benign = math_ops.add(relu_logits - y_conv * y_, math.log(2.0) + 0.5*neg_abs_logits + 1.0/8.0*neg_abs_logits**2) - tf.reduce_sum(perturbFM*W_fc2) #Taylor_benign = tf.abs(y_conv - y_) ### Taylor for adv_x zeros_adv = array_ops.zeros_like(y_adv_conv, dtype=y_conv.dtype) cond_adv = (y_adv_conv >= zeros_adv) relu_logits_adv = array_ops.where(cond_adv, y_adv_conv, zeros_adv) neg_abs_logits_adv = array_ops.where(cond_adv, -y_adv_conv, y_adv_conv) #Taylor = math_ops.add(relu_logits - y_conv * y_, math_ops.log1p(math_ops.exp(neg_abs_logits))) Taylor_adv = math_ops.add(relu_logits_adv - y_adv_conv * adv_y_, math.log(2.0) + 0.5*neg_abs_logits_adv + 1.0/8.0*neg_abs_logits_adv**2) - tf.reduce_sum(perturbFM*W_fc2) #Taylor_adv = tf.abs(y_adv_conv - adv_y_) ### Adversarial training loss adv_loss = (1/(L + L*alpha))*(Taylor_benign + alpha * Taylor_adv) '''Some time, using learning rate decay can help to stablize training process. However, use this carefully, since it may affect the convergent speed.''' global_step = tf.Variable(0, trainable=False) pretrain_var_list = tf.get_collection(AECODER_VARIABLES) train_var_list = tf.get_collection(CONV_VARIABLES) #print(pretrain_var_list) #print(train_var_list) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): pretrain_step = tf.train.AdamOptimizer(LR).minimize(pretrain_adv+pretrain_benign, global_step=global_step, var_list=pretrain_var_list); train_step = tf.train.AdamOptimizer(LR).minimize(adv_loss, global_step=global_step, var_list=train_var_list); sess = tf.InteractiveSession(); # Define the correct prediction and accuracy # This needs to be changed to "Robust Prediction" correct_prediction_x = tf.equal(tf.argmax(y_test,1), tf.argmax(y_,1)); accuracy_x = tf.reduce_mean(tf.cast(correct_prediction_x, tf.float32)); ############# # use these to get predictions wrt to robust conditions """robust_correct_prediction_x = tf.multiply(test_robust_mask, tf.cast(correct_prediction_x, tf.float32)) accuracy_x_robust = tf.reduce_sum(robust_correct_prediction_x) / tf.reduce_sum(test_robust_mask) #certified_utility = 2/(1/accuracy_x_robust + 1/(tf.reduce_sum(test_robust_mask)/(1.0*tf.cast(tf.size(test_robust_mask), tf.float32)))) certified_utility = (1.0*tf.reduce_sum(test_robust_mask))/(1.0*tf.cast(tf.size(test_robust_mask), tf.float32))""" ############# # craft adversarial samples from x for training dynamic_eps = tf.placeholder(tf.float32); emsemble_L = int(L/3) softmax_y = tf.nn.softmax(y_test) #c_x_adv = fgsm(x, softmax_y, eps=fgsm_eps, clip_min=0.0, clip_max=1.0) c_x_adv = fgsm(x, softmax_y, eps=(dynamic_eps)/10, clip_min=-1.0, clip_max=1.0) # for I-FGSM x_adv = tf.reshape(c_x_adv, [emsemble_L,image_size*image_size]); #====================== attack ========================= #attack_switch = {'randfgsm':True, 'fgsm':True, 'ifgsm':True, 'deepfool':True, 'mim':True, 'spsa':False, 'cwl2':False, 'madry':True, 'stm':True} #attack_switch = {'fgsm':True, 'ifgsm':True, 'deepfool':True, 'mim':True, 'spsa':False, 'cwl2':False, 'madry':True, 'stm':True} attack_switch = {'fgsm':True, 'ifgsm':True, 'deepfool':False, 'mim':True, 'spsa':False, 'cwl2':False, 'madry':True, 'stm':False} #other possible attacks: # ElasticNetMethod # FastFeatureAdversaries # LBFGS # SaliencyMapMethod # VirtualAdversarialMethod # y_test = logits (before softmax) # softmax_y_test = preds (probs, after softmax) softmax_y_test = tf.nn.softmax(y_test) # create saver saver = tf.train.Saver(tf.all_variables()) sess.run(W_conv1.initializer) _gamma = sess.run(gamma) _gamma_x = Delta2/L epsilon2_update = epsilon2/(1.0 + 1.0/_gamma + 1/_gamma_x) print(epsilon2_update/_gamma + epsilon2_update/_gamma_x) print(epsilon2_update) _sensitivityW = sess.run(sensitivity) delta_h = _sensitivityW*(14**2) dp_mult = (Delta2/(L*epsilon2_update))/(delta_r / dp_epsilon) + (2*Delta2/(L*epsilon2_update))/(delta_h / dp_epsilon) ############################# iterativeStep = 100 # load the most recent models _global_step = 0 ckpt = tf.train.get_checkpoint_state(os.getcwd() + './tmp/train') if ckpt and ckpt.model_checkpoint_path: print(ckpt.model_checkpoint_path); saver.restore(sess, ckpt.model_checkpoint_path) _global_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) else: print('No checkpoint file found') start_time = time.time(); # adv pretrain model (Auto encoder layer) cost = tf.reduce_sum(Enc_Layer2.cost); logfile.write("pretrain: \n") # define cleverhans abstract models for using cleverhans attacks ch_model_logits = CustomCallableModelWrapper(callable_fn=inference_test_input, output_layer='logits', hk=hk, params=params, image_size=image_size, adv_noise = adv_noise) ch_model_probs = CustomCallableModelWrapper(callable_fn=inference_test_input_probs, output_layer='probs', hk=hk, params=params, image_size=image_size, adv_noise = adv_noise) # rand+fgsm # if attack_switch['randfgsm']: # randfgsm_obj = FastGradientMethod(model=ch_model_probs, sess=sess) # x_randfgsm_t = (fgsm_eps - rand_alpha) * randfgsm_obj.generate(x=x, eps=fgsm_eps, clip_min=-1.0, clip_max=1.0) # x_rand_t = rand_alpha * tf.sign(tf.random_normal(shape=tf.shape(x), mean=0.0, stddev=1.0)) # define each attack method's tensor mu_alpha = tf.placeholder(tf.float32, [1]); attack_tensor_dict = {} # FastGradientMethod if attack_switch['fgsm']: print('creating attack tensor of FastGradientMethod') fgsm_obj = FastGradientMethod(model=ch_model_probs, sess=sess) #x_adv_test_fgsm = fgsm_obj.generate(x=x, eps=fgsm_eps, clip_min=-1.0, clip_max=1.0, ord=2) # testing now x_adv_test_fgsm = fgsm_obj.generate(x=x, eps=mu_alpha, clip_min=-1.0, clip_max=1.0) # testing now attack_tensor_dict['fgsm'] = x_adv_test_fgsm # Iterative FGSM (BasicIterativeMethod/ProjectedGradientMethod with no random init) # default: eps_iter=0.05, nb_iter=10 if attack_switch['ifgsm']: print('creating attack tensor of BasicIterativeMethod') ifgsm_obj = BasicIterativeMethod(model=ch_model_probs, sess=sess) #x_adv_test_ifgsm = ifgsm_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_ifgsm = ifgsm_obj.generate(x=x, eps=mu_alpha, eps_iter=mu_alpha/iterativeStep, nb_iter=iterativeStep, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['ifgsm'] = x_adv_test_ifgsm # Deepfool if attack_switch['deepfool']: print('creating attack tensor of DeepFool') deepfool_obj = DeepFool(model=ch_model_logits, sess=sess) #x_adv_test_deepfool = deepfool_obj.generate(x=x, nb_candidate=10, overshoot=0.02, max_iter=50, nb_classes=10, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_deepfool = deepfool_obj.generate(x=x, nb_candidate=10, overshoot=0.02, max_iter=50, nb_classes=10, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['deepfool'] = x_adv_test_deepfool # MomentumIterativeMethod # default: eps_iter=0.06, nb_iter=10 if attack_switch['mim']: print('creating attack tensor of MomentumIterativeMethod') mim_obj = MomentumIterativeMethod(model=ch_model_probs, sess=sess) #x_adv_test_mim = mim_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, decay_factor=1.0, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_mim = mim_obj.generate(x=x, eps=mu_alpha, eps_iter=mu_alpha/iterativeStep, nb_iter=iterativeStep, decay_factor=1.0, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['mim'] = x_adv_test_mim # SPSA # note here the epsilon is the infinity norm instead of precent of perturb # Maybe exclude this method first, since it seems to have some constrain about the data value range if attack_switch['spsa']: print('creating attack tensor of SPSA') spsa_obj = SPSA(model=ch_model_logits, sess=sess) #x_adv_test_spsa = spsa_obj.generate(x=x, epsilon=fgsm_eps, num_steps=10, is_targeted=False, early_stop_loss_threshold=None, learning_rate=0.01, delta=0.01,spsa_samples=1000, spsa_iters=1, ord=2) x_adv_test_spsa = spsa_obj.generate(x=x, epsilon=fgsm_eps, num_steps=10, is_targeted=False, early_stop_loss_threshold=None, learning_rate=0.01, delta=0.01,spsa_samples=1000, spsa_iters=1) attack_tensor_dict['spsa'] = x_adv_test_spsa # CarliniWagnerL2 # confidence=0 is fron their paper # it is said to be slow, maybe exclude first if attack_switch['cwl2']: print('creating attack tensor of CarliniWagnerL2') cwl2_obj = CarliniWagnerL2(model=ch_model_logits, sess=sess) #x_adv_test_cwl2 = cwl2_obj.generate(x=x, confidence=0, batch_size=1000, learning_rate=0.005, binary_search_steps=5, max_iterations=500, abort_early=True, initial_const=0.01, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_cwl2 = cwl2_obj.generate(x=x, confidence=0, batch_size=1000, learning_rate=0.005, binary_search_steps=5, max_iterations=500, abort_early=True, initial_const=0.01, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['cwl2'] = x_adv_test_cwl2 # MadryEtAl (Projected Grdient with random init, same as rand+fgsm) # default: eps_iter=0.01, nb_iter=40 if attack_switch['madry']: print('creating attack tensor of MadryEtAl') madry_obj = MadryEtAl(model=ch_model_probs, sess=sess) #x_adv_test_madry = madry_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_madry = madry_obj.generate(x=x, eps=mu_alpha, eps_iter=fgsm_eps/iterativeStep, nb_iter=iterativeStep, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['madry'] = x_adv_test_madry # SpatialTransformationMethod # the params are pretty different from on the paper # so I use default # exclude since there's bug if attack_switch['stm']: print('creating attack tensor of SpatialTransformationMethod') stm_obj = SpatialTransformationMethod(model=ch_model_probs, sess=sess) #x_adv_test_stm = stm_obj.generate(x=x, batch_size=1000, n_samples=None, dx_min=-0.1, dx_max=0.1, n_dxs=2, dy_min=-0.1, dy_max=0.1, n_dys=2, angle_min=-30, angle_max=30, n_angles=6, ord=2) x_adv_test_stm = stm_obj.generate(x=x, batch_size=1000, n_samples=None, dx_min=-0.1, dx_max=0.1, n_dxs=2, dy_min=-0.1, dy_max=0.1, n_dys=2, angle_min=-30, angle_max=30, n_angles=6) attack_tensor_dict['stm'] = x_adv_test_stm #====================== attack ========================= sess.run(tf.initialize_all_variables()); ##perturb h for training perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*32) perturbFM_h = np.reshape(perturbFM_h, [-1, 14, 14, 32]); ##perturb h for testing perturbFM_h_test = np.random.laplace(0.0, 0, 14*14*32) perturbFM_h_test = np.reshape(perturbFM_h_test, [-1, 14, 14, 32]); '''for i in range(_global_step, _global_step + pre_T): d_eps = random.random(); batch = mnist.train.next_batch(L); #Get a random batch. adv_images = sess.run(x_adv, feed_dict = {x:batch[0], y_:batch[1], FM_h: perturbFM_h_test, dynamic_eps: d_eps}) for iter in range(0, 9): adv_images = sess.run(x_adv, feed_dict = {x:adv_images, y_:batch[1], FM_h: perturbFM_h_test, dynamic_eps: d_eps}) """batch = mnist.train.next_batch(emsemble_L) adv_images_mim = sess.run(attack_tensor_dict['mim'], feed_dict = {x:batch[0], y_: batch[1]}) batch = mnist.train.next_batch(emsemble_L) adv_images_madry = sess.run(attack_tensor_dict['mim'], feed_dict = {x:batch[0], y_: batch[1]}) train_images = np.append(np.append(adv_images, adv_images_mim, axis = 0),adv_images_madry, axis = 0)""" batch_2 = mnist.train.next_batch(L); pretrain_step.run(feed_dict={adv_x: np.append(adv_images, batch_2[0], axis = 0), adv_noise: AdvLnoise, FM_h: perturbFM_h}); if i % int(5*step_for_epoch) == 0: cost_value = sess.run(cost, feed_dict={adv_x:mnist.test.images, adv_noise: AdvLnoise_test, FM_h: perturbFM_h_test})/(test_size*32) logfile.write("step \t %d \t %g \n"%(i, cost_value)) print(cost_value) pre_train_finish_time = time.time() print('pre_train finished in: ' + parse_time(pre_train_finish_time - start_time))''' # train and test model with adv samples max_benign_acc = -1; max_robust_benign_acc = -1 #max_adv_acc = -1; test_size = len(mnist.test.images) AdvLnoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L); AdvLnoise_test = generateIdLMNoise(image_size, 0, epsilon2_update, test_size); Lnoise_empty = generateIdLMNoise(image_size, 0, epsilon2_update, L); BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L); last_eval_time = -1 accum_time = 0 accum_epoch = 0 max_adv_acc_dict = {} max_robust_adv_acc_dict = {} #max_robust_adv_utility_dict = {} for atk in attack_switch.keys(): if atk not in max_adv_acc_dict: max_adv_acc_dict[atk] = -1 max_robust_adv_acc_dict[atk] = -1 for i in range(_global_step, _global_step + T): # this batch is for generating adv samples batch = mnist.train.next_batch(emsemble_L); #Get a random batch. y_adv_batch = batch[1] #The number of epochs we print out the result. Print out the result every 5 epochs. if i % int(10*step_for_epoch) == 0 and i > int(10*step_for_epoch): cost_value = sess.run(cost, feed_dict={adv_x:mnist.test.images, adv_noise: AdvLnoise_test, FM_h: perturbFM_h_test})/(test_size*32) print(cost_value) if last_eval_time < 0: last_eval_time = time.time() #===================benign samples===================== predictions_form_argmax = np.zeros([test_size, 10]) #test_bach = mnist.test.next_batch(test_size) softmax_predictions = softmax_y_conv.eval(feed_dict={x: mnist.test.images, noise: BenignLNoise, FM_h: perturbFM_h}) argmax_predictions = np.argmax(softmax_predictions, axis=1) for n_draws in range(0, 1): _BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L); _perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*32) _perturbFM_h = np.reshape(_perturbFM_h, [-1, 14, 14, 32]); for j in range(test_size): pred = argmax_predictions[j] predictions_form_argmax[j, pred] += 1; softmax_predictions = softmax_y_conv.eval(feed_dict={x: mnist.test.images, noise: (BenignLNoise + _BenignLNoise/2), FM_h: (perturbFM_h + _perturbFM_h/2)}) argmax_predictions = np.argmax(softmax_predictions, axis=1) final_predictions = predictions_form_argmax; is_correct = [] is_robust = [] for j in range(test_size): is_correct.append(np.argmax(mnist.test.labels[j]) == np.argmax(final_predictions[j])) robustness_from_argmax = robustness.robustness_size_argmax(counts=predictions_form_argmax[j],eta=0.05,dp_attack_size=fgsm_eps, dp_epsilon=1.0, dp_delta=0.05, dp_mechanism='laplace') / (dp_mult) is_robust.append(robustness_from_argmax >= fgsm_eps) acc = np.sum(is_correct)*1.0/test_size robust_acc = np.sum([a and b for a,b in zip(is_robust, is_correct)])*1.0/np.sum(is_robust) robust_utility = np.sum(is_robust)*1.0/test_size max_benign_acc = max(max_benign_acc, acc) max_robust_benign_acc = max(max_robust_benign_acc, robust_acc*robust_utility) log_str = "step: {:.1f}\t epsilon: {:.1f}\t benign: {:.4f} \t {:.4f} \t {:.4f} \t {:.4f} \t".format(i, total_eps, acc, robust_acc, robust_utility, robust_acc*robust_utility) #===================adv samples===================== #log_str = "step: {:.1f}\t epsilon: {:.1f}\t".format(i, total_eps) """adv_images_dict = {} for atk in attack_switch.keys(): if attack_switch[atk]: adv_images_dict[atk] = sess.run(attack_tensor_dict[atk], feed_dict = {x:mnist.test.images, y_:mnist.test.labels}) print("Done with the generating of Adversarial samples")""" #===================adv samples===================== adv_acc_dict = {} robust_adv_acc_dict = {} robust_adv_utility_dict = {} for atk in attack_switch.keys(): if atk not in adv_acc_dict: adv_acc_dict[atk] = -1 robust_adv_acc_dict[atk] = -1 robust_adv_utility_dict[atk] = -1 if attack_switch[atk]: adv_images_dict = sess.run(attack_tensor_dict[atk], feed_dict = {x:mnist.test.images, y_: mnist.test.labels, adv_noise: AdvLnoise_test, mu_alpha:[fgsm_eps]}) ### PixelDP Robustness ### predictions_form_argmax = np.zeros([test_size, 10]) softmax_predictions = softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: BenignLNoise, FM_h: perturbFM_h}) argmax_predictions = np.argmax(softmax_predictions, axis=1) for n_draws in range(0, 2000): if n_draws % 1000 == 0: print(n_draws) _BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L); _perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*32) _perturbFM_h = np.reshape(_perturbFM_h, [-1, 14, 14, 32]); for j in range(test_size): pred = argmax_predictions[j] predictions_form_argmax[j, pred] += 1; softmax_predictions = softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: BenignLNoise, FM_h: (perturbFM_h + _perturbFM_h/2)}) * softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: (BenignLNoise + _BenignLNoise/2), FM_h: perturbFM_h}) #softmax_predictions = softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: BenignLNoise, FM_h: (_perturbFM_h)}) * softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: (_BenignLNoise), FM_h: perturbFM_h}) argmax_predictions = np.argmax(softmax_predictions, axis=1) final_predictions = predictions_form_argmax; is_correct = [] is_robust = [] for j in range(test_size): is_correct.append(np.argmax(mnist.test.labels[j]) == np.argmax(final_predictions[j])) robustness_from_argmax = robustness.robustness_size_argmax(counts=predictions_form_argmax[j],eta=0.05,dp_attack_size=fgsm_eps, dp_epsilon=1.0, dp_delta=0.05, dp_mechanism='laplace') / (dp_mult) is_robust.append(robustness_from_argmax >= fgsm_eps) adv_acc_dict[atk] = np.sum(is_correct)*1.0/test_size robust_adv_acc_dict[atk] = np.sum([a and b for a,b in zip(is_robust, is_correct)])*1.0/np.sum(is_robust) robust_adv_utility_dict[atk] = np.sum(is_robust)*1.0/test_size ############################## for atk in attack_switch.keys(): if attack_switch[atk]: # added robust prediction log_str += " {}: {:.4f} {:.4f} {:.4f} {:.4f}".format(atk, adv_acc_dict[atk], robust_adv_acc_dict[atk], robust_adv_utility_dict[atk], robust_adv_acc_dict[atk]*robust_adv_utility_dict[atk]) max_adv_acc_dict[atk] = max(max_adv_acc_dict[atk], adv_acc_dict[atk]) max_robust_adv_acc_dict[atk] = max(max_robust_adv_acc_dict[atk], robust_adv_acc_dict[atk]*robust_adv_utility_dict[atk]) print(log_str) logfile.write(log_str + '\n') # logfile.write("step \t %d \t %g \t %g \n"%(i, benign_acc, adv_acc)) # print("step \t %d \t %g \t %g"%(i, benign_acc, adv_acc)); # estimate end time """if i > 0 and i % int(10*step_for_epoch) == 0: current_time_interval = time.time() - last_eval_time last_eval_time = time.time() print('during last eval interval, {} epoch takes {}'.format(10, parse_time(current_time_interval))) accum_time += current_time_interval accum_epoch += 10 estimate_time = ((_global_step + T - i) / step_for_epoch) * (accum_time / accum_epoch) print('estimate finish in: {}'.format(parse_time(estimate_time)))""" #print("step \t %d \t adversarial test accuracy \t %g"%(i, accuracy_x.eval(feed_dict={x: adv_images, y_: mnist.test.labels, noise: Lnoise_empty}))); """checkpoint_path = os.path.join(os.getcwd() + '/tmp/train', 'model.ckpt') saver.save(sess, checkpoint_path, global_step=i);""" d_eps = random.random(); y_adv = batch[1] adv_images = sess.run(attack_tensor_dict['ifgsm'], feed_dict = {x:batch[0], y_: batch[1], adv_noise: AdvLnoise, mu_alpha:[d_eps]}) """for iter in range(0, 9): adv_images = sess.run(x_adv, feed_dict = {x:adv_images, y_:batch[1], FM_h: perturbFM_h_test, dynamic_eps: d_eps})""" batch = mnist.train.next_batch(emsemble_L) adv_images_mim = sess.run(attack_tensor_dict['mim'], feed_dict = {x:batch[0], y_: batch[1], adv_noise: AdvLnoise, mu_alpha:[d_eps]}) y_adv = np.append(y_adv, batch[1], axis = 0) batch = mnist.train.next_batch(emsemble_L) adv_images_madry = sess.run(attack_tensor_dict['madry'], feed_dict = {x:batch[0], y_: batch[1], adv_noise: AdvLnoise, mu_alpha:[d_eps]}) y_adv = np.append(y_adv, batch[1], axis = 0) train_images = np.append(np.append(adv_images, adv_images_mim, axis = 0),adv_images_madry, axis = 0) batch = mnist.train.next_batch(L); #Get a random batch. # train with benign and adv samples pretrain_step.run(feed_dict={adv_x: train_images, x: batch[0], adv_noise: AdvLnoise_test, noise: BenignLNoise, FM_h: perturbFM_h}); train_step.run(feed_dict={x: batch[0], adv_x: train_images, y_: batch[1], adv_y_: y_adv, noise: BenignLNoise, adv_noise: AdvLnoise_test, FM_h: perturbFM_h}); duration = time.time() - start_time; # print(parse_time(duration)); #print running time duration# max_acc_string = "max acc: benign: \t{:.4f} {:.4f}".format(max_benign_acc, max_robust_benign_acc) for atk in attack_switch.keys(): if attack_switch[atk]: max_acc_string += " {}: \t{:.4f} {:.4f}".format(atk, max_adv_acc_dict[atk], max_robust_adv_acc_dict[atk]) logfile.write(max_acc_string + '\n') logfile.write(str(duration) + '\n')
def get_adversarial_version(self, x, y=None, eps=0.3, iterations=100, attack='FGSM', targeted=False, y_tar=None, clip_min=0.0, clip_max=1.0, nb_candidate=10, num_params=100): """ Desc: Caclulate the adversarial version for point x using FGSM x: matrix of n x input_shape samples y: matrix of n x input_label samples eps: used for FGSM attack: FGMS or CW """ if self.dataset == 'cifar10': model = KerasModelWrapper(self.model) else: model = KerasModelWrapper(self.model.model) if attack == 'CW-l2': K.set_learning_phase(0) # Instantiate a CW attack object cw = CarliniWagnerL2(model, sess=self.sess) cw_params = { 'batch_size': 10, 'confidence': 0, 'learning_rate': 1e-2, 'binary_search_steps': 5, 'max_iterations': iterations, 'abort_early': True, 'initial_const': 1e-4, 'clip_min': 0.0, 'clip_max': 1.0 } x_adv = cw.generate_np(x, **cw_params) elif attack == 'CW-l0': K.set_learning_phase(0) # Instantiate a CW attack object cw = CarliniWagnerL0(model, sess=self.sess) cw_params = { 'batch_size': 1, 'confidence': 0., 'learning_rate': 1e-2, 'binary_search_steps': 5, 'max_iterations': iterations, 'abort_early': True, 'initial_const': 1e-4, 'clip_min': 0.0, 'clip_max': 1.0 } x_adv = cw.generate_np(x, **cw_params) elif attack == 'DF': K.set_learning_phase(0) df = DeepFool(model, sess=self.sess) df_params = {'nb_candidate': nb_candidate} x_adv = df.generate_np(x, **df_params) elif attack == 'JSMA': K.set_learning_phase(0) jsma = SaliencyMapMethod(model, sess=self.sess) jsma_params = { 'theta': 1., 'gamma': 0.03, 'clip_min': clip_min, 'clip_max': clip_max, 'y_target': y_tar } x_adv = jsma.generate_np(x, **jsma_params) elif attack == 'FGSM': K.set_learning_phase(0) fgsm = FastGradientMethod(model, sess=self.sess) fgsm_params = { 'eps': 0.15, 'clip_min': clip_min, 'clip_max': clip_max, 'y_target': y_tar } x_adv = fgsm.generate_np(x, **fgsm_params) elif attack == 'BIM': K.set_learning_phase(0) fgsm = BasicIterativeMethod(model, sess=self.sess) fgsm_params = { 'eps': 0.015, 'eps_iter': 0.005, 'nb_iter': 100, 'clip_min': clip_min, 'clip_max': clip_max, 'y_target': y_tar } x_adv = fgsm.generate_np(x, **fgsm_params) return x_adv
def train_child(t, p, m, load_dict=False): # model = nn.DataParallel(TestCNN().cuda(1), device_ids=[1, 2, 3]) raw_model = TestCNN().cuda(0) model = TestCNN().cuda(0) tf_model = convert_pytorch_model_to_tf(model) cleverhans_model = CallableModelWrapper(tf_model, output_layer='logits') session = tf.Session() x_op = tf.placeholder(tf.float32, shape=(None, 3, 32, 32)) fgsm = FastGradientMethod(cleverhans_model, sess=session) # stm = SpatialTransformationMethod(cleverhans_model, sess=session) # cw2 = CarliniWagnerL2(cleverhans_model, sess=session) pgd = ProjectedGradientDescent(cleverhans_model, sess=session) noise = Noise(cleverhans_model, sess=session) mim = MomentumIterativeMethod(cleverhans_model, sess=session) df = DeepFool(cleverhans_model, sess=session) tf_raw_model = convert_pytorch_model_to_tf(raw_model) cleverhans_raw_model = CallableModelWrapper(tf_raw_model, output_layer='logits') pgd_raw = ProjectedGradientDescent(cleverhans_raw_model, sess=session) def fgsm_op(x, eps): att = fgsm.generate(x_op, eps=eps) return session.run(att, feed_dict={x_op: x}) # def stm_op(x, eps): # att = stm.generate(x_op, batch_size=len(x), dx_min=-0.1*eps, dx_max=0.1*eps, dy_min=-0.1*eps, dy_max=0.1*eps, angle_min=-30*eps, angle_max=30*eps) # return session.run(att, feed_dict={x_op: x}) # def cw2_op(x, eps): # att = cw2.generate(x_op, max_iterations=3) def pgd_op(x, eps): att = pgd.generate(x_op, eps=eps, eps_iter=eps * 0.2, nb_iter=3) return session.run(att, feed_dict={x_op: x}) def pgd_raw_op(x, eps): att = pgd_raw.generate(x_op, eps=eps, eps_iter=eps * 0.2, nb_iter=3) return session.run(att, feed_dict={x_op: x}) def noise_op(x, eps): att = noise.generate(x_op, eps=eps) return session.run(att, feed_dict={x_op: x}) def df_op(x): att = df.generate(x_op, nb_candidate=10, max_iter=3) return session.run(att, feed_dict={x_op: x}) def mim_op(x, eps): att = mim.generate(x_op, eps=eps, eps_iter=eps * 0.2) return session.run(att, feed_dict={x_op: x}) def attack_train(x): attacks = [fgsm_op, noise_op, mim_op] attacks_name = ['FGSM', 'Noise', 'MIM'] eps = [[0.03, 0.3], [0.03, 0.3], [0.03, 0.3]] train_x_adv = x.copy() adv_type = np.random.randint(SUBPOLICY_COUNT, size=len(train_x_adv)) for i, (ti, pi, mi) in enumerate( tqdm(zip(t, p, m), total=len(t), desc='Subpolicy: ', leave=False)): adv_i = train_x_adv[adv_type == i] for j, (tj, pj, mj) in enumerate( tqdm(zip(ti, pi, mi), total=len(ti), desc='Operation: ', leave=False)): tj, pj, mj = (*tj, *pj, *mj) adv_j = adv_i[np.random.randn(len(adv_i)) < pj] for i in tqdm(range(0, len(adv_j), BATCH_SIZE), desc=attacks_name[tj] + ': ', leave=False): adv_j[i:][:BATCH_SIZE] = attacks[tj]( adv_j[i:][:BATCH_SIZE], (mj + 1) / MAGN_COUNT * (eps[tj][1] - eps[tj][0]) + eps[tj][0]) return train_x_adv optimizer = optim.Adam(model.parameters(), lr=1e-3) raw_optimizer = optim.Adam(raw_model.parameters(), lr=1e-3) adv_idx = np.random.randn(len(train_x)) < 0.5 train_x_adv = train_x.copy() train_x_adv[adv_idx] = attack_train(train_x_adv[adv_idx]) adv_trainset = torch.utils.data.TensorDataset( torch.tensor(train_x_adv, dtype=torch.float), torch.tensor(train_y, dtype=torch.long)) adv_trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) if load_dict: model.load_state_dict(torch.load('partial_eval_runs/model.pt')) optimizer.load_state_dict(torch.load('partial_eval_runs/optimizer.pt')) raw_model.load_state_dict(torch.load('partial_eval_runs/raw_model.pt')) raw_optimizer.load_state_dict( torch.load('partial_eval_runs/raw_optimizer.pt')) model.train() batch_tqdm = tqdm(adv_trainloader, leave=False) for x, y in batch_tqdm: optimizer.zero_grad() output = model(x.cuda(0)) loss = criterion(output, y.cuda(0)) loss.backward() optimizer.step() acc = torch.sum(output.cpu().argmax(axis=1) == y) / y.size(0) batch_tqdm.set_description(f'adv {loss:.3f} {acc:.3f}') batch_tqdm = tqdm(trainloader, leave=False) raw_model.train() for x, y in batch_tqdm: raw_optimizer.zero_grad() output = raw_model(x.cuda(0)) loss = criterion(output, y.cuda(0)) loss.backward() raw_optimizer.step() acc = torch.sum(output.cpu().argmax(axis=1) == y) / y.size(0) batch_tqdm.set_description(f'raw {loss:.3f} {acc:.3f}') with torch.no_grad(): model.eval() batch_tqdm = tqdm(valloader, leave=False) tot_acc = 0 for x, y in batch_tqdm: output = model(x.cuda(0)) acc = float(torch.sum(output.cpu().argmax(axis=1) == y)) tot_acc += acc adv_raw_acc = tot_acc / len(val_x) val_x_adv = np.zeros_like(val_x) for i in tqdm(range(0, len(val_x_adv), BATCH_SIZE), desc='PGD: ', leave=False): val_x_adv[i:][:BATCH_SIZE] = pgd_op(val_x[i:][:BATCH_SIZE], 0.01) adv_valset = torch.utils.data.TensorDataset( torch.tensor(val_x_adv, dtype=torch.float), torch.tensor(val_y, dtype=torch.long)) adv_valloader = torch.utils.data.DataLoader(adv_valset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4) batch_tqdm = tqdm(adv_valloader, leave=False) tot_acc = 0 for x, y in batch_tqdm: output = model(x.cuda(0)) acc = float(torch.sum(output.cpu().argmax(axis=1) == y)) tot_acc += acc adv_adv_acc = tot_acc / len(val_x) raw_model.eval() batch_tqdm = tqdm(valloader, leave=False) tot_acc = 0 for x, y in batch_tqdm: output = raw_model(x.cuda(0)) acc = float(torch.sum(output.cpu().argmax(axis=1) == y)) tot_acc += acc raw_raw_acc = tot_acc / len(val_x) val_x_adv = np.zeros_like(val_x) for i in tqdm(range(0, len(val_x_adv), BATCH_SIZE), desc='PGD: ', leave=False): val_x_adv[i:][:BATCH_SIZE] = pgd_raw_op(val_x[i:][:BATCH_SIZE], 0.01) adv_valset = torch.utils.data.TensorDataset( torch.tensor(val_x_adv, dtype=torch.float), torch.tensor(val_y, dtype=torch.long)) adv_valloader = torch.utils.data.DataLoader(adv_valset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4) batch_tqdm = tqdm(adv_valloader, leave=False) tot_acc = 0 for x, y in batch_tqdm: output = raw_model(x.cuda(0)) acc = float(torch.sum(output.cpu().argmax(axis=1) == y)) tot_acc += acc raw_adv_acc = tot_acc / len(val_x) with open('partial_eval_runs/acc.csv', 'a') as f: f.write(f'{adv_raw_acc},{adv_adv_acc},{raw_raw_acc},{raw_adv_acc}\n') print( f'adv {adv_raw_acc:.3f} -> {adv_adv_acc:.3f} | raw {raw_raw_acc:.3f} -> {raw_adv_acc:.3f}' ) torch.save(model.state_dict(), 'partial_eval_runs/model.pt') torch.save(optimizer.state_dict(), 'partial_eval_runs/optimizer.pt') torch.save(raw_model.state_dict(), 'partial_eval_runs/raw_model.pt') torch.save(raw_optimizer.state_dict(), 'partial_eval_runs/raw_optimizer.pt')
def mnist_tutorial_deepfool(train_start=0, train_end=60000, #读60000训练 test_start=0,test_end=10000, #读10000测试 viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=2, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist")): """ MNIST tutorial for Deepfool's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples激活对抗例子 :param nb_epochs: number of epochs to train model(一个epoch指代所有的数据送入网络中完成一次前向计算及反向传播的过程。) :param batch_size: size of training batches :param nb_classes: number of output classes(输出几类) :param source_samples: number of test inputs to attack(测试输入用于攻击的数量) :param learning_rate: learning rate for training(学习率) :param model_path: path to the model file(文件路径) :param attack_iterations: 攻击迭代次数 :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies精确度报告 report = AccuracyReport() # MNIST-specific dimensions图像尺寸28*28*1 img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = make_basic_picklable_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow(构建训练模型) ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2018, 8, 9]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path+".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models"), rng=rng) print("save success") # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') print("This could take some time ...") # Instantiate a DeepFool attack object deepfool = DeepFool(model, back='tf', sess=sess) idxs = [np.where(np.argmax(Y_test, axis=1) == i)[0][1] for i in range(10)] print("idxs:",idxs) # construct adv_inputs grid_shape = (nb_classes, 2, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') print("grid_viz_data",grid_viz_data.shape) adv_inputs = X_test[idxs].reshape([-1,28,28,1]) deepfool_params = {'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': attack_iterations, 'nb_classes': 10, 'clip_min': 0., 'clip_max': 1.} adv = deepfool.generate_np(adv_inputs, **deepfool_params) print("adv success") adv_accuracy = 1-model_eval(sess, x, y, preds, adv, Y_test[idxs], args={'batch_size': 10}) for j in range(10): grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1.-adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
class TestDeepFool(CleverHansTest): def setUp(self): super(TestDeepFool, self).setUp() import tensorflow as tf # The world's simplest neural network def my_model(x): W1 = tf.constant([[1.5, .3], [-2, 0.3]], dtype=tf.float32) h1 = tf.nn.sigmoid(tf.matmul(x, W1)) W2 = tf.constant([[-2.4, 1.2], [0.5, -2.3]], dtype=tf.float32) res = tf.matmul(h1, W2) return res self.sess = tf.Session() self.model = my_model self.attack = DeepFool(self.model, sess=self.sess) def test_generate_np_gives_adversarial_example(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, over_shoot=0.02, max_iter=50, nb_candidate=2, clip_min=-5, clip_max=5) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(orig_labs == new_labs) < 0.1) def test_generate_gives_adversarial_example(self): import tensorflow as tf x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) x = tf.placeholder(tf.float32, x_val.shape) x_adv_p = self.attack.generate(x, over_shoot=0.02, max_iter=50, nb_candidate=2, clip_min=-5, clip_max=5) x_adv = self.sess.run(x_adv_p, {x: x_val}) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(orig_labs == new_labs) < 0.1) def test_generate_np_gives_clipped_adversarial_examples(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, over_shoot=0.02, max_iter=50, nb_candidate=2, clip_min=-0.2, clip_max=0.3) self.assertTrue(-0.201 < np.min(x_adv)) self.assertTrue(np.max(x_adv) < .301)
def JSMA(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) source_samples = batch_size # Use label smoothing # Hopefully this doesn't screw up JSMA... assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_par = {'batch_size': batch_size} rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) print("#####Starting attacks on clean model#####") ################################################################# #Clean test against JSMA jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } jsma = SaliencyMapMethod(model, back='tf', sess=sess) adv_x = jsma.generate(x, **jsma_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on JSMA adversarial examples: %0.4f' % acc) ################################################################ #Clean test against FGSM fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on FGSM adversarial examples: %0.4f' % acc) ################################################################ #Clean test against BIM bim_params = { 'eps': 0.3, 'eps_iter': 0.01, 'nb_iter': 100, 'clip_min': 0., 'clip_max': 1. } bim = BasicIterativeMethod(model, sess=sess) adv_x = bim.generate(x, **bim_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on BIM adversarial examples: %0.4f' % acc) ################################################################ #Clean test against EN en_params = { 'binary_search_steps': 1, #'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': source_samples, 'initial_const': 10 } en = ElasticNetMethod(model, back='tf', sess=sess) adv_x = en.generate(x, **en_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on EN adversarial examples: %0.4f' % acc) ################################################################ #Clean test against DF deepfool_params = { 'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': 50, 'clip_min': 0., 'clip_max': 1. } deepfool = DeepFool(model, sess=sess) adv_x = deepfool.generate(x, **deepfool_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on DF adversarial examples: %0.4f' % acc) ################################################################ #Clean test against VAT vat_params = { 'eps': 2.0, 'num_iterations': 1, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1. } vat = VirtualAdversarialMethod(model, sess=sess) adv_x = vat.generate(x, **vat_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on VAT adversarial examples: %0.4f\n' % acc) ################################################################ print("Repeating the process, using adversarial training\n") # Redefine TF model graph model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) ################################################################# #Adversarial test against JSMA jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } jsma = SaliencyMapMethod(model, back='tf', sess=sess) adv_x = jsma.generate(x, **jsma_params) preds_adv_jsma = model.get_probs(adv_x) ################################################################ #Adversarial test against FGSM fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv_fgsm = model.get_probs(adv_x) ################################################################ #Adversarial test against BIM bim_params = { 'eps': 0.3, 'eps_iter': 0.01, 'nb_iter': 100, 'clip_min': 0., 'clip_max': 1. } bim = BasicIterativeMethod(model, sess=sess) adv_x = bim.generate(x, **bim_params) preds_adv_bim = model.get_probs(adv_x) ################################################################ #Adversarial test against EN en_params = { 'binary_search_steps': 5, #'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': source_samples, 'initial_const': 10 } en = ElasticNetMethod(model, back='tf', sess=sess) adv_x = en.generate(x, **en_params) preds_adv_en = model.get_probs(adv_x) ################################################################ #Adversarial test against DF deepfool_params = { 'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': 200, 'clip_min': 0., 'clip_max': 1. } deepfool = DeepFool(model, sess=sess) adv_x = deepfool.generate(x, **deepfool_params) preds_adv_df = model.get_probs(adv_x) ################################################################ #Adversarial test against VAT vat_params = { 'eps': 2.0, 'num_iterations': 1, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1. } vat = VirtualAdversarialMethod(model, sess=sess) adv_x = vat.generate(x, **vat_params) preds_adv_vat = model.get_probs(adv_x) ################################################################ print("#####Evaluate trained model#####") def evaluate_2(): # Evaluate the accuracy of the MNIST model on JSMA adversarial examples acc = model_eval(sess, x, y, preds_adv_jsma, X_test, Y_test, args=eval_par) print('Test accuracy on JSMA adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_fgsm, X_test, Y_test, args=eval_par) print('Test accuracy on FGSM adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on BIM adversarial examples acc = model_eval(sess, x, y, preds_adv_bim, X_test, Y_test, args=eval_par) print('Test accuracy on BIM adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on EN adversarial examples acc = model_eval(sess, x, y, preds_adv_en, X_test, Y_test, args=eval_par) print('Test accuracy on EN adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on DF adversarial examples acc = model_eval(sess, x, y, preds_adv_df, X_test, Y_test, args=eval_par) print('Test accuracy on DF adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on VAT adversarial examples acc = model_eval(sess, x, y, preds_adv_vat, X_test, Y_test, args=eval_par) print('Test accuracy on VAT adversarial examples: %0.4f\n' % acc) preds_2_adv = [ preds_adv_jsma # ,preds_adv_fgsm # ,preds_adv_bim # ,preds_adv_en # ,preds_adv_df ] model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, rng=rng)
def eval(sess, model_name, X_train, Y_train, X_test, Y_test, cnn=False, rbf=False, fgsm=False, jsma=False, df=False, bim=False): """ Load model saved in model_name.json and model_name_weights.h5 and evaluate its accuracy on legitimate test samples and adversarial samples. Use cnn=True if the model is CNN based. """ # open text file and output accuracy results to it text_file = open("fmnist_results.txt", "w") # load saved model print("Load model ... ") ''' json = open('models/{}.json'.format(model_name), 'r') model = json.read() json.close() loaded_model = model_from_json(model) loaded_model.load_weights("models/{}_weights.h5".format(model_name)) ''' if rbf: loaded_model = load_model("rbfmodels/{}.h5".format(model_name), custom_objects={'RBFLayer': RBFLayer}) text_file.write('Evaluating on rbfmodels/{}.h5\n\n'.format(model_name)) else: loaded_model = load_model("models/{}.h5".format(model_name)) text_file.write('Evaluating on models/{}.h5\n\n'.format(model_name)) # Set placeholders if cnn: x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) else: x = tf.placeholder(tf.float32, shape=(None, 784)) y = tf.placeholder(tf.float32, shape=(None, 10)) predictions = loaded_model(x) accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args={"batch_size": 128}) text_file.write('Test accuracy on legitimate test examples: {0}\n'.format( str(accuracy))) #print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Craft adversarial examples depending on the input parameters wrap = KerasModelWrapper(loaded_model) # FGSM if fgsm: fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3} adv_x = fgsm.generate(x, **fgsm_params) adv_x = tf.stop_gradient(adv_x) preds_adv = loaded_model(adv_x) # Evaluate the accuracy of the F-MNIST model on adversarial examples accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={"batch_size": 128}) text_file.write( 'Test accuracy on fgsm adversarial test examples: {0}\n'.format( str(accuracy))) #print('Test accuracy on fgsm adversarial test examples: ' + str(accuracy)) # JSMA if jsma: jsma = SaliencyMapMethod(wrap, sess=sess) jsma_params = { 'theta': 2., 'gamma': 0.145, 'clip_min': 0., 'clip_max': 1., 'y_target': None } adv_x = jsma.generate(x, **jsma_params) adv_x = tf.stop_gradient(adv_x) preds_adv = loaded_model(adv_x) # Evaluate the accuracy of the F-MNIST model on adversarial examples accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={"batch_size": 128}) text_file.write( 'Test accuracy on jsma adversarial test examples: {0}\n'.format( str(accuracy))) #print('Test accuracy on jsma adversarial test examples: ' + str(accuracy)) # DeepFool if df: df = DeepFool(wrap, sess=sess) df_params = {'nb_candidate': 10, 'max_iter': 50} adv_x = df.generate(x, **df_params) adv_x = tf.stop_gradient(adv_x) preds_adv = loaded_model(adv_x) # Evaluate the accuracy of the F-MNIST model on adversarial examples accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={"batch_size": 128}) text_file.write( 'Test accuracy on df adversarial test examples: {0}\n'.format( str(accuracy))) #print('Test accuracy on df adversarial test examples: ' + str(accuracy)) # Basic Iterative Method if bim: bim = ProjectedGradientDescent(wrap, sess=sess) bim_params = {'eps': 0.3} adv_x = bim.generate(x, **bim_params) adv_x = tf.stop_gradient(adv_x) preds_adv = loaded_model(adv_x) # Evaluate the accuracy of the F-MNIST model on adversarial examples accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={"batch_size": 128}) text_file.write( 'Test accuracy on bim adversarial test examples: {0}\n'.format( str(accuracy))) #print('Test accuracy on bim adversarial test examples: ' + str(accuracy)) print('Accuracy results outputted to fmnist_results.txt') text_file.close() # Close TF session sess.close()