def _init_data(self): hparams = self.hparams batch_size = hparams.batch_size if hparams.dataset == 'mnist': # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist( train_start=hparams.train_start, train_end=hparams.train_end, test_start=hparams.test_start, test_end=hparams.test_end) input_shape = (batch_size, 28, 28, 1) preproc_func = None elif hparams.dataset == 'cifar10': X_train, Y_train, X_test, Y_test = cifar_input.read_CIFAR10( os.path.join(hparams.data_path, hparams.dataset)) input_shape = (batch_size, 32, 32, 3) preproc_func = cifar_input.cifar_tf_preprocess elif hparams.dataset == 'svhn': X_train, Y_train, X_test, Y_test = svhn_input.read_SVHN( os.path.join(hparams.data_path, hparams.dataset)) input_shape = (batch_size, 32, 32, 3) preproc_func = svhn_input.svhn_tf_preprocess # Use label smoothing assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) self.X_train = X_train self.Y_train = Y_train self.X_test = X_test self.Y_test = Y_test self.data = (X_train, Y_train, X_test, Y_test) self.input_shape = input_shape self.preproc_func = preproc_func
def main(argv): checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if checkpoint is None: raise ValueError("Couldn't find latest checkpoint in " + FLAGS.checkpoint_dir) train_start = 0 train_end = 60000 test_start = 0 test_end = 10000 X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) assert Y_train.shape[1] == 10 # NOTE: for compatibility with Madry Lab downloadable checkpoints, # we cannot enclose this in a scope or do anything else that would # change the automatic naming of the variables. model = MadryMNIST() x_input = tf.placeholder(tf.float32, shape=[None, 784]) x_image = tf.placeholder(tf.float32, shape=[None, 28, 28, 1]) y = tf.placeholder(tf.float32, shape=[None, 10]) if FLAGS.attack_type == 'fgsm': fgsm = FastGradientMethod(model) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x_image, **fgsm_params) elif FLAGS.attack_type == 'bim': bim = BasicIterativeMethod(model) bim_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1., 'nb_iter': 50, 'eps_iter': .01} adv_x = bim.generate(x_image, **bim_params) else: raise ValueError(FLAGS.attack_type) preds_adv = model.get_probs(adv_x) saver = tf.train.Saver() with tf.Session() as sess: # Restore the checkpoint saver.restore(sess, checkpoint) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': FLAGS.batch_size} t1 = time.time() acc = model_eval( sess, x_image, y, preds_adv, X_test, Y_test, args=eval_par) t2 = time.time() print("Took", t2 - t1, "seconds") print('Test accuracy on adversarial examples: %0.4f\n' % acc)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=NB_FILTERS, num_threads=None, attack_string=None): """ MNIST cleverhans tutorial :param train_start: index of first training set example. :param train_end: index of last training set example. :param test_start: index of first test set example. :param test_end: index of last test set example. :param nb_epochs: number of epochs to train model. :param batch_size: size of training batches. :param learning_rate: learning rate for training. :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate. :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param nb_filters: number of filters in the CNN used for training. :param num_threads: number of threads used for running the process. :param attack_string: attack name for crafting adversarial attacks and adversarial training, in string format. :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Train an MNIST model model_path = "models/mnist" train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } # Initialize the attack object attack_class = attack_selection(attack_string) attack_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2018, 6, 18]) if clean_train: model = ModelBasicCNNTFE(nb_filters=nb_filters) def evaluate_clean(): """Evaluate the accuracy of the MNIST model on legitimate test examples """ eval_params = {'batch_size': batch_size} acc = model_eval(model, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) train(model, X_train, Y_train, evaluate=evaluate_clean, args=train_params, rng=rng, var_list=model.get_params()) if testing: # Calculate training error eval_params = {'batch_size': batch_size} acc = model_eval(model, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} attack = attack_class(model) acc = model_eval( model, X_test, Y_test, args=eval_par, attack=attack, attack_args=attack_params) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval( model, X_train, Y_train, args=eval_par, attack=attack, attack_args=attack_params) print('Train accuracy on adversarial examples: %0.4f\n' % acc) report.train_clean_train_adv_eval = acc # Clear the previous Variables for var in model.get_params(): var = None attack = None print("Repeating the process, using adversarial training") model_adv_train = ModelBasicCNNTFE(nb_filters=nb_filters) attack = attack_class(model_adv_train) def evaluate_adv(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval( model_adv_train, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval( model_adv_train, X_test, Y_test, args=eval_params, attack=attack, attack_args=attack_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(model_adv_train, X_train, Y_train, evaluate=evaluate_adv, args=train_params, rng=rng, var_list=model_adv_train.get_params(), attack=attack, attack_args=attack_params) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval( model_adv_train, X_train, Y_train, args=eval_params, attack=None, attack_args=None) report.train_adv_train_clean_eval = accuracy accuracy = model_eval( model_adv_train, X_train, Y_train, args=eval_params, attack=attack, attack_args=attack_params) report.train_adv_train_adv_eval = accuracy return report
def main(argv=None): """ MNIST cleverhans tutorial :return: """ # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist() assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() predictions = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Train an MNIST model train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate } model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, args=train_params) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) adv_x = fgsm(x, predictions, eps=0.3) eval_params = {'batch_size': FLAGS.batch_size} X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], args=eval_params) assert X_test_adv.shape[0] == 10000, X_test_adv.shape # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy)) print("Repeating the process, using adversarial training") # Redefine TF model graph # model_2 = cnn_model() # model_2 = regression_model() model_2 = model predictions_2 = model_2(x) # adv_x_2 = fgsm(x, predictions_2, eps=0.3) adv_x_2 = adv_x predictions_2_adv = model_2(adv_x_2) # fgsm2 = FastGradientMethod(model_2, sess=sess) # predictions_2_adv = model_2(fgsm2.generate(x, **fgsm_params)) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate test examples: %0.4f' % accuracy) # Accuracy of the adversarially trained model on adversarial examples accuracy_adv = model_eval(sess, x, y, predictions_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy_adv) # Perform and evaluate adversarial training train_params['nb_epochs'] = 30 model_train(sess, x, y, predictions_2, X_train, Y_train, predictions_adv=predictions_2_adv, evaluate=evaluate_2, args=train_params) # after adversarial training: # Test accuracy on legitimate test examples: 0.9483 # Test accuracy on adversarial examples: 0.8989 # we test on previously generated adversarial example # Test accuracy on adversarial examples: 0.1844 accuracy = model_eval(sess, x, y, predictions_2, X_test_adv, Y_test, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy))
def main(): """ MNIST cleverhans tutorial :return: """ import argparse parser = argparse.ArgumentParser() parser.add_argument('--batch_size', '-b', default=1000, type=int, help='Size of training batches') parser.add_argument('--train_dir', '-d', default='/tmp', help='Directory storing the saved model.') parser.add_argument('--filename', '-f', default='mnist.ckpt', help='Filename to save model under.') parser.add_argument('--nb_epochs', '-e', default=6, type=int, help='Number of epochs to train model') parser.add_argument('--nb_iters', '-i', default=10000, type=int, help='Number of iterations for crafting adversarial examples') parser.add_argument('--learning_rate', '-lr', default=0.1, type=float, help='Learning rate for training') parser.add_argument('--eps', default=0.01, type=float, help='Epsilon for Carlini L2 Attack') parser.add_argument('--kappa', default=0.01, type=float, help='Kappa for Carlini L2 Attack') parser.add_argument('--c', default=20, type=float) parser.add_argument('--load', default=None, type=str, help='Model path to load') parser.add_argument('--dump', default=None, type=str, help='Model path to dump') args = parser.parse_args() np.random.seed(126) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist() print("Loaded MNIST test data.") assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input Theano placeholder x_shape = (None, 1, 28, 28) y_shape = (None, 10) x = T.tensor4('x') y = T.matrix('y') if args.load: model = pickle.load(open(args.load, "rb")) predictions = model(x) else: # Define Theano model graph model = model_mnist() model.build(x_shape) predictions = model(x) print("Defined Theano model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples accuracy = th_model_eval(x, y, predictions, X_test, Y_test, args=args) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: ' + str(accuracy)) pass # Train an MNIST model th_model_train(x, y, predictions, model.trainable_weights, X_train, Y_train, evaluate=evaluate, args=args) if args.dump: pickle.dump(model, open(args.dump, "wb")) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) #for i in range(10): carlini_L2(x, predictions, X_test, Y_test, eps=args.eps, kappa=args.kappa, c=args.c, nb_iters=args.nb_iters, batch_size=args.batch_size)
def mnist_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=10, batch_size=128, learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6, nb_epochs_s=10, lmbda=0.1): """ MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ keras.layers.core.K.set_learning_phase(0) # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Initialize substitute training set reserved for adversary X_sub = X_test[:holdout] Y_sub = np.argmax(Y_test[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries X_test = X_test[holdout:] Y_test = Y_test[holdout:] # Define input and output TF placeholders x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") prep_bbox_out = prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, nb_epochs, batch_size, learning_rate) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 print("Training the substitute model.") train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda) model_sub, preds_sub = train_sub_out # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params) accuracies['sub'] = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object. fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.} wrap = KerasModelWrapper(model_sub) fgsm = FastGradientMethod(wrap, sess=sess) # Craft adversarial examples using the substitute eval_params = {'batch_size': batch_size} x_adv_sub = fgsm.generate(x, **fgsm_par) # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model(x_adv_sub), X_test, Y_test, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy return accuracies
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': CW_LEARNING_RATE, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': 10 } adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ idxs], args=eval_params) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ :source_samples], args=eval_params) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="/tmp", filename="mnist.ckpt", load_model=False, testing=False): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } ckpt = tf.train.get_checkpoint_state(train_dir) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path if load_model and ckpt_path: saver = tf.train.Saver() saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, save=True) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph wrap = KerasModelWrapper(model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3} adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model() preds_2 = model_2(x) wrap_2 = KerasModelWrapper(model_2) fgsm2 = FastGradientMethod(wrap_2, sess=sess) preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params)) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, save=False) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind+1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def main(argv=None): """ MNIST cleverhans tutorial :return: """ # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'th': keras.backend.set_image_dim_ordering('th') print "INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to 'tf', temporarily setting to 'th'" # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) print "Created TensorFlow session and set Keras backend." # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist() print "Loaded MNIST test data." # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 1, 28, 28)) y = tf.placeholder(tf.float32, shape=(None, FLAGS.nb_classes)) # Define TF model graph model = model_mnist() predictions = model(x) print "Defined TensorFlow model graph." # Train an MNIST model tf_model_train(sess, x, y, predictions, X_train, Y_train) # Evaluate the accuracy of the MNIST model on legitimate test examples accuracy = tf_model_eval(sess, x, y, predictions, X_test, Y_test) assert X_test.shape[0] == 10000, X_test.shape print 'Test accuracy on legitimate test examples: ' + str(accuracy) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) adv_x = fgsm(x, predictions, eps=0.3) X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test]) assert X_test_adv.shape[0] == 10000, X_test_adv.shape # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = tf_model_eval(sess, x, y, predictions, X_test_adv, Y_test) print 'Test accuracy on adversarial examples: ' + str(accuracy) print "Repeating the process, using adversarial training" # Redefine TF model graph model_2 = model_mnist() predictions_2 = model_2(x) adv_x_2 = fgsm(x, predictions_2, eps=0.3) predictions_2_adv = model_2(adv_x_2) # Perform adversarial training tf_model_train(sess, x, y, predictions_2, X_train, Y_train, predictions_adv=predictions_2_adv) # Evaluate the accuracy of the adversarialy trained MNIST model on # legitimate test examples accuracy = tf_model_eval(sess, x, y, predictions_2, X_test, Y_test) print 'Test accuracy on legitimate test examples: ' + str(accuracy) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) on # the new model, which was trained using adversarial training X_test_adv_2, = batch_eval(sess, [x], [adv_x_2], [X_test]) assert X_test_adv_2.shape[0] == 10000, X_test_adv_2.shape # Evaluate the accuracy of the adversarially trained MNIST model on # adversarial examples accuracy_adv = tf_model_eval(sess, x, y, predictions_2, X_test_adv_2, Y_test) print 'Test accuracy on adversarial examples: ' + str(accuracy_adv)
def mnist_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=10, batch_size=128, learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6, nb_epochs_s=10, lmbda=0.1, aug_batch_size=512): """ MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ # Set logging level to see debug information set_log_level(logging.DEBUG) # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session sess = tf.Session() # Get MNIST data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Initialize substitute training set reserved for adversary X_sub = x_test[:holdout] Y_sub = np.argmax(y_test[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries x_test = x_test[holdout:] y_test = y_test[holdout:] # Obtain Image parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Seed random number generator so tutorial is reproducible rng = np.random.RandomState([2017, 8, 30]) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test, nb_epochs, batch_size, learning_rate, rng, nb_classes, img_rows, img_cols, nchannels) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 print("Training the substitute model.") train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, aug_batch_size, rng, img_rows, img_cols, nchannels) model_sub, preds_sub = train_sub_out # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_sub, x_test, y_test, args=eval_params) accuracies['sub'] = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object. fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model_sub, sess=sess) # Craft adversarial examples using the substitute eval_params = {'batch_size': batch_size} x_adv_sub = fgsm.generate(x, **fgsm_par) # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model.get_logits(x_adv_sub), x_test, y_test, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy return accuracies
from cleverhans.utils_mnist import data_mnist from array_to_image import array_to_image import numpy as np X_train, Y_train, X_test, Y_test = data_mnist(train_start=0, train_end=60000, test_start=0, test_end=10000) # visualization for MNIST array_to_image(X_train[0:5], "mnist") # visualization for MNIST's adversarial examples adv_image = np.load("adv_image_FGM.npy") array_to_image(adv_image[0:5], "adv_image_FGM") # visualization for Cifar-10's adversarial examples adv_image = np.load("adv_image_FGM_cifar10.npy") array_to_image(adv_image[0:5], "adv_image_cifar10", channels=3, size=32)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="train_dir", filename="mnist.ckpt", load_model=False, testing=False, label_smoothing=0.1, method='FGSM'): """ MNIST tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) print('y_train: ', y_train.shape) x_train = np.pad(x_train, ((0, 0), (2, 2), (2, 2), (0, 0)), mode='constant') x_test = np.pad(x_test, ((0, 0), (2, 2), (2, 2), (0, 0)), mode='constant') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] print('img_rows: {}, img_cols: {}, nchannels: {}'.format(img_rows, img_cols, nchannels)) nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph if train_dir=='mnist_ff_model': model=mnist_ff_model() elif train_dir=='mnist_BP_model': model = mnist_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = CrossEntropy(wrap, smoothing=label_smoothing) train(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, save=True, rng=rng) print('Training done!') # Calculate training error print('testing param:', testing) if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph # fgsm = FastGradientMethod(wrap, sess=sess) if method=='FGSM': clw=FastGradientMethod(wrap, sess=sess) elif method=='BIM': clw=BasicIterativeMethod(wrap, sess=sess) elif method=='DeepFool': clw=DeepFool(wrap, sess=sess) else: raise NotImplementedError print('method chosen: ', method) clw_params = {} adv_x = clw.generate(x, **clw_params) with sess.as_default(): feed_dict={x:x_test, y:y_test} store_data=adv_x.eval(feed_dict=feed_dict) print('store_data: {}'.format(store_data.shape)) save_name='{}/mnist_{}_data.pkl'.format(train_dir,method) with open(save_name,'wb') as fw: pickle.dump(store_data, fw, protocol=2) print('data stored as {}'.format(save_name)) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_train, y_train, args=eval_par) report.train_clean_train_adv_eval = acc return report
def minist_fgsm_saliency( train_start=0, train_end=10, test_start=0, test_end=5, nb_epochs=2, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64, nb_classes=10, source_samples=10, ): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 # this way, all the 9 zeroes -> 0.1/9 because # the one-bit becomes 0.9 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # placeholder for y_target --> for saliency tensor y_target = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) ########################################################################### # Training the CNN model using TensorFlow: model --> base model ########################################################################### model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) if clean_train: # omg -> creates a cnn model # model = make_basic_cnn(nb_filters=nb_filters) # preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) ########################################################################### # MODEL Train!!!!!!!!!!!! ########################################################################### # training the basic model, using train_params model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc ########################################################################### # Generate FGSM Adversarial based on model, and # Compute Base Model Accuracy ########################################################################### # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) # todo: follow the paper and run Cleverhans Output? fgsm_params_y = {'eps': 0.3, 'y': y, 'clip_min': 0., 'clip_max': 1.} #adv_x = fgsm.generate(x, **fgsm_params) adv_x = fgsm.generate(x, **fgsm_params_y) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on FGSM adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc ########################################################################### # Generate Saliency Map Adversarial Example and # Compute base model accuracy (only 10) ########################################################################### print("Saliency Map Attack On The Base Model") print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Instantiate a SaliencyMapMethod attack object --> modify y_target for each test_data again jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } # Keep track of success (adversarial example classified in target) # Need this info to compute the success rate results = np.zeros((nb_classes, source_samples), dtype='i') # each sample will get 9 adversarial samples # adv_x_set: place_holder for all the x variations # correct_y_set: correct_y_output used for training adv_x_set = None adv_y_target = None # we need multi x_train_saliency / y_train_saliency # x_train_saliency = None y_train_saliency = None for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Saliency Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_train[sample_ind:(sample_ind + 1)] y_sample = Y_train[sample_ind:(sample_ind + 1)] current_class = int(np.argmax(Y_train[sample_ind])) target_classes = other_classes(nb_classes, current_class) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # Create x_train_saliency, corresponding to y_train_saliency if x_train_saliency is not None: x_train_saliency = np.concatenate( (x_train_saliency, sample), axis=0) y_train_saliency = np.concatenate( (y_train_saliency, y_sample), axis=0) else: x_train_saliency = sample y_train_saliency = y_sample print("sample shape: ", x_train_saliency.shape) print("y_sample shape: ", y_train_saliency.shape) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x_np = jsma.generate_np(sample, **jsma_params) # Add to adv_x_set, correct_y_set if adv_x_set is not None: adv_y_target = np.concatenate( (adv_y_target, one_hot_target), axis=0) adv_x_set = np.concatenate((adv_x_np, adv_x_set), axis=0) else: adv_y_target = one_hot_target adv_x_set = adv_x_np print("adv_y_target shape(one-hot-encoding): ", adv_y_target.shape) print("adv_x_set(np) shape: ", adv_x_np.shape) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x_np) == target) # Update the arrays for later analysis results[target, sample_ind] = res print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful Saliency adv. examples {0:.4f}'.format( succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # here we have successfully stacked up x_adversarial_set, y_correct_set # these can be used to provide training to our model now print("\n\n\n*****************************") print("Checking x_adv_set shape: ", adv_x_set.shape) print("Checking correct_y_set shape: ", adv_y_target.shape) print("x_training_saliency shape:", x_train_saliency.shape) print("y_training_saliency shape:", y_train_saliency.shape) # now construct model 3, define output -> input relationship tensor model_3 = make_basic_cnn(nb_filters=nb_filters) # define the x, the placeholder input - > preds_3 output preds_3 = model_3(x) # jsma3 = SaliencyMapMethod(model_3, sess=sess) # # jsma_params = {'theta': 1., 'gamma': 0.1, # 'clip_min': 0., 'clip_max': 1., # 'y_target': y_target} # # # create adv_saliency set tensor, using x_train data and jsma_params containing adv_y_target # adv_jsma = jsma3.generate(x, jsma_params) # # create adv preds tensor # preds_jsma_adv = model_3(adv_jsma) # define saliency training model accuracy def evaluate_saliency(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_3, x_train_saliency, y_train_saliency, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy ########################################################################### # MODEL Train for Saliency Map ########################################################################### # Perform and evaluate adversarial training with FSGM MODEL!!! # Train the model with samples of normal and adversarial examples! model_train(sess, x, y, model_3, x_train_saliency, y_train_saliency, evaluate=evaluate_saliency(), args=train_params, rng=rng) #todo: use jsma to create adversarial testing??? or training??? # Redefine TF model FGSM!!! model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) fgsm2 = FastGradientMethod(model_2, sess=sess) # parameter for FGSM fgsm_params_y = {'eps': 0.3, 'y': y, 'clip_min': 0., 'clip_max': 1.} adv_x_2 = fgsm2.generate(x, **fgsm_params_y) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x_2 = tf.stop_gradient(adv_x_2) preds_2_adv = model_2(adv_x_2) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy ########################################################################### # MODEL Train for FGSM ########################################################################### # Perform and evaluate adversarial training with FSGM MODEL!!! model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=10, batch_size=128, learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6, nb_epochs_s=10, lmbda=0.1, attack="fgsm", targeted=False): """ MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ keras.layers.core.K.set_learning_phase(0) # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session and set as Keras backend session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) keras.backend.set_session(sess) # Get MNIST data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Initialize substitute training set reserved for adversary X_sub = X_test[:holdout] Y_sub = np.argmax(Y_test[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries X_test = X_test[holdout:] Y_test = Y_test[holdout:] X_test = X_test[:FLAGS.n_attack] Y_test = Y_test[:FLAGS.n_attack] # Define input and output TF placeholders x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") prep_bbox_out = prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, nb_epochs, batch_size, learning_rate) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 time_start = time.time() print("Training the substitute model.") train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda) model_sub, preds_sub = train_sub_out time_end = time.time() print("Substitue model training time:", time_end - time_start) # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params) accuracies['sub'] = acc print('substitution model accuracy:', acc) # Find the correctly predicted labels original_predict = batch_eval(sess, [x], [bbox_preds], [X_test], args=eval_params)[0] original_class = np.argmax(original_predict, axis = 1) true_class = np.argmax(Y_test, axis = 1) mask = true_class == original_class print(np.sum(mask), "out of", mask.size, "are correct labeled,", len(X_test[mask])) # Initialize the Fast Gradient Sign Method (FGSM) attack object. wrap = KerasModelWrapper(model_sub) # Craft adversarial examples using the substitute eval_params = {'batch_size': batch_size} if attack == "fgsm": attacker_params = {'eps': 0.4, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(wrap, sess=sess) x_adv_sub = fgsm.generate(x, **attacker_params) attacker = fgsm adv_inputs = X_test ori_labels = Y_test print("Running FGSM attack...") else: print("Running Carlini and Wagner\'s L2 attack...") yname = "y" adv_ys = None # wrap = KerasModelWrapper(model) cwl2 = CarliniWagnerL2(wrap, back='tf', sess=sess) attacker_params = {'binary_search_steps': 9, 'max_iterations': 2000, 'abort_early': True, 'learning_rate': 0.01, 'batch_size': 1, 'initial_const': 0.01, 'confidence': 20} # generate targeted labels, 9 for each test example if targeted: adv_ys = [] targeted_class = [] for i in range(0, X_test.shape[0]): for j in range(0,10): # skip the original image label if j == np.argmax(Y_test[i]): continue adv_ys.append(np.eye(10)[j]) targeted_class.append(j) attacker_params['y_target'] = np.array(adv_ys, dtype=np.float32) # duplicate the inputs by 9 times adv_inputs = np.array([[instance] * 9 for instance in X_test], dtype=np.float32) adv_inputs = adv_inputs.reshape((X_test.shape[0] * 9, 28, 28, 1)) # also update the mask mask = np.repeat(mask, 9) ori_labels = np.repeat(Y_test, 9, axis=0) else: adv_inputs = X_test ori_labels = Y_test attacker = cwl2 if attack == "fgsm": # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model(x_adv_sub), adv_inputs, ori_labels, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy time_start = time.time() # Evaluate the accuracy of the "black-box" model on adversarial examples x_adv_sub_np = attacker.generate_np(adv_inputs, **attacker_params) accuracy = model_eval(sess, x, y, bbox_preds, x_adv_sub_np, ori_labels, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute (NP): ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy time_end = time.time() print('Attack time:', time_end - time_start) # Evaluate the targeted attack bbox_adv_predict = batch_eval(sess, [x], [bbox_preds], [x_adv_sub_np], args=eval_params)[0] bbox_adv_class = np.argmax(bbox_adv_predict, axis = 1) true_class = np.argmax(ori_labels, axis = 1) untargeted_success = np.mean(bbox_adv_class != true_class) print('Untargeted attack success rate:', untargeted_success) accuracies['untargeted_success'] = untargeted_success if targeted: targeted_success = np.mean(bbox_adv_class == targeted_class) print('Targeted attack success rate:', targeted_success) accuracies['targeted_success'] = targeted_success if attack == "cwl2": # Compute the L2 pertubations of generated adversarial examples percent_perturbed = np.sum((x_adv_sub_np - adv_inputs)**2, axis=(1, 2, 3))**.5 # print(percent_perturbed) # print('Avg. L_2 norm of perturbations {0:.4f}'.format(np.mean(percent_perturbed))) # when computing the mean, removing the failure attacks first print('Avg. L_2 norm of perturbations {0:.4f}'.format(np.mean(percent_perturbed[percent_perturbed > 1e-8]))) # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, bbox_preds, adv_inputs[mask], ori_labels[mask], args=eval_params) print('Test accuracy of excluding originally incorrect labels (should be 1.0): ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex_exc_ori'] = accuracy if attack == "fgsm": # Evaluate the accuracy of the "black-box" model on adversarial examples (excluding correct) accuracy = model_eval(sess, x, y, model(x_adv_sub), adv_inputs[mask], ori_labels[mask], args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute (excluding originally incorrect labels): ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex_exc'] = accuracy # Evaluate the accuracy of the "black-box" model on adversarial examples (excluding correct) x_adv_sub_mask_np = x_adv_sub_np[mask] accuracy = model_eval(sess, x, y, bbox_preds, x_adv_sub_mask_np, ori_labels[mask], args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute (excluding originally incorrect labels, NP): ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex_exc'] = accuracy return accuracies
def main(): """ MNIST CleverHans tutorial :return: """ if not hasattr(backend, "theano"): raise RuntimeError("This tutorial requires keras to be configured" " to use the Theano backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'th': keras.backend.set_image_dim_ordering('th') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'tf', temporarily setting to 'th'") parser = argparse.ArgumentParser() parser.add_argument('--batch_size', '-b', default=128, help='Size of training batches') parser.add_argument('--train_dir', '-d', default='/tmp', help='Directory storing the saved model.') parser.add_argument('--filename', '-f', default='mnist.ckpt', help='Filename to save model under.') parser.add_argument('--nb_epochs', '-e', default=6, type=int, help='Number of epochs to train model') parser.add_argument('--learning_rate', '-lr', default=0.5, type=float, help='Learning rate for training') args = parser.parse_args() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist() print("Loaded MNIST test data.") assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input Theano placeholder x_shape = (None, 1, 28, 28) x = T.tensor4('x') y = T.matrix('y') # Define Theano model graph model = cnn_model() model.build(x_shape) predictions = model(x) print("Defined Theano model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples accuracy = th_model_eval(x, y, predictions, X_test, Y_test, args=args) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: ' + str(accuracy)) pass # Train an MNIST model th_model_train(x, y, predictions, model.trainable_weights, X_train, Y_train, evaluate=evaluate, args=args) # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph fgsm = FastGradientMethod(model, back='th') adv_x = fgsm.generate(x, params={'eps': 0.3}) # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = th_model_eval(x, y, model(adv_x), X_test, Y_test, args=args) print('Test accuracy on adversarial examples: ' + str(accuracy)) print("Repeating the process, using adversarial training") # Redefine Theano model graph model_2 = cnn_model() model_2.build(x_shape) preds_2 = model_2(x) fgsm = FastGradientMethod(model_2, back='th') preds_2_adv = model_2(fgsm.generate(x, params={'eps': 0.3})) def evaluate_2(): # Evaluate the accuracy of the adversarialy trained MNIST model on # legitimate test examples accuracy = th_model_eval(x, y, preds_2, X_test, Y_test, args=args) print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Evaluate the accuracy of the adversarially trained MNIST model on # adversarial examples acc_adv = th_model_eval(x, y, preds_2_adv, X_test, Y_test, args=args) print('Test accuracy on adversarial examples: ' + str(acc_adv)) # Perform adversarial training th_model_train(x, y, preds_2, model_2.trainable_weights, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=args)
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x, y, x_train, y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes)] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape((source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" cw_params = {'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': 10} adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ idxs], args=eval_params) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ :source_samples], args=eval_params) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def main(argv=None): """ MNIST cleverhans tutorial :return: """ keras.layers.core.K.set_learning_phase(0) # Perform tutorial setup assert setup_tutorial() # Create TF session and set as Keras backend session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) # sess = tf.Session() keras.backend.set_session(sess) if args.ae: autoencoder.restore(sess, args.ae) # Restore model weights from previously saved model # Get MNIST data X_train, Y_train, X_test, Y_test = data_mnist() # Initialize substitute training set reserved for adversary X_sub = X_test[:FLAGS.holdout] Y_sub = np.argmax(Y_test[:FLAGS.holdout], axis=1) # Shrink training data. # X_train = X_train[:10000] # Y_train = Y_train[:10000] # Redefine test set as remaining samples unavailable to adversaries X_test = X_test[FLAGS.holdout:] Y_test = Y_test[FLAGS.holdout:] # Define input and output TF placeholders x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) print("Preparing the black-box model.") if args.alg == "cnn": model, bbox = prep_cnn_bbox(sess, x, y, X_train, Y_train, X_test, Y_test) elif is_not_nn(): bbox = prep_boost_bbox(X_train, Y_train, X_test, Y_test) print("Training the substitute model.") model_sub, preds_sub = train_substitute(sess, x, y, bbox, X_sub, Y_sub) # Initialize the Fast Gradient Sign Method (FGSM) attack object. fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model_sub, sess=sess) # Craft adversarial examples using the substitute print("Crafting the adversarial examples.") eval_params = {'batch_size': FLAGS.batch_size} x_adv_sub = fgsm.generate(x, **fgsm_par) X_test_adv, = batch_eval(sess, [x], [x_adv_sub], [X_test], args=eval_params) # Dump adversarial examples. example_file = "example/{}.data".format(model_name) with open(example_file, "wb") as f: pickle.dump(X_test_adv, f) if args.ae: print("Denoising...") num_data = X_test_adv.shape[0] autoencoder.visualize(sess, X_test_adv.reshape(num_data, -1), "adv") filtered_data = autoencoder.run(sess, X_test_adv.reshape(num_data, -1)) X_test_adv = filtered_data.reshape(num_data, 28, 28, 1) # Evaluate the accuracy of the "black-box" model on adversarial examples if args.alg == "cnn": accuracy = model_eval(sess, x, y, bbox, X_test_adv, Y_test, args=eval_params) elif is_not_nn(): x_test_adv = X_test_adv.reshape(X_test_adv.shape[0], -1) y_test = np.argmax(Y_test, axis=1) accuracy = bbox.score(x_test_adv, y_test) print("Test adversarial accuracy = {}".format(accuracy)) log_file = "log/{}.log".format(model_name) with open(log_file, "a") as f: if args.ae: f.write("{}. Test adversarial accuracy = {}\n".format(args.ae, accuracy)) else: f.write("Test adversarial accuracy = {}\n".format(accuracy))
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="train_dir", filename="mnist.ckpt", load_model=False, testing=False, label_smoothing=True): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] if label_smoothing: label_smooth = .1 y_train = y_train.clip(label_smooth / (nb_classes-1), 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = LossCrossEntropy(wrap, smoothing=0.1) train(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, save=True, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_train, y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) wrap_2 = KerasModelWrapper(model_2) preds_2 = model_2(x) fgsm2 = FastGradientMethod(wrap_2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) preds_2_adv = model_2(attack(x)) loss_2 = LossCrossEntropy(wrap_2, smoothing=0.1, attack=attack) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_test, y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, x_test, y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(sess, loss_2, x, y, x_train, y_train, evaluate=evaluate_2, args=train_params, save=False, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_train, y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, x_train, y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def main(): """ MNIST cleverhans tutorial :return: """ import argparse parser = argparse.ArgumentParser() parser.add_argument('--batch_size', '-b', default=128, help='Size of training batches') parser.add_argument('--train_dir', '-d', default='/tmp', help='Directory storing the saved model.') parser.add_argument('--filename', '-f', default='mnist.ckpt', help='Filename to save model under.') parser.add_argument('--nb_epochs', '-e', default=6, type=int, help='Number of epochs to train model') parser.add_argument('--learning_rate', '-lr', default=0.5, type=float, help='Learning rate for training') args = parser.parse_args() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist() print("Loaded MNIST test data.") assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input Theano placeholder x_shape = (None, 1, 28, 28) y_shape = (None, 10) x = T.tensor4('x') y = T.matrix('y') # Define Theano model graph model = model_mnist() model.build(x_shape) predictions = model(x) print("Defined Theano model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples accuracy = th_model_eval(x, y, predictions, X_test, Y_test, args=args) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: ' + str(accuracy)) pass # Train an MNIST model th_model_train(x, y, predictions, model.trainable_weights, X_train, Y_train, evaluate=evaluate, args=args) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) adv_x = fgsm(x, predictions, eps=0.3) X_test_adv, = batch_eval([x], [adv_x], [X_test], args=args) assert X_test_adv.shape[0] == 10000, X_test_adv.shape # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = th_model_eval(x, y, predictions, X_test_adv, Y_test, args=args) print('Test accuracy on adversarial examples: ' + str(accuracy)) print("Repeating the process, using adversarial training") # Redefine Theano model graph model_2 = model_mnist() model_2.build(x_shape) predictions_2 = model_2(x) adv_x_2 = fgsm(x, predictions_2, eps=0.3) predictions_2_adv = model_2(adv_x_2) def evaluate_2(): # Evaluate the accuracy of the adversarialy trained MNIST model on # legitimate test examples accuracy = th_model_eval(x, y, predictions_2, X_test, Y_test, args=args) print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Evaluate the accuracy of the adversarially trained MNIST model on # adversarial examples accuracy_adv = th_model_eval(x, y, predictions_2_adv, X_test, Y_test, args=args) print('Test accuracy on adversarial examples: ' + str(accuracy_adv)) # Perform adversarial training th_model_train(x, y, predictions_2, model_2.trainable_weights, X_train, Y_train, predictions_adv=predictions_2_adv, evaluate=evaluate_2, args=args)
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def mnist_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=10, batch_size=128, learning_rate=0.001, nb_epochs=10, holdout=150, nb_filters=64, data_aug=6, nb_epochs_s=10, lmbda=0.1, use_rec_err=True, model_arch=None, model_arch_sub=None, attack_name=None, use_cross_err=None, dataset_name=None, blocking_option=None, opt_type='adam'): """ :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ get_rec_err = rec_err_fct(use_rec_err, blocking_option) merged = None #XXX switched this off for the moment # Set logging level to see debug information set_log_level(logging.DEBUG) # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session sess = tf.Session() # Get data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Initialize substitute training set reserved for adversary X_sub = X_test[:holdout] Y_sub = np.argmax(Y_test[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries X_test = X_test[holdout:] Y_test = Y_test[holdout:] # Define input and output TF placeholders x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Seed random number generator so tutorial is reproducible rng = np.random.RandomState([2017, 8, 30]) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") prep_bbox_out = prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, nb_filters, nb_epochs, batch_size, learning_rate, rng=rng, use_rec_err=use_rec_err, model_arch=model_arch, attack_name=attack_name, use_cross_err=use_cross_err, dataset_name=dataset_name, blocking_option=blocking_option, opt_type=opt_type, merged=merged) #model, bbox_preds, accuracies['bbox'] = prep_bbox_out model, _, _ = prep_bbox_out bbox_preds = model(x) # Train substitute using method from https://arxiv.org/abs/1602.02697 print("Training the substitute model.") model_sub, preds_sub = train_sub( sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, rng=rng, model_arch_sub=model_arch_sub, merged=merged, opt_type=opt_type, blocking_option=blocking_option, ) # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc_sub = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params) accuracies['sub'] = acc_sub #XXX evaluating on clean samples after training sub preds = model(x) pre_ae_states, post_ae_states = model.get_ae_states() rec_err = get_rec_err(pre_ae_states, post_ae_states) accuracy, rec_loss_eval = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params, aux_loss_lst=[rec_err]) print('Test accuracy of oracle on clean examples: ' + str(accuracy)) print('reconstr. err. of oracle on clean examples: ' + str(rec_loss_eval)) #XXX --> the result should be as before # Initialize the attack attack = attacks[attack_name](model_sub, sess=sess) x_adv_sub = attack.generate(x, **attack_par[dataset_name][attack_name]) preds_adv = model(x_adv_sub) pre_ae_states_adv, post_ae_states_adv = model.get_ae_states() rec_err_adv = get_rec_err(pre_ae_states_adv, post_ae_states_adv) # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy, rec_loss_eval = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_params, aux_loss_lst=[rec_err_adv]) print('Test accuracy of oracle on adversarial examples: ' + str(accuracy)) print('reconstr. err. of oracle on adversarial examples: ' + str(rec_loss_eval)) accuracies['bbox_on_sub_adv_ex'] = accuracy print('Accuracies', accuracies) return accuracies
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ nb_classes = 10 # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(4264) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Get MNISTnotMNIST data # with np.load("MNISTnotMNIST.npz") as data: # X_train, Y_train, X_test, Y_test = data['X_train'], data['Y_train'], data['X_test'], data['Y_test'] # Use label smoothing # assert Y_train.shape[1] == 10 # label_smooth = .1 # Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "./" model_name = "clean_trained_mnist_model" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': model_path, 'filename': model_name } fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([443, 224, 39]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters, nb_classes=nb_classes) preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, save=True, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph # model_2 = make_basic_cnn(nb_filters=nb_filters) # preds_2 = model_2(x) # fgsm2 = FastGradientMethod(model_2, sess=sess) # adv_x_2 = fgsm2.generate(x, **fgsm_params) # if not backprop_through_attack: # # For the fgsm attack used in this tutorial, the attack has zero # # gradient so enabling this flag does not change the gradient. # # For some other attacks, enabling this flag increases the cost of # # training, but gives the defender the ability to anticipate how # # the atacker will change their strategy in response to updates to # # the defender's parameters. # adv_x_2 = tf.stop_gradient(adv_x_2) # preds_2_adv = model_2(adv_x_2) # # def evaluate_2(): # # Accuracy of adversarially trained model on legitimate test inputs # eval_params = {'batch_size': batch_size} # accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, # args=eval_params) # print('Test accuracy on legitimate examples: %0.4f' % accuracy) # report.adv_train_clean_eval = accuracy # # # Accuracy of the adversarially trained model on adversarial examples # accuracy = model_eval(sess, x, y, preds_2_adv, X_test, # Y_test, args=eval_params) # print('Test accuracy on adversarial examples: %0.4f' % accuracy) # report.adv_train_adv_eval = accuracy # # # Perform and evaluate adversarial training # model_train(sess, x, y, preds_2, X_train, Y_train, # predictions_adv=preds_2_adv, evaluate=evaluate_2, # args=train_params, rng=rng) # # # Calculate training errors # if testing: # eval_params = {'batch_size': batch_size} # accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, # args=eval_params) # report.train_adv_train_clean_eval = accuracy # accuracy = model_eval(sess, x, y, preds_2_adv, X_train, # Y_train, args=eval_params) # report.train_adv_train_adv_eval = accuracy return report
def cnn_model(sess=tf.get_default_session(), debug=False, w=0, rel=2): """ Defines a CNN model using Keras sequential model :param logits: If set to False, returns a Keras model, otherwise will also return logits tensor :param input_ph: The TensorFlow tensor for the input (needed if returning logits) ("ph" stands for placeholder but it need not actually be a placeholder) :param img_rows: number of row in the image :param img_cols: number of columns in the image :param channels: number of color channels (e.g., 1 for MNIST) :param nb_filters: number of convolutional filters per layer :param nb_classes: the number of output classes :return: """ return "a" # Define the layers successively (convolution layers are version dependent) # if keras.backend.image_dim_ordering() == 'th': # input_shape = (channels, img_rows, img_cols) # else: # input_shape = (img_rows, img_cols, channels) # # layers = [Reshape((784,),input_shape=(28,28,1)), # Dense(10), # Activation('relu'), # Dense(50), # Activation('relu'), # Dense(nb_classes)] inpt = Input(batch_shape=(None, 28, 28, 1)) k = Reshape((784,))(inpt) # k=Activation(activation=binary_filter_tf)(k) #if you want to use feature squeezing # if w == 1: # print("No perception ensemble, using only one neuron") # x2 = Activation(micromodel)(k) # x2 = Dropout(0.05)(k) # else: # # for i in range(w): # percepfilter = Lambda(lambda x: micromodel(x))(k) # percepfilter = Dropout(0.05)(percepfilter) # perceptionlayer.append(percepfilter) # x2 = Concatenate()(perceptionlayer) perceptionlayer = [] g = tf.get_default_graph() with g.gradient_override_map({"Sign": "FakeHS"}): # for i in range(2): # x = TrainableScaler(activation=Paranograd)(k) # x = Dropout(0.15)(x) # x = TrainableScaler(use_bias=False)(x) # x = Dropout(0.15)(x) # perceptionlayer.append(x) # # for relusnumber in range(0): # relux = TrainableScaler(activation="relu")(k) # relux = Dropout(0.20)(relux) # relux = TrainableScaler(use_bias=False)(relux) # relux = Dropout(0.20)(relux) # perceptionlayer.append(relux) # # print(perceptionlayer) # # x2 = Add()(perceptionlayer) # x3 = Multiply()(perceptionlayer) # x3 = Lambda(lambda x: -2 * x, output_shape=(784,))(x3) # print(x2) # print(x3) # x2 = Add()([x2, x3]) # x2 = TrainableOffset()(x2) x = TrainableOffset(bias_constraint=IsNegProb(), bias_initializer=keras.initializers.Constant(value=-0.5), bias_regularizer=antientropy, activation=Heaviside)(k) x2 = Reshape((28, 28, 1))(x) x2 = Conv2D(64, (6, 6))(x2) print(x2) x2 = Reshape((-1,))(x2) x2 = TrainableOffset( bias_initializer=keras.initializers.Constant(value=0.0), activation='relu')(x2) # x2 = TrainableScaler(use_bias=False, kernel_regularizer='l1', activation=Heaviside)(x2) # x2 = Activation('relu')(x2) x2 = Reshape((23, 23, 64))(x2) x2 = Conv2D(64, (4, 4))(x2) x2 = Flatten()(x2) x2 = Dense(40, activation="relu")(x2) # x2 = Dense(400, activation=Paranograd)(x2) # x2 = TrainableScaler(use_bias=False, kernel_regularizer='l1', activation=Heaviside)(x2) x2 = Dense(10)(x2) predictions2 = Activation(activation="softmax")(x2) model = keras.Model(inputs=inpt, outputs=predictions2) if debug: return keras.Model(inputs=inpt, outputs=debugO) X_train, Y_train, X_test, Y_test = data_mnist(train_start=0, train_end=60000, test_start=0, test_end=10000) K.set_session(sess) model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) model.fit(np.array(X_train), np.array(Y_train), 128, 3, verbose=1) print(model.evaluate(np.array(X_test), np.array(Y_test))) print(model.get_weights()) return model
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64, num_threads=None, label_smoothing=0.1): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) sess = tf.Session() def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = make_basic_picklable_cnn() preds = model.get_logits(x) assert len(model.get_params()) > 0 loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) with sess.as_default(): save("clean_model.joblib", model) # Now that the model has been saved, you can evaluate it in a # separate process using `evaluate_pickled_model.py`. # You should get exactly the same result for both clean and # adversarial accuracy as you get within this program. # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True) # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval') print('Repeating the process, using adversarial training') # Create a new model and train it to be robust to FastGradientMethod model2 = make_basic_picklable_cnn() fgsm2 = FastGradientMethod(model2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack) preds2 = model2.get_logits(x) adv_x2 = attack(x) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) def evaluate2(): # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) # Accuracy of the adversarially trained model on adversarial examples do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) # Perform and evaluate adversarial training train(sess, loss2, x, y, x_train, y_train, evaluate=evaluate2, args=train_params, rng=rng, var_list=model2.get_params()) with sess.as_default(): save("adv_model.joblib", model2) # Now that the model has been saved, you can evaluate it in a # separate process using `evaluate_pickled_model.py`. # You should get exactly the same result for both clean and # adversarial accuracy as you get within this program. # Calculate training errors if testing: do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval') do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval') return report
def unprotected_network(sess=tf.get_default_session(), debug=False, train=False, save=False): inpt = Input(batch_shape=(None, 28, 28, 1)) x2 = Conv2D(32, (3, 3), activation='relu')(inpt) x2 = Conv2D(32, (3, 3), activation='relu')(x2) x2 = MaxPooling2D(pool_size=(2, 2))(x2) x2 = Conv2D(32, (3, 3), activation='relu')(x2) x2 = Conv2D(32, (3, 3), activation='relu')(x2) x2 = MaxPooling2D(pool_size=(2, 2))(x2) # x2 = Flatten()(x2) # layer = [] # for i in range(w): # layer.append(TrainableOffset(bias_constraint=IsNegProb(), # bias_initializer=keras.initializers.random_normal(-0.1), # bias_regularizer=antientropy, # activation=Heaviside)(x2)) # x2 = Add()(layer) # x3 = Multiply()(layer) # x3 = Lambda(lambda x:2*x)(x3) # x2 = Subtract()([x2,x3]) # x2 = TrainableScaler(use_bias=False, kernel_regularizer='l1', activation=Heaviside)(x2) # x2 = Reshape((23, 23, 64))(x2) # x2 = Conv2D(64, (3, 3))(x2) # x2 = MaxPool2D()(x2) # x2 = Conv2D(128, (3, 3))(x2) # x2 = MaxPool2D()(x2) x2 = Flatten()(x2) x2 = Dropout(0.2)(x2) layer = [] # for i in range(w): # layer.append(TrainableOffset(bias_constraint=IsNegProb(), # bias_initializer=keras.initializers.random_normal(-0.1), # bias_regularizer=antientropy, # activation=Heaviside)(x2)) # x2 = XOR()(layer) x2 = Dense(50, activation="relu")(x2) x2 = Dense(50, activation='relu')(x2) # x2 = TrainableScaler(use_bias=False, kernel_regularizer='l1', activation=Heaviside)(x2) x2 = Dense(10)(x2) predictions2 = Activation(activation="softmax")(x2) model = keras.Model(inputs=inpt, outputs=predictions2) if debug: model.summary() if not train: return model X_train, Y_train, X_test, Y_test = data_mnist(train_start=0, train_end=60000, test_start=0, test_end=10000) K.set_session(sess) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) model.fit(np.array(X_train), np.array(Y_train), 128, 6, verbose=1) print(model.evaluate(np.array(X_test), np.array(Y_test))) # print(model.get_weights()) if not save: return model save_path = os.path.join("/tmp", "mnist.ckpt") saver = tf.train.Saver() saver.save(sess, save_path) print("Completed model training and saved at: " + str(save_path)) return model
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) idxs = [np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(10)] if targeted: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') one_hot = np.zeros((10, 10)) one_hot[np.arange(10), np.arange(10)] = 1 adv_inputs = np.array([[instance] * 10 for instance in X_test[idxs]], dtype=np.float32) adv_inputs = adv_inputs.reshape((100, 28, 28, 1)) adv_ys = np.array([one_hot] * 10, dtype=np.float32).reshape((100, 10)) yname = "y_target" else: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = X_test[idxs] adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': 100 if targeted else 10, 'initial_const': 10 } adv = cw.generate_np(adv_inputs, **cw_params) if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args={'batch_size': 10}) else: adv_accuracy = 1 - model_eval( sess, x, y, preds, adv, Y_test[idxs], args={'batch_size': 10}) for j in range(10): if targeted: for i in range(10): grid_viz_data[i, j] = adv[i * 10 + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def spartan_network(sess=tf.get_default_session(), debug=False, w=4, train=False, create_surrogate=False): # we want to integrate an attention mechanism with a feature squeezer inpt = Input(batch_shape=(None, 28, 28, 1)) g = tf.get_default_graph() speclayer=[] with g.gradient_override_map({"Sign": "FakeHS"}): x2 = Conv2D(4, (1,1), activation=Paranograd, bias_initializer=keras.initializers.random_uniform(minval=0.05, maxval=0.5), kernel_initializer=keras.initializers.random_uniform(minval=1, maxval=1.5))(inpt) # x2 = SpatialDropout2D(0.1)(x2) x2 = Conv2D(32, (3, 3), activation='relu')(x2) # x2 = SpatialDropout2D(0.1)(x2) x2 = Conv2D(32, (3, 3), activation=Paranograd)(x2) x2 = MaxPooling2D(pool_size=(2, 2))(x2) x2 = Conv2D(32, (3, 3), activation='relu')(x2) # x2 = SpatialDropout2D(0.1)(x2) x2 = Conv2D(32, (3, 3), activation='relu')(x2) x2 = MaxPooling2D(pool_size=(2, 2))(x2) # x2 = Flatten()(x2) # layer = [] # for i in range(w): # layer.append(TrainableOffset(bias_constraint=IsNegProb(), # bias_initializer=keras.initializers.random_normal(-0.1), # bias_regularizer=antientropy, # activation=Heaviside)(x2)) # x2 = Add()(layer) # x3 = Multiply()(layer) # x3 = Lambda(lambda x:2*x)(x3) # x2 = Subtract()([x2,x3]) # x2 = TrainableScaler(use_bias=False, kernel_regularizer='l1', activation=Heaviside)(x2) # x2 = Reshape((23, 23, 64))(x2) # x2 = Conv2D(64, (3, 3))(x2) # x2 = MaxPool2D()(x2) # x2 = Conv2D(128, (3, 3))(x2) # x2 = MaxPool2D()(x2) x2 = Flatten()(x2) # x2 = Dropout(0.1)(x2) layer=[] # for i in range(w): # layer.append(TrainableOffset(bias_constraint=IsNegProb(), # bias_initializer=keras.initializers.random_normal(-0.1), # bias_regularizer=antientropy, # activation=Heaviside)(x2)) # x2 = XOR()(layer) x2 = Dense(100, activation="relu")(x2) # x2 = Dense(50, activation='relu')(x2) # x2 = TrainableScaler(use_bias=False, kernel_regularizer='l1', activation=Heaviside)(x2) x2 = Dense(10)(x2) predictions2 = Activation(activation="softmax")(x2) model = keras.Model(inputs=inpt, outputs=predictions2) model.summary() if not train: return model hist=LossHistory() X_train, Y_train, X_test, Y_test = data_mnist(train_start=0, train_end=60000, test_start=0, test_end=10000) K.set_session(sess) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) model.fit(np.array(X_train), np.array(Y_train), 128, 8, verbose=1, callbacks=[hist]) print(model.evaluate(np.array(X_test), np.array(Y_test))) print(model.get_weights()) crntm=time.localtime() stnct = str(str(crntm.tm_mday) + "_" + str(crntm.tm_mon) + "_" + str(crntm.tm_year) + "_" + str(crntm.tm_hour) + "_" + str(crntm.tm_min)) with open('Experiments/'+stnct,"ab") as f: import pickle pickle.dump([model.get_weights(), hist.losses, model.evaluate(np.array(X_test), np.array(Y_test))], f) if create_surrogate: inpt = Input(batch_shape=(None, 28, 28, 1)) x2 = Conv2D(4, (1, 1), activation=soft_heaviside, kernel_initializer=keras.initializers.random_uniform(minval=0, maxval=30))(inpt) x2 = SpatialDropout2D(0.1)(x2) x2 = Conv2D(32, (3, 3), activation='relu')(x2) x2 = SpatialDropout2D(0.1)(x2) x2 = Conv2D(32, (3, 3), activation=soft_heaviside)(x2) x2 = MaxPooling2D(pool_size=(2, 2))(x2) x2 = Conv2D(32, (3, 3), activation='relu')(x2) x2 = SpatialDropout2D(0.1)(x2) x2 = Conv2D(32, (3, 3), activation=soft_heaviside)(x2) x2 = MaxPooling2D(pool_size=(2, 2))(x2) x2 = Flatten()(x2) x2 = Dropout(0.2)(x2) x2 = Dense(50, activation="relu")(x2) x2 = Dense(50, activation='relu')(x2) # x2 = TrainableScaler(use_bias=False, kernel_regularizer='l1', activation=Heaviside)(x2) x2 = Dense(10)(x2) predictions_surr = Activation(activation="softmax")(x2) model_surrogate=keras.Model(inputs=inpt, outputs=predictions_surr) model_surrogate.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) model_surrogate.set_weights(model.get_weights()) print(model.get_weights()[2],model_surrogate.get_weights()[2]) return [model_surrogate, model] return model
def main(measures, src_model_names): np.random.seed(0) tf.set_random_seed(0) flags.DEFINE_integer('BATCH_SIZE', 32, 'Size of batches') set_mnist_flags() x = K.placeholder( (None, FLAGS.IMAGE_ROWS, FLAGS.IMAGE_COLS, FLAGS.NUM_CHANNELS)) y = K.placeholder((None, FLAGS.NUM_CLASSES)) X_train, Y_train, X_test, Y_test = data_mnist() # source model for crafting adversarial examples src_models = [None] * len(src_model_names) accuracy = [None] * len(src_model_names) for i in range(len(src_model_names)): src_models[i] = load_model(src_model_names[i]) if measures == "Q": X_test = X_test[0:100] Y_test = Y_test[0:100] N = len(X_test) k = len(src_model_names) Qij = [([None] * k) for p in range(k)] for i in range(k - 1): for j in range(i + 1, k): a = b = c = d = 0.0 for n in range(N): src_model_i = src_models[i] src_model_j = src_models[j] Ci = tf_compute_C(src_model_i, x, y, X_test[n:n + 1], Y_test[n:n + 1]) Cj = tf_compute_C(src_model_j, x, y, X_test[n:n + 1], Y_test[n:n + 1]) if (Ci[0] == 1 & Cj[0] == 1): a += 1 elif (Ci[0] == 0 & Cj[0] == 0): d += 1 elif (Ci[0] == 0 & Cj[0] == 1): c += 1 elif (Ci[0] == 1 & Cj[0] == 0): b += 1 print a, b, c, d Qij[i][j] = (a * d - b * c) / (a * d + b * c) Qij_SUM = 0.0 for i in range(k - 1): for j in range(i + 1, k): Qij_SUM += Qij[i][j] QAV = (2.0 / (k * (k - 1))) * Qij_SUM print('The value of the Q statistic: %.4f' % (QAV)) return if measures == "p": X_test = X_test[0:100] Y_test = Y_test[0:100] N = len(X_test) k = len(src_model_names) Pij = [([None] * k) for p in range(k)] for i in range(k - 1): for j in range(i + 1, k): a = b = c = d = 0.0 for n in range(N): src_model_i = src_models[i] src_model_j = src_models[j] Ci = tf_compute_C(src_model_i, x, y, X_test[n:n + 1], Y_test[n:n + 1]) Cj = tf_compute_C(src_model_j, x, y, X_test[n:n + 1], Y_test[n:n + 1]) if (Ci[0] == 1 & Cj[0] == 1): a += 1 elif (Ci[0] == 0 & Cj[0] == 0): d += 1 elif (Ci[0] == 0 & Cj[0] == 1): c += 1 elif (Ci[0] == 1 & Cj[0] == 0): b += 1 print a, b, c, d Pij[i][j] = (a * d - b * c) / math.sqrt( (a + b) * (a + c) * (b + d) * (d + c)) Pij_SUM = 0.0 for i in range(k - 1): for j in range(i + 1, k): Pij_SUM += Pij[i][j] PAV = (2.0 / (k * (k - 1))) * Pij_SUM print('The value of the correlation coefficient: %.4f' % (PAV)) return if measures == "Ent": X_test = X_test[0:100] Y_test = Y_test[0:100] k = len(src_model_names) N = len(X_test) num = 0 for i in range(N): lxt = 0 print i for (name, src_model) in zip(src_model_names, src_models): C = tf_compute_C(src_model, x, y, X_test[i:i + 1], Y_test[i:i + 1]) # lxt denote the number of substitutes that accurately recognize sample x. lxt += C[0] # lxt= 0,1,2,3 m = min(lxt, k - lxt) num += ((1.0 / (k - math.ceil(k / 2.0))) * m) Ent = (1.0 / N) * num print('The value of the entropy measure: %.4f' % (Ent)) return if measures == "KW": X_test = X_test[0:100] Y_test = Y_test[0:100] k = len(src_model_names) N = len(X_test) num = 0 for i in range(N): lxt = 0 print i for (name, src_model) in zip(src_model_names, src_models): C = tf_compute_C(src_model, x, y, X_test[i:i + 1], Y_test[i:i + 1]) # lxt denote the number of substitutes that accurately recognize sample x. lxt += C[0] # lxt= 0,1,2,3 num += (lxt * (k - lxt)) KW = (1.0 / (N * math.pow(k, 2))) * num print('The value of the Kohavi-Wolpert variance: %.4f' % (KW)) return if measures == "test": X_test = X_test[0:5] Y_test = Y_test[0:5] # display_leg_sample(X_test) for j in range(1): for (name, src_model) in zip(src_model_names, src_models): # the number of substitutes from D that correctly recognize X_test[j] num = tf_test_acc_num(src_model, x, y, X_test, Y_test) # output 1, 1, 1, 1, 1, 1 print num return
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=100, batch_size=128, learning_rate=0.001, train_dir="/tmp", filename="mnist.ckpt", load_model=False, testing=True, w=2, rel=0): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param w : number of perceptive neurons :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility #tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) keras.layers.core.K.set_learning_phase(1) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph models = spartan_network(sess=sess,w=w,train=True,create_surrogate=True) model_to_attack = models[0] spartan_model = models[1] ckpt = tf.train.get_checkpoint_state(train_dir) print(ckpt) trainvalue=True if ckpt is None else False #model_to_attack = unprotected_network(sess=sess, train=trainvalue, save=True) preds = model_to_attack(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = None ckpt = tf.train.get_checkpoint_state(train_dir) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path if load_model and ckpt_path: saver = tf.train.Saver() saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") keras.layers.core.K.set_learning_phase(1) # model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, # args=train_params, save=True, rng=rng) keras.layers.core.K.set_learning_phase(0) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) print("With no Dropout : %s" % acc) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph wrap = KerasModelWrapper(model_to_attack) global mdl mdl = model_to_attack fgsm = FastGradientMethod(wrap, sess=sess) for epstep in range(40): fgsm_params = {'eps': 0.01+0.02*epstep, 'clip_min': 0., 'clip_max': 1.} # cw_params = {'confidence': 0.5, # 'batch_size': 4, # 'learning_rate': 2e-2, # 'max_iterations': 400, # 'clip_min': 0., # 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) # cwattack = atk.CarliniWagnerL2(model_to_attack,sess=sess) # adv_x = cwattack.generate(x, **cw_params) # adv_x = tf.stop_gradient(adv_x) # adv_x_np = cwattack.generate_np(X_test[500:704], **cw_params) # from matplotlib import pyplot as plt # plt.rc('figure', figsize=(12.0, 12.0)) # for j in range(40): # # plt.imshow(adv_x_np[j].reshape((28, 28)), # cmap="gray") # plt.pause(0.15) # return preds_adv = spartan_model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on epsilon-%0.4f-adversarial examples: %0.4f\n' % (fgsm_params["eps"],acc)) return report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model(w=w, rel=rel) preds_2 = model_2(x) wrap_2 = KerasModelWrapper(model_2) fgsm2 = FastGradientMethod(wrap_2, sess=sess) preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params)) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, save=False, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64, num_threads=None, label_smoothing=True): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] if label_smoothing: label_smooth = .1 y_train = y_train.clip(label_smooth / (nb_classes-1), 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1. } rng = np.random.RandomState([2017, 8, 30]) sess = tf.Session() def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True) # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval') print('Repeating the process, using adversarial training') # Create a new model and train it to be robust to FastGradientMethod model2 = ModelBasicCNN('model2', nb_classes, nb_filters) fgsm2 = FastGradientMethod(model2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) loss2 = LossCrossEntropy(model2, smoothing=0.1, attack=attack) preds2 = model2.get_logits(x) adv_x2 = attack(x) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) def evaluate2(): # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) # Accuracy of the adversarially trained model on adversarial examples do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) # Perform and evaluate adversarial training train(sess, loss2, x, y, x_train, y_train, evaluate=evaluate2, args=train_params, rng=rng, var_list=model2.get_params()) # Calculate training errors if testing: do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval') do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval') return report
def main(argv=None): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :return: """ # Set TF random seed to improve reproducibility tf.set_random_seed(1234) ########################################################################### # Define the dataset and model ########################################################################### # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' " "to 'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) print("Created TensorFlow session and set Keras backend.") # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist() print("Loaded MNIST test data.") # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() predictions = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model if it does not exist in the train_dir folder saver = tf.train.Saver() save_path = os.path.join(FLAGS.train_dir, FLAGS.filename) if os.path.isfile(save_path): saver.restore(sess, os.path.join(FLAGS.train_dir, FLAGS.filename)) else: train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate } model_train(sess, x, y, predictions, X_train, Y_train, args=train_params) saver.save(sess, save_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(FLAGS.source_samples) + ' * ' + str(FLAGS.nb_classes - 1) + ' adversarial examples') # This array indicates whether an adversarial example was found for each # test set sample and target class results = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='i') # This array contains the fraction of perturbed features for each test set # sample and target class perturbations = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='f') # Define the TF graph for the model's Jacobian grads = jacobian_graph(predictions, x, FLAGS.nb_classes) # Initialize our array for grid visualization grid_shape = (FLAGS.nb_classes, FLAGS.nb_classes, FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels) grid_viz_data = np.zeros(grid_shape, dtype='f') np.save("xorg_jsma.npy", X_test) np.save("ytest_jsma.npy", Y_test) np.save("xtrain_jsma.npy", X_train) np.save("ytrain_jsma.npy", Y_train) adv_examples = [] org_examples = [] ytest_examples = [] ytarget_examples = [] # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, FLAGS.source_samples): # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(FLAGS.nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( X_test[sample_ind:(sample_ind + 1)], (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) # Loop over all target classes for target in target_classes: print('--------------------------------------') print('Creating adv. example for target class ' + str(target)) # This call runs the Jacobian-based saliency map approach adv_x, res, percent_perturb = jsma(sess, x, predictions, grads, X_test[sample_ind:(sample_ind + 1)], target, theta=1, gamma=0.1, increase=True, back='tf', clip_min=0, clip_max=1) adv_examples.append(adv_x) org_examples.append(X_test[sample_ind:(sample_ind + 1)]) ytest_examples.append(current_class) ytarget_examples.append(target) # Display the original and adversarial images side-by-side if FLAGS.viz_enabled: if 'figure' not in vars(): figure = pair_visual( np.reshape(X_test[sample_ind:(sample_ind + 1)], (FLAGS.img_rows, FLAGS.img_cols)), np.reshape(adv_x, (FLAGS.img_rows, FLAGS.img_cols))) else: figure = pair_visual( np.reshape(X_test[sample_ind:(sample_ind + 1)], (FLAGS.img_rows, FLAGS.img_cols)), np.reshape(adv_x, (FLAGS.img_rows, FLAGS.img_cols)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb # Compute the number of adversarial examples that were successfuly found nb_targets_tried = ((FLAGS.nb_classes - 1) * FLAGS.source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.2f}'.format(succ_rate)) adv_examples = np.array(adv_examples) org_examples = np.array(org_examples) ytest_examples = np.array(ytest_examples) ytarget_examples = np.array(ytarget_examples) np.save("xorg2_jsma.npy", org_examples) np.save("ytest2_jsma.npy", ytest_examples) np.save("xadv_jsma.npy", adv_examples) np.save("ytarget_jsma.npy", ytarget_examples) # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.2f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.2f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if FLAGS.viz_enabled: _ = grid_visual(grid_viz_data)
def main(argv=None): """ MNIST cleverhans tutorial :return: """ # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print( "INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to 'th', temporarily setting to 'tf'" ) # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist() assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = model_mnist() predictions = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples accuracy = model_eval(sess, x, y, predictions, X_test, Y_test) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Train an MNIST model model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) adv_x = fgsm(x, predictions, eps=0.3) X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test]) assert X_test_adv.shape[0] == 10000, X_test_adv.shape # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test) print('Test accuracy on adversarial examples: ' + str(accuracy)) print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = model_mnist() predictions_2 = model_2(x) adv_x_2 = fgsm(x, predictions_2, eps=0.3) predictions_2_adv = model_2(adv_x_2) def evaluate_2(): # Evaluate the accuracy of the adversarialy trained MNIST model on # legitimate test examples accuracy = model_eval(sess, x, y, predictions_2, X_test, Y_test) print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Evaluate the accuracy of the adversarially trained MNIST model on # adversarial examples accuracy_adv = model_eval(sess, x, y, predictions_2_adv, X_test, Y_test) print('Test accuracy on adversarial examples: ' + str(accuracy_adv)) # Perform adversarial training model_train(sess, x, y, predictions_2, X_train, Y_train, predictions_adv=predictions_2_adv, evaluate=evaluate_2)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } fgsm_params = {'eps': FLAGS.eps} rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn() preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape logger.info('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x = tf.stop_gradient(adv_x) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) logger.info('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc logger.info("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = make_basic_cnn() preds_2 = model_2(x) fgsm2 = FastGradientMethod(model_2, sess=sess) preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params)) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) logger.info('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) logger.info('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="train_dir", filename="mnist.ckpt", load_model=False, testing=False, label_smoothing=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = CrossEntropy(wrap, smoothing=label_smoothing) train(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, save=True, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_train, y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) wrap_2 = KerasModelWrapper(model_2) preds_2 = model_2(x) fgsm2 = FastGradientMethod(wrap_2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) preds_2_adv = model_2(attack(x)) loss_2 = CrossEntropy(wrap_2, smoothing=label_smoothing, attack=attack) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_test, y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, x_test, y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(sess, loss_2, x, y, x_train, y_train, evaluate=evaluate_2, args=train_params, save=False, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_train, y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, x_train, y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="/tmp", filename="mnist.ckpt", load_model=False, testing=False): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } ckpt = tf.train.get_checkpoint_state(train_dir) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path rng = np.random.RandomState([2017, 8, 30]) if load_model and ckpt_path: saver = tf.train.Saver() saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, save=True) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph wrap = KerasModelWrapper(model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model() preds_2 = model_2(x) wrap_2 = KerasModelWrapper(model_2) fgsm2 = FastGradientMethod(wrap_2, sess=sess) preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params)) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, save=False) # Get a random slice of the data for linear extrapolation plots random_idx = np.random.randint(0, X_train.shape[0]) X_slice = X_train[random_idx] Y_slice = Y_train[random_idx] # Plot the linear extrapolation plot for clean model log_prob_adv_array = get_logits_over_interval( sess, wrap, X_slice, fgsm_params) linear_extrapolation_plot(log_prob_adv_array, Y_slice, 'lep_clean.png') # Plot the linear extrapolation plot for adv model log_prob_adv_array = get_logits_over_interval( sess, wrap_2, X_slice, fgsm_params) linear_extrapolation_plot(log_prob_adv_array, Y_slice, 'lep_adv.png') # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") print(X_test)