def substitute_model(img_rows=28, img_cols=28, nb_classes=10): """ Defines the model architecture to be used by the substitute. Use the example model interface. :param img_rows: number of rows in input :param img_cols: number of columns in input :param nb_classes: number of classes in output :return: tensorflow model """ input_shape = (None, img_rows, img_cols, 1) # Define a fully connected model (it's different than the black-box) layers = [ Conv2D(64, (8, 8), (2, 2), "SAME"), ReLU(), Flatten(), Linear(200), ReLU(), Linear(100), ReLU(), Linear(nb_classes), Softmax() ] return make_basic_cnn() return MLP(layers, input_shape)
def make_model(task): if task == 'mnist': model = make_basic_cnn(nb_classes=2) elif task == 'abalone': model = abalone_mlp(nb_classes=2, input_shape=[None, 7, 1, 1]) else: model = basic_mlp(nb_classes=2, input_shape=[None, 2, 1, 1]) return model
def make_basic_ngpu(nb_classes=10, input_shape=(None, 28, 28, 1), **kwargs): """ Create a multi-GPU model similar to the basic cnn in the tutorials. """ model = make_basic_cnn() layers = model.layers model = MLPnGPU(layers, input_shape) return model
def prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, nb_epochs, batch_size, learning_rate, rng): """ Define and train a model that simulates the "remote" black-box oracle described in the original paper. :param sess: the TF session :param x: the input placeholder for MNIST :param y: the ouput placeholder for MNIST :param X_train: the training data for the oracle :param Y_train: the training labels for the oracle :param X_test: the testing data for the oracle :param Y_test: the testing labels for the oracle :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param rng: numpy.random.RandomState :return: """ #sess = tf_debug.LocalCLIDebugWrapperSession(sess) #DEBUGGING # Define TF model graph (for the black-box model) model = make_basic_cnn() predictions = model(x) print("Defined TensorFlow model graph.") # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } model_train(sess, x, y, predictions, X_train, Y_train, args=train_params, rng=rng) # Print out the accuracy on legitimate data eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) print('Test accuracy of black-box on legitimate test ' 'examples: ' + str(accuracy)) return model, predictions, accuracy
def tutorial(): report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) sess = tf.Session() # Get MNIST test data train_start = 0 train_end = 60000 test_start = 0 test_end = 10000 X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # model_path = "models/mnist" # Train an MNIST model batch_size = 128 train_params = { 'nb_epochs': 6, 'batch_size': batch_size, 'learning_rate': 0.001 } rng = np.random.RandomState([2017, 8, 30]) model = make_basic_cnn(nb_filters=64) preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size, 'adversarial': False} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) eval_params = {'batch_size': batch_size, 'adversarial': False} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) epsilons = [0.01, 0.03, 0.07, 0.1, 0.2, 0.3] for eps in epsilons: fgsm_params = {'eps': eps, 'clip_min': 0., 'clip_max': 1.} # Initialize the Fast Gradient Sign Method (FGSM) attack object and fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) # Define adversarial examples placeholder adv_examples = tf.placeholder(tf.float32, [None, 28, 28, 1]) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size, 'adversarial': True} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc filename = "./examples/fgsm_mnist_adv_x_1000_" + str(eps) # Write the adversarial examples to a file np_examples = adv_x.eval(session=sess, feed_dict={x: X_test}) np.save(filename, np_examples) np.save("./examples/fgsm_mnist_adv_y_1000", Y_test)
def baseline_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing # assert Y_train.shape[1] == 10 # label_smooth = .1 # Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval( sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # # HERE already trained model, thus we need a new one (model_2) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval( sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the JSMA attack object and # graph jsma = SaliencyMapMethod(model, sess=sess) adv_x = jsma.generate(x, **jsma_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) jsma2 = SaliencyMapMethod(model_2, sess=sess) adv_x_2 = jsma2.generate(x, **jsma_params) preds_2_adv = model_2(adv_x_2) # # let's generate FGSM examples for model_2 # fgsm = FastGradientMethod(model_2, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x_fgsm = fgsm.generate(x, **fgsm_params) preds_2_fgsm = model_2(adv_x_fgsm) # DON'T WANT TO TRAIN on FGSM adv examples yet def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on JSMA adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on FGSM adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Accuracy of the JSMA adv trained model on FGSM adv examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2_fgsm, X_test, Y_test, args=eval_params) print('Test accuracy on SaliencyMapMethod adversarial examples: %0.4f' % accuracy) # Perform and evaluate adversarial training model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def prep_bbox(sess, logits_scalar, x, y, X_train, Y_train, X_test, Y_test, img_rows, img_cols, channels, nb_epochs, batch_size, learning_rate, rng, phase=None, binary=False, scale=False, nb_filters=64, model_path=None, adv=0, delay=0, eps=0.3): """ Define and train a model that simulates the "remote" black-box oracle described in the original paper. :param sess: the TF session :param x: the input placeholder for MNIST :param y: the ouput placeholder for MNIST :param X_train: the training data for the oracle :param Y_train: the training labels for the oracle :param X_test: the testing data for the oracle :param Y_test: the testing labels for the oracle :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param rng: numpy.random.RandomState :return: """ # Define TF model graph (for the black-box model) save = False train_from_scratch = False if model_path is not None: if os.path.exists(model_path): # check for existing model in immediate subfolder if any(f.endswith('.meta') for f in os.listdir(model_path)): binary, scale, nb_filters, batch_size, learning_rate, nb_epochs, adv = parse_model_settings( model_path) train_from_scratch = False else: model_path = build_model_save_path(model_path, binary, batch_size, nb_filters, learning_rate, nb_epochs, adv, delay, scale) print(model_path) save = True train_from_scratch = True else: train_from_scratch = True # train from scratch, but don't save since no path given if binary: if scale: #from cleverhans_tutorials.tutorial_models import make_scaled_binary_cnn # model = make_scaled_binary_cnn(phase, 'bb_binsc_', input_shape=( from cleverhans_tutorials.tutorial_models import make_scaled_binary_rand_cnn model = make_scaled_binary_rand_cnn(phase, logits_scalar, 'bb_binsc_', input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_basic_binary_cnn model = make_basic_binary_cnn(phase, logits_scalar, 'bb_bin_', input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_basic_cnn model = make_basic_cnn(phase, logits_scalar, 'bb_fp_', input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters) preds = model(x, reuse=False) print("Defined TensorFlow model graph.") def evaluate(): # Print out the accuracy on legitimate data eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, phase=phase, args=eval_params) print('Test accuracy of black-box on legitimate test ' 'examples: %.4f' % acc) # Train an MNIST model train_params = { 'binary': binary, 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'loss_name': 'bb train loss', 'filename': 'bb_model', 'train_scope': 'bb_model', 'reuse_global_step': False, 'is_training': True } if adv != 0: if adv == ADVERSARIAL_TRAINING_MADRYETAL: from cleverhans.attacks import MadryEtAl nb_iter = 20 train_attack_params = { 'eps': MAX_EPS, 'eps_iter': 0.01, 'nb_iter': nb_iter } train_attacker = MadryEtAl(model, sess=sess) if adv == ADVERSARIAL_TRAINING_FGSM: from cleverhans.attacks import FastGradientMethod train_attacker = FastGradientMethod(model, back='tf', sess=sess) # create the adversarial trainer train_attack_params.update({'clip_min': 0., 'clip_max': 1.}) adv_x_train = train_attacker.generate(x, phase, **train_attack_params) preds_adv = model.get_probs(adv_x_train) if train_from_scratch: if save: train_params.update({'log_dir': model_path}) if adv and delay > 0: train_params.update({'nb_epochs': delay}) # do clean training for 'nb_epochs' or 'delay' epochs model_train(sess, x, y, preds, X_train, Y_train, phase=phase, evaluate=evaluate, args=train_params, save=save, rng=rng) # optionally do additional adversarial training if adv: print("Adversarial training for %d epochs" % (nb_epochs - delay)) train_params.update({'nb_epochs': nb_epochs - delay}) train_params.update({'reuse_global_step': True}) model_train(sess, x, y, preds, X_train, Y_train, phase=phase, predictions_adv=preds_adv, evaluate=evaluate, args=train_params, save=save, rng=rng) else: tf_model_load(sess, model_path) print('Restored model from %s' % model_path) accuracy = evaluate() return model, preds, accuracy, model_path
def minist_fgsm_saliency( train_start=0, train_end=10, test_start=0, test_end=5, nb_epochs=2, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64, nb_classes=10, source_samples=10, ): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 # this way, all the 9 zeroes -> 0.1/9 because # the one-bit becomes 0.9 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # placeholder for y_target --> for saliency tensor y_target = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) ########################################################################### # Training the CNN model using TensorFlow: model --> base model ########################################################################### model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) if clean_train: # omg -> creates a cnn model # model = make_basic_cnn(nb_filters=nb_filters) # preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) ########################################################################### # MODEL Train!!!!!!!!!!!! ########################################################################### # training the basic model, using train_params model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc ########################################################################### # Generate FGSM Adversarial based on model, and # Compute Base Model Accuracy ########################################################################### # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) # todo: follow the paper and run Cleverhans Output? fgsm_params_y = {'eps': 0.3, 'y': y, 'clip_min': 0., 'clip_max': 1.} #adv_x = fgsm.generate(x, **fgsm_params) adv_x = fgsm.generate(x, **fgsm_params_y) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on FGSM adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc ########################################################################### # Generate Saliency Map Adversarial Example and # Compute base model accuracy (only 10) ########################################################################### print("Saliency Map Attack On The Base Model") print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Instantiate a SaliencyMapMethod attack object --> modify y_target for each test_data again jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } # Keep track of success (adversarial example classified in target) # Need this info to compute the success rate results = np.zeros((nb_classes, source_samples), dtype='i') # each sample will get 9 adversarial samples # adv_x_set: place_holder for all the x variations # correct_y_set: correct_y_output used for training adv_x_set = None adv_y_target = None # we need multi x_train_saliency / y_train_saliency # x_train_saliency = None y_train_saliency = None for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Saliency Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_train[sample_ind:(sample_ind + 1)] y_sample = Y_train[sample_ind:(sample_ind + 1)] current_class = int(np.argmax(Y_train[sample_ind])) target_classes = other_classes(nb_classes, current_class) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # Create x_train_saliency, corresponding to y_train_saliency if x_train_saliency is not None: x_train_saliency = np.concatenate( (x_train_saliency, sample), axis=0) y_train_saliency = np.concatenate( (y_train_saliency, y_sample), axis=0) else: x_train_saliency = sample y_train_saliency = y_sample print("sample shape: ", x_train_saliency.shape) print("y_sample shape: ", y_train_saliency.shape) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x_np = jsma.generate_np(sample, **jsma_params) # Add to adv_x_set, correct_y_set if adv_x_set is not None: adv_y_target = np.concatenate( (adv_y_target, one_hot_target), axis=0) adv_x_set = np.concatenate((adv_x_np, adv_x_set), axis=0) else: adv_y_target = one_hot_target adv_x_set = adv_x_np print("adv_y_target shape(one-hot-encoding): ", adv_y_target.shape) print("adv_x_set(np) shape: ", adv_x_np.shape) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x_np) == target) # Update the arrays for later analysis results[target, sample_ind] = res print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful Saliency adv. examples {0:.4f}'.format( succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # here we have successfully stacked up x_adversarial_set, y_correct_set # these can be used to provide training to our model now print("\n\n\n*****************************") print("Checking x_adv_set shape: ", adv_x_set.shape) print("Checking correct_y_set shape: ", adv_y_target.shape) print("x_training_saliency shape:", x_train_saliency.shape) print("y_training_saliency shape:", y_train_saliency.shape) # now construct model 3, define output -> input relationship tensor model_3 = make_basic_cnn(nb_filters=nb_filters) # define the x, the placeholder input - > preds_3 output preds_3 = model_3(x) # jsma3 = SaliencyMapMethod(model_3, sess=sess) # # jsma_params = {'theta': 1., 'gamma': 0.1, # 'clip_min': 0., 'clip_max': 1., # 'y_target': y_target} # # # create adv_saliency set tensor, using x_train data and jsma_params containing adv_y_target # adv_jsma = jsma3.generate(x, jsma_params) # # create adv preds tensor # preds_jsma_adv = model_3(adv_jsma) # define saliency training model accuracy def evaluate_saliency(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_3, x_train_saliency, y_train_saliency, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy ########################################################################### # MODEL Train for Saliency Map ########################################################################### # Perform and evaluate adversarial training with FSGM MODEL!!! # Train the model with samples of normal and adversarial examples! model_train(sess, x, y, model_3, x_train_saliency, y_train_saliency, evaluate=evaluate_saliency(), args=train_params, rng=rng) #todo: use jsma to create adversarial testing??? or training??? # Redefine TF model FGSM!!! model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) fgsm2 = FastGradientMethod(model_2, sess=sess) # parameter for FGSM fgsm_params_y = {'eps': 0.3, 'y': y, 'clip_min': 0., 'clip_max': 1.} adv_x_2 = fgsm2.generate(x, **fgsm_params_y) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x_2 = tf.stop_gradient(adv_x_2) preds_2_adv = model_2(adv_x_2) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy ########################################################################### # MODEL Train for FGSM ########################################################################### # Perform and evaluate adversarial training with FSGM MODEL!!! model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(4254264) set_log_level(logging.DEBUG) # Get MNIST test data # X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, # train_end=train_end, # test_start=test_start, # test_end=test_end) # Get notMNIST data # with np.load("notmnist.npz") as data: # X_train, Y_train, X_test, Y_test = data['examples_train'], data['labels_train'], data['examples_test'], data['labels_test'] # Get MNISTnotMNIST data with np.load("mnist.npz") as data: X_train, Y_train, X_test, Y_test = data['X_train'], data[ 'Y_train'], data['X_test'], data['Y_test'] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") # Define TF model graph model_path = "./" model_name = "clean_trained_mnist_model" model = make_basic_cnn(nb_classes=nb_classes) if tf_model_load(sess, file_path=os.path.join(model_path, model_name)): print(model_name, " reloaded.") preds = model.get_probs(x) # print('shape is', preds.get_shape()) # clean_train = True # if clean_train: # train_params = { # 'nb_epochs': nb_epochs, # 'batch_size': batch_size, # 'learning_rate': learning_rate # } # model_path = "./" # model_name = "clean_trained__model_notmnist" # rng = np.random.RandomState([1989, 12, 13]) # model = make_basic_cnn() # preds = model.get_probs(x) # # def evaluate(): # # Evaluate the accuracy of the MNIST model on legitimate test # # examples # eval_params = {'batch_size': batch_size} # acc = model_eval( # sess, x, y, preds, X_test, Y_test, args=eval_params) # report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape # print('Test accuracy on legitimate examples: %0.4f' % acc) # model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate,args=train_params, rng=rng) # # save_path = os.path.join(model_path, model_name) # saver = tf.train.Saver() # saver.save(sess, save_path) # _logger.info("Completed model training and saved at: " + str(save_path)) # print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model # train_params = { # 'nb_epochs': nb_epochs, # 'batch_size': batch_size, # 'learning_rate': learning_rate, # 'train_dir': model_path, # 'filename': model_name # } # sess.run(tf.global_variables_initializer()) # rng = np.random.RandomState([2017, 8, 30]) # model_train(sess, x, y, preds, X_train, Y_train, save=True, args=train_params, # rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) # report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # misclassify results2 = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization # grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) # grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1, 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } figure = None rng = np.random.RandomState([1358, 23, 234]) index_shuf = list(range(len(X_test))) rng.shuffle(index_shuf) X_test = X_test[index_shuf] Y_test = Y_test[index_shuf] # create a dictionary to keep track of occurence of each letter # create a 2D array to kee track of successful attacks occurence = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0} # 10:0, 11:0, 12:0, 13:0, 14:0, 15:0, 16:0, 17:0, 18:0, 19:0} rate_table = np.zeros((nb_classes, nb_classes), dtype='f') # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # add one to current class occurence occurence[current_class] += 1 # For the grid visualization, keep original images along the diagonal # grid_viz_data[current_class, current_class, :, :, :] = np.reshape( # sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # misclassify res2 = int(model_argmax(sess, x, preds, adv_x) != current_class) # if success, add one to successful rate table if res == 1: rate_table[current_class, target] += 1. # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side # if viz_enabled: # figure = pair_visual( # np.reshape(sample, (img_rows, img_cols)), # np.reshape(adv_x, (img_rows, img_cols)), figure) # Add our adversarial example to our grid data # grid_viz_data[target, current_class, :, :, :] = np.reshape( # adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res results2[target, sample_ind] = res2 perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Close TF session sess.close() # Compute success rate of each letter attacking each target for cur in range(nb_classes): if occurence[cur] != 0: rate_table[cur, :] /= float(occurence[cur]) print("The table of rate of successful attacking is shown below") print(rate_table) print("the number of occurrence of each class is ", occurence) # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried # misclassify succ_rate2 = float(np.sum(results2)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) print( 'Avg. rate of misclassified adv. examples {0:.4f}'.format(succ_rate2)) # report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Finally, block & display a grid of all the adversarial examples # if viz_enabled: # import matplotlib.pyplot as plt # plt.close(figure) # _ = grid_visual(grid_viz_data) return report
def JSMA_FGSM_BIM(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) source_samples = batch_size # Use label smoothing # Hopefully this doesn't screw up JSMA... assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_par = {'batch_size': batch_size} rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) print("#####Starting attacks on clean model#####") ################################################################# #Clean test against JSMA jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } jsma = SaliencyMapMethod(model, back='tf', sess=sess) adv_x = jsma.generate(x, **jsma_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on JSMA adversarial examples: %0.4f' % acc) ################################################################ #Clean test against FGSM fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on FGSM adversarial examples: %0.4f' % acc) ################################################################ #Clean test against BIM bim_params = { 'eps': 0.3, 'eps_iter': 0.01, 'nb_iter': 100, 'clip_min': 0., 'clip_max': 1. } bim = BasicIterativeMethod(model, sess=sess) adv_x = bim.generate(x, **bim_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on BIM adversarial examples: %0.4f' % acc) ################################################################ #Clean test against EN en_params = { 'binary_search_steps': 1, #'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': source_samples, 'initial_const': 10 } en = ElasticNetMethod(model, back='tf', sess=sess) adv_x = en.generate(x, **en_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on EN adversarial examples: %0.4f' % acc) ################################################################ #Clean test against DF deepfool_params = { 'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': 50, 'clip_min': 0., 'clip_max': 1. } deepfool = DeepFool(model, sess=sess) adv_x = deepfool.generate(x, **deepfool_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on DF adversarial examples: %0.4f' % acc) ################################################################ #Clean test against VAT vat_params = { 'eps': 2.0, 'num_iterations': 1, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1. } vat = VirtualAdversarialMethod(model, sess=sess) adv_x = vat.generate(x, **vat_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on VAT adversarial examples: %0.4f\n' % acc) ################################################################ print("Repeating the process, using adversarial training\n") # Redefine TF model graph model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) ################################################################# #Adversarial test against JSMA jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } jsma = SaliencyMapMethod(model, back='tf', sess=sess) adv_x = jsma.generate(x, **jsma_params) preds_adv_jsma = model.get_probs(adv_x) ################################################################ #Adversarial test against FGSM fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv_fgsm = model.get_probs(adv_x) ################################################################ #Adversarial test against BIM bim_params = { 'eps': 0.3, 'eps_iter': 0.01, 'nb_iter': 100, 'clip_min': 0., 'clip_max': 1. } bim = BasicIterativeMethod(model, sess=sess) adv_x = bim.generate(x, **bim_params) preds_adv_bim = model.get_probs(adv_x) ################################################################ #Adversarial test against EN en_params = { 'binary_search_steps': 5, #'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': source_samples, 'initial_const': 10 } en = ElasticNetMethod(model, back='tf', sess=sess) adv_x = en.generate(x, **en_params) preds_adv_en = model.get_probs(adv_x) ################################################################ #Adversarial test against DF deepfool_params = { 'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': 200, 'clip_min': 0., 'clip_max': 1. } deepfool = DeepFool(model, sess=sess) adv_x = deepfool.generate(x, **deepfool_params) preds_adv_df = model.get_probs(adv_x) ################################################################ #Adversarial test against VAT vat_params = { 'eps': 2.0, 'num_iterations': 1, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1. } vat = VirtualAdversarialMethod(model, sess=sess) adv_x = vat.generate(x, **vat_params) preds_adv_vat = model.get_probs(adv_x) ################################################################ print("#####Evaluate trained model#####") def evaluate_2(): # Evaluate the accuracy of the MNIST model on JSMA adversarial examples acc = model_eval(sess, x, y, preds_adv_jsma, X_test, Y_test, args=eval_par) print('Test accuracy on JSMA adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_fgsm, X_test, Y_test, args=eval_par) print('Test accuracy on FGSM adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on BIM adversarial examples acc = model_eval(sess, x, y, preds_adv_bim, X_test, Y_test, args=eval_par) print('Test accuracy on BIM adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on EN adversarial examples acc = model_eval(sess, x, y, preds_adv_en, X_test, Y_test, args=eval_par) print('Test accuracy on EN adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on DF adversarial examples acc = model_eval(sess, x, y, preds_adv_df, X_test, Y_test, args=eval_par) print('Test accuracy on DF adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on VAT adversarial examples acc = model_eval(sess, x, y, preds_adv_vat, X_test, Y_test, args=eval_par) print('Test accuracy on VAT adversarial examples: %0.4f\n' % acc) preds_2_adv = [ preds_adv_jsma, preds_adv_fgsm, preds_adv_bim # ,preds_adv_en # ,preds_adv_df ] model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, rng=rng)
def mnist_tutorial_jsma(train_start=0, train_end=5500, test_start=0, test_end=1000, nb_epochs=8, batch_size=100, nb_classes=10, nb_filters=64, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } # sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) print("x_train shape: ", X_train.shape) print("y_train shape: ", Y_train.shape) # do not log model_train(sess, x, y, preds, X_train, Y_train, args=train_params,verbose=False, rng=rng) f_out_clean = open("Clean_jsma_elastic_against5.log", "w") # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) f_out_clean.write('Test accuracy on legitimate test examples: ' + str(accuracy) + '\n') # Clean test against JSMA jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} jsma = SaliencyMapMethod(model, back='tf', sess=sess) adv_x_jsma = jsma.generate(x, **jsma_params) preds_adv_jsma = model.get_probs(adv_x_jsma) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_jsma, X_test, Y_test, args=eval_params) print('Clean test accuracy on JSMA adversarial examples: %0.4f' % acc) f_out_clean.write('Clean test accuracy on JSMA adversarial examples: ' + str(acc) + '\n') ################################################################ # Clean test against FGSM fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model, sess=sess) adv_x_fgsm = fgsm.generate(x, **fgsm_params) preds_adv_fgsm = model.get_probs(adv_x_fgsm) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_fgsm, X_test, Y_test, args=eval_params) print('Clean test accuracy on FGSM adversarial examples: %0.4f' % acc) f_out_clean.write('Clean test accuracy on FGSM adversarial examples: ' + str(acc) + '\n') ################################################################ # Clean test against BIM bim_params = {'eps': 0.3, 'eps_iter': 0.01, 'nb_iter': 100, 'clip_min': 0., 'clip_max': 1.} bim = BasicIterativeMethod(model, sess=sess) adv_x_bim = bim.generate(x, **bim_params) preds_adv_bim = model.get_probs(adv_x_bim) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_bim, X_test, Y_test, args=eval_params) print('Clean test accuracy on BIM adversarial examples: %0.4f' % acc) f_out_clean.write('Clean test accuracy on BIM adversarial examples: ' + str(acc) + '\n') ################################################################ # Clean test against EN en_params = {'binary_search_steps': 1, # 'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': batch_size, 'initial_const': 10} en = ElasticNetMethod(model, back='tf', sess=sess) adv_x_en = en.generate(x, **en_params) preds_adv_en = model.get_probs(adv_x_en) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_en, X_test, Y_test, args=eval_params) print('Clean test accuracy on EN adversarial examples: %0.4f' % acc) f_out_clean.write('Clean test accuracy on EN adversarial examples: ' + str(acc) + '\n') ################################################################ # Clean test against DF deepfool_params = {'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': 50, 'clip_min': 0., 'clip_max': 1.} deepfool = DeepFool(model, sess=sess) adv_x_df = deepfool.generate(x, **deepfool_params) preds_adv_df = model.get_probs(adv_x_df) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_df, X_test, Y_test, args=eval_params) print('Clean test accuracy on DF adversarial examples: %0.4f' % acc) f_out_clean.write('Clean test accuracy on DF adversarial examples: ' + str(acc) + '\n') ################################################################ # Clean test against VAT vat_params = {'eps': 2.0, 'num_iterations': 1, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1.} vat = VirtualAdversarialMethod(model, sess=sess) adv_x_vat = vat.generate(x, **vat_params) preds_adv_vat = model.get_probs(adv_x_vat) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_vat, X_test, Y_test, args=eval_params) print('Clean test accuracy on VAT adversarial examples: %0.4f\n' % acc) f_out_clean.write('Clean test accuracy on VAT adversarial examples: ' + str(acc) + '\n') f_out_clean.close() ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(X_train.shape[0]) + ' * ' + str(nb_classes-1) + ' adversarial examples') model_2 = make_basic_cnn() preds_2 = model(x) # need this for constructing the array sess.run(tf.global_variables_initializer()) # run this again # sess.run(tf.global_variables_initializer()) # 1. Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model_2, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} adv_random = jsma.generate(x, **jsma_params) preds_adv_random = model_2.get_probs(adv_random) # 2. Instantiate FGSM attack fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model_2, sess=sess) adv_x_fgsm = fgsm.generate(x, **fgsm_params) preds_adv_fgsm = model_2.get_probs(adv_x_fgsm) # 3. Instantiate Elastic net attack en_params = {'binary_search_steps': 5, #'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': batch_size, 'initial_const': 10} enet = ElasticNetMethod(model_2, sess=sess) adv_x_en = enet.generate(x, **en_params) preds_adv_elastic_net = model_2.get_probs(adv_x_en) # 4. Deepfool deepfool_params = {'nb_candidate':10, 'overshoot':0.02, 'max_iter': 50, 'clip_min': 0., 'clip_max': 1.} deepfool = DeepFool(model_2, sess=sess) adv_x_df = deepfool.generate(x, **deepfool_params) preds_adv_deepfool = model_2.get_probs(adv_x_df) # 5. Base Iterative bim_params = {'eps': 0.3, 'eps_iter': 0.01, 'nb_iter': 100, 'clip_min': 0., 'clip_max': 1.} base_iter = BasicIterativeMethod(model_2, sess=sess) adv_x_bi = base_iter.generate(x, **bim_params) preds_adv_base_iter = model_2.get_probs(adv_x_bi) # 6. C & W Attack cw = CarliniWagnerL2(model_2, back='tf', sess=sess) cw_params = {'binary_search_steps': 1, # 'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': batch_size, 'initial_const': 10} adv_x_cw = cw.generate(x, **cw_params) preds_adv_cw = model_2.get_probs(adv_x_cw) #7 vat_params = {'eps': 2.0, 'num_iterations': 1, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1.} vat = VirtualAdversarialMethod(model_2, sess=sess) adv_x = vat.generate(x, **vat_params) preds_adv_vat = model_2.get_probs(adv_x) # ==> generate 10 targeted classes for every train data regardless # This call runs the Jacobian-based saliency map approach # Loop over the samples we want to perturb into adversarial examples X_train_adv_set = [] Y_train_adv_set = [] for index in range(X_train.shape[0]): print('--------------------------------------') x_val = X_train[index:(index+1)] y_val = Y_train[index] # add normal sample in!!!! X_train_adv_set.append(x_val) Y_train_adv_set.append(y_val) # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_val)) target_classes = other_classes(nb_classes, current_class) # Loop over all target classes for target in target_classes: # print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(x_val, **jsma_params) # append to X_train_adv_set and Y_train_adv_set X_train_adv_set.append(adv_x) Y_train_adv_set.append(y_val) # shape is: (1, 28, 28, 1) # print("adv_x shape is: ", adv_x.shape) # check for success rate # res = int(model_argmax(sess, x, preds, adv_x) == target) print('-------------Finished Generating Np Adversarial Data-------------------------') X_train_data = np.concatenate(X_train_adv_set, axis=0) Y_train_data = np.stack(Y_train_adv_set, axis=0) print("X_train_data shape is: ", X_train_data.shape) print("Y_train_data shape is: ", Y_train_data.shape) # saves the output so later no need to re-fun file np.savez("jsma_training_data.npz", x_train=X_train_data , y_train=Y_train_data) # >>> data = np.load('/tmp/123.npz') # >>> data['a'] f_out = open("Adversarial_jsma_elastic_against5.log", "w") # evaluate the function against 5 attacks # fgsm, base iterative, jsma, elastic net, and deepfool def evaluate_against_all(): # 1 Clean Data eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) print('Legitimate accuracy: %0.4f' % accuracy) tmp = 'Legitimate accuracy: '+ str(accuracy) + "\n" f_out.write(tmp) # 2 JSMA accuracy = model_eval(sess, x, y, preds_adv_random, X_test, Y_test, args=eval_params) print('JSMA accuracy: %0.4f' % accuracy) tmp = 'JSMA accuracy:'+ str(accuracy) + "\n" f_out.write(tmp) # 3 FGSM accuracy = model_eval(sess, x, y, preds_adv_fgsm, X_test, Y_test, args=eval_params) print('FGSM accuracy: %0.4f' % accuracy) tmp = 'FGSM accuracy:' + str(accuracy) + "\n" f_out.write(tmp) # 4 Base Iterative accuracy = model_eval(sess, x, y, preds_adv_base_iter, X_test, Y_test, args=eval_params) print('Base Iterative accuracy: %0.4f' % accuracy) tmp = 'Base Iterative accuracy:' + str(accuracy) + "\n" f_out.write(tmp) # 5 Elastic Net accuracy = model_eval(sess, x, y, preds_adv_elastic_net, X_test, Y_test, args=eval_params) print('Elastic Net accuracy: %0.4f' % accuracy) tmp = 'Elastic Net accuracy:' + str(accuracy) + "\n" f_out.write(tmp) # 6 DeepFool accuracy = model_eval(sess, x, y, preds_adv_deepfool, X_test, Y_test, args=eval_params) print('DeepFool accuracy: %0.4f' % accuracy) tmp = 'DeepFool accuracy:' + str(accuracy) + "\n" f_out.write(tmp) # 7 C & W Attack accuracy = model_eval(sess, x, y, preds_adv_cw, X_test, Y_test, args=eval_params) print('C & W accuracy: %0.4f' % accuracy) tmp = 'C & W accuracy:' + str(accuracy) + "\n" f_out.write(tmp) f_out.write("*******End of Epoch***********\n\n") # 8 Virtual Adversarial accuracy = model_eval(sess, x, y, preds_adv_vat, X_test, Y_test, args=eval_params) print('VAT accuracy: %0.4f' % accuracy) tmp = 'VAT accuracy:' + str(accuracy) + "\n" f_out.write(tmp) f_out.write("*******End of Epoch***********\n\n") print("*******End of Epoch***********\n\n") # report.adv_train_adv_eval = accuracy print("Now Adversarial Training with Elastic Net + modified X_train and Y_train") # trained_model.out train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': '/home/stephen/PycharmProjects/jsma-runall-mac/', 'filename': 'trained_model.out' } model_train(sess, x, y, preds_2, X_train_data, Y_train_data, predictions_adv=preds_adv_elastic_net, evaluate=evaluate_against_all, verbose=False, args=train_params, rng=rng) # Close TF session sess.close() return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Get MNIST test data # X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, # train_end=train_end, # test_start=test_start, # test_end=test_end) # Get notMNIST data with np.load("notmnist.npz") as data: X_train, Y_train, X_test, Y_test = data['examples_train'], data[ 'labels_train'], data['examples_test'], data['labels_test'] # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "./" model_name = "adv_trained_fgsm_model_mix_data_notmnist" fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([1992, 8, 3]) model = make_basic_cnn(nb_filters=nb_filters) preds = model(x) # Create TF session sess = tf.Session() fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model(adv_x) mixed_x = tf.concat([x, adv_x], 0) mixed_y = tf.concat([y, y], 0) # length = tf.shape(mixed_x)[0] index_shuffle = list(range(batch_size * 2)) rng.shuffle(index_shuffle) mixed_x = tf.gather(mixed_x, index_shuffle) mixed_y = tf.gather(mixed_y, index_shuffle) preds_mixed = model(mixed_x) loss = model_loss(mixed_y, preds_mixed) train_step = tf.train.AdamOptimizer(learning_rate=learning_rate) train_step = train_step.minimize(loss) tf.global_variables_initializer().run(session=sess) for epoch in xrange(nb_epochs): print('Training for epoch %i/%i' % (epoch, nb_epochs - 1)) # Compute number of batches nb_batches = int(math.ceil(float(len(X_train)) / batch_size)) assert nb_batches * batch_size >= len(X_train) # Indices to shuffle training set index_shuf = list(range(len(X_train))) rng.shuffle(index_shuf) prev = time.time() for batch in range(nb_batches): # re-instantiate FGSM object with new trained model # fgsm = FastGradientMethod(model, sess=sess) # adv_x = fgsm.generate(x, **fgsm_params) print('--------------------------------------') # create an array for storing adv examples print('batch: %i/%i' % (batch + 1, nb_batches)) # adv_examples = np.empty([1,28,28,1]) start, end = batch_indices(batch, len(X_train), batch_size) X_this_batch = X_train[index_shuf[start:end]] Y_this_batch = Y_train[index_shuf[start:end]] # adv_examples = sess.run(adv_x, feed_dict={x:X_this_batch}) # for target labels #adv_targets = np.empty([1,10]) # corresponding clean/correct label # adv_clean_labels = np.empty([1,10]) # correspongding clean data # adv_clean_examples = np.empty([1,28,28,1]) # adv_examples = np.reshape(adv_examples, (batch_size*(nb_classes-1),28,28,1)) # adv_clean_examples = np.reshape(adv_clean_examples, (batch_size*(nb_classes-1),28,28,1)) # mixed_X = np.concatenate((X_this_batch, adv_examples), axis=0) # mixed_Y = np.concatenate((Y_this_batch, Y_this_batch), axis=0) # print('mixed data have shape', np.shape(mixed_X)) # print('mixed labels have shape', np.shape(mixed_Y)) #shuffle the mixed data before training # index_of_batch = list(range(np.shape(mixed_Y)[0])) # rng.shuffle(index_of_batch) # mixed_X = mixed_X[index_of_batch] # mixed_Y = mixed_Y[index_of_batch] feed_dict = {x: X_this_batch, y: Y_this_batch} train_step.run(feed_dict=feed_dict, session=sess) cur = time.time() _logger.info("Epoch " + str(epoch) + " took " + str(cur - prev) + " seconds") eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) acc2 = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on adversarial examples: %0.4f' % acc2) print('Training finished.') # reload fgsm successfully attacking adv test data # with np.load("adversarial_fgsm.npz") as data: # adv_X_test, adv_clean_Y_test, adv_clean_X_test = data['adv_examples'], data['adv_clean_labels'], data['adv_clean_examples'] # print('FGSM adversarial data are successfully reloaded.') # preds_adv_test = model(x1) # # Evaluate the accuracy of the MNIST model on adversarial examples # # eval_par = {'batch_size': 10} # acc = model_eval(sess, x1, y, preds_adv_test, adv_X_test, adv_clean_Y_test, args=eval_par) # print('Test accuracy on pre-generated adversarial examples of fgsm: %0.4f\n' % acc) # # reload fgsm successfully attacking adv test data # with np.load("adversarial_mnist_test_from_1500.npz") as data: # adv_X_test, adv_clean_Y_test, adv_clean_X_test = data['adv_examples'], data['adv_clean_labels'], data['adv_clean_examples'] # print('JSMA adversarial data are successfully reloaded.') # # Evaluate the accuracy of the MNIST model on adversarial examples # acc2 = model_eval(sess, x1, y, preds_adv_test, adv_X_test, adv_clean_Y_test, args=eval_par) # print('Test accuracy on pre-generated adversarial examples of jsma: %0.4f\n' % acc2) save_path = os.path.join(model_path, model_name) saver = tf.train.Saver() saver.save(sess, save_path) _logger.info("Completed model training and saved at: " + str(save_path)) # Close TF session sess.close() return
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() model_path = "./" model_name = "clean_trained__model_notmnist" # Set TF random seed to improve reproducibility tf.set_random_seed(7895) # Set logging level to see debug information set_log_level(logging.DEBUG) # Get MNIST test data # X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, # train_end=train_end, # test_start=test_start, # test_end=test_end) # Get notMNIST data with np.load("notmnist.npz") as data: X_train, Y_train, X_test, Y_test = data['examples_train'], data[ 'labels_train'], data['examples_test'], data['labels_test'] # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} # Define TF model graph model = make_basic_cnn() # Create TF session sess = tf.Session() if tf_model_load(sess, file_path=os.path.join(model_path, model_name)): print(model_name, " reloaded.") # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=False, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=1, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility #tf.set_random_seed(1234) # Create TF session config = tf.ConfigProto() config.gpu_options.allow_growth = True #config.log_device_placement=True sess = tf.Session(config=config) print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) fgsm = FastGradientMethod(model, sess=sess) result = np.zeros((5,len(X_test))) strength = np.zeros((3,len(X_test))) adv_ys = None yname = "y" cw_params = {'binary_search_steps': 1, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples, 'initial_const': 10} fgsm_eps = [0.1,0.3, 0.5] for j in fgsm_eps: fgsm_params = {'eps': j, 'clip_min': 0., 'clip_max': 1.} for i in range(len(X_test)): feed_dict = {x: X_test[i].reshape((1,28,28,1))} Classes0 = preds.eval(feed_dict=feed_dict,session=sess) Class0 = np.argmax(Classes0) result[0,i] = Class0 adv_inputs = X_test[i] adv_inputs = adv_inputs.reshape((1,28,28,1)) #adv = cw.generate_np(adv_inputs,**cw_params) adv = fgsm.generate_np(adv_inputs, **fgsm_params) pdb.set_trace() feed_dict = {x: adv} Classes1 = preds.eval(feed_dict=feed_dict,session=sess) Class1 = np.argmax(Classes1) result[1,i] = Class1 # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) strength[0,i] = percent_perturbed adv2 = cw.generate_np(adv,**cw_params) feed_dict = {x: adv2} Classes2 = preds.eval(feed_dict=feed_dict,session=sess) Class2 = np.argmax(Classes2) result[2,i] = Class2 # Compute the average distortion introduced by the algorithm percent_perturbed2 = np.mean(np.sum((adv2 - adv)**2, axis=(1, 2, 3))**.5) strength[1,i] = percent_perturbed2 adv_f = sig.medfilt(adv,(1,3,3,1)) feed_dict = {x: adv_f} Classes1 = preds.eval(feed_dict=feed_dict,session=sess) Class1 = np.argmax(Classes1) result[3,i] = Class1 # Compute the average distortion introduced by the algorithm #percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, # axis=(1, 2, 3))**.5) #strength[0,i] = percent_perturbed adv2_f = cw.generate_np(adv_f,**cw_params) feed_dict = {x: adv2_f} Classes2 = preds.eval(feed_dict=feed_dict,session=sess) Class2 = np.argmax(Classes2) result[4,i] = Class2 # Compute the average distortion introduced by the algorithm percent_perturbed2 = np.mean(np.sum((adv2_f - adv_f)**2, axis=(1, 2, 3))**.5) strength[2,i] = percent_perturbed2 if i%100 == 0: print(i) # exit() # Close TF session sess.close() sio.savemat('fgsm_mnist.mat',{'adv_01':adv_01,'adv_03':adv_03, 'adv_05':adv_05 'strength':strength})
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=False, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=1, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() file = read_mat_file(filename) label = file["label"] data = file["data"] #data[data>1]= 1 #data[data<0]= 0 adv_data = data[10000:80000, :, :, :] cw = adv_data[0::7, :, :, :] fgsm01 = adv_data[1::7, :, :, :] fgsm03 = adv_data[2::7, :, :, :] fgsm05 = adv_data[3::7, :, :, :] gaussian01 = adv_data[4::7, :, :, :] gaussian03 = adv_data[5::7, :, :, :] gaussian05 = adv_data[6::7, :, :, :] # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility #tf.set_random_seed(1234) # Create TF session config = tf.ConfigProto() config.gpu_options.allow_growth = True #config.log_device_placement=True sess = tf.Session(config=config) print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} pdb.set_trace() accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy
def model_run(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=False, testing=False, backprop_through_attack=False, nb_filters=64): # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(7076) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() rng = np.random.RandomState([2017, 11, 1]) # Get MNIST clean data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Get adversarial data with np.load("adversarial_mnist_test_from_1500.npz") as data: adv_X_test, adv_Y_test, adv_clean_Y_test, adv_clean_X_test = data[ 'adv_examples'], data['adv_targets'], data[ 'adv_clean_labels'], data['adv_clean_examples'] with np.load("adversarial_mnist_train_from_6000.npz") as data: adv_X_train, adv_Y_train, adv_clean_Y_train, adv_clean_X_train = data[ 'adv_examples'], data['adv_targets'], data[ 'adv_clean_labels'], data['adv_clean_examples'] print('Adversarial data are successfully reloaded.') def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds, adv_X_test, adv_clean_Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # the second option: we concatenate clean training data and adversarial training data, and do classic/traditional training model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } training_data = np.concatenate((X_train, adv_X_train), axis=0) training_target = np.concatenate((Y_train, adv_clean_Y_train), axis=0) print("the shape of training data for adversarial training: ", np.shape(training_data)) # test_data = np.concatenate((X_test, adv_X_test), axis=0) # test_target = np.concatenate((Y_test, adv_clean_Y_test), axis=0) # Perform and evaluate adversarial training model_train(sess, x, y, preds, training_data, training_target, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, training_data, training_target, args=eval_params) report.train_adv_train_adv_eval = accuracy with np.load("adv_test_fgsm_data.npz") as data: adv_X_test, adv_clean_Y_test, adv_clean_X_test = data[ 'adv_examples'], data['adv_clean_labels'], data[ 'adv_clean_examples'] print('Adversarial data are successfully reloaded.') preds_adv = model(x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, adv_X_test, adv_clean_Y_test, args=eval_par) print('Test accuracy on adversarial examples of fgsm: %0.4f\n' % acc) report.clean_train_adv_eval = acc return report
def main(argv=None): """ CIFAR10 CleverHans tutorial :return: """ # CIFAR10-specific dimensions img_rows = 32 img_cols = 32 channels = 3 nb_classes = 10 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) set_log_level(logging.WARNING) # Get CIFAR10 test data X_train, Y_train, X_test, Y_test = data_cifar10() assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, 10)) phase = tf.placeholder(tf.bool, name="phase") model_path = FLAGS.model_path targeted = True if FLAGS.targeted else False binary = True if FLAGS.binary else False scale = True if FLAGS.scale else False learning_rate = FLAGS.learning_rate nb_filters = FLAGS.nb_filters batch_size = FLAGS.batch_size nb_samples = FLAGS.nb_samples nb_epochs = FLAGS.nb_epochs delay = FLAGS.delay eps = FLAGS.eps adv = FLAGS.adv attack = FLAGS.attack attack_iterations = FLAGS.attack_iterations save = False train_from_scratch = False if model_path is not None: if os.path.exists(model_path): # check for existing model in immediate subfolder if any(f.endswith('.meta') for f in os.listdir(model_path)): binary, scale, nb_filters, batch_size, learning_rate, nb_epochs, adv = parse_model_settings( model_path) train_from_scratch = False else: model_path = build_model_save_path( model_path, binary, batch_size, nb_filters, learning_rate, nb_epochs, adv, delay, scale) print(model_path) save = True train_from_scratch = True else: train_from_scratch = True # train from scratch, but don't save since no path given if binary: if scale: from cleverhans_tutorials.tutorial_models import make_scaled_binary_cnn model = make_scaled_binary_cnn(phase, 'bin_', input_shape=( None, img_rows, img_cols, channels), nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_basic_binary_cnn model = make_basic_binary_cnn(phase, 'bin_', input_shape=( None, img_rows, img_cols, channels), nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_basic_cnn model = make_basic_cnn(phase, 'fp_', input_shape=( None, img_rows, img_cols, channels), nb_filters=nb_filters) preds = model(x, reuse=False) print("Defined TensorFlow model graph.") rng = np.random.RandomState([2017, 8, 30]) def evaluate(): # Evaluate the accuracy of the CIFAR10 model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval( sess, x, y, preds, X_test, Y_test, phase=phase, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an CIFAR10 model train_params = { 'binary': binary, 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'loss_name': 'train loss', 'filename': 'model', 'reuse_global_step': False, 'train_scope': 'train', 'is_training': True } if adv: from cleverhans.attacks import FastGradientMethod fgsm = FastGradientMethod(model, back='tf', sess=sess) fgsm_params = {'eps': eps, 'clip_min': 0., 'clip_max': 1.} adv_x_train = fgsm.generate(x, phase, **fgsm_params) preds_adv = model.get_probs(adv_x_train) if train_from_scratch: if save: train_params.update({'log_dir': model_path}) if adv and delay > 0: train_params.update({'nb_epochs': delay}) # do clean training for 'nb_epochs' or 'delay' epochs model_train(sess, x, y, preds, X_train, Y_train, phase=phase, evaluate=evaluate, args=train_params, save=save, rng=rng) # optionally do additional adversarial training if adv: print("Adversarial training for %d epochs" % (nb_epochs - delay)) train_params.update({'nb_epochs': nb_epochs - delay}) train_params.update({'reuse_global_step': True}) model_train(sess, x, y, preds, X_train, Y_train, phase=phase, predictions_adv=preds_adv, evaluate=evaluate, args=train_params, save=save, rng=rng) else: tf_model_load(sess, model_path) print('Restored model from %s' % model_path) evaluate() # Evaluate the accuracy of the CIFAR10 model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, phase=phase, feed={phase: False}, args=eval_params) print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) ########################################################################### # Build dataset ########################################################################### if targeted: from cleverhans.utils import build_targeted_dataset adv_inputs, true_labels, adv_ys = build_targeted_dataset( X_test, Y_test, np.arange(nb_samples), nb_classes, img_rows, img_cols, channels) else: adv_inputs = X_test[:nb_samples] ########################################################################### # Craft adversarial examples using generic approach ########################################################################### if targeted: att_batch_size = np.clip( nb_samples * (nb_classes - 1), a_max=MAX_BATCH_SIZE, a_min=1) nb_adv_per_sample = nb_classes - 1 yname = "y_target" else: att_batch_size = np.minimum(nb_samples, MAX_BATCH_SIZE) nb_adv_per_sample = 1 adv_ys = None yname = "y" print('Crafting ' + str(nb_samples) + ' * ' + str(nb_adv_per_sample) + ' adversarial examples') print("This could take some time ...") if attack == ATTACK_CARLINI_WAGNER_L2: from cleverhans.attacks import CarliniWagnerL2 attacker = CarliniWagnerL2(model, back='tf', sess=sess) attack_params = {'binary_search_steps': 1, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': att_batch_size, 'initial_const': 10, } elif attack == ATTACK_JSMA: from cleverhans.attacks import SaliencyMapMethod attacker = SaliencyMapMethod(model, back='tf', sess=sess) attack_params = {'theta': 1., 'gamma': 0.1} elif attack == ATTACK_FGSM: from cleverhans.attacks import FastGradientMethod attacker = FastGradientMethod(model, back='tf', sess=sess) attack_params = {'eps': eps} elif attack == ATTACK_MADRYETAL: from cleverhans.attacks import MadryEtAl attacker = MadryEtAl(model, back='tf', sess=sess) attack_params = {'eps': eps, 'eps_iter': 0.01, 'nb_iter': nb_iter} else: print("Attack undefined") sys.exit(1) attack_params.update({yname: adv_ys, 'clip_min': 0., 'clip_max': 1.}) X_test_adv = attacker.generate_np(adv_inputs, phase, **attack_params) ''' adv_x = attacker.generate(x, phase, **attack_params) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) eval_params = {'batch_size': att_batch_size} X_test_adv, = batch_eval(sess, [x], [adv_x], [adv_inputs], feed={ phase: False}, args=eval_params) ''' if targeted: assert X_test_adv.shape[0] == nb_samples * \ (nb_classes - 1), X_test_adv.shape # Evaluate the accuracy of the CIFAR10 model on adversarial examples print("Evaluating targeted results") adv_accuracy = model_eval(sess, x, y, preds, X_test_adv, true_labels, phase=phase, args=eval_params) else: assert X_test_adv.shape[0] == nb_samples, X_test_adv.shape # Evaluate the accuracy of the CIFAR10 model on adversarial examples print("Evaluating un-targeted results") adv_accuracy = model_eval(sess, x, y, preds, X_test_adv, Y_test, phase=phase, args=eval_params) # Compute the number of adversarial examples that were successfully found print('Test accuracy on adversarial examples {0:.4f}'.format(adv_accuracy)) # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((X_test_adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Friendly output for pasting into spreadsheet print('{0:.4f},'.format(accuracy)) print('{0:.4f},'.format(adv_accuracy)) print('{0:.4f},'.format(percent_perturbed)) sess.close() ''' print("Repeating the process, using adversarial training") def evaluate_2(): # Evaluate the accuracy of the adversarialy trained CIFAR10 model on # legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, phase=phase, args=eval_params) print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Evaluate the accuracy of the adversarially trained CIFAR10 model on # adversarial examples accuracy_adv = model_eval(sess, x, y, preds_adv, X_test, Y_test, phase=phase, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy_adv)) # Perform adversarial training train_params.update({'reuse_global_step': True}) model_train(sess, x, y, preds, X_train, Y_train, phase=phase, predictions_adv=preds_adv, evaluate=evaluate_2, args=train_params) ''' '''
def mnist_attack(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_filters=64, nb_samples=10, learning_rate=0.001, eps=0.3, attack=0, attack_iterations=100, model_path=None, targeted=False, binary=False, scale=False, rand=False, debug=None, test=False, data_dir=None, delay=0, adv=0, nb_iter=40): """ MNIST tutorial for generic attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param nb_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 nb_classes = 10 # Set TF random seed to improve reproducibility tf.set_random_seed(1237) # Create TF session sess = tf.Session() print("Created TensorFlow session.") if debug: set_log_level(logging.DEBUG) else: set_log_level(logging.WARNING) # for running on sharcnet # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(datadir=data_dir, train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) phase = tf.placeholder(tf.bool, name='phase') # for attempting to break unscaled network. logits_scalar = tf.placeholder_with_default(INIT_T, shape=(), name="logits_temperature") save = False train_from_scratch = False if model_path is not None: if os.path.exists(model_path): # check for existing model in immediate subfolder if any(f.endswith('.meta') for f in os.listdir(model_path)): binary, scale, nb_filters, batch_size, learning_rate, nb_epochs, adv = parse_model_settings( model_path) train_from_scratch = False else: model_path = build_model_save_path(model_path, binary, batch_size, nb_filters, learning_rate, nb_epochs, adv, delay, scale) print(model_path) save = True train_from_scratch = True else: train_from_scratch = True # train from scratch, but don't save since no path given # Define TF model graph if binary: print('binary=True') if scale: print('scale=True') if rand: print('rand=True') from cleverhans_tutorials.tutorial_models import make_scaled_binary_rand_cnn model = make_scaled_binary_rand_cnn( phase, logits_scalar, 'binsc_', input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_scaled_binary_cnn model = make_scaled_binary_cnn(phase, logits_scalar, 'binsc_', input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_basic_binary_cnn model = make_basic_binary_cnn(phase, logits_scalar, 'bin_', nb_filters=nb_filters) else: if rand: print('rand=True') from cleverhans_tutorials.tutorial_models import make_scaled_rand_cnn model = make_scaled_rand_cnn(phase, logits_scalar, 'fp_rand', nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_basic_cnn model = make_basic_cnn(phase, logits_scalar, 'fp_', nb_filters=nb_filters) preds = model(x, reuse=False) # * logits_scalar print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### rng = np.random.RandomState([2017, 8, 30]) # Train an MNIST model train_params = { 'binary': binary, 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'loss_name': 'train loss', 'filename': 'model', 'reuse_global_step': False, 'train_scope': 'train', 'is_training': True } if adv != 0: if adv == ADVERSARIAL_TRAINING_MADRYETAL: from cleverhans.attacks import MadryEtAl train_attack_params = { 'eps': MAX_EPS, 'eps_iter': 0.01, 'nb_iter': nb_iter } train_attacker = MadryEtAl(model, sess=sess) elif adv == ADVERSARIAL_TRAINING_FGSM: from cleverhans.attacks import FastGradientMethod stddev = int(np.ceil((MAX_EPS * 255) // 2)) train_attack_params = { 'eps': tf.abs( tf.truncated_normal(shape=(batch_size, 1, 1, 1), mean=0, stddev=stddev)) } train_attacker = FastGradientMethod(model, back='tf', sess=sess) # create the adversarial trainer train_attack_params.update({'clip_min': 0., 'clip_max': 1.}) adv_x_train = train_attacker.generate(x, phase, **train_attack_params) preds_adv_train = model.get_probs(adv_x_train) eval_attack_params = {'eps': MAX_EPS, 'clip_min': 0., 'clip_max': 1.} adv_x_eval = train_attacker.generate(x, phase, **eval_attack_params) preds_adv_eval = model.get_probs(adv_x_eval) # * logits_scalar def evaluate(): # Evaluate the accuracy of the MNIST model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, phase=phase, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) if adv != 0: # Accuracy of the adversarially trained model on adversarial # examples acc = model_eval(sess, x, y, preds_adv_eval, X_test, Y_test, phase=phase, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % acc) acc = model_eval(sess, x, y, preds_adv_eval, X_test, Y_test, phase=phase, args=eval_params, feed={logits_scalar: ATTACK_T}) print('Test accuracy on adversarial examples (scaled): %0.4f' % acc) if train_from_scratch: if save: train_params.update({'log_dir': model_path}) if adv and delay > 0: train_params.update({'nb_epochs': delay}) # do clean training for 'nb_epochs' or 'delay' epochs if test: model_train(sess, x, y, preds, X_train, Y_train, phase=phase, evaluate=evaluate, args=train_params, save=save, rng=rng) else: model_train(sess, x, y, preds, X_train, Y_train, phase=phase, args=train_params, save=save, rng=rng) # optionally do additional adversarial training if adv: print("Adversarial training for %d epochs" % (nb_epochs - delay)) train_params.update({'nb_epochs': nb_epochs - delay}) train_params.update({'reuse_global_step': True}) if test: model_train(sess, x, y, preds, X_train, Y_train, phase=phase, predictions_adv=preds_adv_train, evaluate=evaluate, args=train_params, save=save, rng=rng) else: model_train(sess, x, y, preds, X_train, Y_train, phase=phase, predictions_adv=preds_adv_train, args=train_params, save=save, rng=rng) else: tf_model_load(sess, model_path) print('Restored model from %s' % model_path) evaluate() # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, phase=phase, feed={phase: False}, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Build dataset ########################################################################### if viz_enabled: assert nb_samples == nb_classes idxs = [ np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] viz_rows = nb_classes if targeted else 2 # Initialize our array for grid visualization grid_shape = (nb_classes, viz_rows, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') if targeted: from cleverhans.utils import build_targeted_dataset if viz_enabled: from cleverhans.utils import grid_visual adv_inputs, true_labels, adv_ys = build_targeted_dataset( X_test, Y_test, idxs, nb_classes, img_rows, img_cols, channels) else: adv_inputs, true_labels, adv_ys = build_targeted_dataset( X_test, Y_test, np.arange(nb_samples), nb_classes, img_rows, img_cols, channels) else: if viz_enabled: from cleverhans.utils import pair_visual adv_inputs = X_test[idxs] else: adv_inputs = X_test[:nb_samples] ########################################################################### # Craft adversarial examples using generic approach ########################################################################### if targeted: att_batch_size = np.clip(nb_samples * (nb_classes - 1), a_max=MAX_BATCH_SIZE, a_min=1) nb_adv_per_sample = nb_classes - 1 yname = "y_target" else: att_batch_size = np.minimum(nb_samples, MAX_BATCH_SIZE) nb_adv_per_sample = 1 adv_ys = None yname = "y" print('Crafting ' + str(nb_samples) + ' * ' + str(nb_adv_per_sample) + ' adversarial examples') print("This could take some time ...") if attack == ATTACK_CARLINI_WAGNER_L2: print('Attack: CarliniWagnerL2') from cleverhans.attacks import CarliniWagnerL2 attacker = CarliniWagnerL2(model, back='tf', sess=sess) attack_params = { 'binary_search_steps': 1, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': att_batch_size, 'initial_const': 10, } elif attack == ATTACK_JSMA: print('Attack: SaliencyMapMethod') from cleverhans.attacks import SaliencyMapMethod attacker = SaliencyMapMethod(model, back='tf', sess=sess) attack_params = {'theta': 1., 'gamma': 0.1} elif attack == ATTACK_FGSM: print('Attack: FastGradientMethod') from cleverhans.attacks import FastGradientMethod attacker = FastGradientMethod(model, back='tf', sess=sess) attack_params = {'eps': eps} elif attack == ATTACK_MADRYETAL: print('Attack: MadryEtAl') from cleverhans.attacks import MadryEtAl attacker = MadryEtAl(model, back='tf', sess=sess) attack_params = {'eps': eps, 'eps_iter': 0.01, 'nb_iter': nb_iter} elif attack == ATTACK_BASICITER: print('Attack: BasicIterativeMethod') from cleverhans.attacks import BasicIterativeMethod attacker = BasicIterativeMethod(model, back='tf', sess=sess) attack_params = {'eps': eps, 'eps_iter': 0.01, 'nb_iter': nb_iter} else: print("Attack undefined") sys.exit(1) attack_params.update({yname: adv_ys, 'clip_min': 0., 'clip_max': 1.}) adv_np = attacker.generate_np(adv_inputs, phase, **attack_params) ''' name = 'm_fgsm_eps%s_n%s.npy' % (eps, nb_samples) fpath = os.path.join( '/scratch/gallowaa/mnist/adversarial_examples/cleverhans/', name) np.savez(fpath, x=adv_np, y=Y_test[:nb_samples]) ''' ''' adv_x = attacker.generate(x, phase, **attack_params) adv_np, = batch_eval(sess, [x], [adv_x], [adv_inputs], feed={ phase: False}, args=eval_params) ''' eval_params = {'batch_size': att_batch_size} if targeted: print("Evaluating targeted results") adv_accuracy = model_eval(sess, x, y, preds, adv_np, true_labels, phase=phase, args=eval_params) else: print("Evaluating untargeted results") if viz_enabled: adv_accuracy = model_eval(sess, x, y, preds, adv_np, Y_test[idxs], phase=phase, args=eval_params) else: adv_accuracy = model_eval(sess, x, y, preds, adv_np, Y_test[:nb_samples], phase=phase, args=eval_params) if viz_enabled: n = nb_classes - 1 for i in range(nb_classes): if targeted: for j in range(nb_classes): if i != j: if j != 0 and i != n: grid_viz_data[i, j] = adv_np[j * n + i] if j == 0 and i > 0 or i == n and j > 0: grid_viz_data[i, j] = adv_np[j * n + i - 1] else: grid_viz_data[i, j] = adv_inputs[j * n] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv_np[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Test accuracy on adversarial examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv_np - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Compute number of modified features (L_0 norm) nb_changed = np.where(adv_np != adv_inputs)[0].shape[0] percent_perturb = np.mean(float(nb_changed) / adv_np.reshape(-1).shape[0]) # Compute the average distortion introduced by the algorithm print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturb)) # Friendly output for pasting into spreadsheet print('{0:.4f}'.format(accuracy)) print('{0:.4f}'.format(adv_accuracy)) print('{0:.4f}'.format(percent_perturbed)) print('{0:.4f}'.format(percent_perturb)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): nb_classes = 10 # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(4264) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "./" model_name = "clean_trained_mnist_model" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': model_path, 'filename': model_name } fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([443, 224, 39]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters, nb_classes=nb_classes) preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, save=True, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc mnist = input_data.read_data_sets("../MNIST_data/", one_hot=True) # Variables xx = tf.placeholder(tf.float32, [None, 784]) y_ = tf.placeholder(tf.float32, [None, 10]) p_x = tf.reshape(xx, [-1, 28, 28, 1]) preds_x = model.get_probs(p_x) decoded = mk_nn_model(xx, y_) p_decoded = tf.reshape(decoded, [-1, 28, 28, 1]) mse = tf.losses.mean_squared_error(xx, decoded) pred_decoded = model.get_probs(p_decoded) pred_loss = -abs(tf.losses.absolute_difference(preds_x, pred_decoded)) loss = tf.reduce_mean(mse + pred_loss) train_step = tf.train.AdagradOptimizer(0.1).minimize(loss) init = tf.initialize_all_variables() with sess as sess: print('Training...') sess.run(init) for i in range(10001): batch_xs, batch_ys = mnist.train.next_batch(128) train_step.run({xx: batch_xs, y_: batch_ys}) if i % 1000 == 0: train_loss = loss.eval({xx: batch_xs, y_: batch_ys}) print(' step, loss = %6d: %6.3f' % (i, train_loss)) # generate decoded image with test data test_fd = {xx: mnist.test.images, y_: mnist.test.labels} decoded_imgs = decoded.eval(test_fd) print('loss (test) = ', loss.eval(test_fd)) adv_x = tf.reshape(decoded_imgs, [-1, 28, 28, 1]) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc x_test = mnist.test.images n = 10 # how many digits we will display plt.figure(figsize=(20, 4)) for i in range(n): # display original ax = plt.subplot(2, n, i + 1) plt.imshow(x_test[i].reshape(28, 28)) plt.gray() ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) # display reconstruction ax = plt.subplot(2, n, i + 1 + n) plt.imshow(decoded_imgs[i].reshape(28, 28)) plt.gray() ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) # plt.show() plt.savefig('mnist_ae2.png') return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval( sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval( sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) print(adv_x) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} # Define accuracy symbolically if LooseVersion(tf.__version__) >= LooseVersion('1.0.0'): correct_preds = tf.not_equal(tf.argmax(y, axis=-1), tf.argmax(preds_adv, axis=-1)) else: correct_preds = tf.not_equal(tf.argmax(y, axis=tf.rank(y) - 1), tf.argmax(preds_adv, axis=tf.rank(preds_adv) - 1)) # print("the shape of correct_preds is ", correct_preds.get_shape()) # correct_preds is a boolean Tensor with shape (size,) success_adv_x = tf.boolean_mask(adv_x, correct_preds) success_clean_x = tf.boolean_mask(x, correct_preds) success_clean_y = tf.boolean_mask(y, correct_preds) fgsm_adv_x, fgsm_clean_x, fgsm_clean_y = sess.run([success_adv_x, success_clean_x, success_clean_y], feed_dict={x:X_test,y:Y_test}) np.savez('adversarial_fgsm',adv_examples=fgsm_adv_x, adv_clean_labels=fgsm_clean_y, adv_clean_examples=fgsm_clean_x) print("the shape of adversarial examples we save is ", np.shape(fgsm_adv_x)) print("the shape of clean targets we save is ", np.shape(fgsm_clean_y)) acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples fgsm: %0.4f\n' % acc) report.clean_train_adv_eval = acc adv_x_test_for_save = sess.run(adv_x, {x: X_test}) np.savez("adv_test_fgsm_data.npz", adv_examples=adv_x_test_for_save, adv_clean_labels=Y_test, adv_clean_examples=X_test) # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) fgsm2 = FastGradientMethod(model_2, sess=sess) adv_x_2 = fgsm2.generate(x, **fgsm_params) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x_2 = tf.stop_gradient(adv_x_2) preds_2_adv = model_2(adv_x_2) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training print("pred_adv", preds_2_adv.get_shape()) model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### source_samples = 10000 nb_classes = 10 print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model_2, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # create an array for storing adv examples adv_examples = np.empty([1,28,28,1]) # for target labels adv_targets = np.empty([1,10]) # corresponding clean/correct label adv_clean_labels = np.empty([1,10]) # correspongding clean data adv_clean_examples = np.empty([1,28,28,1]) # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind+1)] # generate from testing data # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) # generate from testing data target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal # grid_viz_data[current_class, current_class, :, :, :] = np.reshape( # sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) #create fake target one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # print('adv_x\'shape is ', np.shape(adv_x)) # (1,28,28,1) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # if succeeds if res == 1: # append new adv_x to adv_examples array # append sample here, so that the number of times sample is appended mmatches number of adv_ex. adv_examples = np.append(adv_examples, adv_x, axis=0) adv_targets = np.append(adv_targets, one_hot_target, axis=0) adv_clean_labels = np.append(adv_clean_labels, np.expand_dims(Y_test[sample_ind],axis=0), axis=0) # generate from testing data adv_clean_examples = np.append(adv_clean_examples, sample, axis=0) adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') adv_examples = adv_examples[1:,:,:,:] adv_targets = adv_targets[1:,:] adv_clean_labels = adv_clean_labels[1:,:] adv_clean_examples = adv_clean_examples[1:,:,:,:] np.savez('adversarial_jsma_actual_full',adv_examples=adv_examples, adv_targets=adv_targets, adv_clean_labels=adv_clean_labels,adv_clean_examples=adv_clean_examples) print(np.shape(adv_targets)[0], "adversarial examples have been saved.") print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_test_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64, num_threads=None): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval( sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval( sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) fgsm2 = FastGradientMethod(model_2, sess=sess) adv_x_2 = fgsm2.generate(x, **fgsm_params) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x_2 = tf.stop_gradient(adv_x_2) preds_2_adv = model_2(adv_x_2) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64, num_threads=None): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) nb_classes = 10 source_samples = 10 img_rows = 28 img_cols = 28 channels = 1 model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } fgsm_params = {'eps': FLAGS.fgsm_eps, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) prune_factor = FLAGS.prune_factor eval_par = {'batch_size': batch_size} if clean_train: prune_percent = { 'conv1_w': 5, 'conv2_w': 5, 'conv3_w': 5, 'conv4_w': 5, 'fc1_w': prune_factor, 'fc2_w': prune_factor, 'fc3_w': prune_factor } model = make_basic_cnn(nb_filters=nb_filters, prune_percent=prune_percent) initialize_uninitialized_global_variables(sess) preds = model.get_probs(x) saver = tf.train.Saver() def fgsm_combo(): acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_par) print('Test accuracy on legitimate examples: %0.4f\n' % acc) fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print( 'Test accuracy on adversarial examples generated by fgsm: %0.4f\n' % acc) bim = BasicIterativeMethod(model, sess=sess) adv_x = bim.generate(x) preds_adv = model.get_probs(adv_x) acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print( 'Test accuracy on adversarial examples generated by IterativeMethod: %0.4f\n' % acc) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_par) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) ckpt_name = './mnist_model.ckpt' if not FLAGS.resume: model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) saver.save(sess, ckpt_name) if FLAGS.resume: saver = tf.train.import_meta_graph(ckpt_name + '.meta') print("loading pretrain model") saver.restore(sess, ckpt_name) acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_par) print('Test accuracy on pretrained model: %0.4f\n' % acc) if not FLAGS.resume: import sys sys.exit() def do_jsma(): print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where( adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape( -1).shape[0] # Display the original and adversarial images side-by-side if FLAGS.viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols)), np.reshape(adv_x, (img_rows, img_cols)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format( succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format( percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) if FLAGS.viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report def do_cw(): nb_adv_per_sample = str(nb_classes - 1) if FLAGS.targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if FLAGS.viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if FLAGS.targeted: if FLAGS.viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in X_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in X_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, 1)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if FLAGS.viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = X_test[idxs] else: adv_inputs = X_test[:source_samples] adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': FLAGS.attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if FLAGS.targeted else source_samples, 'initial_const': 10 } adv = cw.generate_np(adv_inputs, **cw_params) if FLAGS.targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_par) else: if FLAGS.viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, Y_test[ idxs], args=eval_par) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, Y_test[ :source_samples], args=eval_par) if FLAGS.viz_enabled: for j in range(nb_classes): if FLAGS.targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format( adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format( percent_perturbed)) # Close TF session # sess.close() # Finally, block & display a grid of all the adversarial examples if FLAGS.viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report print("before pruning and gradient inhibition\n") fgsm_combo() do_cw() do_jsma() preds = model.get_probs(x) loss = model_loss(y, preds) if not FLAGS.load_pruned_model: print("start iterative pruning") for i in range(FLAGS.prune_iterations): print("iterative %d" % (i)) start = time.time() dict_nzidx = model.apply_prune(sess) trainer = tf.train.AdamOptimizer(learning_rate) grads = trainer.compute_gradients(loss) grads = model.apply_prune_on_grads(grads, dict_nzidx) end = time.time() print('until grad compute elpased %f' % (end - start)) prune_args = {'trainer': trainer, 'grads': grads} train_params = { 'nb_epochs': FLAGS.retrain_epoch, 'batch_size': batch_size, 'learning_rate': FLAGS.retrain_lr } start = time.time() model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng, prune_args=prune_args, retrainindex=i) end = time.time() print('model_train function takes %f' % (end - start)) eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) saver.save(sess, './pruned_mnist_model.ckpt') else: print("loading pruned model") saver = tf.train.import_meta_graph( './pruned_mnist_model.ckpt.meta') saver.restore(sess, './pruned_mnist_model.ckpt') print("before applying gradient inhibition") fgsm_combo() print("before gradient inhibition, doing c&w") do_cw() do_jsma() if FLAGS.do_inhibition: model.inhibition(sess, original_method=FLAGS.use_inhibition_original, inhibition_eps=FLAGS.inhibition_eps) fgsm_combo() do_cw() do_jsma()
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64, num_threads=None): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) s = [] for i in range(0,len(X_test),1): pred = sess.run(preds, {x: X_test[i:i+1]}) print(pred) print(Y_test[i:i+1]) s.append(np.sort(pred)[0,-1]-np.sort(pred)[0,-2]) #Draw a histogram def draw_hist(myList,Title,Xlabel,Ylabel): plt.hist(myList,np.arange(0,1,0.01),normed=True,stacked=True,facecolor='blue') plt.xlabel(Xlabel) plt.ylabel(Ylabel) plt.title(Title) plt.show() draw_hist(myList=s,Title='legitimate',Xlabel='difference between the max and second largest', Ylabel='Probability') # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval( sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) ''' s = [] for i in range(0,len(X_test),1): pred=sess.run(adv_x, {x: X_test[i:i+1]}) pred1 = sess.run(preds_adv, {x: X_test[i:i+1]}) print(pred1) print(Y_test[i:i+1]) #difference array s s.append(np.sort(pred1)[0,-1]-np.sort(pred1)[0,-2]) #Draw a histogram def draw_hist(myList,Title,Xlabel,Ylabel): plt.hist(myList,np.arange(0,1,0.01),normed=True,stacked=True,facecolor='blue') plt.xlabel(Xlabel) plt.ylabel(Ylabel) plt.title(Title) plt.show() draw_hist(myList=s,Title='legitimate',Xlabel='difference between the max and second largest', Ylabel='Probability') ''' # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc return report
def effective_train_jsma(train_start=0, train_end=20, test_start=0, test_end=10000, viz_enabled=False, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001): # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set logging level to see debug information set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") model_path = "./" model_name = "adv_trained_jsma_model_alpha0.4_fortest" # sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) # Define input TF placeholder x1 = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) # for clean data x2 = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) # for adv data y = tf.placeholder(tf.float32, shape=(None, 10)) # for adv clean targets # Initialize the model model = make_basic_cnn() preds = model(x1) preds_adv = model(x2) # Instantiate a SaliencyMapMethod attack object # jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } # Define loss loss = 0.4 * model_loss(y, preds) + 0.6 * model_loss(y, preds_adv) train_step = tf.train.AdamOptimizer(learning_rate=learning_rate) train_step = train_step.minimize(loss) def evaluate_2(adv_examples_last_batch, adv_clean_labels_last_batch): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x1, y, preds, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x2, y, preds_adv, adv_examples_last_batch, adv_clean_labels_last_batch, args=eval_params) print('Test accuracy on last batch of adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy with sess.as_default(): tf.global_variables_initializer().run() for epoch in xrange(nb_epochs): print('Training for epoch %i/%i' % (epoch, nb_epochs - 1)) # Compute number of batches nb_batches = int(math.ceil(float(len(X_train)) / batch_size)) assert nb_batches * batch_size >= len(X_train) # Indices to shuffle training set index_shuf = list(range(len(X_train))) rng.shuffle(index_shuf) prev = time.time() for batch in range(nb_batches): # re-instantiate Saliency object with new trained model jsma = SaliencyMapMethod(model, back='tf', sess=sess) print('--------------------------------------') # create an array for storing adv examples print('batch: %i/%i' % (batch + 1, nb_batches)) # adv_examples = np.empty([1,28,28,1]) adv_examples = [] # for target labels #adv_targets = np.empty([1,10]) # corresponding clean/correct label # adv_clean_labels = np.empty([1,10]) adv_clean_labels = [] # correspongding clean data # adv_clean_examples = np.empty([1,28,28,1]) adv_clean_examples = [] for sample_ind in xrange(0, batch_size): print('Attacking input %i/%i' % (sample_ind + 1, batch_size)) # Compute batch start and end indices start, end = batch_indices(batch, len(X_train), batch_size) X_this_batch = X_train[index_shuf[start:end]] Y_this_batch = Y_train[index_shuf[start:end]] # Perform one training step # feed_dict = {x: X_train[index_shuf[start:end]],y: Y_train[index_shuf[start:end]]} sample = X_this_batch[sample_ind:( sample_ind + 1)] # generate from training data # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_this_batch[sample_ind]) ) # generate from training data target_classes = other_classes(nb_classes, current_class) print('Current class is ', current_class) # For the grid visualization, keep original images along the diagonal # grid_viz_data[current_class, current_class, :, :, :] = np.reshape( # sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) #create fake target one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np( sample, **jsma_params ) # get numpy array (1, 28, 28, 1), not Tensor # Check if success was achieved # res = int(model_argmax(sess, x, preds, adv_x) == target) # if succeeds # if res == 1: # append new adv_x to adv_examples array # append sample here, so that the number of times sample is appended mmatches number of adv_ex. # adv_examples = np.append(adv_examples, adv_x, axis=0) adv_examples.append(adv_x) #adv_targets = np.append(adv_targets, one_hot_target, axis=0) # adv_clean_labels = np.append(adv_clean_labels, np.expand_dims(Y_this_batch[sample_ind],axis=0), axis=0) # generate from training data adv_clean_labels.append(Y_this_batch[sample_ind]) # adv_clean_examples = np.append(adv_clean_examples, sample, axis=0) adv_clean_examples.append(sample) # what we have for this batch, batch_size * 9 data # adv_examples = adv_examples[1:,:,:,:] #adv_targets = adv_targets[1:,:] # adv_clean_labels = adv_clean_labels[1:,:] # adv_clean_examples = adv_clean_examples[1:,:,:,:] adv_examples = np.reshape( adv_examples, (batch_size * (nb_classes - 1), 28, 28, 1)) adv_clean_examples = np.reshape(adv_clean_examples, (batch_size * (nb_classes - 1), 28, 28, 1)) feed_dict = { x1: adv_clean_examples, x2: adv_examples, y: adv_clean_labels } train_step.run(feed_dict=feed_dict) cur = time.time() _logger.info("Epoch " + str(epoch) + " took " + str(cur - prev) + " seconds") evaluate_2(adv_examples, adv_clean_labels) print('Training finished.') # report on clean test data preds_test = model(x1) eval_par = {'batch_size': 10} acc_clean = model_eval(sess, x1, y, preds_test, X_test, Y_test, args=eval_par) print('Test accuracy on legitimate examples: %0.4f\n' % acc_clean) # reload fgsm successfully attacking adv test data # with np.load("adversarial_fgsm.npz") as data: # adv_X_test, adv_clean_Y_test, adv_clean_X_test = data['adv_examples'], data['adv_clean_labels'], data['adv_clean_examples'] # print('FGSM adversarial data are successfully reloaded.') # preds_adv_test = model(x1) # # Evaluate the accuracy of the MNIST model on adversarial examples # # eval_par = {'batch_size': 10} # acc = model_eval(sess, x1, y, preds_adv_test, adv_X_test, adv_clean_Y_test, args=eval_par) # print('Test accuracy on pre-generated adversarial examples of fgsm: %0.4f\n' % acc) # # reload fgsm successfully attacking adv test data # with np.load("adversarial_mnist_test_from_1500.npz") as data: # adv_X_test, adv_clean_Y_test, adv_clean_X_test = data['adv_examples'], data['adv_clean_labels'], data['adv_clean_examples'] # print('JSMA adversarial data are successfully reloaded.') # # Evaluate the accuracy of the MNIST model on adversarial examples # acc2 = model_eval(sess, x1, y, preds_adv_test, adv_X_test, adv_clean_Y_test, args=eval_par) # print('Test accuracy on pre-generated adversarial examples of jsma: %0.4f\n' % acc2) save_path = os.path.join(model_path, model_name) saver = tf.train.Saver() saver.save(sess, save_path) _logger.info("Completed model training and saved at: " + str(save_path)) # Close TF session sess.close()
X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_start + train_end, test_start=test_start, test_end=test_end) xDim = np.shape(X_train)[1:3] tf.reset_default_graph() orig_preds = L0_Utils.getModelPreds(X_train, make_basic_cnn, cnnModelFile) filterSample = L0_Utils.filterSampleSet(Y_train, advTarget, orig_preds) advGenSample = X_train[filterSample][0:advSampleSetSize] tf.reset_default_graph() cnn_model = make_basic_cnn() saver = tf.train.Saver() perb = genUniAdvPerb(x_sample=advGenSample, t=advTarget, model=cnn_model, nb_classes=10, clip_min=0., clip_max=1., L0_Max=0.01, theta=1.0) with tf.Session() as sess: saver.restore(sess, cnnModelFile) perb_np = perb.eval() '''
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models"), rng=rng, loss_type=KEYWORDS.MSE) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess, loss_type=KEYWORDS.MSE) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in X_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in X_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, 1)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = X_test[idxs] else: adv_inputs = X_test[:source_samples] adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': 10 } adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, Y_test[ idxs], args=eval_params) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, Y_test[ :source_samples], args=eval_params) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, nb_epochs, batch_size, learning_rate, rng): """ Define and train a model that simulates the "remote" black-box oracle described in the original paper. :param sess: the TF session :param x: the input placeholder for MNIST :param y: the ouput placeholder for MNIST :param X_train: the training data for the oracle :param Y_train: the training labels for the oracle :param X_test: the testing data for the oracle :param Y_test: the testing labels for the oracle :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param rng: numpy.random.RandomState :return: """ # Define TF model graph (for the black-box model) model = make_basic_cnn() predictions = model(x) fgsm_params = { 'eps': FLAGS.training_eps, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1. } fgsm = FastGradientMethod(model, sess=sess) predictions_adv = model(fgsm.generate(x, **fgsm_params)) logger.info("Defined TensorFlow model graph.") # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } model_train(sess, x, y, predictions, X_train, Y_train, verbose=False, args=train_params, rng=rng, predictions_adv=predictions_adv) # logger.info out the accuracy on legitimate data eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) logger.info( 'Test accuracy of adversarially trained black-box on legitimate test ' 'examples: ' + str(accuracy)) return model, predictions, accuracy
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) source_samples = batch_size # Use label smoothing # Hopefully this doesn't screw up JSMA... # assert Y_train.shape[1] == 10 # label_smooth = .1 # Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) print("evaluate 1") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval( sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval( sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on FGSM adversarial examples: %0.4f\n' % acc) # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc ################################################################ # Init the Elastic Network Method attack object and graph en = ElasticNetMethod(model, back='tf', sess=sess) en_params = {'binary_search_steps': 1, #'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': source_samples, 'initial_const': 10} adv_x_2 = en.generate(x, **en_params) preds_adv_2 = model.get_probs(adv_x_2) en_eval_params = {'batch_size': source_samples} # Evaluate the accuracy of the MNIST model on EN adversarial examples acc = model_eval(sess, x, y, preds_adv_2, X_test, Y_test, args=en_eval_params) print('Test accuracy on EN adversarial examples: %0.4f\n' % acc) ############################################################### # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) fgsm2 = FastGradientMethod(model_2, sess=sess) adv_x_fgsm = fgsm2.generate(x, **fgsm_params) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x_2 = tf.stop_gradient(adv_x_fgsm) preds_2_adv_fgsm = model_2(adv_x_fgsm) ########################################## en2 = ElasticNetMethod(model_2, back='tf',sess=sess) en_params = {'binary_search_steps': 1, #'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': source_samples, 'initial_const': 10} adv_x_en = en2.generate(x, **en_params) preds_2_adv_en = model_2(adv_x_en) print("evaluate 2") def evaluate_2(): # evaluate the final result of the model eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) # Accuracy of the adversarially trained model on FGSM adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv_fgsm, X_test, Y_test, args=eval_params) print('Test accuracy on FGSM adversarial examples: %0.4f' % accuracy) # Accuracy of the adversarially trained model on EN Method adversarial examples en_eval_params = {'batch_size': source_samples} accuracy = model_eval(sess, x, y, preds_2_adv_en, X_test, Y_test, args=en_eval_params) print('Test accuracy on EN adversarial examples: %0.4f' % accuracy) # Perform and evaluate adversarial training # want to combine preds but can't figure out the data types... ??? # hope this training style works preds_2_adv = [preds_2_adv_fgsm, preds_2_adv_en] train_params = { 'nb_epochs': nb_epochs, 'batch_size': source_samples, 'learning_rate': learning_rate } model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=[preds_2_adv_en],evaluate = evaluate_2, args=train_params, rng=rng) return report
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) model_train(sess, x, y, preds, X_train, Y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind+1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, channels)), np.reshape(adv_x, (img_rows, img_cols, channels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") print(X_test)