def plot_results( adv_inputs, adv, recon_orig, recon_adv): nb_classes = 10 img_rows = img_cols = 28 nchannels = 1 grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') grid_viz_data_1 = np.zeros(grid_shape, dtype='f') curr_class = 0 for j in range(nb_classes): for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if(i==j): grid_viz_data[i,j] = recon_orig[curr_class*9] grid_viz_data_1[i,j] = adv_inputs[curr_class*9] curr_class = curr_class+1 else: if(j>i): grid_viz_data[i,j] = recon_adv[i*(nb_classes-1) + j-1] grid_viz_data_1[i,j] = adv[i*(nb_classes-1)+j-1] else: grid_viz_data[i,j] = recon_adv[i*(nb_classes-1) + j] grid_viz_data_1[i,j] = adv[i*(nb_classes-1)+j] _ = grid_visual(grid_viz_data) _ = grid_visual(grid_viz_data_1)
def do_jsma(): print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where( adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape( -1).shape[0] # Display the original and adversarial images side-by-side if FLAGS.viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols)), np.reshape(adv_x, (img_rows, img_cols)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format( succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format( percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) if FLAGS.viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def generate_images(): print('==> Preparing data..') if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print( "INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.5 sess = tf.Session(config=config) keras.backend.set_session(sess) print "==> Beginning Session" # Get CIFAR10 test data X_train, Y_train, X_test, Y_test = data_cifar10() assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Load model print "==> loading vgg model" args = load_args() if args.model == 'vgg6': model = vggbn(top=True, pool=args.pool) if args.model == 'vgg15': model = vgg15(top=True, pool=args.pool) if args.model == 'generic': model = generic(top=True, pool=args.pool) if args.model == 'resnet18': model = resnet.build_resnet_18(args.pool) predictions = model(x) model.load_weights(args.load) eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) print '==> Accuracy : {}'.format(accuracy) def evaluate(): # Evaluate the accuracy of the CIFAR10 model on legitimate test examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Train an CIFAR10 model train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate } im_base = '/im_' model_name = args.model + '_p' + str(args.pool) if args.attack == 'fgsm' or args.attack == 'FGSM': result_dir = os.getcwd() + '/images/fgsm/' print "==> creating fgsm adversarial wrapper" adv_x = fgsm(x, predictions, eps=0.3) print "==> sending to batch evaluator to finalize adversarial images" eval_params = {'batch_size': FLAGS.batch_size} X_train_adv, = batch_eval(sess, [x], [adv_x], [X_train], args=eval_params) i = 0 if not os.path.exists(result_dir + model_name): os.makedirs(result_dir + model_name) print "==> saving images to {}".format(result_dir + model_name) for ad in X_train_adv: scipy.misc.imsave( result_dir + model_name + im_base + str(i) + '.png', ad) i += 1 sess.close() """ JSMA """ if args.attack == 'jsma' or args.attack == 'JSMA': result_dir = os.getcwd() + '/images/jsma/trial_single_adv' print('Crafting ' + str(FLAGS.source_samples) + ' * ' + str(FLAGS.nb_classes - 1) + ' adversarial examples') results = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='i') # This array contains the fraction of perturbed features for each test set perturbations = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='f') # Define the TF graph for the model's Jacobian grads = jacobian_graph(predictions, x, FLAGS.nb_classes) # Initialize our array for grid visualization grid_shape = (FLAGS.nb_classes, FLAGS.nb_classes, FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels) grid_viz_data = np.zeros(grid_shape, dtype='f') i_saved = 0 n_image = 0 # Loop over the samples we want to perturb into adversarial examples print "==> saving images to {}".format(result_dir + model_name) for sample_ind in xrange(7166, FLAGS.source_samples): # We want to find an adversarial example for each possible target class current_class = int(np.argmax(Y_train[sample_ind])) target_classes = other_classes(FLAGS.nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( X_train[sample_ind:(sample_ind + 1)], (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) # Loop over all target classes adversarials = [] for idx, target in enumerate(target_classes): print "image {}".format(sample_ind) # here we hold all successful adversarials for this iteration # since we dont want 500k images, we will uniformly sample an image to save after each target print('--------------------------------------') print('Creating adv. example for target class ' + str(target)) # This call runs the Jacobian-based saliency map approach adv_x, res, percent_perturb = jsma( sess, x, predictions, grads, X_train[sample_ind:(sample_ind + 1)], target, theta=1, gamma=0.1, increase=True, back='tf', clip_min=0, clip_max=1) # Display the original and adversarial images side-by-side adversarial = np.reshape( adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) original = np.reshape( X_train[sample_ind:(sample_ind + 1)], (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) if FLAGS.viz_enabled: if 'figure' not in vars(): figure = pair_visual(original, adversarial) else: figure = pair_visual(original, adversarial, figure) if not os.path.exists(result_dir + model_name): os.makedirs(result_dir + model_name) if res == 1: adversarials.append(adversarial) if idx == FLAGS.nb_classes - 2: try: if len(adversarials) == 1: idx_uniform = 0 else: idx_uniform = np.random.randint( 0, len(adversarials) - 1) print idx_uniform scipy.misc.imsave( result_dir + model_name + im_base + str(sample_ind) + '.png', adversarials[idx_uniform]) i_saved += 1 print "==> images saved: {}".format(i_saved) except: print "No adversarials generated" # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb n_image += 1 # Compute the number of adversarial examples that were successfuly found nb_targets_tried = ((FLAGS.nb_classes - 1) * FLAGS.source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print( 'Avg. rate of successful adv. examples {0:.2f}'.format(succ_rate)) # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.2f}'.format( percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print( 'Avg. rate of perturbed features for successful ' 'adversarial examples {0:.2f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if FLAGS.viz_enabled: _ = grid_visual(grid_viz_data)
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x, y, x_train, y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes)] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape((source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" cw_params = {'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': 10} adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ idxs], args=eval_params) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ :source_samples], args=eval_params) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
format = '%i,%i,%i' np.savetxt("./DataAnalysis/attack_fgsm_eps10_ord2_target.csv", result_log, fmt=format, delimiter=",") print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = source_samples succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session #sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: plt.close(figure) _ = grid_visual(grid_viz_data)
def mnist_tutorial_jsma( train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, ): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST( train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end, ) x_train, y_train = mnist.get_set("train") x_test, y_test = mnist.get_set("test") # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN("model1", nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { "nb_epochs": nb_epochs, "batch_size": batch_size, "learning_rate": learning_rate, } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {"batch_size": batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print("Test accuracy on legitimate test examples: {0}".format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print("Crafting " + str(source_samples) + " * " + str(nb_classes - 1) + " adversarial examples") # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype="i") # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype="f") # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype="f") # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, sess=sess) jsma_params = { "theta": 1.0, "gamma": 0.1, "clip_min": 0.0, "clip_max": 1.0, "y_target": None, } figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print("--------------------------------------") print("Attacking input %i/%i" % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print("Generating adv. example for target class %i" % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params["y_target"] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Compute number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure, ) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print("--------------------------------------") # Compute the number of adversarial examples that were successfully found nb_targets_tried = (nb_classes - 1) * source_samples succ_rate = float(np.sum(results)) / nb_targets_tried print("Avg. rate of successful adv. examples {0:.4f}".format(succ_rate)) report.clean_train_adv_eval = 1.0 - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations[np.where(perturbations != 0)]) print("Avg. rate of perturbed features {0:.4f}".format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean( perturbations[np.where(perturbations != 0)] * (results[np.where(perturbations != 0)] == 1)) print("Avg. rate of perturbed features for successful " "adversarial examples {0:.4f}".format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def zoo(viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED): """ :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) if DATASET == 'MNIST': train_start = 0 train_end = 60000 test_start = 0 test_end = 10000 ds = dataset.MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end, center=False) elif DATASET == 'SVHN': train_start = 0 train_end = 73257 test_start = 0 test_end = 26032 ds = dataset.SVHN(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) elif DATASET == 'CIFAR10': train_start = 0 train_end = 60000 test_start = 0 test_end = 10000 ds = dataset.CIFAR10(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end, center=False) x_train, y_train, x_test, y_test = ds.get_set('train') + ds.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN(DATASET, nb_classes, nb_filters, (None, img_rows, img_cols, nchannels)) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2018, 10, 22]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a Zoo attack object zoo = Zoo(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" zoo_params = { 'binary_search_steps': BINARY_SEARCH_STEPS, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': ZOO_LEARNING_RATE, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': INIT_CONST, 'solver': SOLVER, 'image_shape': [img_rows, img_cols, nchannels], 'nb_classes': nb_classes } adv = zoo.generate_np(adv_inputs, **zoo_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = 1 - model_eval( sess, x, y, preds, adv, y_test[idxs], args=eval_params) else: adv_accuracy = 1 - model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, model_path_cls=MODEL_PATH, targeted=TARGETED): # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) rng = np.random.RandomState() # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) nb_latent_size = 100 # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) x_t = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) y_t = tf.placeholder(tf.float32, shape=(None, nb_classes)) z = tf.placeholder(tf.float32, shape=(None, nb_latent_size)) z_t = tf.placeholder(tf.float32, shape=(None, nb_latent_size)) #nb_filters = 64 nb_layers = 500 # Define TF model graph model = ModelBasicAE('model', nb_layers, nb_latent_size) cl_model = ModelCls('cl_model') #preds = model.get_logits(x) recons = model.get_layer(x, 'RECON') loss = SquaredError(model) print("Defined TensorFlow model graph.") loss_cls = CrossEntropy(cl_model) y_logits = cl_model.get_layer(z, 'LOGITS') ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } train_params_cls = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path_cls)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model #if os.path.exists(model_path + ".meta"): # tf_model_load(sess, model_path) #else: eval_params_cls = {'batch_size': batch_size} # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerAE(model, cl_model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') grid_viz_data_1 = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * (nb_classes - 1) for instance in x_test[idxs]], dtype=np.float32) #adv_input_y = np.array([[instance]*(nb_classes-1) for instance in y_test[idxs]]) adv_input_y = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes - 1): targ.append(y_test[idxs[curr_num]]) adv_input_y.append(targ) adv_input_y = np.array(adv_input_y) adv_target_y = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes): if (id != curr_num): targ.append(y_test[idxs[id]]) adv_target_y.append(targ) adv_target_y = np.array(adv_target_y) #print("adv_input_y: \n", adv_input_y) #print("adv_target_y: \n", adv_target_y) adv_input_targets = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes): if (id != curr_num): targ.append(x_test[idxs[id]]) adv_input_targets.append(targ) adv_input_targets = np.array(adv_input_targets) adv_inputs = adv_inputs.reshape((source_samples * (nb_classes - 1), img_rows, img_cols, nchannels)) adv_input_targets = adv_input_targets.reshape( (source_samples * (nb_classes - 1), img_rows, img_cols, nchannels)) adv_input_y = adv_input_y.reshape( source_samples * (nb_classes - 1), 10) adv_target_y = adv_target_y.reshape( source_samples * (nb_classes - 1), 10) #print("adv_input_y: \n", adv_input_y) #print("adv_target_y: \n", adv_target_y) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 train_ae(sess, loss, x_train, x_train, args=train_params, rng=rng, var_list=model.get_params()) saver = tf.train.Saver() saver.save(sess, model_path) x_train_lat = model.get_layer(x_train, 'LATENT') x_test_lat = model.get_layer(x_test, 'LATENT') x_train_lat = sess.run(x_train_lat) x_test_lat = sess.run(x_test_lat) def do_eval_cls(preds, x_set, y_set, x_tar_set, report_key, is_adv=None): acc = model_eval(sess, z, y, preds, z_t, x_set, y_set, x_tar_set, args=eval_params_cls) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) def eval_cls(): do_eval_cls(y_logits, x_test_lat, y_test, x_test_lat, 'clean_train_clean_eval', False) #train_cls(sess, loss_cls, x_train, y_train, evaluate = eval_cls, args = train_params_cls, rng = rng, var_list = cl_model.get_params()) train_cls_lat(sess, loss_cls, x_train_lat, y_train, evaluate=eval_cls, args=train_params_cls, rng=rng, var_list=cl_model.get_params()) saver.save(sess, model_path_cls) #adv_input_y = cl_model.get_layer(adv_inputs, 'LOGITS') #adv_target_y = cl_model.get_layer(adv_input_targets, 'LOGITS') adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" cw_params_batch_size = source_samples * (nb_classes - 1) cw_params = { 'binary_search_steps': 10, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': CW_LEARNING_RATE, 'batch_size': cw_params_batch_size, 'initial_const': 1 } adv = cw.generate_np(adv_inputs, adv_input_targets, **cw_params) #print("shaep of adv: ", np.shape(adv)) recon_orig = model.get_layer(adv_inputs, 'RECON') lat_adv = model.get_layer(adv, 'LATENT') recon_adv = model.get_layer(adv, 'RECON') lat_orig = model.get_layer(x, 'LATENT') lat_orig_recon = model.get_layer(recons, 'LATENT') #pred_adv_recon = cl_model.get_layer(recon_adv, 'LOGITS') pred_adv_recon = cl_model.get_layer(lat_adv, 'LOGITS') #eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} eval_params = {'batch_size': 90} if targeted: noise, d1, d2, dist_diff, avg_dist_lat = model_eval_ae( sess, x, x_t, recons, adv_inputs, adv_input_targets, adv, recon_adv, lat_orig, lat_orig_recon, args=eval_params) acc = model_eval(sess, x, y, pred_adv_recon, x_t, adv_inputs, adv_target_y, adv_input_targets, args=eval_params_cls) print("noise: ", noise) print("classifier acc: ", acc) recon_adv = sess.run(recon_adv) recon_orig = sess.run(recon_orig) #print("recon_adv[0]\n", recon_adv[0,:,:,0]) curr_class = 0 if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if (i == j): grid_viz_data[i, j] = recon_orig[curr_class * 9] grid_viz_data_1[i, j] = adv_inputs[curr_class * 9] curr_class = curr_class + 1 else: if (j > i): grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j - 1] grid_viz_data_1[i, j] = adv[i * (nb_classes - 1) + j - 1] else: grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j] grid_viz_data_1[i, j] = adv[i * (nb_classes - 1) + j] #rint(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session #sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: _ = grid_visual(grid_viz_data) _ = grid_visual(grid_viz_data_1) #return report #adversarial training if (adv_train == True): print("starting adversarial training") #sess1 = tf.Session() adv_input_set = [] adv_input_target_set = [] for i in range(20): indices = np.arange(np.shape(x_train)[0]) np.random.shuffle(indices) print("indices: ", indices[1:10]) x_train = x_train[indices] y_train = y_train[indices] idxs = [ np.where(np.argmax(y_train, axis=1) == i)[0][0] for i in range(nb_classes) ] adv_inputs_2 = np.array([[instance] * (nb_classes - 1) for instance in x_train[idxs]], dtype=np.float32) adv_input_targets_2 = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes): if (id != curr_num): targ.append(x_train[idxs[id]]) adv_input_targets_2.append(targ) adv_input_targets_2 = np.array(adv_input_targets_2) adv_inputs_2 = adv_inputs_2.reshape( (source_samples * (nb_classes - 1), img_rows, img_cols, nchannels)) adv_input_targets_2 = adv_input_targets_2.reshape( (source_samples * (nb_classes - 1), img_rows, img_cols, nchannels)) adv_input_set.append(adv_inputs_2) adv_input_target_set.append(adv_input_targets_2) adv_input_set = np.array(adv_input_set), adv_input_target_set = np.array(adv_input_target_set) print("shape of adv_input_set: ", np.shape(adv_input_set)) print("shape of adv_input_target_set: ", np.shape(adv_input_target_set)) adv_input_set = np.reshape( adv_input_set, (np.shape(adv_input_set)[0] * np.shape(adv_input_set)[1] * np.shape(adv_input_set)[2], np.shape(adv_input_set)[3], np.shape(adv_input_set)[4], np.shape(adv_input_set)[5])) adv_input_target_set = np.reshape(adv_input_target_set, (np.shape(adv_input_target_set)[0] * np.shape(adv_input_target_set)[1], np.shape(adv_input_target_set)[2], np.shape(adv_input_target_set)[3], np.shape(adv_input_target_set)[4])) print("generated adversarial training set") adv_set = cw.generate_np(adv_input_set, adv_input_target_set, **cw_params) x_train_aim = np.append(x_train, adv_input_set, axis=0) x_train_app = np.append(x_train, adv_set, axis=0) model_adv_trained = ModelBasicAE('model_adv_trained', nb_layers, nb_latent_size) recons_2 = model_adv_trained.get_layer(x, 'RECON') loss_2 = SquaredError(model_adv_trained) train_ae(sess, loss_2, x_train_app, x_train_aim, args=train_params, rng=rng, var_list=model_adv_trained.get_params()) saver = tf.train.Saver() saver.save(sess, model_path) cw2 = CarliniWagnerAE(model_adv_trained, cl_model, sess=sess) adv_2 = cw2.generate_np(adv_inputs, adv_input_targets, **cw_params) #print("shaep of adv: ", np.shape(adv)) recon_orig = model_adv_trained.get_layer(adv_inputs, 'RECON') recon_adv = model_adv_trained.get_layer(adv_2, 'RECON') lat_orig = model_adv_trained.get_layer(x, 'LATENT') lat_orig_recon = model_adv_trained.get_layer(recons, 'LATENT') pred_adv_recon = cl_model.get_layer(recon_adv, 'LOGITS') #eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} eval_params = {'batch_size': 90} if targeted: noise, d1, d2, dist_diff, avg_dist_lat = model_eval_ae( sess, x, x_t, recons, adv_inputs, adv_input_targets, adv_2, recon_adv, lat_orig, lat_orig_recon, args=eval_params) acc = model_eval(sess, x, y, pred_adv_recon, x_t, adv_inputs, adv_target_y, adv_input_targets, args=eval_params_cls) print("noise: ", noise) #print("d1: ", d1) #print("d2: ", d2) #print("d1-d2: ", dist_diff) #print("Avg_dist_lat: ", avg_dist_lat) print("classifier acc: ", acc) recon_adv = sess.run(recon_adv) recon_orig = sess.run(recon_orig) #print("recon_adv[0]\n", recon_adv[0,:,:,0]) curr_class = 0 if viz_enabled: for j in range(nb_classes): for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if (i == j): grid_viz_data[i, j] = recon_orig[curr_class * 9] grid_viz_data_1[i, j] = adv_inputs[curr_class * 9] curr_class = curr_class + 1 else: if (j > i): grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j - 1] grid_viz_data_1[i, j] = adv_2[i * (nb_classes - 1) + j - 1] else: grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j] grid_viz_data_1[i, j] = adv_2[i * (nb_classes - 1) + j] #rint(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv_2 - adv_inputs)**2, axis=(1, 2, 3))**.5) print( 'Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: _ = grid_visual(grid_viz_data) _ = grid_visual(grid_viz_data_1) return report #binarization defense if (binarization_defense == True or mean_filtering == True): #adv = sess.run(adv) # print(adv[0]) if (binarization_defense == True): adv[adv > 0.5] = 1.0 adv[adv <= 0.5] = 0.0 else: #radius = 2 #adv_list = [mean(adv[i,:,:,0], disk(radius)) for i in range(0, np.shape(adv)[0])] #adv = np.array(adv_list) #adv = np.expand_dims(adv, axis = 3) adv = uniform_filter(adv, 2) #adv = median_filter(adv, 2) #print("after bin ") #print(adv[0]) recon_orig = model.get_layer(adv_inputs, 'RECON') recon_adv = model.get_layer(adv, 'RECON') lat_adv = model.get_layer(adv, 'LATENT') lat_orig = model.get_layer(x, 'LATENT') lat_orig_recon = model.get_layer(recons, 'LATENT') pred_adv_recon = cl_model.get_layer(lat_adv, 'LOGITS') #eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} eval_params = {'batch_size': 90} if targeted: noise, d1, d2, dist_diff, avg_dist_lat = model_eval_ae( sess, x, x_t, recons, adv_inputs, adv_input_targets, adv, recon_adv, lat_orig, lat_orig_recon, args=eval_params) acc1 = model_eval(sess, x, y, pred_adv_recon, x_t, adv_inputs, adv_target_y, adv_input_targets, args=eval_params_cls) acc2 = model_eval(sess, x, y, pred_adv_recon, x_t, adv_inputs, adv_input_y, adv_input_targets, args=eval_params_cls) print("noise: ", noise) print("classifier acc for target class: ", acc1) print("classifier acc for true class: ", acc2) recon_adv = sess.run(recon_adv) recon_orig = sess.run(recon_orig) #print("recon_adv[0]\n", recon_adv[0,:,:,0]) curr_class = 0 if viz_enabled: for j in range(nb_classes): for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if (i == j): grid_viz_data[i, j] = recon_orig[curr_class * 9] grid_viz_data_1[i, j] = adv_inputs[curr_class * 9] curr_class = curr_class + 1 else: if (j > i): grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j - 1] grid_viz_data_1[i, j] = adv[i * (nb_classes - 1) + j - 1] else: grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j] grid_viz_data_1[i, j] = adv[i * (nb_classes - 1) + j] sess.close() _ = grid_visual(grid_viz_data) _ = grid_visual(grid_viz_data_1)
def main(argv=None): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :return: """ # Disable Keras learning phase since we will be serving through tensorflow keras.layers.core.K.set_learning_phase(0) # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' " "to 'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) print("Created TensorFlow session and set Keras backend.") # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist() print("Loaded MNIST test data.") # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model if it does not exist in the train_dir folder saver = tf.train.Saver() save_path = os.path.join(FLAGS.train_dir, FLAGS.filename) if os.path.isfile(save_path): saver.restore(sess, os.path.join(FLAGS.train_dir, FLAGS.filename)) else: train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate } model_train(sess, x, y, preds, X_train, Y_train, args=train_params) saver.save(sess, save_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(FLAGS.source_samples) + ' * ' + str(FLAGS.nb_classes-1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (FLAGS.nb_classes, FLAGS.nb_classes, FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Define the SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, FLAGS.source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, FLAGS.source_samples)) # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(FLAGS.nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( X_test[sample_ind:(sample_ind+1)], (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, FLAGS.nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params = {'theta': 1., 'gamma': 0.1, 'nb_classes': FLAGS.nb_classes, 'clip_min': 0., 'clip_max': 1., 'targets': y, 'y_val': one_hot_target} adv_x = jsma.generate_np(X_test[sample_ind:(sample_ind+1)], **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if FLAGS.viz_enabled: if 'figure' not in vars(): figure = pair_visual( np.reshape(X_test[sample_ind:(sample_ind+1)], (FLAGS.img_rows, FLAGS.img_cols)), np.reshape(adv_x, (FLAGS.img_rows, FLAGS.img_cols))) else: figure = pair_visual( np.reshape(X_test[sample_ind:(sample_ind+1)], (FLAGS.img_rows, FLAGS.img_cols)), np.reshape(adv_x, (FLAGS.img_rows, FLAGS.img_cols)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((FLAGS.nb_classes - 1) * FLAGS.source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if FLAGS.viz_enabled: _ = grid_visual(grid_viz_data)
def main(argv=None): """ CIFAR10 CleverHans tutorial :return: """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # CIFAR10-specific dimensions img_rows = 32 img_cols = 32 channels = 3 nb_classes = 10 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) sess = tf.Session() set_log_level(logging.DEBUG) # Get CIFAR10 test data X_train, Y_train, X_test, Y_test = data_cifar10() # Label smoothing assert Y_train.shape[1] == 10. # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = FLAGS.model_path nb_samples = FLAGS.nb_samples from cnn_models import make_basic_cnn model = make_basic_cnn('fp_', input_shape=(None, img_rows, img_cols, channels), nb_filters=FLAGS.nb_filters) preds = model(x) print("Defined TensorFlow model graph with %d parameters" % model.n_params) rng = np.random.RandomState([2017, 8, 30]) def evaluate(eval_params): # Evaluate the model on legitimate test examples acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) return acc model_load(sess, model_path) print('Restored model from %s' % model_path) eval_params = {'batch_size': FLAGS.batch_size} accuracy = evaluate(eval_params) print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(nb_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, nb_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, nb_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') from cleverhans.attacks import SaliencyMapMethod jsma = SaliencyMapMethod(model, sess=sess) jsma_params = { 'gamma': FLAGS.gamma, 'theta': 1., 'symbolic_impl': True, 'clip_min': 0., 'clip_max': 1., 'y_target': None } figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in range(0, nb_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, nb_samples)) sample = X_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if FLAGS.viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, channels)), np.reshape(adv_x, (img_rows, img_cols, channels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * nb_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if FLAGS.viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data)
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session #replace num_threads = None if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) #with sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = make_basic_picklable_cnn() preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } dataset = tf.data.Dataset.from_tensor_slices( (tf.reshape(x_train, [60000, 28, 28]), y_train)) dataset = dataset.batch(32) val_dataset = tf.data.Dataset.from_tensor_slices( (tf.reshape(x_test, [10000, 28, 28]), y_test)) val_dataset = val_dataset.batch(32) sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) if TRAIN_NEW == 1: with sess.as_default(): train(sess, loss, x_train, y_train, args=train_params, rng=rng) save("test.joblib", model) else: with sess.as_default(): model = load("test.joblib") #changed assert len(model.get_params()) > 0 preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } figure = None # Loop over the samples we want to perturb into adversarial examples seed(SEED) for sample_ind in xrange(0, source_samples): img = randint(0, 10000) print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[img:(img + 1)] #sample = x_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax( y_test[img])) #current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) tn = 0 totc = 0 # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Compute number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] diff = np.array(adv_x - sample) #print(np.sum(diff)) diff = np.reshape(diff, (28, 28)) diff = diff * 255 cv2.imwrite("test.png", diff) diff = cv2.imread("test.png") diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY) nieghbors = 0 tc = 0 for i in range(0, 28, 1): for j in range(0, 28, 1): if diff[i, j] > 0: tc = tc + 1 totc = totc + 1 if i > 0 and i < 27 and j > 0 and j < 27: #main grid not edges or corners if diff[i - 1, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j + 1] > 0: nieghbors = nieghbors + 1 else: #corners if i == 0 and j == 0: if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if i == 27 and j == 0: if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 if i == 0 and j == 27: if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if i == 27 and j == 27: if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 #edges if i == 0 and j > 0 and j < 27: #left side if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j + 1] > 0: nieghbors = nieghbors + 1 if i == 27 and j > 0 and j < 27: #right side if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j + 1] > 0: nieghbors = nieghbors + 1 if j == 0 and i > 0 and i < 27: #top side if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j + 1] > 0: nieghbors = nieghbors + 1 if j == 27 and i > 0 and i < 27: #bot side if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j - 1] > 0: nieghbors = nieghbors + 1 # print(tc) # print(nieghbors) tn = tn + nieghbors # if tc > 0: # print(nieghbors/tc) # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb #print(perturbations[target, sample_ind]) print('--------------------------------------') print("average neighbors per modified pixel ", tn / totc) # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.8f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) s = perturbations.shape myPert = np.empty(0) myResults = np.empty(0) for i in range(s[0]): for j in range(s[1]): if perturbations[i][j] > 0: myPert = np.append(myPert, perturbations[i][j]) myResults = np.append(myResults, results[i][j]) min_perturbed = np.min(myPert) max_perturbed = np.max(myPert) s2 = myResults.shape final = np.empty(0) for i in range(s2[0]): if myResults[i] > 0: final = np.append(final, myPert[i]) print('Avg. rate of perturbed features {0:.8f}'.format(percent_perturbed)) print('MIN of perturbed features {0:.8f}'.format(min_perturbed)) print('MAX of perturbed features {0:.8f}'.format(max_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) min_perturb_succ = np.min(final) max_perturb_succ = np.max(final) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.8f}'.format(percent_perturb_succ)) print('Min of perturbed features for successful ' 'adversarial examples {0:.8f}'.format(min_perturb_succ)) print('Max of perturbed features for successful ' 'adversarial examples {0:.8f}'.format(max_perturb_succ)) #Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def mnist_attack(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_filters=64, nb_samples=10, learning_rate=0.001, eps=0.3, attack=0, attack_iterations=100, model_path=None, targeted=False, binary=False, scale=False, rand=False, debug=None, test=False, data_dir=None, delay=0, adv=0, nb_iter=40): """ MNIST tutorial for generic attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param nb_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 nb_classes = 10 # Set TF random seed to improve reproducibility tf.set_random_seed(1237) # Create TF session sess = tf.Session() print("Created TensorFlow session.") if debug: set_log_level(logging.DEBUG) else: set_log_level(logging.WARNING) # for running on sharcnet # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(datadir=data_dir, train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) phase = tf.placeholder(tf.bool, name='phase') # for attempting to break unscaled network. logits_scalar = tf.placeholder_with_default(INIT_T, shape=(), name="logits_temperature") save = False train_from_scratch = False if model_path is not None: if os.path.exists(model_path): # check for existing model in immediate subfolder if any(f.endswith('.meta') for f in os.listdir(model_path)): binary, scale, nb_filters, batch_size, learning_rate, nb_epochs, adv = parse_model_settings( model_path) train_from_scratch = False else: model_path = build_model_save_path(model_path, binary, batch_size, nb_filters, learning_rate, nb_epochs, adv, delay, scale) print(model_path) save = True train_from_scratch = True else: train_from_scratch = True # train from scratch, but don't save since no path given # Define TF model graph if binary: print('binary=True') if scale: print('scale=True') if rand: print('rand=True') from cleverhans_tutorials.tutorial_models import make_scaled_binary_rand_cnn model = make_scaled_binary_rand_cnn( phase, logits_scalar, 'binsc_', input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_scaled_binary_cnn model = make_scaled_binary_cnn(phase, logits_scalar, 'binsc_', input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_basic_binary_cnn model = make_basic_binary_cnn(phase, logits_scalar, 'bin_', nb_filters=nb_filters) else: if rand: print('rand=True') from cleverhans_tutorials.tutorial_models import make_scaled_rand_cnn model = make_scaled_rand_cnn(phase, logits_scalar, 'fp_rand', nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_basic_cnn model = make_basic_cnn(phase, logits_scalar, 'fp_', nb_filters=nb_filters) preds = model(x, reuse=False) # * logits_scalar print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### rng = np.random.RandomState([2017, 8, 30]) # Train an MNIST model train_params = { 'binary': binary, 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'loss_name': 'train loss', 'filename': 'model', 'reuse_global_step': False, 'train_scope': 'train', 'is_training': True } if adv != 0: if adv == ADVERSARIAL_TRAINING_MADRYETAL: from cleverhans.attacks import MadryEtAl train_attack_params = { 'eps': MAX_EPS, 'eps_iter': 0.01, 'nb_iter': nb_iter } train_attacker = MadryEtAl(model, sess=sess) elif adv == ADVERSARIAL_TRAINING_FGSM: from cleverhans.attacks import FastGradientMethod stddev = int(np.ceil((MAX_EPS * 255) // 2)) train_attack_params = { 'eps': tf.abs( tf.truncated_normal(shape=(batch_size, 1, 1, 1), mean=0, stddev=stddev)) } train_attacker = FastGradientMethod(model, back='tf', sess=sess) # create the adversarial trainer train_attack_params.update({'clip_min': 0., 'clip_max': 1.}) adv_x_train = train_attacker.generate(x, phase, **train_attack_params) preds_adv_train = model.get_probs(adv_x_train) eval_attack_params = {'eps': MAX_EPS, 'clip_min': 0., 'clip_max': 1.} adv_x_eval = train_attacker.generate(x, phase, **eval_attack_params) preds_adv_eval = model.get_probs(adv_x_eval) # * logits_scalar def evaluate(): # Evaluate the accuracy of the MNIST model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, phase=phase, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) if adv != 0: # Accuracy of the adversarially trained model on adversarial # examples acc = model_eval(sess, x, y, preds_adv_eval, X_test, Y_test, phase=phase, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % acc) acc = model_eval(sess, x, y, preds_adv_eval, X_test, Y_test, phase=phase, args=eval_params, feed={logits_scalar: ATTACK_T}) print('Test accuracy on adversarial examples (scaled): %0.4f' % acc) if train_from_scratch: if save: train_params.update({'log_dir': model_path}) if adv and delay > 0: train_params.update({'nb_epochs': delay}) # do clean training for 'nb_epochs' or 'delay' epochs if test: model_train(sess, x, y, preds, X_train, Y_train, phase=phase, evaluate=evaluate, args=train_params, save=save, rng=rng) else: model_train(sess, x, y, preds, X_train, Y_train, phase=phase, args=train_params, save=save, rng=rng) # optionally do additional adversarial training if adv: print("Adversarial training for %d epochs" % (nb_epochs - delay)) train_params.update({'nb_epochs': nb_epochs - delay}) train_params.update({'reuse_global_step': True}) if test: model_train(sess, x, y, preds, X_train, Y_train, phase=phase, predictions_adv=preds_adv_train, evaluate=evaluate, args=train_params, save=save, rng=rng) else: model_train(sess, x, y, preds, X_train, Y_train, phase=phase, predictions_adv=preds_adv_train, args=train_params, save=save, rng=rng) else: tf_model_load(sess, model_path) print('Restored model from %s' % model_path) evaluate() # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, phase=phase, feed={phase: False}, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Build dataset ########################################################################### if viz_enabled: assert nb_samples == nb_classes idxs = [ np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] viz_rows = nb_classes if targeted else 2 # Initialize our array for grid visualization grid_shape = (nb_classes, viz_rows, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') if targeted: from cleverhans.utils import build_targeted_dataset if viz_enabled: from cleverhans.utils import grid_visual adv_inputs, true_labels, adv_ys = build_targeted_dataset( X_test, Y_test, idxs, nb_classes, img_rows, img_cols, channels) else: adv_inputs, true_labels, adv_ys = build_targeted_dataset( X_test, Y_test, np.arange(nb_samples), nb_classes, img_rows, img_cols, channels) else: if viz_enabled: from cleverhans.utils import pair_visual adv_inputs = X_test[idxs] else: adv_inputs = X_test[:nb_samples] ########################################################################### # Craft adversarial examples using generic approach ########################################################################### if targeted: att_batch_size = np.clip(nb_samples * (nb_classes - 1), a_max=MAX_BATCH_SIZE, a_min=1) nb_adv_per_sample = nb_classes - 1 yname = "y_target" else: att_batch_size = np.minimum(nb_samples, MAX_BATCH_SIZE) nb_adv_per_sample = 1 adv_ys = None yname = "y" print('Crafting ' + str(nb_samples) + ' * ' + str(nb_adv_per_sample) + ' adversarial examples') print("This could take some time ...") if attack == ATTACK_CARLINI_WAGNER_L2: print('Attack: CarliniWagnerL2') from cleverhans.attacks import CarliniWagnerL2 attacker = CarliniWagnerL2(model, back='tf', sess=sess) attack_params = { 'binary_search_steps': 1, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': att_batch_size, 'initial_const': 10, } elif attack == ATTACK_JSMA: print('Attack: SaliencyMapMethod') from cleverhans.attacks import SaliencyMapMethod attacker = SaliencyMapMethod(model, back='tf', sess=sess) attack_params = {'theta': 1., 'gamma': 0.1} elif attack == ATTACK_FGSM: print('Attack: FastGradientMethod') from cleverhans.attacks import FastGradientMethod attacker = FastGradientMethod(model, back='tf', sess=sess) attack_params = {'eps': eps} elif attack == ATTACK_MADRYETAL: print('Attack: MadryEtAl') from cleverhans.attacks import MadryEtAl attacker = MadryEtAl(model, back='tf', sess=sess) attack_params = {'eps': eps, 'eps_iter': 0.01, 'nb_iter': nb_iter} elif attack == ATTACK_BASICITER: print('Attack: BasicIterativeMethod') from cleverhans.attacks import BasicIterativeMethod attacker = BasicIterativeMethod(model, back='tf', sess=sess) attack_params = {'eps': eps, 'eps_iter': 0.01, 'nb_iter': nb_iter} else: print("Attack undefined") sys.exit(1) attack_params.update({yname: adv_ys, 'clip_min': 0., 'clip_max': 1.}) adv_np = attacker.generate_np(adv_inputs, phase, **attack_params) ''' name = 'm_fgsm_eps%s_n%s.npy' % (eps, nb_samples) fpath = os.path.join( '/scratch/gallowaa/mnist/adversarial_examples/cleverhans/', name) np.savez(fpath, x=adv_np, y=Y_test[:nb_samples]) ''' ''' adv_x = attacker.generate(x, phase, **attack_params) adv_np, = batch_eval(sess, [x], [adv_x], [adv_inputs], feed={ phase: False}, args=eval_params) ''' eval_params = {'batch_size': att_batch_size} if targeted: print("Evaluating targeted results") adv_accuracy = model_eval(sess, x, y, preds, adv_np, true_labels, phase=phase, args=eval_params) else: print("Evaluating untargeted results") if viz_enabled: adv_accuracy = model_eval(sess, x, y, preds, adv_np, Y_test[idxs], phase=phase, args=eval_params) else: adv_accuracy = model_eval(sess, x, y, preds, adv_np, Y_test[:nb_samples], phase=phase, args=eval_params) if viz_enabled: n = nb_classes - 1 for i in range(nb_classes): if targeted: for j in range(nb_classes): if i != j: if j != 0 and i != n: grid_viz_data[i, j] = adv_np[j * n + i] if j == 0 and i > 0 or i == n and j > 0: grid_viz_data[i, j] = adv_np[j * n + i - 1] else: grid_viz_data[i, j] = adv_inputs[j * n] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv_np[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Test accuracy on adversarial examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv_np - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Compute number of modified features (L_0 norm) nb_changed = np.where(adv_np != adv_inputs)[0].shape[0] percent_perturb = np.mean(float(nb_changed) / adv_np.reshape(-1).shape[0]) # Compute the average distortion introduced by the algorithm print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturb)) # Friendly output for pasting into spreadsheet print('{0:.4f}'.format(accuracy)) print('{0:.4f}'.format(adv_accuracy)) print('{0:.4f}'.format(percent_perturbed)) print('{0:.4f}'.format(percent_perturb)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_cw( train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED, ): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST( train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end, ) x_train, y_train = mnist.get_set("train") x_test, y_test = mnist.get_set("test") # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN("model1", nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { "nb_epochs": nb_epochs, "batch_size": batch_size, "learning_rate": learning_rate, "filename": os.path.split(model_path)[-1], } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {"batch_size": batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print("Test accuracy on legitimate test examples: {0}".format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else "1" print( "Crafting " + str(source_samples) + " * " + nb_adv_per_sample + " adversarial examples" ) print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype="f") adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32 ) else: adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32, ) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels) ) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes) ) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype="f") adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" if targeted: cw_params_batch_size = source_samples * nb_classes else: cw_params_batch_size = source_samples cw_params = { "binary_search_steps": 1, yname: adv_ys, "max_iterations": attack_iterations, "learning_rate": CW_LEARNING_RATE, "batch_size": cw_params_batch_size, "initial_const": 10, } adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {"batch_size": np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval( sess, x, y, preds, adv, y_test[:source_samples], args=eval_params ) adv_accuracy = 1 - err if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print("--------------------------------------") # Compute the number of adversarial examples that were successfully found print("Avg. rate of successful adv. examples {0:.4f}".format(adv_accuracy)) report.clean_train_adv_eval = 1.0 - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs) ** 2, axis=(1, 2, 3)) ** 0.5) print("Avg. L_2 norm of perturbations {0:.4f}".format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(file, train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] nb_filters = 64 ################ color training initialization #################### color_training_epochs = 5000 color_learning_rate = 0.1 colorCategory = [ [0.0, 0.4], # Black [0.3, 0.7], # Grey [0.6, 1.0] # White ] numColorInput = 1 color_x = tf.placeholder( tf.float32, [None, numColorInput]) # mnist data image of shape 28*28=784 color_y = tf.placeholder( tf.float32, [None, numColorOutput]) # 0-9 digits recognition => 10 classes # Set model weights color_W = tf.Variable(tf.zeros([numColorInput, numColorOutput])) color_b = tf.Variable(tf.zeros([numColorOutput])) color_pred_out = tf.matmul(color_x, color_W) + color_b # Softmax color_cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=color_pred_out, labels=color_y)) # Gradient Descent color_optimizer = tf.train.GradientDescentOptimizer( color_learning_rate).minimize(color_cost) # Test model color_argmax = tf.argmax(color_pred_out, 1) color_correct_prediction = tf.equal(tf.argmax(color_pred_out, 1), tf.argmax(color_y, 1)) # Calculate accuracy color_accuracy = tf.reduce_mean( tf.cast(color_correct_prediction, tf.float32)) # Graph for re-generating the original image into a new image by using trained color model pr_model_x = tf.placeholder( tf.float32, [None, n_input, numColorInput]) # mnist data image of shape 28*28=784 pr_model_W = tf.placeholder(tf.float32, [None, numColorInput, numColorOutput ]) # mnist data image of shape 28*28=784 pr_model_b = tf.placeholder(tf.float32, [None, numColorInput, numColorOutput ]) # mnist data image of shape 28*28=784 pr_model_output = tf.one_hot( tf.argmax((tf.matmul(pr_model_x, pr_model_W) + pr_model_b), 2), numColorOutput) # Merge the random generated output for new image based on the colorCategory randomColorCategory = [] for i in range(len(colorCategory)): tmp = [] tmpRandomColorCategory = my_tf_round( tf.random_uniform(tf.shape(pr_model_x), colorCategory[i][0], colorCategory[i][1], dtype=tf.float32), 2) tmp.append(tmpRandomColorCategory) randomColorCategory.append(tf.concat(tmp, 1)) random_merge = tf.reshape(tf.concat(randomColorCategory, -1), [-1, n_input, numColorOutput]) random_color_set = tf.reduce_sum( tf.multiply(pr_model_output, random_merge), 2) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) x = tf.reshape(random_color_set, shape=(-1, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, #'train_dir': os.path.join(*os.path.split(model_path)[:-1]), #'filename': os.path.split(model_path)[-1], 'train_dir': save_dir, 'filename': filename, 'numColorOutput': numColorOutput } with sess.as_default(): if hasattr(tf, "global_variables_initializer"): tf.global_variables_initializer().run() else: warnings.warn("Update your copy of tensorflow; future versions of " "CleverHans may drop support for this version.") sess.run(tf.initialize_all_variables()) ################# color training #################### print("Trying to load pr model from: " + model_path2) if os.path.exists(model_path2 + ".meta"): tf_model_load(sess, model_path2) c_w, c_b = sess.run([color_W, color_b]) print("Load color trained model in training") else: # Training the color for epoch in range(color_training_epochs): outputColorY = [] p1 = np.random.random(100) for i in range(len(p1)): outputOverlapColorY = [] for j in range(len(colorCategory)): if p1[i] >= colorCategory[j][0] and p1[ i] <= colorCategory[j][1]: colorIndexSeq = [] for k in range(len(colorCategory)): if j == k: colorIndexSeq.append(1) else: colorIndexSeq.append(0) outputOverlapColorY.append(colorIndexSeq) #break # Randomly choose the output for color Y if the outputOverlapColorY has more than 1 item outputColorY.append(outputOverlapColorY[np.random.randint( 0, len(outputOverlapColorY))]) inputColorX = p1.reshape(100, 1) _, c, c_w, c_b = sess.run( [color_optimizer, color_cost, color_W, color_b], feed_dict={ color_x: inputColorX, color_y: outputColorY }) avg_cost = c # Evaluating color model outputColorY = [] p1 = np.random.random(100) # Generate output for random color inputs (test case) for i in range(len(p1)): for j in range(len(colorCategory)): outputOverlapColorY = [] if p1[i] >= colorCategory[j][0] and p1[ i] <= colorCategory[j][1]: colorIndexSeq = [] for k in range(len(colorCategory)): if j == k: colorIndexSeq.append(1) else: colorIndexSeq.append(0) outputOverlapColorY.append(colorIndexSeq) break # Randomly choose the output for color Y if the outputOverlapColorY has more than 1 item outputColorY.append(outputOverlapColorY[np.random.randint( 0, len(outputOverlapColorY))]) inputColorX = p1.reshape(100, 1) # print(random_xs) acc, argmax = sess.run([color_accuracy, color_argmax], feed_dict={ color_x: inputColorX, color_y: outputColorY }) print("Epoch:", '%04d' % (epoch + 1) + "/" + str(color_training_epochs) + ", Cost= " + \ "{:.9f}".format(avg_cost) + ", Training Accuracy= " + \ "{:.5f}".format(acc) + " ") # print(c_w) with tf.device('/CPU:0'): saver = tf.train.Saver(tf.global_variables(), max_to_keep=50) # Since training PR model is fast, we do not have to save multiple sessions for this save_path = os.path.join(save_dir2, filename2) saver.save(sess, save_path) ##################### end of color training ------------------------------ ################# model training #################### rng = np.random.RandomState([2017, 8, 30]) saveFileNum = 50 saveFileNum = 500 # saveFileNum = 1000 model_path = os.path.join(save_dir, filename + "-" + str(saveFileNum)) # check if we've trained before, and if we have, use that pre-trained model print("Trying to load trained model from: " + model_path) if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) print("Load trained model") else: train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng, save=True, c_w=c_w, c_b=c_b, pr_model_x=pr_model_x, random_color_set=random_color_set, pr_model_W=pr_model_W, pr_model_b=pr_model_b) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size, 'numColorOutput': numColorOutput} #accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params, # pred2=preds, c_w=c_w, c_b=c_b, pr_model_x=pr_model_x, random_color_set=random_color_set, # pr_model_W=pr_model_W, pr_model_b=pr_model_b) #assert x_test.shape[0] == test_end - test_start, x_test.shape #print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) #report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] adv_inputs = x_test else: adv_inputs = x_test[:source_samples] adv_inputs = x_test adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': 10 } adv2 = cw.generate(x, **cw_params) cw_params[yname] = adv_ys adv = None adv = cw.generate_np(adv_inputs, **cw_params) eval_params = { 'batch_size': np.minimum(nb_classes, source_samples), 'numColorOutput': numColorOutput } if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) else: #adv_accuracy = model_eval(sess, x, y, preds, adv, y_test[ # :source_samples], args=eval_params) adv_accuracy = model_eval(sess, x, y, preds, adv, y_test, args=eval_params, pred2=preds, c_w=c_w, c_b=c_b, pr_model_x=pr_model_x, random_color_set=random_color_set, pr_model_W=pr_model_W, pr_model_b=pr_model_b, is_adv=True, ae=adv2) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') print("load save file: ", saveFileNum) # Compute the number of adversarial examples that were successfully found print('Test with adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def do_cw(): nb_adv_per_sample = str(nb_classes - 1) if FLAGS.targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if FLAGS.viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if FLAGS.targeted: if FLAGS.viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in X_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in X_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape((source_samples * nb_classes, img_rows, img_cols, channels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if FLAGS.viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = X_test[idxs] else: adv_inputs = X_test[:source_samples] adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': FLAGS.attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if FLAGS.targeted else source_samples, 'initial_const': 10 } adv = cw.generate_np(adv_inputs, **cw_params) if FLAGS.targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_par) else: if FLAGS.viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, Y_test[ idxs], args=eval_par) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, Y_test[ :source_samples], args=eval_par) if FLAGS.viz_enabled: for j in range(nb_classes): if FLAGS.targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format( adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format( percent_perturbed)) # Close TF session # sess.close() # Finally, block & display a grid of all the adversarial examples if FLAGS.viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind+1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_deepfool(train_start=0, train_end=60000, #读60000训练 test_start=0,test_end=10000, #读10000测试 viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=2, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist")): """ MNIST tutorial for Deepfool's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples激活对抗例子 :param nb_epochs: number of epochs to train model(一个epoch指代所有的数据送入网络中完成一次前向计算及反向传播的过程。) :param batch_size: size of training batches :param nb_classes: number of output classes(输出几类) :param source_samples: number of test inputs to attack(测试输入用于攻击的数量) :param learning_rate: learning rate for training(学习率) :param model_path: path to the model file(文件路径) :param attack_iterations: 攻击迭代次数 :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies精确度报告 report = AccuracyReport() # MNIST-specific dimensions图像尺寸28*28*1 img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = make_basic_picklable_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow(构建训练模型) ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2018, 8, 9]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path+".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models"), rng=rng) print("save success") # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') print("This could take some time ...") # Instantiate a DeepFool attack object deepfool = DeepFool(model, back='tf', sess=sess) idxs = [np.where(np.argmax(Y_test, axis=1) == i)[0][1] for i in range(10)] print("idxs:",idxs) # construct adv_inputs grid_shape = (nb_classes, 2, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') print("grid_viz_data",grid_viz_data.shape) adv_inputs = X_test[idxs].reshape([-1,28,28,1]) deepfool_params = {'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': attack_iterations, 'nb_classes': 10, 'clip_min': 0., 'clip_max': 1.} adv = deepfool.generate_np(adv_inputs, **deepfool_params) print("adv success") adv_accuracy = 1-model_eval(sess, x, y, preds, adv, Y_test[idxs], args={'batch_size': 10}) for j in range(10): grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1.-adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x, y, x_train, y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': 10 } adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ idxs], args=eval_params) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ :source_samples], args=eval_params) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def main(argv=None): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :return: """ # Set TF random seed to improve reproducibility tf.set_random_seed(1234) ########################################################################### # Define the dataset and model ########################################################################### # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' " "to 'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) print("Created TensorFlow session and set Keras backend.") # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist() print("Loaded MNIST test data.") # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() predictions = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model if it does not exist in the train_dir folder saver = tf.train.Saver() save_path = os.path.join(FLAGS.train_dir, FLAGS.filename) if os.path.isfile(save_path): saver.restore(sess, os.path.join(FLAGS.train_dir, FLAGS.filename)) else: train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate } model_train(sess, x, y, predictions, X_train, Y_train, args=train_params) saver.save(sess, save_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(FLAGS.source_samples) + ' * ' + str(FLAGS.nb_classes - 1) + ' adversarial examples') # This array indicates whether an adversarial example was found for each # test set sample and target class results = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='i') # This array contains the fraction of perturbed features for each test set # sample and target class perturbations = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='f') # Define the TF graph for the model's Jacobian grads = jacobian_graph(predictions, x, FLAGS.nb_classes) # Initialize our array for grid visualization grid_shape = (FLAGS.nb_classes, FLAGS.nb_classes, FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, FLAGS.source_samples): # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(FLAGS.nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( X_test[sample_ind:(sample_ind + 1)], (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) # Loop over all target classes for target in target_classes: print('--------------------------------------') print('Creating adv. example for target class ' + str(target)) # This call runs the Jacobian-based saliency map approach adv_x, res, percent_perturb = jsma(sess, x, predictions, grads, X_test[sample_ind:(sample_ind + 1)], target, theta=1, gamma=0.1, increase=True, back='tf', clip_min=0, clip_max=1) # Display the original and adversarial images side-by-side if FLAGS.viz_enabled: if 'figure' not in vars(): figure = pair_visual( np.reshape(X_test[sample_ind:(sample_ind + 1)], (FLAGS.img_rows, FLAGS.img_cols)), np.reshape(adv_x, (FLAGS.img_rows, FLAGS.img_cols))) else: figure = pair_visual( np.reshape(X_test[sample_ind:(sample_ind + 1)], (FLAGS.img_rows, FLAGS.img_cols)), np.reshape(adv_x, (FLAGS.img_rows, FLAGS.img_cols)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb # Compute the number of adversarial examples that were successfuly found nb_targets_tried = ((FLAGS.nb_classes - 1) * FLAGS.source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.2f}'.format(succ_rate)) # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.2f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.2f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if FLAGS.viz_enabled: _ = grid_visual(grid_viz_data)
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) model_train(sess, x, y, preds, X_train, Y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind+1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, channels)), np.reshape(adv_x, (img_rows, img_cols, channels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Disable Keras learning phase since we will be serving through tensorflow keras.layers.core.K.set_learning_phase(0) # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Image dimensions ordering should follow the TensorFlow convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' " "to 'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) print("Created TensorFlow session and set Keras backend.") # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path+".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models")) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object wrap = KerasModelWrapper(model) cw = CarliniWagnerL2(wrap, back='tf', sess=sess) idxs = [np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(10)] if targeted: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') one_hot = np.zeros((10, 10)) one_hot[np.arange(10), np.arange(10)] = 1 adv_inputs = np.array([[instance] * 10 for instance in X_test[idxs]], dtype=np.float32) adv_inputs = adv_inputs.reshape((100, 28, 28, 1)) adv_ys = np.array([one_hot] * 10, dtype=np.float32).reshape((100, 10)) yname = "y_target" else: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = X_test[idxs] adv_ys = None yname = "y" cw_params = {'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': 100 if targeted else 10, 'initial_const': 10} adv = cw.generate_np(adv_inputs, **cw_params) if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args={'batch_size': 10}) else: adv_accuracy = 1-model_eval(sess, x, y, preds, adv, Y_test[idxs], args={'batch_size': 10}) for j in range(10): if targeted: for i in range(10): grid_viz_data[i, j] = adv[i * 10 + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1.-adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report