def attack_single_step(self, x, eta, y): """ Given the original image and the perturbation computed so far, computes a new perturbation. :param x: A tensor with the original input. :param eta: A tensor the same shape as x that holds the perturbation. :param y: A tensor with the target labels or ground-truth labels. """ import tensorflow as tf from cleverhans.utils_tf import model_loss, clip_eta adv_x = x + eta preds = self.model.get_probs(adv_x) loss = model_loss(y, preds) loss_vector = model_loss(y, preds, mean=False) if self.targeted: loss = -loss grad, = tf.gradients(loss, adv_x) scaled_signed_grad = self.eps_iter * tf.sign(grad) adv_x = adv_x + scaled_signed_grad if self.clip_min is not None and self.clip_max is not None: adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max) eta = adv_x - x eta = clip_eta(eta, self.ord, self.eps) return eta, loss, loss_vector
def attack_single_step(self, x, eta, y): """ Given the original image and the perturbation computed so far, computes a new perturbation. :param x: A tensor with the original input. :param eta: A tensor the same shape as x that holds the perturbation. :param y: A tensor with the target labels or ground-truth labels. """ from cleverhans.utils_tf import model_loss, clip_eta adv_x = x + eta preds = self.model.get_probs(adv_x) loss = model_loss(y, preds) if self.targeted: loss = -loss grad, = tf.gradients(loss, adv_x) if self.pgd_update == 'sign': adv_x = adv_x + self.eps_iter * tf.sign(grad) elif self.pgd_update == 'plain': adv_x = adv_x + self.eps_iter * grad / tf.reduce_sum( grad**2, axis=[1, 2, 3], keep_dims=True)**0.5 else: raise Exception('Wrong pgd_update.') if self.clip_min is not None and self.clip_max is not None: adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max) eta = adv_x - x eta = clip_eta(eta, self.ord, self.eps) return eta
def __init__(self, modelpath, model_file): self.image_size_height = 224 self.image_size_width = 224 self.num_channels = 3 self.sess, self.model, self.logits = self.load_model(modelpath, model_file) self.y = tf.placeholder(tf.float32, shape=(None, 4)) self.loss = utils_tf.model_loss(self.y, self.model.outputs[0], mean=False) self.x = self.model.inputs[0] # input tensor
def __init__(self, modelpath): self.image_size_height = 224 self.image_size_width = 224 self.num_channels = 3 self.sess, self.inputImage, self.output1 = self.load_model(modelpath) self.x = self.inputImage self.logits = self.output1 self.y = tf.placeholder(tf.float32, shape=(None, 1001)) self.loss = utils_tf.model_loss(self.y, self.logits, mean=False)
def fg(x, predictions): # Compute loss y = tf.to_float( tf.equal(predictions, tf.reduce_max(predictions, 1, keep_dims=True))) y = y / tf.reduce_sum(y, 1, keep_dims=True) loss = utils_tf.model_loss(y, predictions, mean=False) # Define gradient of loss wrt input grad, = tf.gradients(loss, x) return grad
def fgm_grad(x, preds, y=None, ord=np.inf, clip_min=None, clip_max=None, targeted=False): """ Returns the scaled gradient of the fgm attack. Clipping now has to be done outside, but this avoids costly recomputation of the gradient. """ if y is None: # Using model predictions as ground truth to avoid label leaking preds_max = tf.reduce_max(preds, 1, keep_dims=True) y = tf.to_float(tf.equal(preds, preds_max)) y = tf.stop_gradient(y) y = y / tf.reduce_sum(y, 1, keep_dims=True) # Compute loss loss = utils_tf.model_loss(y, preds, mean=False) if targeted: loss = -loss # Define gradient of loss wrt input grad, = tf.gradients(loss, x) if ord == np.inf: # Take sign of gradient normalized_grad = tf.sign(grad) # The following line should not change the numerical results. # It applies only because `normalized_grad` is the output of # a `sign` op, which has zero derivative anyway. # It should not be applied for the other norms, where the # perturbation has a non-zero derivative. normalized_grad = tf.stop_gradient(normalized_grad) elif ord == 1: red_ind = list(range(1, len(x.get_shape()))) normalized_grad = grad / tf.reduce_sum( tf.abs(grad), reduction_indices=red_ind, keep_dims=True) elif ord == 2: red_ind = list(range(1, len(x.get_shape()))) square = tf.reduce_sum(tf.square(grad), reduction_indices=red_ind, keep_dims=True) normalized_grad = grad / tf.sqrt(square) else: raise NotImplementedError("Only L-inf, L1 and L2 norms are " "currently implemented.") return normalized_grad
def body(i, old_adv_x, old_loss, labels=labels): """Find example with max loss value amongst batch of perturbations.""" deltas = tf.random_uniform(deltas_shape) # generate uniform samples from the l^p unit ball interior if self.ord == np.inf: deltas *= 2. * self.eps deltas -= self.eps elif self.ord == 1: # ref: https://mathoverflow.net/questions/9185/how-to-generate-random-points-in-ell-p-balls pylint: disable=line-too-long exp = -tf.log(deltas) shift = -tf.log(tf.random_uniform(deltas_shape[:2])) norm = tf.reduce_sum(tf.abs(exp), range(2, len(deltas_shape) - 2)) scale = tf.reshape(shift + norm, deltas_shape[:2] + [1] * (len(deltas_shape) - 2)) deltas = exp / scale elif self.ord == 2: # ref: https://blogs.sas.com/content/iml/2016/04/06/generate-points-uniformly-in-ball.html pylint: disable=line-too-long dims = tf.reduce_prod(deltas_shape[2:]) deltas = tf.pow(deltas, 1. / dims) normal = tf.random_normal(deltas) normal /= tf.sqrt( tf.reduce_sum(normal**2, axis=range(2, len(deltas_shape) - 2)), keepdims=True) deltas *= normal else: raise NotImplementedError('Only L-inf, L1 and L2 norms are ' 'currently implemented.') adv_x = tf.expand_dims(x, 1) + deltas labels = tf.expand_dims(labels, 1) labels = tf.tile(labels, [1, self.num_samples, 1]) if (self.clip_min is not None) and (self.clip_max is not None): adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max) adv_x_r = tf.reshape(adv_x, [-1] + deltas_shape[2:]) preds = self.model.get_probs(adv_x_r) preds_shape = preds.shape.as_list() preds = tf.reshape(preds, deltas_shape[:2] + preds_shape[1:]) if labels is None: # Using model predictions as ground truth to avoid label leaking preds_max = tf.reduce_max(preds, -1, keep_dims=True) labels = tf.to_float(tf.equal(preds, preds_max)) labels = tf.stop_gradient(labels) labels = labels / tf.reduce_sum(labels, -1, keep_dims=True) # Compute loss loss = utils_tf.model_loss(labels, preds, mean=False) if self.y_target is not None: loss = -loss # find the maximum loss value input_idx = tf.one_hot(tf.argmax(loss, axis=1), self.num_samples, axis=1) loss = tf.reduce_sum(loss * input_idx, axis=1) input_idx = tf.reshape(input_idx, deltas_shape[:2] + [1] * (len(deltas_shape) - 2)) adv_x = tf.reduce_sum(adv_x * input_idx, axis=1) condition = tf.greater(old_loss, loss) new_loss = tf.where(condition, old_loss, loss) new_adv_x = tf.where(condition, old_adv_x, adv_x) print(new_loss, new_adv_x) return i + 1, new_adv_x, new_loss
def main(argv=None): print("going into setup") op, model, sess, pholders, varops = setup_attack_graph() data = load_many_images(FLAGS.attack_srcdir) num_images = len(data) feed_dict = {pholders['image_in']: data, pholders['attack_target']: get_adv_target(nb_inputs = num_images), pholders['noise_mask']: load_norm_mask(), keras.backend.learning_phase(): 0} if FLAGS.printability_optimization: feed_dict[pholders['printable_colors']] = get_print_triplets(FLAGS.printability_tuples) # used to save checkpoints after each epoch saver = tf.train.Saver(max_to_keep=50) # debug: sanity check to make sure the model isn't being adjusted # i.e. this should stay constant if FLAGS.fullres_input: clean_model_loss = model_loss(pholders['attack_target'], model(tf.image.resize_images(pholders['image_in'], (FLAGS.img_rows,FLAGS.img_cols))), mean=True) else: clean_model_loss = model_loss(pholders['attack_target'], model(pholders['image_in']), mean=True) for i in range(FLAGS.attack_epochs): print('Epoch %d'%i), sys.stdout.flush() if not FLAGS.fullres_input: _, train_loss, noisy_in, clean_loss, clean_classes, noisy_classes = sess.run( \ (op, \ varops['adv_loss'], \ varops['noise_inputs'], \ clean_model_loss, \ model(pholders['image_in']), \ varops['adv_pred']) \ , feed_dict=feed_dict) else: _, train_loss, noisy_in, clean_loss, clean_classes, noisy_classes, rnin = sess.run( \ (op, \ varops['adv_loss'], \ varops['noise_inputs'], \ clean_model_loss, \ model(tf.image.resize_images(pholders['image_in'], (FLAGS.img_rows,FLAGS.img_cols))), \ varops['adv_pred'], \ varops['resized_noise_in']) \ , feed_dict=feed_dict) print(model(tf.image.resize_images(pholders['image_in'], (FLAGS.img_rows,FLAGS.img_cols)))) print("adversarial loss %.5f model loss on clean img: %.5f"%(train_loss, clean_loss)), sys.stdout.flush() if FLAGS.printability_optimization: print("noise NPS %.5f"%sess.run(varops['printer_error'], feed_dict=feed_dict)), # num_misclassified = 0 # for j in range(num_images): # clean_classification = np.argmax(clean_classes[j]) # noise_classification = np.argmax(noisy_classes[j]) # if clean_classification != noise_classification: # num_misclassified += 1 # proportion_misclassified = float(num_misclassified)/float(num_images) # print('percent misclassified images %.1f'%(proportion_misclassified*100.0)) # if i%FLAGS.save_frequency == 0 or proportion_misclassified > 0.9: # saver.save(sess, os.path.join('optimization_output', FLAGS.checkpoint, 'model', FLAGS.checkpoint), global_step=i) # imsave(os.path.join('optimization_output', FLAGS.checkpoint, "noisy_images", "noisyimg_%s_epoch_%d.png"%(FLAGS.checkpoint,i)), (noisy_in[0]*255).astype(int)) # if FLAGS.fullres_input: # imsave(os.path.join('optimization_output', FLAGS.checkpoint, "nimage_downsized_%d.png"%i), rnin[0]) # imsave(os.path.join('optimization_output', FLAGS.checkpoint, "noise_downsized_%d.png"%i),sess.run(varops['noise'])) print() ### end of epoch sess.close() for i in range(num_images): imsave(os.path.join('optimization_output', FLAGS.checkpoint, "noisy-set", "%d.png"%(i)), (noisy_in[i]*255).astype(int))
def fgm_range(x, preds, y=None, epsilons=[0.3], ord=np.inf, clip_min=None, clip_max=None, targeted=False): """ This is a slight modification of the fast gradient method to return a series of fgm attacks with a set of different epilons, in order to avoid the costly recomputation of the gradient. """ if y is None: # Using model predictions as ground truth to avoid label leaking preds_max = tf.reduce_max(preds, 1, keep_dims=True) y = tf.to_float(tf.equal(preds, preds_max)) y = tf.stop_gradient(y) y = y / tf.reduce_sum(y, 1, keep_dims=True) # Compute loss loss = utils_tf.model_loss(y, preds, mean=False) if targeted: loss = -loss # Define gradient of loss wrt input grad, = tf.gradients(loss, x) if ord == np.inf: # Take sign of gradient normalized_grad = tf.sign(grad) # The following line should not change the numerical results. # It applies only because `normalized_grad` is the output of # a `sign` op, which has zero derivative anyway. # It should not be applied for the other norms, where the # perturbation has a non-zero derivative. normalized_grad = tf.stop_gradient(normalized_grad) elif ord == 1: red_ind = list(range(1, len(x.get_shape()))) normalized_grad = grad / tf.reduce_sum( tf.abs(grad), reduction_indices=red_ind, keep_dims=True) elif ord == 2: red_ind = list(range(1, len(x.get_shape()))) square = tf.reduce_sum(tf.square(grad), reduction_indices=red_ind, keep_dims=True) normalized_grad = grad / tf.sqrt(square) else: raise NotImplementedError("Only L-inf, L1 and L2 norms are " "currently implemented.") # Multiply by constant epsilon scaled_grads = [eps * normalized_grad for eps in epsilons] # Add perturbation to original example to obtain adversarial example adv_xs = [x + scaled_grad for scaled_grad in scaled_grads] # If clipping is needed, reset all values outside of [clip_min, clip_max] if (clip_min is not None) and (clip_max is not None): for adx_x in adv_xs: adv_x = tf.clip_by_value(adv_x, clip_min, clip_max) return adv_xs
def setup_attack_graph_two_masks(): ''' Sets up the attack graph assuming two different masks will be fed in :return: a tuple of (the optimization op, the model, the session, the placeholders, the variable and other ops) ''' assert not (FLAGS.fullres_input ), "High resolution input not supported with two masks" # this handles the setup of the Keras model, the initialization of the TF session, # and the loading of the model parameters from previously saved values model, sess = setup_model_and_sess() # at this point, only the model variables exist; we will need this set later # in order to tell TF to not initialize those variables again model_vars = set(tf.global_variables()) # will hold the placeholders so that they can be returned placeholders = {} # will hold the variables and operations defined from now on varops = {} # set up the placeholders -- these are "input" places to the computation graph that change from run to run # these are "filled in" for each session run by using a feed_dict placeholders['image_in'] = tf.placeholder(tf.float32, shape=(None, FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) # attack_target is the one-hot vector for the class # we are trying to mimic when feeding image_in into the network placeholders['attack_target'] = tf.placeholder(tf.float32, shape=(None, FLAGS.nb_classes)) # this is the first mask being applied to limit the region of the perturbations placeholders['mask1'] = tf.placeholder(tf.float32, shape=(FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) # this is the second mask being applied to limit the region of the perturbations placeholders['mask2'] = tf.placeholder(tf.float32, shape=(FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) # this is the sum of mask1 and mask2 varops['combined_mask'] = tf.add(placeholders['mask1'], placeholders['mask2']) if FLAGS.printability_optimization: ####!!! Assumption: the printable tuples were all expanded to match ### the size of the image, so one tuple (x, y, z) gets replicated 32x32 times # we will have a different set of printable colors for each mask placeholders['printable_colors_region_1'] = tf.placeholder( tf.float32, shape=(None, 32, 32, 3)) placeholders['printable_colors_region_2'] = tf.placeholder( tf.float32, shape=(None, 32, 32, 3)) # the noise variable is what is actually being optimized # the values stored in variables are persisted across session runs (but not across program runs, unless saved) if FLAGS.initial_value_for_noise != "" and FLAGS.initial_value_for_noise != " ": # if a specific color for the initialization has been specified, # set the initial value of the noise to that color noise_init_color = np.float32( FLAGS.initial_value_for_noise.split(",")) / 255.0 assert noise_init_color.shape == ( 3, ), "You must provide 3 comma-separated values or no value for the initial_value_for_noise argument" noise_init = np.ndarray( [FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels], dtype='float32') noise_init[:, :] = noise_init_color varops['noise'] = tf.Variable( noise_init, name='noise', collections=[tf.GraphKeys.GLOBAL_VARIABLES, 'adv_var']) else: varops['noise'] = tf.Variable( tf.random_uniform([img_rows, img_cols, FLAGS.nb_channels], 0.0, 1.0), name='noise', collections=[tf.GraphKeys.GLOBAL_VARIABLES, 'adv_var']) if FLAGS.clipping: varops['noise'] = tf.clip_by_value(varops['noise'], FLAGS.noise_clip_min, FLAGS.noise_clip_max) varops['noise_mul'] = tf.multiply(varops['combined_mask'], varops['noise']) varops['noise_inputs'] = tf.clip_by_value( tf.add(placeholders['image_in'], varops['noise_mul']), FLAGS.noisy_input_clip_min, FLAGS.noisy_input_clip_max) else: varops['noise_mul'] = tf.multiply(varops['combined_mask'], varops['noise']) varops['noise_inputs'] = tf.add(placeholders['image_in'], varops['noise_mul']) varops['adv_pred'] = model(varops['noise_inputs']) # Regularization term to control size of perturbation if FLAGS.regloss == 'l1': varops['reg_loss'] = FLAGS.attack_lambda * l1_norm( tf.multiply(varops['combined_mask'], varops['noise'])) else: varops['reg_loss'] = FLAGS.attack_lambda * l2_norm( tf.multiply(varops['combined_mask'], varops['noise'])) # Compares adv predictions to given predictions # Default to cross-entropy (as defined in the model_loss cleverhans utility) if FLAGS.optimization_loss == 'mse': varops['loss'] = l2_loss(placeholders['attack_target'], varops['adv_pred']) else: varops['loss'] = model_loss(placeholders['attack_target'], varops['adv_pred'], mean=True) if FLAGS.printability_optimization: varops['nps1'] = get_nps_op( tf.multiply(varops['noise'], placeholders['mask1']), placeholders['printable_colors_region_1']) varops['nps2'] = get_nps_op( tf.multiply(varops['noise'], placeholders['mask2']), placeholders['printable_colors_region_2']) varops['adv_loss'] = varops['loss'] + varops['reg_loss'] + varops[ 'nps1'] + varops['nps2'] else: varops['adv_loss'] = varops['loss'] + varops['reg_loss'] op = tf.train.AdamOptimizer(learning_rate=FLAGS.optimization_rate, beta1=FLAGS.adam_beta1, beta2=FLAGS.adam_beta2, epsilon=FLAGS.adam_epsilon).minimize( varops['adv_loss'], var_list=tf.get_collection('adv_var')) # initialize the noise variable sess.run(tf.variables_initializer(set(tf.global_variables()) - model_vars)) return op, model, sess, placeholders, varops
def cifar_tutorial(train_start=0, train_end=49000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64, num_threads=None): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) (X_train, Y_train), (X_test, Y_test) = cifar10.load_data() Y_train = np_utils.to_categorical(Y_train, 10) Y_test = np_utils.to_categorical(Y_test, 10) X_train = X_train.astype('float32') X_test = X_test.astype('float32') X_train /= 255 X_test /= 255 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=([None, 10])) nb_classes = 10 source_samples = 10 img_rows = 32 img_cols = 32 channels = 3 train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } fgsm_params = {'eps': FLAGS.fgsm_eps, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) prune_factor = FLAGS.prune_factor if clean_train: prune_percent = { 'conv1_w': 5, 'conv2_w': 5, 'conv3_w': 5, 'conv4_w': 5, 'fc1_w': prune_factor, 'fc2_w': prune_factor, 'fc3_w': prune_factor } #model = make_resnet(x,10,[None,32,32,3],reuse = True,prune_percent = prune_percent) model = make_strong_cnn(nb_filters=nb_filters, prune_percent=prune_percent) initialize_uninitialized_global_variables(sess) preds = model.get_probs(x) saver = tf.train.Saver() eval_par = {'batch_size': batch_size} def fgsm_combo(): acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_par) print('Test accuracy on legitimate examples: %0.4f\n' % acc) fgsm = FastGradientMethod(model, sess=sess) #initialize_uninitialized_global_variables(sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print( 'Test accuracy on adversarial examples generated by fgsm: %0.4f\n' % acc) bim = BasicIterativeMethod(model, sess=sess) adv_x = bim.generate(x) preds_adv = model.get_probs(adv_x) acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print( 'Test accuracy on adversarial examples generated by IterativeMethod: %0.4f\n' % acc) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_par) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) ckpt_name = './cifar_model.ckpt' if not FLAGS.resume: model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) saver.save(sess, ckpt_name) if FLAGS.resume: saver = tf.train.import_meta_graph(ckpt_name + '.meta') print("loading pretrain model") saver.restore(sess, ckpt_name) acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_par) print('Test accuracy on pretrained model: %0.4f\n' % acc) if not FLAGS.resume: import sys sys.exit() def do_jsma(): print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where( adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape( -1).shape[0] # Display the original and adversarial images side-by-side if FLAGS.viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols)), np.reshape(adv_x, (img_rows, img_cols)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format( succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format( percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) if FLAGS.viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report def do_cw(): nb_adv_per_sample = str(nb_classes - 1) if FLAGS.targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if FLAGS.viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if FLAGS.targeted: if FLAGS.viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in X_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in X_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape((source_samples * nb_classes, img_rows, img_cols, channels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if FLAGS.viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = X_test[idxs] else: adv_inputs = X_test[:source_samples] adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': FLAGS.attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if FLAGS.targeted else source_samples, 'initial_const': 10 } adv = cw.generate_np(adv_inputs, **cw_params) if FLAGS.targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_par) else: if FLAGS.viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, Y_test[ idxs], args=eval_par) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, Y_test[ :source_samples], args=eval_par) if FLAGS.viz_enabled: for j in range(nb_classes): if FLAGS.targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format( adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format( percent_perturbed)) # Close TF session # sess.close() # Finally, block & display a grid of all the adversarial examples if FLAGS.viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report fgsm = FastGradientMethod(model, sess=sess) #model.test_mode() #initialize_uninitialized_global_variables(sess) fgsm_combo() do_cw() do_jsma() if not FLAGS.load_pruned_model: print("start iterative pruning") for i in range(FLAGS.prune_iterations): print("iterative %d" % (i)) dict_nzidx = model.apply_prune(sess) trainer = tf.train.AdamOptimizer(learning_rate) preds = model.get_probs(x) loss = model_loss(y, preds) grads = trainer.compute_gradients(loss) grads = model.apply_prune_on_grads(grads, dict_nzidx) prune_args = {'trainer': trainer, 'grads': grads} train_params = { 'nb_epochs': FLAGS.retrain_epoch, 'batch_size': batch_size, 'learning_rate': FLAGS.retrain_lr } model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng, prune_args=prune_args, retrainindex=i) acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) saver.save(sess, './pruned_cifar_model.ckpt') else: print("load pruned model") saver = tf.train.import_meta_graph( './pruned_cifar_model.ckpt.meta') saver.restore(sess, './pruned_cifar_model.ckpt') print("before applying gradient inhibition\n") fgsm_combo() do_cw() do_jsma() if FLAGS.do_inhibition: model.inhibition(sess, original_method=FLAGS.use_inhibition_original, inhibition_eps=FLAGS.inhibition_eps) print("after applying gradient inhibition\n") fgsm_combo() do_cw() do_jsma()
def setup_attack_graph(): ''' Sets up the attack graph :return: a tuple of (the optimization op, the model, the session, the placeholders, the variable and other ops) ''' # the NPS calculation also depends on the correct input being provided in the correct size, # so for now, the NPS calculation will stay undefined when the full resolution input is used assert not ( FLAGS.printability_optimization and FLAGS.fullres_input ), "Printability optimization and full resolution input are not set up to work together. Set at least one of them to false." # set up the input (to the TF graph) image size # by setting up the variables here, there is no need to do this same if each time we define an operation dependent on the input size if FLAGS.fullres_input: # if we are providing a full-res image, the noise and mask placeholders and vars # will also have this size; they will then be resized back down to FLAGS.img_rows, FLAGS.img_cols for input to the classification model img_rows = 300 img_cols = 300 else: # this is just the standard, 32 by 32 input img_rows = FLAGS.img_rows img_cols = FLAGS.img_cols # this handles the setup of the Keras model, the initialization of the TF session, # and the loading of the model parameters from previously saved values model, sess = setup_model_and_sess() # at this point, only the model variables exist; we will need this set later # in order to tell TF to not initialize those variables again model_vars = set(tf.global_variables()) # will hold the placeholders so that they can be returned placeholders = {} # set up the placeholders -- these are "input" places to the computation graph that change from run to run # these are "filled in" for each session run by using a feed_dict # image_in was x: the input to the neural network placeholders['image_in'] = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, FLAGS.nb_channels)) # attack_target used to be y: the one-hot vector for the class # we are trying to mimic when feeding image_in into the network placeholders['attack_target'] = tf.placeholder(tf.float32, shape=(None, FLAGS.nb_classes)) # this is the mask being applied to limit the region of the perturbations placeholders['noise_mask'] = tf.placeholder(tf.float32, shape=(img_rows, img_cols, FLAGS.nb_channels)) if FLAGS.printability_optimization: ####!!! Assumption: the printable tuples were all expanded to match ### the size of the image, so one tuple (x, y, z) gets replicated 32x32 times placeholders['printable_colors'] = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) # will hold the variables and operations defined from now on varops = {} # the noise variable is what is actually being optimized # the values stored in variables are persisted across session runs (but not across program runs, unless saved) if FLAGS.initial_value_for_noise != "" and FLAGS.initial_value_for_noise != " ": # if a specific color for the initialization has been specified, # set the initial value of the noise to that color noise_init_color = np.float32( FLAGS.initial_value_for_noise.split(",")) / 255.0 assert noise_init_color.shape == ( 3, ), "You must provide 3 comma-separated values or no value for the initial_value_for_noise argument" noise_init = np.ndarray([img_rows, img_cols, FLAGS.nb_channels], dtype='float32') noise_init[:, :] = noise_init_color varops['noise'] = tf.Variable( noise_init, name='noise', collections=[tf.GraphKeys.GLOBAL_VARIABLES, 'adv_var']) else: varops['noise'] = tf.Variable( tf.random_uniform([img_rows, img_cols, FLAGS.nb_channels], 0.0, 1.0), name='noise', collections=[tf.GraphKeys.GLOBAL_VARIABLES, 'adv_var']) if FLAGS.clipping: varops['noise'] = tf.clip_by_value(varops['noise'], FLAGS.noise_clip_min, FLAGS.noise_clip_max) varops['noise_mul'] = tf.multiply(placeholders['noise_mask'], varops['noise']) varops['noise_inputs'] = tf.clip_by_value( tf.add(placeholders['image_in'], varops['noise_mul']), FLAGS.noisy_input_clip_min, FLAGS.noisy_input_clip_max) else: varops['noise_mul'] = tf.multiply(placeholders['noise_mask'], varops['noise']) varops['noise_inputs'] = tf.add(placeholders['image_in'], varops['noise_mul']) # add a resize before feeding into the model if the input to the TF graph # was provided in full resolution (needs to be down-scaled to fit into the classification model) if FLAGS.fullres_input: varops['resized_noise_in'] = tf.image.resize_images( varops['noise_inputs'], (FLAGS.img_rows, FLAGS.img_cols)) varops['adv_pred'] = model(varops['resized_noise_in']) else: # adv_pred is what comes out of the model (network) for a given input varops['adv_pred'] = model(varops['noise_inputs']) # Regularization term to control size of perturbation if FLAGS.regloss == 'l1': varops['reg_loss'] = FLAGS.attack_lambda * l1_norm( tf.multiply(placeholders['noise_mask'], varops['noise'])) else: varops['reg_loss'] = FLAGS.attack_lambda * l2_norm( tf.multiply(placeholders['noise_mask'], varops['noise'])) # Compares adv predictions to given predictions # Default to cross-entropy (as defined in the model_loss cleverhans utility) if FLAGS.optimization_loss == 'mse': varops['loss'] = l2_loss(placeholders['attack_target'], varops['adv_pred']) else: varops['loss'] = model_loss(placeholders['attack_target'], varops['adv_pred'], mean=True) if FLAGS.printability_optimization: ####!!! Assumption: the printable tuples were all expanded to match ### the size of the image, so one tuple (x, y, z) gets replicated 32x32 times varops['printab_pixel_element_diff'] = tf.squared_difference( varops['noise_mul'], placeholders['printable_colors']) varops['printab_pixel_diff'] = tf.sqrt( tf.reduce_sum(varops['printab_pixel_element_diff'], 3)) varops['printab_reduce_prod'] = tf.reduce_prod( varops['printab_pixel_diff'], 0) varops['printer_error'] = tf.reduce_sum(varops['printab_reduce_prod']) varops['adv_loss'] = varops['loss'] + varops['reg_loss'] + varops[ 'printer_error'] else: varops['adv_loss'] = varops['loss'] + varops['reg_loss'] op = tf.train.AdamOptimizer(learning_rate=FLAGS.optimization_rate, beta1=FLAGS.adam_beta1, beta2=FLAGS.adam_beta2, epsilon=FLAGS.adam_epsilon).minimize( varops['adv_loss'], var_list=tf.get_collection('adv_var')) # initialize the noise variable sess.run(tf.variables_initializer(set(tf.global_variables()) - model_vars)) return op, model, sess, placeholders, varops
def setup_attack_graph(): ''' This function sets up an attack graph based on the Robust Physical Perturbations optimization algorithm and returns the Tensorflow operation to run that graph, along with the model, session, variables, operations, and placeholders defined along the way in dictionaries addressed by their names. :return: a tuple of (operation, model, session, placeholders, varops) where operation is the operation to run, provided initially, session is the TF session used to run the attack, and placeholder and varops are dictionaries holding the placeholders, variables, and intermediate TF operations defined ''' assert FLAGS.img_rows, 'img_rows needs to be defined in the parameters' assert FLAGS.img_cols, 'img_cols needs to be defined in the parameters' assert FLAGS.nb_channels, 'nb_channels needs to be defined in the parameters' assert FLAGS.nb_classes, 'nb_classes needs to be defined in the parameters' assert FLAGS.model_path, 'model_path needs to be defined in the parameters' assert FLAGS.printability_optimization is not None, \ 'printability_optimization needs to be defined in the parameters' if FLAGS.printability_optimization: assert FLAGS.printability_tuples, \ 'printability_tuples needs to be defined if printability_optimization is True' assert FLAGS.regloss, 'regloss needs to be defined in the parameters' assert FLAGS.optimization_loss, 'optimization_loss needs to be defined in the params' # begin by setting up the session that will be used sess = tf.Session() # place all placeholders in this dict # so that they can be returned for use outside of this function placeholders = {} # note that these are set to the size of the input, # resizing happens later (before building model) if different placeholders['image_in'] = tf.placeholder(tf.float32, \ shape=(None, FLAGS.input_rows, FLAGS.input_cols, FLAGS.nb_channels), name="noiseattack/image_in") placeholders['attack_target'] = tf.placeholder(tf.float32, \ shape=(None, FLAGS.nb_classes), name="noiseattack/attack_targe") # resize later placeholders['noise_mask'] = tf.placeholder(tf.float32, \ shape= \ (FLAGS.input_rows, \ FLAGS.input_cols, \ FLAGS.nb_channels), \ name="noiseattack/noise_mask") if FLAGS.printability_optimization: ####!!! Assumption: the printable tuples were all expanded to match ### the size of the image, so one tuple (x, y, z) gets replicated 32x32 times placeholders['printable_colors'] = tf.placeholder(tf.float32, \ shape=(None, \ FLAGS.input_rows, \ FLAGS.input_cols, \ FLAGS.nb_channels), \ name="noiseattack/printable_colors") # will hold the variables and operations defined from now on varops = {} varops['noise'] = tf.Variable(tf.random_normal( \ [FLAGS.input_rows, FLAGS.input_cols, FLAGS.nb_channels]), \ name='noiseattack/noise', collections=[tf.GraphKeys.GLOBAL_VARIABLES, 'adv_var']) # the following operations are these: # noise: a clipped value of the noise # noise_mul: the multiplication of the noise by the mask # noisy_inputs: the addition of the masked noise to the image if FLAGS.clipping: varops['noise'] = tf.clip_by_value(varops['noise'], \ FLAGS.noise_clip_min, FLAGS.noise_clip_max, \ name="noiseattack/noise_clipped") varops['noise_mul'] = tf.multiply(placeholders['noise_mask'], varops['noise'], \ name="noiseattack/noise_mul") varops['noisy_inputs'] = tf.clip_by_value(tf.add(placeholders['image_in'], \ varops['noise_mul']), \ FLAGS.noisy_input_clip_min, FLAGS.noisy_input_clip_max, \ name="noiseattack/noisy_inputs") else: varops['noise_mul'] = tf.multiply(placeholders['noise_mask'], varops['noise'], \ name="noiseattack/noise_mul") varops['noisy_inputs'] = tf.add(placeholders['image_in'], varops['noise_mul'], \ name="noiseattack/noisy_inputs") if FLAGS.img_rows != FLAGS.input_rows or FLAGS.img_cols != FLAGS.input_cols: if FLAGS.resize_method != "avgpool" and FLAGS.resize_method != "convresize": if FLAGS.resize_method == "area": resize_met = tf.image.ResizeMethod.AREA elif FLAGS.resize_method == "bicubic": resize_met = tf.image.ResizeMethod.BICUBIC elif FLAGS.resize_method == "bilinear": resize_met = tf.image.ResizeMethod.BILINEAR elif FLAGS.resize_method == "nearestneighbor": resize_met = tf.image.ResizeMethod.NEAREST_NEIGHBOR else: raise Exception("resize method needs to be one of: area, bicubic, bilinear, nearestneighbor") varops['noisy_inputs'] = tf.image.resize_images(varops['noisy_inputs'], \ size=(FLAGS.img_rows, FLAGS.img_cols), \ method=resize_met) elif FLAGS.resize_method == "convresize": assert FLAGS.img_rows == 32 and FLAGS.input_rows == 256, \ "Convresize only guaranteed to work with input 256 and a 32 model" f = [[[[1,0,0], [0,1,0], [0,0,1]] for _ in xrange(8)] for __ in xrange(8)] f = np.array(f).astype('float32')/(64.0) varops['noisy_inputs'] = tf.nn.conv2d(varops['noisy_inputs'], \ tf.constant(f), \ strides=[1, 8, 8, 1], \ padding='SAME') else: s = FLAGS.input_rows/FLAGS.img_rows assert FLAGS.input_rows%FLAGS.img_rows == 0, \ "Input size should be a multiple of model input size, \ currently input: %d model input: %d"%(FLAGS.input_rows, FLAGS.img_rows) varops['noisy_inputs'] = tf.nn.avg_pool(varops['noisy_inputs'], \ ksize=[1, s, s, 1], \ strides=[1, s, s, 1], \ padding='SAME') # instantiate the model model = YadavModel(train=False, custom_input=varops['noisy_inputs']) model_vars = filter(lambda x: not str(x.name).startswith("noiseattack"), \ tf.global_variables()) print map(lambda x: x.name, model_vars) # load the model saver = tf.train.Saver(var_list=model_vars) saver.restore(sess, FLAGS.model_path) print 'Loaded the parameters for the model from', FLAGS.model_path # adv_pred is the output of the model for an image (or images) with noise varops['adv_pred'] = model.labels_pred # Regularization term to control size of perturbation if FLAGS.regloss != "none": if FLAGS.regloss == 'l1': varops['reg_loss'] = FLAGS.attack_lambda * \ l1_norm(tf.multiply(placeholders['noise_mask'], varops['noise'])) elif FLAGS.regloss == 'l2': varops['reg_loss'] = FLAGS.attack_lambda * \ l2_norm(tf.multiply(placeholders['noise_mask'], varops['noise'])) else: raise Exception("Regloss may only be none or l1 or l2. Now%s"%FLAGS.regloss) # Compares adv predictions to given predictions # Default to cross-entropy (as defined in the model_loss cleverhans utility) if FLAGS.optimization_loss == 'justmse': varops['loss'] = l2_loss(placeholders['attack_target'], varops['adv_pred']) elif FLAGS.optimization_loss == "justcrossentropy": varops['loss'] = model_loss(placeholders['attack_target'], varops['adv_pred'], mean=True) elif FLAGS.optimization_loss == "justconfmin": varops['loss'] = _confmin_loss(varops['adv_pred'], FLAGS.true_class, FLAGS.target_class) elif FLAGS.optimization_loss == "confminandcrossentropy": varops['loss'] = model_loss(placeholders['attack_target'], varops['adv_pred'], mean=True) + \ _confmin_loss(varops['adv_pred'], FLAGS.true_class, FLAGS.target_class) else: raise Exception("Optimization_loss needs to be justmse or justcrossentropy \ or justconfmin. Now %s"%FLAGS.optimization_loss) if FLAGS.printability_optimization: ####!!! Assumption: the printable tuples were all expanded to match ### the size of the image, so one tuple (x, y, z) gets replicated 32x32 times varops['printab_pixel_element_diff'] = tf.squared_difference(varops['noise_mul'], \ placeholders['printable_colors']) varops['printab_pixel_diff'] = tf.sqrt(tf.reduce_sum( \ varops['printab_pixel_element_diff'], 3)) varops['printab_reduce_prod'] = tf.reduce_prod(varops['printab_pixel_diff'], 0) varops['printer_error'] = tf.reduce_sum(varops['printab_reduce_prod']) if FLAGS.regloss != "none": varops['adv_loss'] = varops['loss'] + varops['reg_loss'] + varops['printer_error'] else: varops['adv_loss'] = varops['loss'] + varops['printer_error'] else: if FLAGS.regloss != "none": varops['adv_loss'] = varops['loss'] + varops['reg_loss'] else: varops['adv_loss'] = varops['loss'] optimization_op = tf.train.AdamOptimizer(learning_rate=FLAGS.optimization_rate, \ beta1=FLAGS.adam_beta1, \ beta2=FLAGS.adam_beta2, \ epsilon=FLAGS.adam_epsilon).minimize(varops['adv_loss'], \ var_list=tf.get_collection('adv_var')) sess.run(tf.variables_initializer(set(tf.global_variables()) - set(model_vars))) print 'Initialized the model variables' return optimization_op, model, sess, placeholders, varops
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Get MNIST test data # X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, # train_end=train_end, # test_start=test_start, # test_end=test_end) # Get notMNIST data with np.load("notmnist.npz") as data: X_train, Y_train, X_test, Y_test = data['examples_train'], data[ 'labels_train'], data['examples_test'], data['labels_test'] # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "./" model_name = "adv_trained_fgsm_model_mix_data_notmnist" fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([1992, 8, 3]) model = make_basic_cnn(nb_filters=nb_filters) preds = model(x) # Create TF session sess = tf.Session() fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model(adv_x) mixed_x = tf.concat([x, adv_x], 0) mixed_y = tf.concat([y, y], 0) # length = tf.shape(mixed_x)[0] index_shuffle = list(range(batch_size * 2)) rng.shuffle(index_shuffle) mixed_x = tf.gather(mixed_x, index_shuffle) mixed_y = tf.gather(mixed_y, index_shuffle) preds_mixed = model(mixed_x) loss = model_loss(mixed_y, preds_mixed) train_step = tf.train.AdamOptimizer(learning_rate=learning_rate) train_step = train_step.minimize(loss) tf.global_variables_initializer().run(session=sess) for epoch in xrange(nb_epochs): print('Training for epoch %i/%i' % (epoch, nb_epochs - 1)) # Compute number of batches nb_batches = int(math.ceil(float(len(X_train)) / batch_size)) assert nb_batches * batch_size >= len(X_train) # Indices to shuffle training set index_shuf = list(range(len(X_train))) rng.shuffle(index_shuf) prev = time.time() for batch in range(nb_batches): # re-instantiate FGSM object with new trained model # fgsm = FastGradientMethod(model, sess=sess) # adv_x = fgsm.generate(x, **fgsm_params) print('--------------------------------------') # create an array for storing adv examples print('batch: %i/%i' % (batch + 1, nb_batches)) # adv_examples = np.empty([1,28,28,1]) start, end = batch_indices(batch, len(X_train), batch_size) X_this_batch = X_train[index_shuf[start:end]] Y_this_batch = Y_train[index_shuf[start:end]] # adv_examples = sess.run(adv_x, feed_dict={x:X_this_batch}) # for target labels #adv_targets = np.empty([1,10]) # corresponding clean/correct label # adv_clean_labels = np.empty([1,10]) # correspongding clean data # adv_clean_examples = np.empty([1,28,28,1]) # adv_examples = np.reshape(adv_examples, (batch_size*(nb_classes-1),28,28,1)) # adv_clean_examples = np.reshape(adv_clean_examples, (batch_size*(nb_classes-1),28,28,1)) # mixed_X = np.concatenate((X_this_batch, adv_examples), axis=0) # mixed_Y = np.concatenate((Y_this_batch, Y_this_batch), axis=0) # print('mixed data have shape', np.shape(mixed_X)) # print('mixed labels have shape', np.shape(mixed_Y)) #shuffle the mixed data before training # index_of_batch = list(range(np.shape(mixed_Y)[0])) # rng.shuffle(index_of_batch) # mixed_X = mixed_X[index_of_batch] # mixed_Y = mixed_Y[index_of_batch] feed_dict = {x: X_this_batch, y: Y_this_batch} train_step.run(feed_dict=feed_dict, session=sess) cur = time.time() _logger.info("Epoch " + str(epoch) + " took " + str(cur - prev) + " seconds") eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) acc2 = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on adversarial examples: %0.4f' % acc2) print('Training finished.') # reload fgsm successfully attacking adv test data # with np.load("adversarial_fgsm.npz") as data: # adv_X_test, adv_clean_Y_test, adv_clean_X_test = data['adv_examples'], data['adv_clean_labels'], data['adv_clean_examples'] # print('FGSM adversarial data are successfully reloaded.') # preds_adv_test = model(x1) # # Evaluate the accuracy of the MNIST model on adversarial examples # # eval_par = {'batch_size': 10} # acc = model_eval(sess, x1, y, preds_adv_test, adv_X_test, adv_clean_Y_test, args=eval_par) # print('Test accuracy on pre-generated adversarial examples of fgsm: %0.4f\n' % acc) # # reload fgsm successfully attacking adv test data # with np.load("adversarial_mnist_test_from_1500.npz") as data: # adv_X_test, adv_clean_Y_test, adv_clean_X_test = data['adv_examples'], data['adv_clean_labels'], data['adv_clean_examples'] # print('JSMA adversarial data are successfully reloaded.') # # Evaluate the accuracy of the MNIST model on adversarial examples # acc2 = model_eval(sess, x1, y, preds_adv_test, adv_X_test, adv_clean_Y_test, args=eval_par) # print('Test accuracy on pre-generated adversarial examples of jsma: %0.4f\n' % acc2) save_path = os.path.join(model_path, model_name) saver = tf.train.Saver() saver.save(sess, save_path) _logger.info("Completed model training and saved at: " + str(save_path)) # Close TF session sess.close() return
def effective_train_jsma(train_start=0, train_end=20, test_start=0, test_end=10000, viz_enabled=False, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001): # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set logging level to see debug information set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") model_path = "./" model_name = "adv_trained_jsma_model_alpha0.4_fortest" # sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) # Define input TF placeholder x1 = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) # for clean data x2 = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) # for adv data y = tf.placeholder(tf.float32, shape=(None, 10)) # for adv clean targets # Initialize the model model = make_basic_cnn() preds = model(x1) preds_adv = model(x2) # Instantiate a SaliencyMapMethod attack object # jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } # Define loss loss = 0.4 * model_loss(y, preds) + 0.6 * model_loss(y, preds_adv) train_step = tf.train.AdamOptimizer(learning_rate=learning_rate) train_step = train_step.minimize(loss) def evaluate_2(adv_examples_last_batch, adv_clean_labels_last_batch): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x1, y, preds, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x2, y, preds_adv, adv_examples_last_batch, adv_clean_labels_last_batch, args=eval_params) print('Test accuracy on last batch of adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy with sess.as_default(): tf.global_variables_initializer().run() for epoch in xrange(nb_epochs): print('Training for epoch %i/%i' % (epoch, nb_epochs - 1)) # Compute number of batches nb_batches = int(math.ceil(float(len(X_train)) / batch_size)) assert nb_batches * batch_size >= len(X_train) # Indices to shuffle training set index_shuf = list(range(len(X_train))) rng.shuffle(index_shuf) prev = time.time() for batch in range(nb_batches): # re-instantiate Saliency object with new trained model jsma = SaliencyMapMethod(model, back='tf', sess=sess) print('--------------------------------------') # create an array for storing adv examples print('batch: %i/%i' % (batch + 1, nb_batches)) # adv_examples = np.empty([1,28,28,1]) adv_examples = [] # for target labels #adv_targets = np.empty([1,10]) # corresponding clean/correct label # adv_clean_labels = np.empty([1,10]) adv_clean_labels = [] # correspongding clean data # adv_clean_examples = np.empty([1,28,28,1]) adv_clean_examples = [] for sample_ind in xrange(0, batch_size): print('Attacking input %i/%i' % (sample_ind + 1, batch_size)) # Compute batch start and end indices start, end = batch_indices(batch, len(X_train), batch_size) X_this_batch = X_train[index_shuf[start:end]] Y_this_batch = Y_train[index_shuf[start:end]] # Perform one training step # feed_dict = {x: X_train[index_shuf[start:end]],y: Y_train[index_shuf[start:end]]} sample = X_this_batch[sample_ind:( sample_ind + 1)] # generate from training data # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_this_batch[sample_ind]) ) # generate from training data target_classes = other_classes(nb_classes, current_class) print('Current class is ', current_class) # For the grid visualization, keep original images along the diagonal # grid_viz_data[current_class, current_class, :, :, :] = np.reshape( # sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) #create fake target one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np( sample, **jsma_params ) # get numpy array (1, 28, 28, 1), not Tensor # Check if success was achieved # res = int(model_argmax(sess, x, preds, adv_x) == target) # if succeeds # if res == 1: # append new adv_x to adv_examples array # append sample here, so that the number of times sample is appended mmatches number of adv_ex. # adv_examples = np.append(adv_examples, adv_x, axis=0) adv_examples.append(adv_x) #adv_targets = np.append(adv_targets, one_hot_target, axis=0) # adv_clean_labels = np.append(adv_clean_labels, np.expand_dims(Y_this_batch[sample_ind],axis=0), axis=0) # generate from training data adv_clean_labels.append(Y_this_batch[sample_ind]) # adv_clean_examples = np.append(adv_clean_examples, sample, axis=0) adv_clean_examples.append(sample) # what we have for this batch, batch_size * 9 data # adv_examples = adv_examples[1:,:,:,:] #adv_targets = adv_targets[1:,:] # adv_clean_labels = adv_clean_labels[1:,:] # adv_clean_examples = adv_clean_examples[1:,:,:,:] adv_examples = np.reshape( adv_examples, (batch_size * (nb_classes - 1), 28, 28, 1)) adv_clean_examples = np.reshape(adv_clean_examples, (batch_size * (nb_classes - 1), 28, 28, 1)) feed_dict = { x1: adv_clean_examples, x2: adv_examples, y: adv_clean_labels } train_step.run(feed_dict=feed_dict) cur = time.time() _logger.info("Epoch " + str(epoch) + " took " + str(cur - prev) + " seconds") evaluate_2(adv_examples, adv_clean_labels) print('Training finished.') # report on clean test data preds_test = model(x1) eval_par = {'batch_size': 10} acc_clean = model_eval(sess, x1, y, preds_test, X_test, Y_test, args=eval_par) print('Test accuracy on legitimate examples: %0.4f\n' % acc_clean) # reload fgsm successfully attacking adv test data # with np.load("adversarial_fgsm.npz") as data: # adv_X_test, adv_clean_Y_test, adv_clean_X_test = data['adv_examples'], data['adv_clean_labels'], data['adv_clean_examples'] # print('FGSM adversarial data are successfully reloaded.') # preds_adv_test = model(x1) # # Evaluate the accuracy of the MNIST model on adversarial examples # # eval_par = {'batch_size': 10} # acc = model_eval(sess, x1, y, preds_adv_test, adv_X_test, adv_clean_Y_test, args=eval_par) # print('Test accuracy on pre-generated adversarial examples of fgsm: %0.4f\n' % acc) # # reload fgsm successfully attacking adv test data # with np.load("adversarial_mnist_test_from_1500.npz") as data: # adv_X_test, adv_clean_Y_test, adv_clean_X_test = data['adv_examples'], data['adv_clean_labels'], data['adv_clean_examples'] # print('JSMA adversarial data are successfully reloaded.') # # Evaluate the accuracy of the MNIST model on adversarial examples # acc2 = model_eval(sess, x1, y, preds_adv_test, adv_X_test, adv_clean_Y_test, args=eval_par) # print('Test accuracy on pre-generated adversarial examples of jsma: %0.4f\n' % acc2) save_path = os.path.join(model_path, model_name) saver = tf.train.Saver() saver.save(sess, save_path) _logger.info("Completed model training and saved at: " + str(save_path)) # Close TF session sess.close()
def main(argv=None): with tf.device(FLAGS.device): print "Parameters" for k in sorted(FLAGS.__dict__["__flags"].keys()): print k, FLAGS.__dict__["__flags"][k] op, model_obj, sess, pholders, varops = setup_attack_graph() model = varops['adv_pred'] data = map( lambda z: preprocess_yadav(z), map( lambda y: read_img(os.path.join(FLAGS.attack_srcdir, y)), filter(lambda x: x.endswith(".png"), os.listdir(FLAGS.attack_srcdir)))) num_images = len(data) feed_dict = { pholders['image_in']: data, pholders['attack_target']: get_adv_target(nb_inputs=num_images), pholders['noise_mask']: read_img(FLAGS.attack_mask) / 255.0, model_obj.keep_prob: 1.0 } if FLAGS.printability_optimization: feed_dict[pholders['printable_colors']] = get_print_triplets() # used to save checkpoints after each epoch saver = tf.train.Saver(max_to_keep=5) clean_model_loss = model_loss(pholders['attack_target'], varops['adv_pred'], mean=True) latest_misrate = FLAGS.min_rate_to_save latest_loss = 10000 for i in xrange(FLAGS.attack_epochs): print 'Epoch %d' % i, sys.stdout.flush() _, train_loss, mod_loss, noisy_in, noisy_classes = sess.run( \ (op, \ varops['adv_loss'], \ varops['loss'], \ varops['noisy_inputs'], \ varops['adv_pred']) \ , feed_dict=feed_dict) if FLAGS.regloss != "none": reg_loss = sess.run(varops['reg_loss'], feed_dict=feed_dict) else: reg_loss = 0 clean_loss, clean_classes = sess.run( (clean_model_loss, model), feed_dict={ pholders['image_in']: data, pholders['attack_target']: get_adv_target(nb_inputs=num_images), pholders['noise_mask']: np.zeros([ FLAGS.input_rows, FLAGS.input_cols, FLAGS.nb_channels ]), model_obj.keep_prob: 1.0 }) print "adversarial loss %.5f reg loss %.5f model loss %.5f model loss on clean img: %.5f" % ( train_loss, reg_loss, mod_loss, clean_loss), sys.stdout.flush() if FLAGS.printability_optimization: print "noise NPS %.5f" % sess.run(varops['printer_error'], feed_dict=feed_dict), num_misclassified = 0 for j in xrange(num_images): clean_classification = np.argmax(clean_classes[j]) noise_classification = np.argmax(noisy_classes[j]) if clean_classification != noise_classification and noise_classification == FLAGS.target_class: num_misclassified += 1 proportion_misclassified = float(num_misclassified) / float( num_images) print 'percent misclassified images %.1f' % ( proportion_misclassified * 100.0) if proportion_misclassified > latest_misrate or \ (proportion_misclassified == latest_misrate and train_loss < latest_loss) \ or ("octagon" in FLAGS.attack_mask and train_loss < latest_loss): latest_misrate = proportion_misclassified latest_loss = train_loss saver.save(sess, os.path.join('optimization_output', FLAGS.checkpoint, 'model', FLAGS.checkpoint), global_step=i) if FLAGS.save_all_noisy_images: write_img( os.path.join( 'optimization_output', FLAGS.checkpoint, "noisy_images", "noisyimg_%s_epoch_%d.png" % (FLAGS.checkpoint, i)), ((noisy_in[0] + 0.5) * 255).astype(int))
def fgm(x, preds, y=None, eps=0.3, ord=np.inf, clip_min=None, clip_max=None): """ TensorFlow implementation of the Fast Gradient Method. :param x: the input placeholder :param preds: the model's output tensor :param y: (optional) A placeholder for the model labels. Only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is None. Labels should be one-hot-encoded. :param eps: the epsilon (input variation parameter) :param ord: (optional) Order of the norm (mimics Numpy). Possible values: np.inf, 1 or 2. :param clip_min: Minimum float value for adversarial example components :param clip_max: Maximum float value for adversarial example components :return: a tensor for the adversarial example """ if y is None: # Using model predictions as ground truth to avoid label leaking preds_max = tf.reduce_max(preds, 1, keep_dims=True) y = tf.to_float(tf.equal(preds, preds_max)) y = y / tf.reduce_sum(y, 1, keep_dims=True) # Compute loss loss = utils_tf.model_loss(y, preds, mean=False) # Define gradient of loss wrt input grad, = tf.gradients(loss, x) # smooth grad kernel = gkern(7, 2).astype(np.float32) stack_kernel = np.stack([kernel, kernel, kernel]).swapaxes(2, 0) stack_kernel = np.expand_dims(stack_kernel, 3) grad = tf.nn.depthwise_conv2d(grad, stack_kernel, strides=[1, 1, 1, 1], padding='SAME') if ord == np.inf: # Take sign of gradient signed_grad = tf.sign(grad) elif ord == 1: reduc_ind = list(xrange(1, len(x.get_shape()))) signed_grad = grad / tf.reduce_sum( tf.abs(grad), reduction_indices=reduc_ind, keep_dims=True) elif ord == 2: reduc_ind = list(xrange(1, len(x.get_shape()))) signed_grad = grad / tf.sqrt( tf.reduce_sum( tf.square(grad), reduction_indices=reduc_ind, keep_dims=True)) else: raise NotImplementedError("Only L-inf, L1 and L2 norms are " "currently implemented.") # Multiply by constant epsilon scaled_signed_grad = eps * signed_grad # Add perturbation to original example to obtain adversarial example adv_x = tf.stop_gradient(x + scaled_signed_grad) # If clipping is needed, reset all values outside of [clip_min, clip_max] if (clip_min is not None) and (clip_max is not None): adv_x = tf.clip_by_value(adv_x, clip_min, clip_max) return adv_x
def body(i, old_adv_x, old_loss, labels=labels): """Find example with max loss value amongst batch of perturbations.""" deltas = tf.random_uniform(deltas_shape) # generate uniform samples from the l^p unit ball interior if self.ord == np.inf: deltas *= 2. * self.eps deltas -= self.eps elif self.ord == 1: # ref: https://mathoverflow.net/questions/9185/how-to-generate-random-points-in-ell-p-balls pylint: disable=line-too-long exp = -tf.log(deltas) shift = -tf.log(tf.random_uniform(deltas_shape[:2])) norm = tf.reduce_sum(tf.abs(exp), range(2, len(deltas_shape) - 2)) scale = tf.reshape( shift + norm, deltas_shape[:2] + [1] * (len(deltas_shape) - 2)) deltas = exp / scale elif self.ord == 2: # ref: https://blogs.sas.com/content/iml/2016/04/06/generate-points-uniformly-in-ball.html pylint: disable=line-too-long dims = tf.reduce_prod(deltas_shape[2:]) deltas = tf.pow(deltas, 1. / dims) normal = tf.random_normal(deltas) normal /= tf.sqrt(tf.reduce_sum(normal**2, axis=range( 2, len(deltas_shape) - 2)), keepdims=True) deltas *= normal else: raise NotImplementedError('Only L-inf, L1 and L2 norms are ' 'currently implemented.') adv_x = tf.expand_dims(x, 1) + deltas labels = tf.expand_dims(labels, 1) labels = tf.tile(labels, [1, self.num_samples, 1]) if (self.clip_min is not None) and (self.clip_max is not None): adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max) adv_x_r = tf.reshape(adv_x, [-1] + deltas_shape[2:]) preds = self.model.get_probs(adv_x_r) preds_shape = preds.shape.as_list() preds = tf.reshape(preds, deltas_shape[:2] + preds_shape[1:]) if labels is None: # Using model predictions as ground truth to avoid label leaking preds_max = tf.reduce_max(preds, -1, keep_dims=True) labels = tf.to_float(tf.equal(preds, preds_max)) labels = tf.stop_gradient(labels) labels = labels / tf.reduce_sum(labels, -1, keep_dims=True) # Compute loss loss = utils_tf.model_loss(labels, preds, mean=False) if self.y_target is not None: loss = -loss # find the maximum loss value input_idx = tf.one_hot(tf.argmax(loss, axis=1), self.num_samples, axis=1) loss = tf.reduce_sum(loss * input_idx, axis=1) input_idx = tf.reshape( input_idx, deltas_shape[:2] + [1] * (len(deltas_shape) - 2)) adv_x = tf.reduce_sum(adv_x * input_idx, axis=1) condition = tf.greater(old_loss, loss) new_loss = tf.where(condition, old_loss, loss) new_adv_x = tf.where(condition, old_adv_x, adv_x) print(new_loss, new_adv_x) return i + 1, new_adv_x, new_loss
# Define model model = cnn_model(img_rows=FLAGS.img_rows, img_cols=FLAGS.img_cols, channels=FLAGS.nb_channels, nb_classes=FLAGS.nb_classes) model.summary() # will hold the placeholders so that they can be returned placeholders = {} img_rows = FLAGS.img_rows img_cols = FLAGS.img_cols placeholders['image_in'] = tf.placeholder(tf.float32, shape = (None, img_rows, img_cols, FLAGS.nb_channels)) placeholders['True_labels'] = tf.placeholder(tf.float32, shape = (None, FLAGS.nb_classes)) # will hold the variables and operations defined from now on varops = {} varops['pred'] = model(placeholders['image_in']) varops['loss'] = model_loss(placeholders['True_labels'], varops['pred'], mean=True) feed_dict = {placeholders['image_in']: imgs, placeholders['True_labels']: labels, keras.backend.learning_phase(): 0} # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) print("Created TensorFlow session and set Keras backend.") #op = tf.train.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False,name='Adam').minimize(varops['loss']) op = tf.train.AdadeltaOptimizer(learning_rate=0.1, rho=0.95, epsilon=1e-08, use_locking=False,name='Adadelta').minimize(varops['loss']) #op = tf.train.AdagradOptimizer(learning_rate=0.01, initial_accumulator_value=0.1, use_locking=False,name='Adagrad').minimize(varops['loss'])