def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" if targeted: cw_params_batch_size = source_samples * nb_classes else: cw_params_batch_size = source_samples cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': CW_LEARNING_RATE, 'batch_size': cw_params_batch_size, 'initial_const': 10 } adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) adv_accuracy = 1 - err if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: _ = grid_visual(grid_viz_data) return report
def train_zero_knowledge_gandef_model(train_start=0, train_end=60000, test_start=0, test_end=10000, smoke_test=True, save=False, testing=False, backprop_through_attack=False, num_threads=None): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param train_batch_size: size of training batches :param test_batch_size: size of testing batches :param learning_rate: learning rate for training :param save: if true, the final model will be saved :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_cifar10(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) if smoke_test: X_train, Y_train, X_test, Y_test = X_train[: 256], Y_train[: 256], X_test[: 256], Y_test[: 256] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y_soft = tf.placeholder(tf.float32, shape=(None, 10)) # Adversarial training print("Start adversarial training") zero_knowledge_gandef_model = make_zero_knowledge_gandef_model( name="model_zero_knowledge_gandef") aug_x_1 = cifar10_augment(x) aug_x_2 = gaussian_augment(aug_x_1) preds_clean = zero_knowledge_gandef_model(aug_x_1) preds_aug = zero_knowledge_gandef_model(aug_x_2) def cross_entropy(truth, preds, mean=True): # Get the logits operator op = preds.op if op.type == "Softmax": logits, = op.inputs else: logits = preds # Calculate cross entropy loss out = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=truth) # Take average loss and return if mean: out = tf.reduce_mean(out) return out def sigmoid_entropy(truth, preds, mean=True): # Get the logits operator op = preds.op if op.type == "Softmax": logits, = op.inputs else: logits = preds # Calculate cross entropy loss out = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=truth) # Take average loss and return if mean: out = tf.reduce_mean(out) return out # Prepare training settings rng = np.random.RandomState([2017, 8, 30]) # Prepare optimizer learning_rate = 1e-4 clf_opt = tf.train.AdamOptimizer(learning_rate) dic_opt = tf.train.AdamOptimizer(learning_rate * 10) # Prepare train params train_params = { 'nb_epochs': 350, 'batch_size': 128, 'weight_decay': 0.001, 'trade_off': 2, 'pretrain_epochs': 0, 'inner_epochs': 1 } # Perform and evaluate adversarial training gan_train_v2(sess, x, y_soft, preds_clean, X_train, Y_train, loss_func=[cross_entropy, sigmoid_entropy], optimizer=[clf_opt, dic_opt], predictions_adv=preds_aug, evaluate=None, args=train_params, rng=rng, var_list=zero_knowledge_gandef_model.get_gan_params()) # Evaluate the accuracy of the MNIST model on Clean examples preds_clean = zero_knowledge_gandef_model(x) eval_params = { 'batch_size': 128, 'use_dic': False, 'is_clean': True, 'reject_threshold': 0.5 } clean_acc = confident_model_eval(sess, x, y_soft, preds_clean, X_test, Y_test, args=eval_params) print('Test accuracy on Clean test examples: %0.4f\n' % clean_acc) report.adv_train_clean_eval = clean_acc # Evaluate the accuracy of the MNIST model on FGSM examples fgsm_params = {'eps': float(8 / 255) * 2., 'clip_min': -1., 'clip_max': 1.} fgsm_att = FastGradientMethod(zero_knowledge_gandef_model, sess=sess) fgsm_adv = fgsm_att.generate(x, **fgsm_params) preds_fgsm_adv = zero_knowledge_gandef_model(fgsm_adv) eval_params = { 'batch_size': 128, 'use_dic': False, 'is_clean': False, 'reject_threshold': 0.5 } fgsm_acc = confident_model_eval(sess, x, y_soft, preds_fgsm_adv, X_test, Y_test, args=eval_params) print('Test accuracy on FGSM test examples: %0.4f\n' % fgsm_acc) report.adv_train_adv_eval = fgsm_acc # Evaluate the accuracy of the MNIST model on BIM examples bim_params = { 'eps': float(8 / 255) * 2., 'eps_iter': float(2 / 255) * 2., 'clip_min': -1., 'clip_max': 1. } bim_att = BasicIterativeMethod(zero_knowledge_gandef_model, sess=sess) bim_adv = bim_att.generate(x, **bim_params) preds_bim_adv = zero_knowledge_gandef_model(bim_adv) eval_params = { 'batch_size': 128, 'use_dic': False, 'is_clean': False, 'reject_threshold': 0.5 } bim_acc = confident_model_eval(sess, x, y_soft, preds_bim_adv, X_test, Y_test, args=eval_params) print('Test accuracy on BIM test examples: %0.4f\n' % bim_acc) report.adv_train_adv_eval = bim_acc # Evaluate the accuracy of the MNIST model on PGD examples pgd_params = { 'eps': float(8 / 255) * 2., 'eps_iter': float(2 / 255) * 2., 'nb_iter': 7, 'clip_min': -1., 'clip_max': 1., 'rand_init': True } pgd_att = MadryEtAl(zero_knowledge_gandef_model, sess=sess) pgd_adv = pgd_att.generate(x, **bim_params) preds_pgd_adv = zero_knowledge_gandef_model(pgd_adv) eval_params = { 'batch_size': 128, 'use_dic': False, 'is_clean': False, 'reject_threshold': 0.5 } pgd_acc = confident_model_eval(sess, x, y_soft, preds_pgd_adv, X_test, Y_test, args=eval_params) print('Test accuracy on PGD test examples: %0.4f\n' % pgd_acc) report.adv_train_adv_eval = pgd_acc # Save model if save: model_path = "models/zero_knowledge_gandef" vars_to_save = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model_zero_knowledge_gandef*') assert len(vars_to_save) > 0 saver = tf.train.Saver(var_list=vars_to_save) saver.save(sess, model_path) print('Model saved\n') else: print('Model not saved\n')
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64, num_threads=None): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) nb_classes = 10 source_samples = 10 img_rows = 28 img_cols = 28 channels = 1 model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } fgsm_params = {'eps': FLAGS.fgsm_eps, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) prune_factor = FLAGS.prune_factor eval_par = {'batch_size': batch_size} if clean_train: prune_percent = { 'conv1_w': 5, 'conv2_w': 5, 'conv3_w': 5, 'conv4_w': 5, 'fc1_w': prune_factor, 'fc2_w': prune_factor, 'fc3_w': prune_factor } model = make_basic_cnn(nb_filters=nb_filters, prune_percent=prune_percent) initialize_uninitialized_global_variables(sess) preds = model.get_probs(x) saver = tf.train.Saver() def fgsm_combo(): acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_par) print('Test accuracy on legitimate examples: %0.4f\n' % acc) fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print( 'Test accuracy on adversarial examples generated by fgsm: %0.4f\n' % acc) bim = BasicIterativeMethod(model, sess=sess) adv_x = bim.generate(x) preds_adv = model.get_probs(adv_x) acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print( 'Test accuracy on adversarial examples generated by IterativeMethod: %0.4f\n' % acc) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_par) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) ckpt_name = './mnist_model.ckpt' if not FLAGS.resume: model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) saver.save(sess, ckpt_name) if FLAGS.resume: saver = tf.train.import_meta_graph(ckpt_name + '.meta') print("loading pretrain model") saver.restore(sess, ckpt_name) acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_par) print('Test accuracy on pretrained model: %0.4f\n' % acc) if not FLAGS.resume: import sys sys.exit() def do_jsma(): print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where( adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape( -1).shape[0] # Display the original and adversarial images side-by-side if FLAGS.viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols)), np.reshape(adv_x, (img_rows, img_cols)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format( succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format( percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) if FLAGS.viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report def do_cw(): nb_adv_per_sample = str(nb_classes - 1) if FLAGS.targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if FLAGS.viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if FLAGS.targeted: if FLAGS.viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in X_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in X_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, 1)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if FLAGS.viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = X_test[idxs] else: adv_inputs = X_test[:source_samples] adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': FLAGS.attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if FLAGS.targeted else source_samples, 'initial_const': 10 } adv = cw.generate_np(adv_inputs, **cw_params) if FLAGS.targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_par) else: if FLAGS.viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, Y_test[ idxs], args=eval_par) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, Y_test[ :source_samples], args=eval_par) if FLAGS.viz_enabled: for j in range(nb_classes): if FLAGS.targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format( adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format( percent_perturbed)) # Close TF session # sess.close() # Finally, block & display a grid of all the adversarial examples if FLAGS.viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report print("before pruning and gradient inhibition\n") fgsm_combo() do_cw() do_jsma() preds = model.get_probs(x) loss = model_loss(y, preds) if not FLAGS.load_pruned_model: print("start iterative pruning") for i in range(FLAGS.prune_iterations): print("iterative %d" % (i)) start = time.time() dict_nzidx = model.apply_prune(sess) trainer = tf.train.AdamOptimizer(learning_rate) grads = trainer.compute_gradients(loss) grads = model.apply_prune_on_grads(grads, dict_nzidx) end = time.time() print('until grad compute elpased %f' % (end - start)) prune_args = {'trainer': trainer, 'grads': grads} train_params = { 'nb_epochs': FLAGS.retrain_epoch, 'batch_size': batch_size, 'learning_rate': FLAGS.retrain_lr } start = time.time() model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng, prune_args=prune_args, retrainindex=i) end = time.time() print('model_train function takes %f' % (end - start)) eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) saver.save(sess, './pruned_mnist_model.ckpt') else: print("loading pruned model") saver = tf.train.import_meta_graph( './pruned_mnist_model.ckpt.meta') saver.restore(sess, './pruned_mnist_model.ckpt') print("before applying gradient inhibition") fgsm_combo() print("before gradient inhibition, doing c&w") do_cw() do_jsma() if FLAGS.do_inhibition: model.inhibition(sess, original_method=FLAGS.use_inhibition_original, inhibition_eps=FLAGS.inhibition_eps) fgsm_combo() do_cw() do_jsma()
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="/tmp", filename="mnist.ckpt", load_model=False, testing=False): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data # X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, # train_end=train_end, # test_start=test_start, # test_end=test_end) # # Use label smoothing # assert Y_train.shape[1] == 10 # label_smooth = .1 # Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # # Define input TF placeholder # x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) # y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() print(type(model)) print(model) # preds = model(x) # print("Defined TensorFlow model graph.") # def evaluate(): # # Evaluate the accuracy of the MNIST model on legitimate test examples # eval_params = {'batch_size': batch_size} # acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) # report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape # print('Test accuracy on legitimate examples: %0.4f' % acc) # # Train an MNIST model # train_params = { # 'nb_epochs': nb_epochs, # 'batch_size': batch_size, # 'learning_rate': learning_rate, # 'train_dir': train_dir, # 'filename': filename # } # ckpt = tf.train.get_checkpoint_state(train_dir) # ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path # rng = np.random.RandomState([2017, 8, 30]) # if load_model and ckpt_path: # saver = tf.train.Saver() # saver.restore(sess, ckpt_path) # print("Model loaded from: {}".format(ckpt_path)) # evaluate() # else: # print("Model was not loaded, training from scratch.") # model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, # args=train_params, save=True, rng=rng) # # Calculate training error # if testing: # eval_params = {'batch_size': batch_size} # acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) # report.train_clean_train_clean_eval = acc # # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph wrap = KerasModelWrapper(model) print(wrap) print(type(wrap))
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, train_dir=TRAIN_DIR, filename=FILENAME, load_model=LOAD_MODEL, testing=False, label_smoothing=0.1, adversarial_training = ADVERSARIAL_TRAINING, save_model=SAVE_MODEL): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session os.environ["CUDA_VISIBLE_DEVICES"] = '0' # only use No.0 GPU config = tf.ConfigProto() config.allow_soft_placement=True config.gpu_options.allow_growth = True sess = tf.Session(config=config) keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = fc_modelC(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = CrossEntropy(wrap, smoothing=label_smoothing) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng) if save_model: saver = tf.train.Saver(max_to_keep=1) saver.save(sess, '{}/mnist.ckpt'.format(train_dir), global_step=NB_EPOCHS) print("model has been saved") # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.2, 'clip_min': 0., 'clip_max': 1.} print(fgsm_params) adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} start_time = time.time() acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f' % acc) end_time = time.time() print("FGSM attack time is {}\n".format(end_time-start_time)) report.clean_train_adv_eval = acc gc.collect()
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, train_dir=TRAIN_DIR, filename=FILENAME, load_model=LOAD_MODEL, testing=True, label_smoothing=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ tf.keras.backend.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if keras.backend.image_data_format() != 'channels_last': raise NotImplementedError("this tutorial requires keras to be configured to channels_last format") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() eval_params = {'batch_size': batch_size} def eval(xx): acc = model_eval(sess, x, y, preds, xx, y_test, args=eval_params) print(acc) eval(x_test) model = load_modell(str("./autoencoder_fg.h5")) x_test_decoded = model.predict(x_test) eval(x_test_decoded) _, fg_test = pickle.load(open("/data/mnist/fg.pkl")) fg_test = fg_test.astype('float32') / 255. fg_test = np.mean(fg_test, axis=3) fg_test = np.expand_dims(fg_test, axis=3) fg_test_decoded = model.predict(fg_test) eval(fg_test) eval(fg_test_decoded)
), 'svhn': ( '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' ) } # this is the name of the scope of the author(s) Resnet34 graph. If the user wants to just load our network parameters # and maybe later even use our scores.npy outputs (it takes a long time to compute yourself...), he/she must use # these strings. Otherwise, any string is OK. We provide here as default the scope names we used. ARCH_NAME = {'cifar10': 'model1', 'cifar100': 'model_cifar_100', 'svhn': 'model_svhn'} weight_decay = 0.0004 LABEL_SMOOTHING = {'cifar10': 0.1, 'cifar100': 0.01, 'svhn': 0.1} # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility superseed = 123456789 rand_gen = np.random.RandomState(superseed) tf.set_random_seed(superseed) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session config_args = dict(allow_soft_placement=True) sess = tf.Session(config=tf.ConfigProto(**config_args)) # get records from training if FLAGS.checkpoint_dir != '':
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind+1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
# tf.app.run() train_start=0 train_end=60000 test_start=0 test_end=10000 nb_epochs=6 batch_size=128 learning_rate=0.001 train_dir="/tmp" filename="mnist.ckpt" testing=False # keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session()
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=NB_FILTERS, num_threads=None, attack_string=None): """ MNIST cleverhans tutorial :param train_start: index of first training set example. :param train_end: index of last training set example. :param test_start: index of first test set example. :param test_end: index of last test set example. :param nb_epochs: number of epochs to train model. :param batch_size: size of training batches. :param learning_rate: learning rate for training. :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate. :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param nb_filters: number of filters in the CNN used for training. :param num_threads: number of threads used for running the process. :param attack_string: attack name for crafting adversarial attacks and adversarial training, in string format. :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Train an MNIST model model_path = "models/mnist" train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } # Initialize the attack object attack_class = attack_selection(attack_string) attack_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2018, 6, 18]) if clean_train: model = ModelBasicCNNTFE(nb_filters=nb_filters) def evaluate_clean(): """Evaluate the accuracy of the MNIST model on legitimate test examples """ eval_params = {'batch_size': batch_size} acc = model_eval(model, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) train(model, X_train, Y_train, evaluate=evaluate_clean, args=train_params, rng=rng, var_list=model.get_params()) if testing: # Calculate training error eval_params = {'batch_size': batch_size} acc = model_eval(model, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} attack = attack_class(model) acc = model_eval(model, X_test, Y_test, args=eval_par, attack=attack, attack_args=attack_params) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(model, X_train, Y_train, args=eval_par, attack=attack, attack_args=attack_params) print('Train accuracy on adversarial examples: %0.4f\n' % acc) report.train_clean_train_adv_eval = acc # Clear the previous Variables for var in model.get_params(): var = None attack = None print("Repeating the process, using adversarial training") model_adv_train = ModelBasicCNNTFE(nb_filters=nb_filters) attack = attack_class(model_adv_train) def evaluate_adv(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(model_adv_train, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(model_adv_train, X_test, Y_test, args=eval_params, attack=attack, attack_args=attack_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(model_adv_train, X_train, Y_train, evaluate=evaluate_adv, args=train_params, rng=rng, var_list=model_adv_train.get_params(), attack=attack, attack_args=attack_params) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(model_adv_train, X_train, Y_train, args=eval_params, attack=None, attack_args=None) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(model_adv_train, X_train, Y_train, args=eval_params, attack=attack, attack_args=attack_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_tutorial( train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1, ): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data mnist = MNIST( train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end, ) x_train, y_train = mnist.get_set("train") x_test, y_test = mnist.get_set("test") # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { "nb_epochs": nb_epochs, "batch_size": batch_size, "learning_rate": learning_rate, } eval_params = {"batch_size": batch_size} fgsm_params = {"eps": 0.3, "clip_min": 0.0, "clip_max": 1.0} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): """ Run the evaluation and print the results. """ acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = "adversarial" else: report_text = "legitimate" if report_text: print("Test accuracy on %s examples: %0.4f" % (report_text, acc)) if clean_train: model = make_basic_picklable_cnn() # Tag the model so that when it is saved to disk, future scripts will # be able to tell what data it was trained on model.dataset_factory = mnist.get_factory() preds = model.get_logits(x) assert len(model.get_params()) > 0 loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): """ Run evaluation for the naively trained model on clean examples. """ do_eval(preds, x_test, y_test, "clean_train_clean_eval", False) train( sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params(), ) with sess.as_default(): save("clean_model.joblib", model) print("Now that the model has been saved, you can evaluate it in a" " separate process using `evaluate_pickled_model.py`. " "You should get exactly the same result for both clean and " "adversarial accuracy as you get within this program.") # Calculate training error if testing: do_eval(preds, x_train, y_train, "train_clean_train_clean_eval") # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds_adv, x_test, y_test, "clean_train_adv_eval", True) # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, "train_clean_train_adv_eval") print("Repeating the process, using adversarial training") # Create a new model and train it to be robust to FastGradientMethod model2 = make_basic_picklable_cnn() # Tag the model so that when it is saved to disk, future scripts will # be able to tell what data it was trained on model2.dataset_factory = mnist.get_factory() fgsm2 = FastGradientMethod(model2, sess=sess) def attack(x): """Return an adversarial example near clean example `x`""" return fgsm2.generate(x, **fgsm_params) loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack) preds2 = model2.get_logits(x) adv_x2 = attack(x) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) def evaluate_adv(): """ Evaluate the adversarially trained model. """ # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, "adv_train_clean_eval", False) # Accuracy of the adversarially trained model on adversarial examples do_eval(preds2_adv, x_test, y_test, "adv_train_adv_eval", True) # Perform and evaluate adversarial training train( sess, loss2, x_train, y_train, evaluate=evaluate_adv, args=train_params, rng=rng, var_list=model2.get_params(), ) with sess.as_default(): save("adv_model.joblib", model2) print( "Now that the model has been saved, you can evaluate it in a " "separate process using " "`python evaluate_pickled_model.py adv_model.joblib`. " "You should get exactly the same result for both clean and " "adversarial accuracy as you get within this program." " You can also move beyond the tutorials directory and run the " " real `compute_accuracy.py` script (make sure cleverhans/scripts " "is in your PATH) to see that this FGSM-trained " "model is actually not very robust---it's just a model that trains " " quickly so the tutorial does not take a long time") # Calculate training errors if testing: do_eval(preds2, x_train, y_train, "train_adv_train_clean_eval") do_eval(preds2_adv, x_train, y_train, "train_adv_train_adv_eval") return report
def mnist_tutorial(nb_epochs=6, batch_size=128, train_end=-1, test_end=-1, learning_rate=0.001): """ MNIST cleverhans tutorial :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Train a pytorch MNIST model torch_model = PytorchMnistModel() if torch.cuda.is_available(): torch_model = torch_model.cuda() report = AccuracyReport() train_loader = torch.utils.data.DataLoader( datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor()), batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader( datasets.MNIST('data', train=False, transform=transforms.ToTensor()), batch_size=batch_size) # Truncate the datasets so that our test run more quickly train_loader.dataset.train_data = train_loader.dataset.train_data[ :train_end] test_loader.dataset.test_data = test_loader.dataset.test_data[:test_end] # Train our model optimizer = optim.Adam(torch_model.parameters(), lr=learning_rate) train_loss = [] total = 0 correct = 0 step = 0 for epoch in range(nb_epochs): for xs, ys in train_loader: xs, ys = Variable(xs), Variable(ys) if torch.cuda.is_available(): xs, ys = xs.cuda(), ys.cuda() optimizer.zero_grad() preds = torch_model(xs) loss = F.nll_loss(preds, ys) loss.backward() # calc gradients train_loss.append(loss.data.item()) optimizer.step() # update gradients preds_np = preds.data.cpu().numpy() correct += (np.argmax(preds_np, axis=1) == ys).sum() total += len(xs) step += 1 if total % 1000 == 0: acc = float(correct) / total print('[%s] Training accuracy: %.2f%%' % (step, acc * 100)) total = 0 correct = 0 # Evaluate on clean data total = 0 correct = 0 for xs, ys in test_loader: xs, ys = Variable(xs), Variable(ys) if torch.cuda.is_available(): xs, ys = xs.cuda(), ys.cuda() preds = torch_model(xs) preds_np = preds.data.cpu().numpy() correct += (np.argmax(preds_np, axis=1) == ys).sum() total += len(xs) acc = float(correct) / total report.clean_train_clean_eval = acc print('[%s] Clean accuracy: %.2f%%' % (step, acc * 100)) # We use tf for evaluation on adversarial data sess = tf.Session() x_op = tf.placeholder(tf.float32, shape=(None, 1, 28, 28,)) # Convert pytorch model to a tf_model and wrap it in cleverhans tf_model_fn = convert_pytorch_model_to_tf(torch_model) cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits') # Create an FGSM attack fgsm_op = FastGradientMethod(cleverhans_model, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x_op = fgsm_op.generate(x_op, **fgsm_params) adv_preds_op = tf_model_fn(adv_x_op) # Run an evaluation of our model against fgsm total = 0 correct = 0 for xs, ys in test_loader: adv_preds = sess.run(adv_preds_op, feed_dict={x_op: xs}) correct += (np.argmax(adv_preds, axis=1) == ys).sum() total += len(xs) acc = float(correct) / total print('Adv accuracy: {:.3f}'.format(acc * 100)) report.clean_train_adv_eval = acc return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="train_dir", filename="mnist.ckpt", load_model=False, testing=False, label_smoothing=True): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] if label_smoothing: label_smooth = .1 y_train = y_train.clip(label_smooth / (nb_classes-1), 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = LossCrossEntropy(wrap, smoothing=0.1) train(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, save=True, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_train, y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) wrap_2 = KerasModelWrapper(model_2) preds_2 = model_2(x) fgsm2 = FastGradientMethod(wrap_2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) preds_2_adv = model_2(attack(x)) loss_2 = LossCrossEntropy(wrap_2, smoothing=0.1, attack=attack) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_test, y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, x_test, y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(sess, loss_2, x, y, x_train, y_train, evaluate=evaluate_2, args=train_params, save=False, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_train, y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, x_train, y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_ae(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, num_threads=None, label_smoothing=0.1): report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) rng = np.random.RandomState() source_samples = 10 # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get CIFAR10 data data = CIFAR10(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) dataset_size = data.x_train.shape[0] dataset_train = data.to_tensorflow()[0] dataset_train = dataset_train.map( lambda x, y: (random_shift(random_horizontal_flip(x)), y), 4) dataset_train = dataset_train.batch(batch_size) dataset_train = dataset_train.prefetch(16) x_train, y_train = data.get_set('train') x_test, y_test = data.get_set('test') nb_latent_size = 100 # Get MNIST test data # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] print("img_Rows, img_cols, nchannels: ", img_rows, img_cols, nchannels) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) x_t = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) y_t = tf.placeholder( tf.float32, shape=(None, nb_classes)) #z = tf.placeholder(tf.float32, shape = (None, nb_latent_size)) #z_t = tf.placeholder(tf.float32, shape = (None, nb_latent_size)) ''' save_dir= 'models' model_name = 'cifar10_AE.h5' model_path_ae = os.path.join(save_dir, model_name) ''' #model_ae= ae_model(x, img_rows=img_rows, img_cols=img_cols, # channels=nchannels) #recon = model_ae(x) #print("recon: ",recon) wrap_ae = ModelVAE('wrap_ae') recon = wrap_ae.get_layer(x,'RECON') print("Defined TensorFlow model graph.") def evaluate_ae(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': 128} noise, d1, d2, dist_diff, avg_dist_lat = model_eval_ae(sess, x, x_t,recon, x_train, x_train, args=eval_params) print("reconstruction distance: ", d1) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, #'train_dir': train_dir_ae, #'filename': filename } rng = np.random.RandomState([2017, 8, 30]) #if not os.path.exists(train_dir_ae): # os.mkdir(train_dir_ae) #ckpt = tf.train.get_checkpoint_state(train_dir_ae) #print(train_dir_ae, ckpt) #ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path if clean_train_vae==True: print("Training VAE") loss = vae_loss(wrap_ae) train_ae(sess, loss, x_train, x_train, tf.train.AdamOptimizer(learning_rate=0.0002, beta1=0.5), evaluate=evaluate_ae, args=train_params, rng=rng, var_list = wrap_ae.get_params()) saver = tf.train.Saver() saver.save(sess, "train_dir/model_vae_fgsm.ckpt") print("saved model") else: print("Loading VAE") saver = tf.train.Saver() #print(ckpt_path) saver.restore(sess, "train_dir/model_vae.ckpt") evaluate_ae() if(train_further): train_params = { 'nb_epochs': 10, 'batch_size': batch_size, 'learning_rate': 0.0002, } #training with the saved model as starting point loss = SquaredError(wrap_ae) train_ae(sess, loss, x_train, x_train, optimizer = tf.train.AdamOptimizer(learning_rate=0.0002, beta1=0.5), evaluate=evaluate_ae, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, "train_dir/model_vae_fgsm.ckpt") evaluate_ae() print("Model loaded and trained for more epochs") num_classes = 10 ''' save_dir= 'models' model_name = 'cifar10_CNN.h5' model_path_cls = os.path.join(save_dir, model_name) ''' cl_model = cnn_cl_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds_cl = cl_model(x) def do_eval_cls(preds, x_set, y_set, x_tar_set,report_key, is_adv = None): acc = model_eval(sess, x, y, preds, x_t, x_set, y_set, x_tar_set, args=eval_params_cls) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_cl,x_t, x_test, y_test, x_test,args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) train_params = { 'nb_epochs': 100, 'batch_size': batch_size, 'learning_rate': learning_rate, #'train_dir': train_dir_cl, #'filename': filename } rng = np.random.RandomState([2017, 8, 30]) wrap_cl = KerasModelWrapper(cl_model) if clean_train_cl == True: train_params = { 'nb_epochs': 5, 'batch_size': batch_size, 'learning_rate': learning_rate, #'train_dir': train_dir_cl, #'filename': filename } print("Training CNN Classifier") ''' datagen = ImageDataGenerator( rotation_range=15, width_shift_range=0.1, height_shift_range=0.1, horizontal_flip=True, ) datagen.fit(x_train) ''' loss_cl = CrossEntropy(wrap_cl, smoothing=label_smoothing) #for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size = 128): # train(sess, loss_cl, x_batch, y_batch, tf.train.AdamOptimizer(learning_rate=0.0002, beta1=0.5), evaluate=evaluate, # args=train_params, rng=rng) train(sess, loss_cl, x_train, y_train, evaluate=evaluate, optimizer = tf.train.RMSPropOptimizer(learning_rate = 0.0001, decay = 1e-6), args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, "train_dir/model_cnn_cl.ckpt") print("saved model at ", "train_dir/model_cnn_cl_fgsm.ckpt") else: print("Loading CNN Classifier") saver = tf.train.Saver() #print(ckpt_path) saver.restore(sess, "train_dir/model_cnn_cl.ckpt") evaluate() if(train_further): train_params = { 'nb_epochs': 10, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir_cl, 'filename': filename } loss_cl = CrossEntropy(wrap_cl, smoothing=label_smoothing) train(sess, loss_cl, x_train, y_train, evaluate=evaluate, optimizer = tf.train.RMSPropOptimizer(learning_rate = 0.0001, decay = 1e-6), args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, "train_dir/model_cl_fgsm.ckpt") print("Model loaded and trained further") evaluate() ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object #cw = CarliniWagnerAE(wrap_ae,wrap_cl, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes)] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') grid_viz_data_1 = np.zeros(grid_shape, dtype='f') adv_inputs = np.array( [[instance] * (nb_classes-1) for instance in x_test[idxs]], dtype=np.float32) #adv_input_y = np.array([[instance]*(nb_classes-1) for instance in y_test[idxs]]) adv_input_y = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes-1): targ.append(y_test[idxs[curr_num]]) adv_input_y.append(targ) adv_input_y = np.array(adv_input_y) adv_target_y = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes): if(id!=curr_num): targ.append(y_test[idxs[id]]) adv_target_y.append(targ) adv_target_y = np.array(adv_target_y) #print("adv_input_y: \n", adv_input_y) #print("adv_target_y: \n", adv_target_y) adv_input_targets = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes): if(id!=curr_num): targ.append(x_test[idxs[id]]) adv_input_targets.append(targ) adv_input_targets = np.array(adv_input_targets) adv_inputs = adv_inputs.reshape( (source_samples * (nb_classes-1), img_rows, img_cols, nchannels)) adv_input_targets = adv_input_targets.reshape( (source_samples * (nb_classes-1), img_rows, img_cols, nchannels)) adv_input_y = adv_input_y.reshape(source_samples*(nb_classes-1), 10) adv_target_y = adv_target_y.reshape(source_samples*(nb_classes-1), 10) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape((source_samples * nb_classes, nb_classes)) yname = "y_target" fgsm_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1. } fgsm = FastGradientMethodAe(wrap_ae, sess=sess) adv = fgsm.generate(x,x_t, **fgsm_params) adv = sess.run(adv, {x: adv_inputs, x_t: adv_input_targets}) recon_orig = wrap_ae.get_layer(x, 'RECON') recon_orig = sess.run(recon_orig, feed_dict = {x: adv_inputs}) recon_adv = wrap_ae.get_layer(x, 'RECON') recon_adv = sess.run(recon_adv, feed_dict = {x: adv}) pred_adv_recon = wrap_cl.get_logits(x) pred_adv_recon = sess.run(pred_adv_recon, {x:recon_adv}) #scores1 = cl_model.evaluate(recon_adv, adv_input_y, verbose=1) #scores2 = cl_model.evaluate(recon_adv, adv_target_y, verbose = 1) #acc_1 = model_eval(sess, x, y, pred_adv_recon, x_t, adv_inputs, adv_target_y, adv_input_targets, args=eval_params_cls) #acc_2 = model_eval(sess, x, y, pred_adv_recon, x_t, adv_inputs, adv_input_y, adv_input_targets, args=eval_params_cls) shape = np.shape(adv_inputs) noise = np.sum(np.square(adv-adv_inputs))/(np.shape(adv)[0]) noise = pow(noise,0.5) d1 = np.sum(np.square(recon_adv-adv_inputs))/(np.shape(adv_inputs)[0]) d2 = np.sum(np.square(recon_adv-adv_input_targets))/(np.shape(adv_inputs)[0]) acc_1 = (sum(np.argmax(pred_adv_recon, axis=-1)== np.argmax(adv_target_y, axis=-1)))/(np.shape(adv_target_y)[0]) acc_2 = (sum(np.argmax(pred_adv_recon, axis=-1)== np.argmax(adv_input_y, axis=-1)))/(np.shape(adv_target_y)[0]) print("noise: ", noise) print("d1: ", d1) print("d2: ", d2) print("classifier acc_target: ", acc_1) print("classifier acc_true: ", acc_2) #print("recon_adv[0]\n", recon_adv[0,:,:,0]) curr_class = 0 if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if(i==j): grid_viz_data[i,j] = recon_orig[curr_class*9] grid_viz_data_1[i,j] = adv_inputs[curr_class*9] curr_class = curr_class+1 else: if(j>i): grid_viz_data[i,j] = recon_adv[i*(nb_classes-1) + j-1] grid_viz_data_1[i,j] = adv[i*(nb_classes-1)+j-1] else: grid_viz_data[i,j] = recon_adv[i*(nb_classes-1) + j] grid_viz_data_1[i,j] = adv[i*(nb_classes-1)+j] #rint(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Finally, block & display a grid of all the adversarial examples if viz_enabled: plt.ioff() figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = grid_viz_data.shape[0] num_rows = grid_viz_data.shape[1] num_channels = grid_viz_data.shape[4] for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') plt.imshow(grid_viz_data[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_fgsm_vae_fig1') figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') plt.imshow(grid_viz_data_1[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_fgsm_vae_fig2') if adversarial_training: print("starting adversarial training") index_shuf = list(range(len(x_train))) x_train_target = x_train[index_shuf] y_train_target = y_train[index_shuf] # Randomly repeat a few training examples each epoch to avoid # having a too-small batch ''' while len(index_shuf) % batch_size != 0: index_shuf.append(rng.randint(len(x_train))) nb_batches = len(index_shuf) // batch_size rng.shuffle(index_shuf) # Shuffling here versus inside the loop doesn't seem to affect # timing very much, but shuffling here makes the code slightly # easier to read ''' print("len of x_train_target and x_train: ", len(x_train_target), len(x_train)) for ind in range (0, len(x_train)): r_ind = -1 while(np.argmax(y_train_target[ind])==np.argmax(y_train[ind])): r_ind = rng.randint(0,len(x_train)) y_train_target[ind] = y_train[r_ind] if r_ind>-1: x_train_target[ind] = x_train[r_ind] wrap_ae2 = ModelVAE('wrap_ae2') fgsm2 = FastGradientMethodAe(wrap_ae2, sess=sess) adv2 = fgsm.generate(x,x_t, **fgsm_params) adv_set = sess.run(adv2, {x: x_train, x_t: x_train_target}) x_train_aim = np.append(x_train, x_train, axis = 0) x_train_app = np.append(x_train, adv_set, axis = 0) loss2 = vae_loss(wrap_ae2) train_params = { 'nb_epochs': 5, 'batch_size': batch_size, 'learning_rate': learning_rate} train_ae(sess, loss2, x_train_app, x_train_aim, tf.train.AdamOptimizer(learning_rate=0.0002, beta1=0.5), args=train_params, rng=rng, var_list = wrap_ae2.get_params()) evaluate_ae() adv3 = fgsm2.generate(x, x_t, **fgsm_params) adv3 = sess.run(adv3, {x: adv_inputs, x_t: adv_input_targets}) recon_orig2 = wrap_ae2.get_layer(x, 'RECON') recon_orig2 = sess.run(recon_orig2, feed_dict = {x: adv_inputs}) recon_adv2 = wrap_ae2.get_layer(x, 'RECON') recon_adv2 = sess.run(recon_adv2, feed_dict = {x: adv3}) pred_adv_recon2 = wrap_cl.get_logits(x) pred_adv_recon2 = sess.run(pred_adv_recon2, {x:recon_adv2}) shape = np.shape(adv_inputs) noise = np.sum(np.square(adv3-adv_inputs))/(np.shape(adv3)[0]) noise = pow(noise,0.5) d1 = np.sum(np.square(recon_adv2-adv_inputs))/(np.shape(adv_inputs)[0]) d2 = np.sum(np.square(recon_adv2-adv_input_targets))/(np.shape(adv_inputs)[0]) acc_1 = (sum(np.argmax(pred_adv_recon2, axis=-1)== np.argmax(adv_target_y, axis=-1)))/(np.shape(adv_target_y)[0]) acc_2 = (sum(np.argmax(pred_adv_recon2, axis=-1)== np.argmax(adv_input_y, axis=-1)))/(np.shape(adv_target_y)[0]) print("noise: ", noise) print("d1: ", d1) print("d2: ", d2) print("classifier acc_target: ", acc_1) print("classifier acc_true: ", acc_2) #print("recon_adv[0]\n", recon_adv[0,:,:,0]) curr_class = 0 if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if(i==j): grid_viz_data[i,j] = recon_orig2[curr_class*9] grid_viz_data_1[i,j] = adv_inputs[curr_class*9] curr_class = curr_class+1 else: if(j>i): grid_viz_data[i,j] = recon_adv2[i*(nb_classes-1) + j-1] grid_viz_data_1[i,j] = adv3[i*(nb_classes-1)+j-1] else: grid_viz_data[i,j] = recon_adv2[i*(nb_classes-1) + j] grid_viz_data_1[i,j] = adv3[i*(nb_classes-1)+j] #rint(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Finally, block & display a grid of all the adversarial examples if viz_enabled: plt.ioff() figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = grid_viz_data.shape[0] num_rows = grid_viz_data.shape[1] num_channels = grid_viz_data.shape[4] for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') plt.imshow(grid_viz_data[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_vae_fgsm_adv_fig1') figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') plt.imshow(grid_viz_data_1[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_vae_fgsm_adv_fig2') #return report if binarization: print("----------------") print("BINARIZATION") adv[adv>0.5] = 1.0 adv[adv<=0.5] = 0.0 recon_orig = wrap_ae.get_layer(x, 'RECON') recon_adv = wrap_ae.get_layer(x, 'RECON') #pred_adv = wrap_cl.get_logits(x) recon_orig = sess.run(recon_orig, {x: adv_inputs}) recon_adv = sess.run(recon_adv, {x: adv}) #pred_adv = sess.run(pred_adv, {x: recon_adv}) pred_adv_recon = wrap_cl.get_logits(x) pred_adv_recon = sess.run(pred_adv_recon, {x:recon_adv}) eval_params = {'batch_size': 90} if targeted: noise = np.sum(np.square(adv-adv_inputs))/(np.shape(adv)[0]) noise = pow(noise,0.5) d1 = np.sum(np.square(recon_adv-adv_inputs))/(np.shape(adv_inputs)[0]) d2 = np.sum(np.square(recon_adv-adv_input_targets))/(np.shape(adv_inputs)[0]) acc_1 = (sum(np.argmax(pred_adv_recon, axis=-1)== np.argmax(adv_target_y, axis=-1)))/(np.shape(adv_target_y)[0]) acc_2 = (sum(np.argmax(pred_adv_recon, axis=-1)== np.argmax(adv_input_y, axis=-1)))/(np.shape(adv_target_y)[0]) print("noise: ", noise) print("d1: ", d1) print("d2: ", d2) print("classifier acc_target: ", acc_1) print("classifier acc_true: ", acc_2) curr_class = 0 if viz_enabled: for j in range(nb_classes): for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if(i==j): grid_viz_data[i,j] = recon_orig[curr_class*9] grid_viz_data_1[i,j] = adv_inputs[curr_class*9] curr_class = curr_class+1 else: if(j>i): grid_viz_data[i,j] = recon_adv[i*(nb_classes-1) + j-1] grid_viz_data_1[i,j] = adv[i*(nb_classes-1)+j-1] else: grid_viz_data[i,j] = recon_adv[i*(nb_classes-1) + j] grid_viz_data_1[i,j] = adv[i*(nb_classes-1)+j] plt.ioff() figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = grid_viz_data.shape[0] num_rows = grid_viz_data.shape[1] num_channels = grid_viz_data.shape[4] for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy* num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(grid_viz_data[xx, yy, :, :, 0]) else: plt.imshow(grid_viz_data[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_fgsm_vae_fig1_bin') figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(grid_viz_data_1[xx, yy, :, :, 0]) else: plt.imshow(grid_viz_data_1[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_fgsm_vae_fig2_bin') if(mean_filtering ==True): print("----------------") print("MEAN FILTERING") adv = uniform_filter(adv, 2) recon_orig = wrap_ae.get_layer(x, 'RECON') recon_adv = wrap_ae.get_layer(x, 'RECON') pred_adv_recon = wrap_cl.get_logits(x) recon_orig = sess.run(recon_orig, {x: adv_inputs}) recon_adv = sess.run(recon_adv, {x: adv}) pred_adv_recon = sess.run(pred_adv_recon, {x: recon_adv}) eval_params = {'batch_size': 90} noise = np.sum(np.square(adv-adv_inputs))/(np.shape(adv)[0]) noise = pow(noise,0.5) d1 = np.sum(np.square(recon_adv-adv_inputs))/(np.shape(adv_inputs)[0]) d2 = np.sum(np.square(recon_adv-adv_input_targets))/(np.shape(adv_inputs)[0]) acc_1 = (sum(np.argmax(pred_adv_recon, axis=-1)== np.argmax(adv_target_y, axis=-1)))/(np.shape(adv_target_y)[0]) acc_2 = (sum(np.argmax(pred_adv_recon, axis=-1)== np.argmax(adv_input_y, axis=-1)))/(np.shape(adv_target_y)[0]) print("noise: ", noise) print("d1: ", d1) print("d2: ", d2) print("classifier acc_target: ", acc_1) print("classifier acc_true: ", acc_2) curr_class = 0 if viz_enabled: for j in range(nb_classes): for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if(i==j): grid_viz_data[i,j] = recon_orig[curr_class*9] grid_viz_data_1[i,j] = adv_inputs[curr_class*9] curr_class = curr_class+1 else: if(j>i): grid_viz_data[i,j] = recon_adv[i*(nb_classes-1) + j-1] grid_viz_data_1[i,j] = adv[i*(nb_classes-1)+j-1] else: grid_viz_data[i,j] = recon_adv[i*(nb_classes-1) + j] grid_viz_data_1[i,j] = adv[i*(nb_classes-1)+j] plt.ioff() figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = grid_viz_data.shape[0] num_rows = grid_viz_data.shape[1] num_channels = grid_viz_data.shape[4] for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy* num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(grid_viz_data[xx, yy, :, :, 0]) else: plt.imshow(grid_viz_data[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_fgsm_vae_fig1_mean') figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(grid_viz_data_1[xx, yy, :, :, 0]) else: plt.imshow(grid_viz_data_1[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_fgsm_vae_fig2_mean')
def helper_run_multi_gpu_madryetal(self, extra_flags=None): """ Compare the single GPU performance to multiGPU performance. """ # Run the trainers on a dataset of reduced size flags = { "train_start": 0, "train_end": 5000, "test_start": 0, "test_end": 333, "nb_epochs": 5, "testing": True, } # Run the multi-gpu trainer for adversarial training flags.update( { "batch_size": 128, "adam_lrn": 0.001, "dataset": "mnist", "only_adv_train": False, "eval_iters": 1, "fast_tests": True, "save_dir": None, "save_steps": 10000, "attack_nb_iter_train": 10, "sync_step": None, "adv_train": True, "save": False, "model_type": "basic", "attack_type_test": "MadryEtAl_y", } ) if extra_flags is not None: flags.update(extra_flags) # Run the multi-gpu trainer for adversarial training using 2 gpus # trainer_multigpu by default sets `allow_soft_placement=True` flags.update( {"ngpu": 2, "attack_type_train": "MadryEtAl_y_multigpu", "sync_step": 1} ) HParams = namedtuple("HParams", flags.keys()) hparams = HParams(**flags) np.random.seed(42) tf.set_random_seed(42) with tf.variable_scope(None, "runner"): report_dict = run_trainer(hparams) report_m = AccuracyReport() report_m.train_adv_train_clean_eval = report_dict["train"] report_m.adv_train_clean_eval = report_dict["test"] report_m.adv_train_adv_eval = report_dict["MadryEtAl_y"] flags.update({"ngpu": 1, "attack_type_train": "MadryEtAl_y"}) hparams = HParams(**flags) np.random.seed(42) tf.set_random_seed(42) with tf.variable_scope(None, "runner"): report_dict = run_trainer(hparams) report_s = AccuracyReport() report_s.train_adv_train_clean_eval = report_dict["train"] report_s.adv_train_clean_eval = report_dict["test"] report_s.adv_train_adv_eval = report_dict["MadryEtAl_y"] self.assertClose( report_s.train_adv_train_clean_eval, report_m.train_adv_train_clean_eval, atol=5e-2, ) self.assertClose( report_s.adv_train_clean_eval, report_m.adv_train_clean_eval, atol=2e-2 ) self.assertClose( report_s.adv_train_adv_eval, report_m.adv_train_adv_eval, atol=5e-2 )
def SNNL_example( train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, nb_filters=NB_FILTERS, SNNL_factor=SNNL_FACTOR, output_dir=OUTPUT_DIR, ): """ A simple model trained to minimize Cross Entropy and Maximize Soft Nearest Neighbor Loss at each internal layer. This outputs a TSNE of the sign of the adversarial gradients of a trained model. A model with a negative SNNL_factor will show little or no class clusters, while a model with a 0 SNNL_factor will have class clusters in the adversarial gradient direction. :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param SNNL_factor: multiplier for Soft Nearest Neighbor Loss :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST data mnist = MNIST( train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end, ) x_train, y_train = mnist.get_set("train") x_test, y_test = mnist.get_set("test") # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { "nb_epochs": nb_epochs, "batch_size": batch_size, "learning_rate": learning_rate, } eval_params = {"batch_size": batch_size} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) print("Test accuracy on legitimate examples: %0.4f" % (acc)) model = ModelBasicCNN("model", nb_classes, nb_filters) preds = model.get_logits(x) cross_entropy_loss = CrossEntropy(model) if not SNNL_factor: loss = cross_entropy_loss else: loss = SNNLCrossEntropy(model, factor=SNNL_factor, optimize_temperature=False) def evaluate(): do_eval(preds, x_test, y_test, "clean_train_clean_eval") train( sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params(), ) do_eval(preds, x_train, y_train, "train_clean_train_clean_eval") def imscatter(points, images, ax=None, zoom=1, cmap="hot"): if ax is None: ax = plt.gca() artists = [] i = 0 if not isinstance(cmap, list): cmap = [cmap] * len(points) for x0, y0 in points: transformed = (images[i] - np.min(images[i])) / ( np.max(images[i]) - np.min(images[i]) ) im = OffsetImage(transformed[:, :, 0], zoom=zoom, cmap=cmap[i]) ab = AnnotationBbox(im, (x0, y0), xycoords="data", frameon=False) artists.append(ax.add_artist(ab)) i += 1 ax.update_datalim(np.column_stack(np.transpose(points))) ax.autoscale() ax.get_xaxis().set_ticks([]) ax.get_yaxis().set_ticks([]) return artists adv_grads = tf.sign(tf.gradients(cross_entropy_loss.fprop(x, y), x)) feed_dict = {x: x_test[:batch_size], y: y_test[:batch_size]} adv_grads_val = sess.run(adv_grads, feed_dict=feed_dict) adv_grads_val = np.reshape(adv_grads_val, (batch_size, img_rows * img_cols)) X_embedded = TSNE(n_components=2, verbose=0).fit_transform(adv_grads_val) plt.figure(num=None, figsize=(50, 50), dpi=40, facecolor="w", edgecolor="k") plt.title( "TSNE of Sign of Adv Gradients, SNNLCrossEntropy Model, factor:" + str(FLAGS.SNNL_factor), fontsize=42, ) imscatter(X_embedded, x_test[:batch_size], zoom=2, cmap="Purples") plt.savefig( output_dir + "adversarial_gradients_SNNL_factor_" + str(SNNL_factor) + ".png" )
def test_run_single_gpu_fgsm(self): """ Test the basic single GPU performance by comparing to the FGSM tutorial. """ from cleverhans_tutorials import mnist_tutorial_tf # Run the MNIST tutorial on a dataset of reduced size flags = { "train_start": 0, "train_end": 5000, "test_start": 0, "test_end": 333, "nb_epochs": 5, "testing": True, } report = mnist_tutorial_tf.mnist_tutorial(**flags) # Run the multi-gpu trainer for clean training flags.update( { "batch_size": 128, "adam_lrn": 0.001, "dataset": "mnist", "only_adv_train": False, "eval_iters": 1, "ngpu": 1, "fast_tests": False, "attack_type_train": "", "save_dir": None, "save_steps": 10000, "attack_nb_iter_train": None, "save": False, "model_type": "basic", "attack_type_test": "FGSM", } ) flags.update({"adv_train": False}) HParams = namedtuple("HParams", flags.keys()) hparams = HParams(**flags) np.random.seed(42) tf.set_random_seed(42) with tf.variable_scope(None, "runner"): report_dict = run_trainer(hparams) report_2 = AccuracyReport() report_2.train_clean_train_clean_eval = report_dict["train"] report_2.clean_train_clean_eval = report_dict["test"] report_2.clean_train_adv_eval = report_dict["FGSM"] # Run the multi-gpu trainer for adversarial training flags.update({"adv_train": True, "attack_type_train": "FGSM"}) HParams = namedtuple("HParams", flags.keys()) hparams = HParams(**flags) np.random.seed(42) tf.set_random_seed(42) with tf.variable_scope(None, "runner"): report_dict = run_trainer(hparams) report_2.train_adv_train_clean_eval = report_dict["train"] report_2.adv_train_clean_eval = report_dict["test"] report_2.adv_train_adv_eval = report_dict["FGSM"] self.assertClose( report.train_clean_train_clean_eval, report_2.train_clean_train_clean_eval, atol=5e-2, ) self.assertClose( report.clean_train_clean_eval, report_2.clean_train_clean_eval, atol=2e-2 ) self.assertClose( report.clean_train_adv_eval, report_2.clean_train_adv_eval, atol=5e-2 ) self.assertClose( report.train_adv_train_clean_eval, report_2.train_adv_train_clean_eval, atol=1e-1, ) self.assertClose( report.adv_train_clean_eval, report_2.adv_train_clean_eval, atol=2e-2 ) self.assertClose( report.adv_train_adv_eval, report_2.adv_train_adv_eval, atol=1e-1 )
def test_run_single_gpu_fgsm(self): """ Test the basic single GPU performance by comparing to the FGSM tutorial. """ from cleverhans_tutorials import mnist_tutorial_tf # Run the MNIST tutorial on a dataset of reduced size flags = {'train_start': 0, 'train_end': 5000, 'test_start': 0, 'test_end': 333, 'nb_epochs': 5, 'testing': True} report = mnist_tutorial_tf.mnist_tutorial(**flags) # Run the multi-gpu trainer for clean training flags.update({'batch_size': 128, 'adam_lrn': 0.001, 'dataset': 'mnist', 'only_adv_train': False, 'eval_iters': 1, 'ngpu': 1, 'fast_tests': False, 'attack_type_train': '', 'save_dir': None, 'save_steps': 10000, 'attack_nb_iter_train': None, 'save': False, 'model_type': 'basic', 'attack_type_test': 'FGSM'}) flags.update({'adv_train': False}) HParams = namedtuple('HParams', flags.keys()) hparams = HParams(**flags) np.random.seed(42) tf.set_random_seed(42) with tf.variable_scope(None, 'runner'): report_dict = run_trainer(hparams) report_2 = AccuracyReport() report_2.train_clean_train_clean_eval = report_dict['train'] report_2.clean_train_clean_eval = report_dict['test'] report_2.clean_train_adv_eval = report_dict['FGSM'] # Run the multi-gpu trainer for adversarial training flags.update({'adv_train': True, 'attack_type_train': 'FGSM', }) HParams = namedtuple('HParams', flags.keys()) hparams = HParams(**flags) np.random.seed(42) tf.set_random_seed(42) with tf.variable_scope(None, 'runner'): report_dict = run_trainer(hparams) report_2.train_adv_train_clean_eval = report_dict['train'] report_2.adv_train_clean_eval = report_dict['test'] report_2.adv_train_adv_eval = report_dict['FGSM'] self.assertClose(report.train_clean_train_clean_eval, report_2.train_clean_train_clean_eval, atol=5e-2) self.assertClose(report.clean_train_clean_eval, report_2.clean_train_clean_eval, atol=2e-2) self.assertClose(report.clean_train_adv_eval, report_2.clean_train_adv_eval, atol=5e-2) self.assertClose(report.train_adv_train_clean_eval, report_2.train_adv_train_clean_eval, atol=1e-1) self.assertClose(report.adv_train_clean_eval, report_2.adv_train_clean_eval, atol=2e-2) self.assertClose(report.adv_train_adv_eval, report_2.adv_train_adv_eval, atol=1e-1)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = make_basic_cnn() preds = model.get_probs(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params) # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph fgsm = FastGradientMethod(model, sess=sess) fgsm_params = {'eps': 0.3} adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = make_basic_cnn() preds_2 = model_2(x) fgsm2 = FastGradientMethod(model_2, sess=sess) preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params)) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params) return report
from cleverhans.utils_keras import KerasModelWrapper from cleverhans.utils_tf import model_eval from lib.utils import load_gtsrb from parameters import * from small_net import * from stn.conv_model import conv_model_no_color_adjust import logging import os os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "1" keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if K.image_dim_ordering() != 'tf': K.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() K.set_session(sess) set_log_level(logging.DEBUG)
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x, y, x_train, y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes)] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape((source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" cw_params = {'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': 10} adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ idxs], args=eval_params) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ :source_samples], args=eval_params) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, train_dir=TRAIN_DIR, filename=FILENAME, load_model=LOAD_MODEL, testing=True, label_smoothing=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ tf.keras.backend.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if keras.backend.image_data_format() != 'channels_last': raise NotImplementedError("this tutorial requires keras to be configured to channels_last format") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) [x_adv_train, x_adv_test] = pickle.load(open(PKLDATA+'.pkl')) x_train, y_train = mnist.get_set('train') x_train = x_adv_train x_test, y_test = mnist.get_set('test') x_test = x_adv_test # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc print('Test accuracy on adversarial examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = CrossEntropy(wrap, smoothing=label_smoothing) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph filter_x = binary_filter(x) batch = 1000 def convert(x_input): x_output = None for i in tqdm(range(int(len(x_input) / batch))): tmp = sess.run(filter_x, feed_dict={x: x_input[i * batch:(i + 1) * batch]}) if x_output is None: x_output = tmp else: x_output = np.concatenate((x_output, tmp)) return x_output x_filter_test = convert(x_test) x_filter_train = convert(x_train) def evaluate_adv(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_filter_test, y_test, args=eval_params) report.clean_train_clean_eval = acc print('Test accuracy on filtered examples: %0.4f' % acc) evaluate_adv()
def helper_run_multi_gpu_madryetal(self, extra_flags=None): """ Compare the single GPU performance to multiGPU performance. """ # Run the trainers on a dataset of reduced size flags = { 'train_start': 0, 'train_end': 5000, 'test_start': 0, 'test_end': 333, 'nb_epochs': 5, 'testing': True } # Run the multi-gpu trainer for adversarial training flags.update({ 'batch_size': 128, 'adam_lrn': 0.001, 'dataset': 'mnist', 'only_adv_train': False, 'eval_iters': 1, 'fast_tests': True, 'save_dir': None, 'save_steps': 10000, 'attack_nb_iter_train': 10, 'sync_step': None, 'adv_train': True, 'save': False, 'model_type': 'basic', 'attack_type_test': 'MadryEtAl_y' }) if extra_flags is not None: flags.update(extra_flags) # Run the multi-gpu trainer for adversarial training using 2 gpus # trainer_multigpu by default sets `allow_soft_placement=True` flags.update({ 'ngpu': 2, 'attack_type_train': 'MadryEtAl_y_multigpu', 'sync_step': 1 }) HParams = namedtuple('HParams', flags.keys()) hparams = HParams(**flags) np.random.seed(42) tf.compat.v1.set_random_seed(42) with tf.compat.v1.variable_scope(None, 'runner'): report_dict = run_trainer(hparams) report_m = AccuracyReport() report_m.train_adv_train_clean_eval = report_dict['train'] report_m.adv_train_clean_eval = report_dict['test'] report_m.adv_train_adv_eval = report_dict['MadryEtAl_y'] flags.update({'ngpu': 1, 'attack_type_train': 'MadryEtAl_y'}) hparams = HParams(**flags) np.random.seed(42) tf.compat.v1.set_random_seed(42) with tf.compat.v1.variable_scope(None, 'runner'): report_dict = run_trainer(hparams) report_s = AccuracyReport() report_s.train_adv_train_clean_eval = report_dict['train'] report_s.adv_train_clean_eval = report_dict['test'] report_s.adv_train_adv_eval = report_dict['MadryEtAl_y'] self.assertClose(report_s.train_adv_train_clean_eval, report_m.train_adv_train_clean_eval, atol=5e-2) self.assertClose(report_s.adv_train_clean_eval, report_m.adv_train_clean_eval, atol=2e-2) self.assertClose(report_s.adv_train_adv_eval, report_m.adv_train_adv_eval, atol=5e-2)
def cifar10_cw_recon(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, model_path_cls=MODEL_PATH, targeted=TARGETED, num_threads=None, label_smoothing=0.1, nb_filters=NB_FILTERS, filename=FILENAME, train_dir_ae=TRAIN_DIR_AE, train_dir_cl=TRAIN_DIR_CL): # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) rng = np.random.RandomState() # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get CIFAR10 data data = CIFAR10(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) dataset_size = data.x_train.shape[0] dataset_train = data.to_tensorflow()[0] dataset_train = dataset_train.map( lambda x, y: (random_shift(random_horizontal_flip(x)), y), 4) dataset_train = dataset_train.batch(batch_size) dataset_train = dataset_train.prefetch(16) x_train, y_train = data.get_set('train') x_test, y_test = data.get_set('test') nb_latent_size = 100 # Get MNIST test data # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] print("img_Rows, img_cols, nchannels: ", img_rows, img_cols, nchannels) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) x_t = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) y_t = tf.placeholder(tf.float32, shape=(None, nb_classes)) #model_vae= vae_model(x, img_rows=img_rows, img_cols=img_cols, # channels=nchannels) wrap_vae = ModelVAE('wrap_vae') recon = wrap_vae.get_layer(x, 'RECON') #print("recon: ",recon) print("Defined TensorFlow model graph.") def evaluate_ae(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': 128} noise, d1, d2, dist_diff, avg_dist_lat = model_eval_ae( sess, x, x_t, recon, x_train, x_train, args=eval_params) print("reconstruction distance: ", d1) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir_ae, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir_ae): os.mkdir(train_dir_ae) #ckpt = tf.train.get_checkpoint_state(train_dir_ae) #print(train_dir_ae, ckpt) #ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path #wrap_vae = KerasModelWrapper(model_vae) latent_dim = 20 intermediate_dim = 128 #train_ae(sess, global_loss, x_train, x_train, evaluate = evaluate_ae, args = train_params, rng = rng, var_list=wrap_vae.get_params()) if clean_train_vae == True: print("Training VAE") loss = vae_loss(wrap_vae) train_ae(sess, loss, x_train, x_train, evaluate=evaluate_ae, args=train_params, rng=rng, var_list=wrap_vae.get_params()) saver = tf.train.Saver() saver.save(sess, "train_dir/model_vae.ckpt") print("saved model") else: print("Loading VAE") saver = tf.train.Saver() #print(ckpt_path) saver.restore(sess, "train_dir/model_vae.ckpt") evaluate_ae() if (train_further): train_params = { 'nb_epochs': 10, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir_ae, 'filename': filename } #training with the saved model as starting point loss = SquaredError(wrap_vae) train_ae(sess, loss, x_train, x_train, evaluate=evaluate_vae, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, "train_dir/model_ae_final.ckpt") evaluate_ae() print("Model loaded and trained for more epochs") num_classes = 10 ''' save_dir= 'models' model_name = 'cifar10_CNN.h5' model_path_cls = os.path.join(save_dir, model_name) ''' cl_model = cnn_cl_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds_cl = cl_model(x) def do_eval_cls(preds, x_set, y_set, x_tar_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_t, x_set, y_set, x_tar_set, args=eval_params_cls) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_cl, x_t, x_test, y_test, x_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) train_params = { 'nb_epochs': 3, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir_cl, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir_cl): os.mkdir(train_dir_cl) #ckpt = tf.train.get_checkpoint_state(train_dir_cl) #print(train_dir_cl, ckpt) #ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap_cl = KerasModelWrapper(cl_model) if clean_train_cl == True: print("Training CNN Classifier") loss_cl = CrossEntropy(wrap_cl, smoothing=label_smoothing) train(sess, loss_cl, x_train, y_train, evaluate=evaluate, optimizer=tf.train.RMSPropOptimizer(learning_rate=0.0001, decay=1e-6), args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, "train_dir/model_cnn_cl_vae.ckpt") print("saved model at ", "train_dir/model_cnn_cl.ckpt") else: print("Loading CNN Classifier") saver = tf.train.Saver() #print(ckpt_path) saver.restore(sess, "train_dir/model_cnn_cl_vae.ckpt") print("Model loaded") evaluate() # Score trained model. ''' scores = cl_model.evaluate(x_test, y_test, verbose=1) print('Test loss:', scores[0]) print('Test accuracy:', scores[1]) cl_model_wrap = KerasModelWrapper(cl_model) ` ''' ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerAE(wrap_vae, wrap_cl, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') grid_viz_data_1 = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * (nb_classes - 1) for instance in x_test[idxs]], dtype=np.float32) #adv_input_y = np.array([[instance]*(nb_classes-1) for instance in y_test[idxs]]) adv_input_y = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes - 1): targ.append(y_test[idxs[curr_num]]) adv_input_y.append(targ) adv_input_y = np.array(adv_input_y) adv_target_y = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes): if (id != curr_num): targ.append(y_test[idxs[id]]) adv_target_y.append(targ) adv_target_y = np.array(adv_target_y) #print("adv_input_y: \n", adv_input_y) #print("adv_target_y: \n", adv_target_y) adv_input_targets = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes): if (id != curr_num): targ.append(x_test[idxs[id]]) adv_input_targets.append(targ) adv_input_targets = np.array(adv_input_targets) adv_inputs = adv_inputs.reshape((source_samples * (nb_classes - 1), img_rows, img_cols, nchannels)) adv_input_targets = adv_input_targets.reshape( (source_samples * (nb_classes - 1), img_rows, img_cols, nchannels)) adv_input_y = adv_input_y.reshape( source_samples * (nb_classes - 1), 10) adv_target_y = adv_target_y.reshape( source_samples * (nb_classes - 1), 10) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" cw_params_batch_size = source_samples * (nb_classes - 1) cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': CW_LEARNING_RATE, 'batch_size': cw_params_batch_size, 'initial_const': 1 } adv = cw.generate_np(adv_inputs, adv_input_targets, **cw_params) #adv = sess.run(adv) #print("layer names: \n", wrap_vae.get_layer_names()) recon_orig = wrap_vae.get_layer(x, 'RECON') recon_orig = sess.run(recon_orig, feed_dict={x: adv_inputs}) recon_adv = wrap_vae.get_layer(x, 'RECON') recon_adv = sess.run(recon_adv, feed_dict={x: adv}) pred_adv_recon = wrap_cl.get_logits(x) pred_adv_recon = sess.run(pred_adv_recon, {x: recon_adv}) #scores1 = cl_model.evaluate(recon_adv, adv_input_y, verbose=1) #scores2 = cl_model.evaluate(recon_adv, adv_target_y, verbose = 1) #acc_1 = model_eval(sess, x, y, pred_adv_recon, x_t, adv_inputs, adv_target_y, adv_input_targets, args=eval_params_cls) #acc_2 = model_eval(sess, x, y, pred_adv_recon, x_t, adv_inputs, adv_input_y, adv_input_targets, args=eval_params_cls) shape = np.shape(adv_inputs) noise = np.sum(np.square(adv - adv_inputs)) / (np.shape(adv)[0]) noise = pow(noise, 0.5) d1 = np.sum(np.square(recon_adv - adv_inputs)) / (np.shape(adv_inputs)[0]) d2 = np.sum( np.square(recon_adv - adv_input_targets)) / (np.shape(adv_inputs)[0]) acc_1 = (sum( np.argmax(pred_adv_recon, axis=-1) == np.argmax(adv_target_y, axis=-1)) ) / (np.shape(adv_target_y)[0]) acc_2 = (sum( np.argmax(pred_adv_recon, axis=-1) == np.argmax(adv_input_y, axis=-1)) ) / (np.shape(adv_target_y)[0]) print("noise: ", noise) print("d1: ", d1) print("d2: ", d2) print("classifier acc_target: ", acc_1) print("classifier acc_true: ", acc_2) #print("recon_adv[0]\n", recon_adv[0,:,:,0]) curr_class = 0 if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if (i == j): grid_viz_data[i, j] = recon_orig[curr_class * 9] grid_viz_data_1[i, j] = adv_inputs[curr_class * 9] curr_class = curr_class + 1 else: if (j > i): grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j - 1] grid_viz_data_1[i, j] = adv[i * (nb_classes - 1) + j - 1] else: grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j] grid_viz_data_1[i, j] = adv[i * (nb_classes - 1) + j] #rint(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Finally, block & display a grid of all the adversarial examples if viz_enabled: plt.ioff() figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = grid_viz_data.shape[0] num_rows = grid_viz_data.shape[1] num_channels = grid_viz_data.shape[4] for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') plt.imshow(grid_viz_data[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_vae_fig1') figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') plt.imshow(grid_viz_data_1[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_vae_fig2') #return report #adversarial training if (adv_train == True): print("starting adversarial training") #sess1 = tf.Session() adv_input_set = [] adv_input_target_set = [] for i in range(20): indices = np.arange(np.shape(x_train)[0]) np.random.shuffle(indices) print("indices: ", indices[1:10]) x_train = x_train[indices] y_train = y_train[indices] idxs = [ np.where(np.argmax(y_train, axis=1) == i)[0][0] for i in range(nb_classes) ] adv_inputs_2 = np.array([[instance] * (nb_classes - 1) for instance in x_train[idxs]], dtype=np.float32) adv_input_targets_2 = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes): if (id != curr_num): targ.append(x_train[idxs[id]]) adv_input_targets_2.append(targ) adv_input_targets_2 = np.array(adv_input_targets_2) adv_inputs_2 = adv_inputs_2.reshape( (source_samples * (nb_classes - 1), img_rows, img_cols, nchannels)) adv_input_targets_2 = adv_input_targets_2.reshape( (source_samples * (nb_classes - 1), img_rows, img_cols, nchannels)) adv_input_set.append(adv_inputs_2) adv_input_target_set.append(adv_input_targets_2) adv_input_set = np.array(adv_input_set), adv_input_target_set = np.array(adv_input_target_set) print("shape of adv_input_set: ", np.shape(adv_input_set)) print("shape of adv_input_target_set: ", np.shape(adv_input_target_set)) adv_input_set = np.reshape( adv_input_set, (np.shape(adv_input_set)[0] * np.shape(adv_input_set)[1] * np.shape(adv_input_set)[2], np.shape(adv_input_set)[3], np.shape(adv_input_set)[4], np.shape(adv_input_set)[5])) adv_input_target_set = np.reshape(adv_input_target_set, (np.shape(adv_input_target_set)[0] * np.shape(adv_input_target_set)[1], np.shape(adv_input_target_set)[2], np.shape(adv_input_target_set)[3], np.shape(adv_input_target_set)[4])) print("generated adversarial training set") adv_set = cw.generate_np(adv_input_set, adv_input_target_set, **cw_params) x_train_aim = np.append(x_train, adv_input_set, axis=0) x_train_app = np.append(x_train, adv_set, axis=0) #model_name = 'cifar10_AE_adv.h5' #model_path_ae = os.path.join(save_dir, model_name) model_ae_adv = ae_model(x, img_rows=img_rows, img_cols=img_cols, channels=nchannels) recon = model_ae_adv(x) wrap_vae_adv = KerasModelWrapper(model_ae_adv) #print("recon: ",recon) #print("Defined TensorFlow model graph.") print("Training Adversarial AE") loss = SquaredError(wrap_vae_adv) train_ae(sess, loss_2, x_train_app, x_train_aim, evaluate=evaluate_ae, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, "train_dir/model_ae_adv.ckpt") print("saved model") cw2 = CarliniWagnerAE(wrap_vae_adv, wrap_cl, sess=sess) adv_2 = cw2.generate_np(adv_inputs, adv_input_targets, **cw_params) recon_adv = wrap_vae_adv.get_layer(x, 'RECON') recon_orig = wrap_vae_adv.get_layer(x, 'RECON') recon_adv = sess.run(recon_adv, {x: adv_2}) recon_orig = sess.run(recon_orig, {x: adv_inputs}) pred_adv_recon = wrap_cl.get_logits(x) pred_adv_recon = sess.run(pred_adv_recon, {x: recon_adv}) if targeted: noise = reduce_sum(tf.square(adv_inputs - adv_2), list(range(1, len(shape)))) print("noise: ", noise) pred_adv_recon = cl_model.get_layer(recon_adv) #scores1 = cl_model.evaluate(recon_adv, adv_input_y, verbose=1) #scores2 = cl_model.eval_params(recon_adv, adv_target_y, verbose = 1) #acc_1 = model_eval(sess, x, y, pred_adv_recon, x_t, adv_inputs, adv_target_y, adv_input_targets, args=eval_params_cls) #acc_2 = model_eval(sess, x, y, pred_adv_recon, x_t, adv_inputs, adv_input_y, adv_input_targets, args=eval_params_cls) noise = np.sum(np.square(adv - adv_inputs)) / (np.shape(adv)[0]) noise = pow(noise, 0.5) d1 = np.sum( np.square(recon_adv - adv_inputs)) / (np.shape(adv_inputs)[0]) d2 = np.sum(np.square(recon_adv - adv_input_targets)) / (np.shape(adv_inputs)[0]) acc_1 = (sum( np.argmax(pred_adv_recon, axis=-1) == np.argmax( adv_target_y, axis=-1))) / (np.shape(adv_target_y)[0]) acc_2 = (sum( np.argmax(pred_adv_recon, axis=-1) == np.argmax( adv_input_y, axis=-1))) / (np.shape(adv_target_y)[0]) print("noise: ", noise) print("d1: ", d1) print("d2: ", d2) print("classifier acc_target: ", acc_1) print("classifier acc_true: ", acc_2) #print("recon_adv[0]\n", recon_adv[0,:,:,0]) curr_class = 0 if viz_enabled: for j in range(nb_classes): for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if (i == j): grid_viz_data[i, j] = recon_orig[curr_class * 9] grid_viz_data_1[i, j] = adv_inputs[curr_class * 9] curr_class = curr_class + 1 else: if (j > i): grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j - 1] grid_viz_data_1[i, j] = adv_2[i * (nb_classes - 1) + j - 1] else: grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j] grid_viz_data_1[i, j] = adv_2[i * (nb_classes - 1) + j] #rint(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv_2 - adv_inputs)**2, axis=(1, 2, 3))**.5) print( 'Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session #sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: #_ = grid_visual(grid_viz_data) #_ = grid_visual(grid_viz_data_1) plt.ioff() figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = grid_viz_data.shape[0] num_rows = grid_viz_data.shape[1] num_channels = grid_viz_data.shape[4] for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(grid_viz_data[xx, yy, :, :, 0]) else: plt.imshow(grid_viz_data[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_fig1_vae_adv_trained') figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') plt.imshow(grid_viz_data_1[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_fig2_vae_adv_trained') #return report #binarization defense #if(binarization_defense == True or mean_filtering==True): if (binarization_defense == True): print("BINARIZATION") print("---------------------------") adv[adv > 0.5] = 1.0 adv[adv <= 0.5] = 0.0 recon_orig = wrap_vae.get_layer(x, 'RECON') recon_adv = wrap_vae.get_layer(x, 'RECON') #pred_adv = wrap_cl.get_logits(x) recon_orig = sess.run(recon_orig, {x: adv_inputs}) recon_adv = sess.run(recon_adv, {x: adv}) #pred_adv = sess.run(pred_adv, {x: recon_adv}) pred_adv_recon = wrap_cl.get_logits(x) pred_adv_recon = sess.run(pred_adv_recon, {x: recon_adv}) eval_params = {'batch_size': 90} if targeted: noise = np.sum(np.square(adv - adv_inputs)) / (np.shape(adv)[0]) noise = pow(noise, 0.5) d1 = np.sum( np.square(recon_adv - adv_inputs)) / (np.shape(adv_inputs)[0]) d2 = np.sum(np.square(recon_adv - adv_input_targets)) / ( np.shape(adv_inputs)[0]) acc_1 = (sum( np.argmax(pred_adv_recon, axis=-1) == np.argmax( adv_target_y, axis=-1))) / (np.shape(adv_target_y)[0]) acc_2 = (sum( np.argmax(pred_adv_recon, axis=-1) == np.argmax( adv_input_y, axis=-1))) / (np.shape(adv_target_y)[0]) print("noise: ", noise) print("d1: ", d1) print("d2: ", d2) print("classifier acc_target: ", acc_1) print("classifier acc_true: ", acc_2) curr_class = 0 if viz_enabled: for j in range(nb_classes): for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if (i == j): grid_viz_data[i, j] = recon_orig[curr_class * 9] grid_viz_data_1[i, j] = adv_inputs[curr_class * 9] curr_class = curr_class + 1 else: if (j > i): grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j - 1] grid_viz_data_1[i, j] = adv[i * (nb_classes - 1) + j - 1] else: grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j] grid_viz_data_1[i, j] = adv[i * (nb_classes - 1) + j] plt.ioff() figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = grid_viz_data.shape[0] num_rows = grid_viz_data.shape[1] num_channels = grid_viz_data.shape[4] for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(grid_viz_data[xx, yy, :, :, 0]) else: plt.imshow(grid_viz_data[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_vae_fig1_bin') figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(grid_viz_data_1[xx, yy, :, :, 0]) else: plt.imshow(grid_viz_data_1[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_vae_fig2_bin') if (mean_filtering == True): print("MEAN FILTERING") print("---------------------------") adv = uniform_filter(adv, 2) recon_orig = wrap_vae.get_layer(x, 'RECON') recon_adv = wrap_vae.get_layer(x, 'RECON') pred_adv_recon = wrap_cl.get_logits(x) recon_orig = sess.run(recon_orig, {x: adv_inputs}) recon_adv = sess.run(recon_adv, {x: adv}) pred_adv_recon = sess.run(pred_adv_recon, {x: recon_adv}) eval_params = {'batch_size': 90} noise = np.sum(np.square(adv - adv_inputs)) / (np.shape(adv)[0]) noise = pow(noise, 0.5) d1 = np.sum( np.square(recon_adv - adv_inputs)) / (np.shape(adv_inputs)[0]) d2 = np.sum(np.square(recon_adv - adv_input_targets)) / (np.shape(adv_inputs)[0]) acc_1 = (sum( np.argmax(pred_adv_recon, axis=-1) == np.argmax( adv_target_y, axis=-1))) / (np.shape(adv_target_y)[0]) acc_2 = (sum( np.argmax(pred_adv_recon, axis=-1) == np.argmax( adv_input_y, axis=-1))) / (np.shape(adv_target_y)[0]) print("noise: ", noise) print("d1: ", d1) print("d2: ", d2) print("classifier acc_target: ", acc_1) print("classifier acc_true: ", acc_2) curr_class = 0 if viz_enabled: for j in range(nb_classes): for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if (i == j): grid_viz_data[i, j] = recon_orig[curr_class * 9] grid_viz_data_1[i, j] = adv_inputs[curr_class * 9] curr_class = curr_class + 1 else: if (j > i): grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j - 1] grid_viz_data_1[i, j] = adv[i * (nb_classes - 1) + j - 1] else: grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j] grid_viz_data_1[i, j] = adv[i * (nb_classes - 1) + j] plt.ioff() figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = grid_viz_data.shape[0] num_rows = grid_viz_data.shape[1] num_channels = grid_viz_data.shape[4] for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(grid_viz_data[xx, yy, :, :, 0]) else: plt.imshow(grid_viz_data[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_vae_fig1_mean') figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(grid_viz_data_1[xx, yy, :, :, 0]) else: plt.imshow(grid_viz_data_1[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_vae_fig2_mean')
def test_run_single_gpu_fgsm(self): """ Test the basic single GPU performance by comparing to the FGSM tutorial. """ from cleverhans_tutorials import mnist_tutorial_tf # Run the MNIST tutorial on a dataset of reduced size flags = { 'train_start': 0, 'train_end': 5000, 'test_start': 0, 'test_end': 333, 'nb_epochs': 5, 'testing': True } report = mnist_tutorial_tf.mnist_tutorial(**flags) # Run the multi-gpu trainer for clean training flags.update({ 'batch_size': 128, 'adam_lrn': 0.001, 'dataset': 'mnist', 'only_adv_train': False, 'eval_iters': 1, 'ngpu': 1, 'fast_tests': False, 'attack_type_train': '', 'save_dir': None, 'save_steps': 10000, 'attack_nb_iter_train': None, 'save': False, 'model_type': 'basic', 'attack_type_test': 'FGSM' }) flags.update({'adv_train': False}) HParams = namedtuple('HParams', flags.keys()) hparams = HParams(**flags) np.random.seed(42) tf.compat.v1.set_random_seed(42) with tf.compat.v1.variable_scope(None, 'runner'): report_dict = run_trainer(hparams) report_2 = AccuracyReport() report_2.train_clean_train_clean_eval = report_dict['train'] report_2.clean_train_clean_eval = report_dict['test'] report_2.clean_train_adv_eval = report_dict['FGSM'] # Run the multi-gpu trainer for adversarial training flags.update({'adv_train': True, 'attack_type_train': 'FGSM'}) HParams = namedtuple('HParams', flags.keys()) hparams = HParams(**flags) np.random.seed(42) tf.compat.v1.set_random_seed(42) with tf.compat.v1.variable_scope(None, 'runner'): report_dict = run_trainer(hparams) report_2.train_adv_train_clean_eval = report_dict['train'] report_2.adv_train_clean_eval = report_dict['test'] report_2.adv_train_adv_eval = report_dict['FGSM'] self.assertClose(report.train_clean_train_clean_eval, report_2.train_clean_train_clean_eval, atol=5e-2) self.assertClose(report.clean_train_clean_eval, report_2.clean_train_clean_eval, atol=2e-2) self.assertClose(report.clean_train_adv_eval, report_2.clean_train_adv_eval, atol=5e-2) self.assertClose(report.train_adv_train_clean_eval, report_2.train_adv_train_clean_eval, atol=1e-1) self.assertClose(report.adv_train_clean_eval, report_2.adv_train_clean_eval, atol=2e-2) self.assertClose(report.adv_train_adv_eval, report_2.adv_train_adv_eval, atol=1e-1)
def JSMA_FGSM(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) source_samples = batch_size # Use label smoothing # Hopefully this doesn't screw up JSMA... assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_par = {'batch_size': batch_size} rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) print("#####Starting attacks on clean model#####") ################################################################# #Clean test against JSMA jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } jsma = SaliencyMapMethod(model, back='tf', sess=sess) adv_x = jsma.generate(x, **jsma_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on JSMA adversarial examples: %0.4f' % acc) ################################################################ #Clean test against FGSM fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on FGSM adversarial examples: %0.4f' % acc) ################################################################ #Clean test against BIM bim_params = { 'eps': 0.3, 'eps_iter': 0.01, 'nb_iter': 100, 'clip_min': 0., 'clip_max': 1. } bim = BasicIterativeMethod(model, sess=sess) adv_x = bim.generate(x, **bim_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on BIM adversarial examples: %0.4f' % acc) ################################################################ #Clean test against EN en_params = { 'binary_search_steps': 1, #'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': source_samples, 'initial_const': 10 } en = ElasticNetMethod(model, back='tf', sess=sess) adv_x = en.generate(x, **en_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on EN adversarial examples: %0.4f' % acc) ################################################################ #Clean test against DF deepfool_params = { 'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': 50, 'clip_min': 0., 'clip_max': 1. } deepfool = DeepFool(model, sess=sess) adv_x = deepfool.generate(x, **deepfool_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on DF adversarial examples: %0.4f' % acc) ################################################################ #Clean test against VAT vat_params = { 'eps': 2.0, 'num_iterations': 1, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1. } vat = VirtualAdversarialMethod(model, sess=sess) adv_x = vat.generate(x, **vat_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Clean test accuracy on VAT adversarial examples: %0.4f\n' % acc) ################################################################ print("Repeating the process, using adversarial training\n") # Redefine TF model graph model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) ################################################################# #Adversarial test against JSMA jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } jsma = SaliencyMapMethod(model, back='tf', sess=sess) adv_x = jsma.generate(x, **jsma_params) preds_adv_jsma = model.get_probs(adv_x) ################################################################ #Adversarial test against FGSM fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv_fgsm = model.get_probs(adv_x) ################################################################ #Adversarial test against BIM bim_params = { 'eps': 0.3, 'eps_iter': 0.01, 'nb_iter': 100, 'clip_min': 0., 'clip_max': 1. } bim = BasicIterativeMethod(model, sess=sess) adv_x = bim.generate(x, **bim_params) preds_adv_bim = model.get_probs(adv_x) ################################################################ #Adversarial test against EN en_params = { 'binary_search_steps': 5, #'y': None, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': source_samples, 'initial_const': 10 } en = ElasticNetMethod(model, back='tf', sess=sess) adv_x = en.generate(x, **en_params) preds_adv_en = model.get_probs(adv_x) ################################################################ #Adversarial test against DF deepfool_params = { 'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': 200, 'clip_min': 0., 'clip_max': 1. } deepfool = DeepFool(model, sess=sess) adv_x = deepfool.generate(x, **deepfool_params) preds_adv_df = model.get_probs(adv_x) ################################################################ #Adversarial test against VAT vat_params = { 'eps': 2.0, 'num_iterations': 1, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1. } vat = VirtualAdversarialMethod(model, sess=sess) adv_x = vat.generate(x, **vat_params) preds_adv_vat = model.get_probs(adv_x) ################################################################ print("#####Evaluate trained model#####") def evaluate_2(): # Evaluate the accuracy of the MNIST model on JSMA adversarial examples acc = model_eval(sess, x, y, preds_adv_jsma, X_test, Y_test, args=eval_par) print('Test accuracy on JSMA adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on FGSM adversarial examples acc = model_eval(sess, x, y, preds_adv_fgsm, X_test, Y_test, args=eval_par) print('Test accuracy on FGSM adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on BIM adversarial examples acc = model_eval(sess, x, y, preds_adv_bim, X_test, Y_test, args=eval_par) print('Test accuracy on BIM adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on EN adversarial examples acc = model_eval(sess, x, y, preds_adv_en, X_test, Y_test, args=eval_par) print('Test accuracy on EN adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on DF adversarial examples acc = model_eval(sess, x, y, preds_adv_df, X_test, Y_test, args=eval_par) print('Test accuracy on DF adversarial examples: %0.4f' % acc) # Evaluate the accuracy of the MNIST model on VAT adversarial examples acc = model_eval(sess, x, y, preds_adv_vat, X_test, Y_test, args=eval_par) print('Test accuracy on VAT adversarial examples: %0.4f\n' % acc) preds_2_adv = [ preds_adv_jsma, preds_adv_fgsm # ,preds_adv_bim # ,preds_adv_en # ,preds_adv_df ] model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, rng=rng)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, train_dir=TRAIN_DIR, filename=FILENAME, load_model=LOAD_MODEL, testing=False, label_smoothing=0.1, save_model=SAVE_MODEL, attack_method=ATTACK_METHOD, model_type=MODEL_TYPE): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session os.environ["CUDA_VISIBLE_DEVICES"] = '1' # only use No.0 GPU config = tf.ConfigProto() config.allow_soft_placement = True config.gpu_options.allow_growth = True sess = tf.Session(config=config) keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph the_model = modelA if model_type == 'a': the_model = modelA elif model_type == 'b': the_model = modelB elif model_type == 'c': the_model = modelC else: exit('the model type must be a or b or c.') model = the_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_dir = train_dir + '/' + model_type train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = CrossEntropy(wrap, smoothing=label_smoothing) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng) if save_model: saver = tf.train.Saver(max_to_keep=1) saver.save(sess, '{}/mnist.ckpt'.format(train_dir), global_step=NB_EPOCHS) keras.models.save_model( model, '{}/{}_mnist.h5'.format(train_dir, model_type)) print("model has been saved") # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Basic Iterative Method (BIM) attack object and graph if attack_method == 'fgsm': att_method = FastGradientMethod(wrap, sess=sess) att_method_params = {'eps': 0.2, 'clip_min': 0., 'clip_max': 1.} elif attack_method == 'bim': att_method = BasicIterativeMethod(wrap, sess=sess) att_method_params = { 'eps': 0.2, 'eps_iter': 0.06, 'nb_iter': 10, 'clip_min': 0., 'clip_max': 1. } elif attack_method == 'mifgsm': att_method = MomentumIterativeMethod(wrap, sess=sess) att_method_params = { 'eps': 0.2, 'eps_iter': 0.08, 'nb_iter': 10, 'decay_factor': 0.4, 'clip_min': 0., 'clip_max': 1. } else: exit("the attack method must be fgsm,bim,mifgsm") print(att_method_params) adv_x = att_method.generate(x, **att_method_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} start_time = time.time() acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f' % acc) end_time = time.time() print("{} attack time is {}\n".format(attack_method, end_time - start_time)) report.clean_train_adv_eval = acc #save_acc = np.array(save_acc) #record = pd.DataFrame(save_acc,columns=["decay","acc"]) #record.to_csv("result/mnist_fc_decay__change.csv",index=False) tmp_train = sess.run(adv_x, feed_dict={ x: x_test[0:1000] }).reshape(1000, 28, 28) for i in range(1, 10): tmp_data = sess.run(adv_x, feed_dict={ x: x_test[i * 1000:(i + 1) * 1000] }).reshape(1000, 28, 28) tmp_train = np.concatenate([tmp_train, tmp_data]) np.save("{}_{}_test_adv.npy".format(attack_method, model_type), tmp_train) gc.collect()
def run_adversarial(num_epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, train_end=-1, test_end=-1): """ MNIST cleverhans tutorial :param num_epochs: number of epochs to train model :param batch_size: size of training batches :return: an AccuracyReport object """ # Train a pytorch MNIST model torch_model = LeNetCNN() if torch.cuda.is_available(): torch_model = torch_model.cuda() report = AccuracyReport() train_loader = torch.utils.data.DataLoader( datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor()), batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader( datasets.MNIST('data', train=False, transform=transforms.ToTensor()), batch_size=batch_size) # Train our model optimizer = optim.Adam(torch_model.parameters()) train_loss = [] total = 0 correct = 0 step = 0 for _epoch in range(num_epochs): for xs, ys in train_loader: xs, ys = Variable(xs), Variable(ys) if torch.cuda.is_available(): xs, ys = xs.cuda(), ys.cuda() optimizer.zero_grad() preds = torch_model(xs) loss = F.nll_loss(preds, ys) loss.backward() # calc gradients train_loss.append(loss.data.item()) optimizer.step() # update gradients preds_np = preds.cpu().detach().numpy() correct += (np.argmax(preds_np, axis=1) == ys.cpu().detach().numpy()).sum() total += train_loader.batch_size step += 1 if total % 1000 == 0: acc = float(correct) / total print('[%s] Training accuracy: %.2f%%' % (step, acc * 100)) total = 0 correct = 0 # Evaluate on clean data total = 0 correct = 0 for xs, ys in test_loader: xs, ys = Variable(xs), Variable(ys) if torch.cuda.is_available(): xs, ys = xs.cuda(), ys.cuda() preds = torch_model(xs) preds_np = preds.cpu().detach().numpy() correct += (np.argmax(preds_np, axis=1) == ys.cpu().detach().numpy()).sum() total += len(xs) acc = float(correct) / total report.clean_train_clean_eval = acc print('[%s] Clean accuracy: %.2f%%' % (step, acc * 100)) # We use tf for evaluation on adversarial data sess = tf.Session() x_op = tf.placeholder(tf.float32, shape=(None, 1, 28, 28,)) # Convert pytorch model to a tf_model and wrap it in cleverhans tf_model_fn = convert_pytorch_model_to_tf(torch_model) cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits') # Create an FGSM attack fgsm_op = FastGradientMethod(cleverhans_model, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x_op = fgsm_op.generate(x_op, **fgsm_params) adv_preds_op = tf_model_fn(adv_x_op) # Run an evaluation of our model against fgsm total = 0 correct = 0 for xs, ys in test_loader: adv_preds = sess.run(adv_preds_op, feed_dict={x_op: xs}) correct += (np.argmax(adv_preds, axis=1) == ys.cpu().detach().numpy()).sum() total += test_loader.batch_size acc = float(correct) / total print('Adv accuracy: {:.3f}'.format(acc * 100)) report.clean_train_adv_eval = acc return report
def cifar10_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED, noise_output=NOISE_OUTPUT): """ CIFAR10 tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get Fashion MNIST test data fashion = keras.datasets.fashion_mnist (x_train, y_train), (x_test, y_test) = fashion.load_data() # cifar10 = CIFAR10(train_start=train_start, train_end=train_end, # test_start=test_start, test_end=test_end) # x_train, y_train = cifar10.get_set('train') # x_test, y_test = cifar10.get_set('test') x_train = x_train.reshape(x_train.shape[0], 28, 28, 1) x_test = x_test.reshape(x_test.shape[0], 28, 28, 1) y_train = np_utils.to_categorical(y_train, 10) y_test = np_utils.to_categorical(y_test, 10) x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 # Obtain Image Parameters img_rows, img_cols = x_train.shape[1:3] nchannels = 1 nb_classes = 10 # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an CIFAR10 model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the CIFAR10 model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes)] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 1, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape((source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" if targeted: cw_params_batch_size = source_samples * nb_classes else: cw_params_batch_size = source_samples cw_params = {'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': CW_LEARNING_RATE, 'batch_size': cw_params_batch_size, 'initial_const': 10} adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) adv_accuracy = 1 - err if viz_enabled: for i in range(nb_classes): if noise_output: image = adv[i * nb_classes] - adv_inputs[i * nb_classes] else: image = adv[i * nb_classes] grid_viz_data[i, 0] = image print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() def save_visual(data, path): """ Modified version of cleverhans.plot.pyplot """ figure = plt.figure() # figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = data.shape[0] num_rows = data.shape[1] num_channels = data.shape[4] for y in range(num_rows): for x in range(num_cols): figure.add_subplot(num_rows, num_cols, (x + 1) + (y * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(data[x, y, :, :, 0], cmap='gray') else: plt.imshow(data[x, y, :, :, :]) # Draw the plot and return plt.savefig(path) return figure # Finally, block & display a grid of all the adversarial examples if viz_enabled: # _ = grid_visual(grid_viz_data) # cleverhans_image.save("output", grid_viz_data) if noise_output: image_name = "output/cw_fashion_mnist_noise.png" else: image_name = "output/cw_fashion_mnist.png" _ = save_visual(grid_viz_data, image_name) return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64, num_threads=None, label_smoothing=0.1): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True) # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval') print('Repeating the process, using adversarial training') # Create a new model and train it to be robust to FastGradientMethod model2 = ModelBasicCNN('model2', nb_classes, nb_filters) fgsm2 = FastGradientMethod(model2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) loss2 = LossCrossEntropy(model2, smoothing=label_smoothing, attack=attack) preds2 = model2.get_logits(x) adv_x2 = attack(x) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) def evaluate2(): # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) # Accuracy of the adversarially trained model on adversarial examples do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) # Perform and evaluate adversarial training train(sess, loss2, x, y, x_train, y_train, evaluate=evaluate2, args=train_params, rng=rng, var_list=model2.get_params()) # Calculate training errors if testing: do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval') do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval') return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="/tmp", filename="mnist.ckpt", load_model=False, testing=False): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } ckpt = tf.train.get_checkpoint_state(train_dir) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path rng = np.random.RandomState([2017, 8, 30]) if load_model and ckpt_path: saver = tf.train.Saver() saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, save=True, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph wrap = KerasModelWrapper(model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model() preds_2 = model_2(x) wrap_2 = KerasModelWrapper(model_2) fgsm2 = FastGradientMethod(wrap_2, sess=sess) preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params)) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, save=False, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def cifar10_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1, adversarial_training=ADVERSARIAL_TRAINING): """ CIFAR10 cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :param adversarial_training: True means using adversarial training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: # put data on cpu and gpu both config_args = dict(allow_soft_placement=True) sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get CIFAR10 data data = CIFAR10(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) dataset_size = data.x_train.shape[0] dataset_train = data.to_tensorflow()[0] dataset_train = dataset_train.map( lambda x, y: (random_shift(random_horizontal_flip(x)), y), 4) dataset_train = dataset_train.batch(batch_size) dataset_train = dataset_train.prefetch(16) x_train, y_train = data.get_set('train') x_test, y_test = data.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_test.shape[1:4] nb_classes = y_test.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} mifgsm_params = { 'eps': 0.5, 'clip_min': 0., 'eps_iter': 0.002, 'nb_iter': 10, 'clip_max': 1., 'ord': np.inf } rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = ModelAllConvolutional('model1', nb_classes, nb_filters, input_shape=[32, 32, 3]) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) """ when training, evaluating can be happened """ train(sess, loss, None, None, dataset_train=dataset_train, dataset_size=dataset_size, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) # save model model_path = sys.path[0] + '/simple_cifar10.ckpt' saver = tf.train.Saver(max_to_keep=1) saver.save(sess, model_path) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Basic Iterative Method (BIM) attack object and # graph mifgsm = MomentumIterativeMethod(model, sess=sess) for i in range(20): adv_x = mifgsm.generate(x, **mifgsm_params) preds_adv = model.get_logits(adv_x) print("eps:%0.2f" % (mifgsm_params["eps_iter"] * mifgsm_params['nb_iter'])) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True) mifgsm_params['eps_iter'] = mifgsm_params['eps_iter'] + 0.002 # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval') print('Repeating the process, using adversarial training') if not adversarial_training: return report # Create a new model and train it to be robust to BasicIterativeMethod model2 = ModelAllConvolutional('model2', nb_classes, nb_filters, input_shape=[32, 32, 3]) bim2 = MomentumIterativeMethod(model2, sess=sess) def attack(x): return bim2.generate(x, **mifgsm_params) # add attack to loss loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack) preds2 = model2.get_logits(x) adv_x2 = attack(x) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the attacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) def evaluate2(): # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) # Accuracy of the adversarially trained model on adversarial examples do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) # Perform and evaluate adversarial training train(sess, loss2, None, None, dataset_train=dataset_train, dataset_size=dataset_size, evaluate=evaluate2, args=train_params, rng=rng, var_list=model2.get_params()) # Calculate training errors if testing: do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval') do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval') return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, train_dir=TRAIN_DIR, filename=FILENAME, load_model=LOAD_MODEL, testing=False, label_smoothing=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = CrossEntropy(wrap, smoothing=label_smoothing) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_train, y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) wrap_2 = KerasModelWrapper(model_2) preds_2 = model_2(x) fgsm2 = FastGradientMethod(wrap_2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) preds_2_adv = model_2(attack(x)) loss_2 = CrossEntropy(wrap_2, smoothing=label_smoothing, attack=attack) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_test, y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, x_test, y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(sess, loss_2, x_train, y_train, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_train, y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, x_train, y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) model_train(sess, x, y, preds, X_train, Y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind+1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols)), np.reshape(adv_x, (img_rows, img_cols)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def baseline_deepfool(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } deepfool_params = { 'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': 50, 'clip_min': 0., 'clip_max': 1. } rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # # HERE already trained model, thus we need a new one (model_2) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph deepfool = DeepFool(model, sess=sess) adv_x = deepfool.generate(x, **deepfool_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on DeepFool adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) deepfool2 = DeepFool(model_2, sess=sess) adv_x_2 = deepfool2.generate(x, **deepfool_params) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x_2 = tf.stop_gradient(adv_x_2) preds_2_adv = model_2(adv_x_2) # # let's generate DeepFool examples # # let's generate FGSM examples # fgsm = FastGradientMethod(model_2, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x_fgsm = fgsm.generate(x, **fgsm_params) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x_fgsm = tf.stop_gradient(adv_x_fgsm) preds_2_fgsm = model_2(adv_x_fgsm) # DON'T WANT TO TRAIN on FGSM adv examples yet def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on FGSM adversarial examples accuracy = model_eval(sess, x, y, preds_2_fgsm, X_test, Y_test, args=eval_params) print('Test accuracy on FGSM adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Accuracy of the DeepFool adv trained model on DeepFool examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on DeepFool adversarial examples: %0.4f' % accuracy) # Perform and evaluate adversarial training model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="/tmp", filename="mnist.ckpt", load_model=False, testing=False): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } ckpt = tf.train.get_checkpoint_state(train_dir) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path rng = np.random.RandomState([2017, 8, 30]) if load_model and ckpt_path: saver = tf.train.Saver() saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, save=True) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph wrap = KerasModelWrapper(model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model() preds_2 = model_2(x) wrap_2 = KerasModelWrapper(model_2) fgsm2 = FastGradientMethod(wrap_2, sess=sess) preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params)) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, save=False) # Get a random slice of the data for linear extrapolation plots random_idx = np.random.randint(0, X_train.shape[0]) X_slice = X_train[random_idx] Y_slice = Y_train[random_idx] # Plot the linear extrapolation plot for clean model log_prob_adv_array = get_logits_over_interval( sess, wrap, X_slice, fgsm_params) linear_extrapolation_plot(log_prob_adv_array, Y_slice, 'lep_clean.png') # Plot the linear extrapolation plot for adv model log_prob_adv_array = get_logits_over_interval( sess, wrap_2, X_slice, fgsm_params) linear_extrapolation_plot(log_prob_adv_array, Y_slice, 'lep_adv.png') # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def helper_run_multi_gpu_madryetal(self, extra_flags=None): """ Compare the single GPU performance to multiGPU performance. """ # Run the trainers on a dataset of reduced size flags = {'train_start': 0, 'train_end': 5000, 'test_start': 0, 'test_end': 333, 'nb_epochs': 5, 'testing': True} # Run the multi-gpu trainer for adversarial training flags.update({'batch_size': 128, 'adam_lrn': 0.001, 'dataset': 'mnist', 'only_adv_train': False, 'eval_iters': 1, 'fast_tests': True, 'save_dir': None, 'save_steps': 10000, 'attack_nb_iter_train': 10, 'sync_step': None, 'adv_train': True, 'save': False, 'model_type': 'basic', 'attack_type_test': 'MadryEtAl_y', }) if extra_flags is not None: flags.update(extra_flags) # Run the multi-gpu trainer for adversarial training using 2 gpus # trainer_multigpu by default sets `allow_soft_placement=True` flags.update({'ngpu': 2, 'attack_type_train': 'MadryEtAl_y_multigpu', 'sync_step': 1}) HParams = namedtuple('HParams', flags.keys()) hparams = HParams(**flags) np.random.seed(42) tf.set_random_seed(42) with tf.variable_scope(None, 'runner'): report_dict = run_trainer(hparams) report_m = AccuracyReport() report_m.train_adv_train_clean_eval = report_dict['train'] report_m.adv_train_clean_eval = report_dict['test'] report_m.adv_train_adv_eval = report_dict['MadryEtAl_y'] flags.update({'ngpu': 1, 'attack_type_train': 'MadryEtAl_y', }) hparams = HParams(**flags) np.random.seed(42) tf.set_random_seed(42) with tf.variable_scope(None, 'runner'): report_dict = run_trainer(hparams) report_s = AccuracyReport() report_s.train_adv_train_clean_eval = report_dict['train'] report_s.adv_train_clean_eval = report_dict['test'] report_s.adv_train_adv_eval = report_dict['MadryEtAl_y'] self.assertClose(report_s.train_adv_train_clean_eval, report_m.train_adv_train_clean_eval, atol=5e-2) self.assertClose(report_s.adv_train_clean_eval, report_m.adv_train_clean_eval, atol=2e-2) self.assertClose(report_s.adv_train_adv_eval, report_m.adv_train_adv_eval, atol=5e-2)