def main(argv): checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if checkpoint is None: raise ValueError("Couldn't find latest checkpoint in " + FLAGS.checkpoint_dir) train_start = 0 train_end = 60000 test_start = 0 test_end = 10000 X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) assert Y_train.shape[1] == 10 # NOTE: for compatibility with Madry Lab downloadable checkpoints, # we cannot enclose this in a scope or do anything else that would # change the automatic naming of the variables. model = MadryMNIST() x_input = tf.placeholder(tf.float32, shape=[None, 784]) x_image = tf.placeholder(tf.float32, shape=[None, 28, 28, 1]) y = tf.placeholder(tf.float32, shape=[None, 10]) if FLAGS.attack_type == 'fgsm': fgsm = FastGradientMethod(model) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x_image, **fgsm_params) elif FLAGS.attack_type == 'bim': bim = BasicIterativeMethod(model) bim_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1., 'nb_iter': 50, 'eps_iter': .01} adv_x = bim.generate(x_image, **bim_params) else: raise ValueError(FLAGS.attack_type) preds_adv = model.get_probs(adv_x) saver = tf.train.Saver() with tf.Session() as sess: # Restore the checkpoint saver.restore(sess, checkpoint) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': FLAGS.batch_size} t1 = time.time() acc = model_eval( sess, x_image, y, preds_adv, X_test, Y_test, args=eval_par) t2 = time.time() print("Took", t2 - t1, "seconds") print('Test accuracy on adversarial examples: %0.4f\n' % acc)
def get_logits_over_interval(sess, model, x_data, fgsm_params, min_epsilon=-10., max_epsilon=10., num_points=21): """Get logits when the input is perturbed in an interval in adv direction. Args: sess: Tf session model: Model for which we wish to get logits. x_data: Numpy array corresponding to single data. point of shape [height, width, channels]. fgsm_params: Parameters for generating adversarial examples. min_epsilon: Minimum value of epsilon over the interval. max_epsilon: Maximum value of epsilon over the interval. num_points: Number of points used to interpolate. Returns: Numpy array containing logits. Raises: ValueError if min_epsilon is larger than max_epsilon. """ # Get the height, width and number of channels height = x_data.shape[0] width = x_data.shape[1] channels = x_data.shape[2] size = height * width * channels x_data = np.expand_dims(x_data, axis=0) import tensorflow as tf from cleverhans.attacks import FastGradientMethod # Define the data placeholder x = tf.placeholder(dtype=tf.float32, shape=[1, height, width, channels], name='x') # Define adv_x fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) if min_epsilon > max_epsilon: raise ValueError('Minimum epsilon is less than maximum epsilon') eta = tf.nn.l2_normalize(adv_x - x, dim=0) epsilon = tf.reshape(tf.lin_space(float(min_epsilon), float(max_epsilon), num_points), (num_points, 1, 1, 1)) lin_batch = x + epsilon * eta logits = model.get_logits(lin_batch) with sess.as_default(): log_prob_adv_array = sess.run(logits, feed_dict={x: x_data}) return log_prob_adv_array
def baseline_deepfool(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } deepfool_params = { 'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': 50, 'clip_min': 0., 'clip_max': 1. } rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # # HERE already trained model, thus we need a new one (model_2) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph deepfool = DeepFool(model, sess=sess) adv_x = deepfool.generate(x, **deepfool_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on DeepFool adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) deepfool2 = DeepFool(model_2, sess=sess) adv_x_2 = deepfool2.generate(x, **deepfool_params) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x_2 = tf.stop_gradient(adv_x_2) preds_2_adv = model_2(adv_x_2) # # let's generate DeepFool examples # # let's generate FGSM examples # fgsm = FastGradientMethod(model_2, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x_fgsm = fgsm.generate(x, **fgsm_params) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x_fgsm = tf.stop_gradient(adv_x_fgsm) preds_2_fgsm = model_2(adv_x_fgsm) # DON'T WANT TO TRAIN on FGSM adv examples yet def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on FGSM adversarial examples accuracy = model_eval(sess, x, y, preds_2_fgsm, X_test, Y_test, args=eval_params) print('Test accuracy on FGSM adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Accuracy of the DeepFool adv trained model on DeepFool examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on DeepFool adversarial examples: %0.4f' % accuracy) # Perform and evaluate adversarial training model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_tutorial(nb_epochs=6, batch_size=128, train_end=-1, test_end=-1, learning_rate=0.001): """ MNIST cleverhans tutorial :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Train a pytorch MNIST model torch_model = PytorchMnistModel() if torch.cuda.is_available(): torch_model = torch_model.cuda() report = AccuracyReport() train_loader = torch.utils.data.DataLoader( datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor()), batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader( datasets.MNIST('data', train=False, transform=transforms.ToTensor()), batch_size=batch_size) # Truncate the datasets so that our test run more quickly train_loader.dataset.train_data = train_loader.dataset.train_data[ :train_end] test_loader.dataset.test_data = test_loader.dataset.test_data[:test_end] # Train our model optimizer = optim.Adam(torch_model.parameters(), lr=learning_rate) train_loss = [] total = 0 correct = 0 step = 0 for epoch in range(nb_epochs): for xs, ys in train_loader: xs, ys = Variable(xs), Variable(ys) if torch.cuda.is_available(): xs, ys = xs.cuda(), ys.cuda() optimizer.zero_grad() preds = torch_model(xs) loss = F.nll_loss(preds, ys) loss.backward() # calc gradients train_loss.append(loss.data.item()) optimizer.step() # update gradients preds_np = preds.data.cpu().numpy() correct += (np.argmax(preds_np, axis=1) == ys).sum() total += len(xs) step += 1 if total % 1000 == 0: acc = float(correct) / total print('[%s] Training accuracy: %.2f%%' % (step, acc * 100)) total = 0 correct = 0 # Evaluate on clean data total = 0 correct = 0 for xs, ys in test_loader: xs, ys = Variable(xs), Variable(ys) if torch.cuda.is_available(): xs, ys = xs.cuda(), ys.cuda() preds = torch_model(xs) preds_np = preds.data.cpu().numpy() correct += (np.argmax(preds_np, axis=1) == ys).sum() total += len(xs) acc = float(correct) / total report.clean_train_clean_eval = acc print('[%s] Clean accuracy: %.2f%%' % (step, acc * 100)) # We use tf for evaluation on adversarial data sess = tf.Session() x_op = tf.placeholder(tf.float32, shape=(None, 1, 28, 28,)) # Convert pytorch model to a tf_model and wrap it in cleverhans tf_model_fn = convert_pytorch_model_to_tf(torch_model) cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits') # Create an FGSM attack fgsm_op = FastGradientMethod(cleverhans_model, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x_op = fgsm_op.generate(x_op, **fgsm_params) adv_preds_op = tf_model_fn(adv_x_op) # Run an evaluation of our model against fgsm total = 0 correct = 0 for xs, ys in test_loader: adv_preds = sess.run(adv_preds_op, feed_dict={x_op: xs}) correct += (np.argmax(adv_preds, axis=1) == ys).sum() total += len(xs) acc = float(correct) / total print('Adv accuracy: {:.3f}'.format(acc * 100)) report.clean_train_adv_eval = acc return report
def blackbox(gan, rec_data_path=None, batch_size=128, learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6, nb_epochs_s=10, lmbda=0.1, online_training=False, train_on_recs=False, test_on_dev=False, defense_type='none'): """MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 Args: train_start: index of first training set example train_end: index of last training set example test_start: index of first test set example test_end: index of last test set example defense_type: Type of defense against blackbox attacks Returns: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ FLAGS = flags.FLAGS # Set logging level to see debug information. set_log_level(logging.WARNING) # Dictionary used to keep track and return key accuracies. accuracies = {} # Create TF session. adv_training = False if defense_type: if defense_type == 'defense_gan' and gan: sess = gan.sess gan_defense_flag = True else: gan_defense_flag = False config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) if 'adv_tr' in defense_type: adv_training = True else: gan_defense_flag = False config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) train_images, train_labels, test_images, test_labels = \ get_cached_gan_data(gan, test_on_dev, orig_data_flag=True) x_shape, classes = list(train_images.shape[1:]), train_labels.shape[1] nb_classes = classes type_to_models = { 'A': model_a, 'B': model_b, 'C': model_c, 'D': model_d, 'E': model_e, 'F': model_f, 'Q': model_q, 'Y': model_y, 'Z': model_z } with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): bb_model = type_to_models[FLAGS.bb_model]( input_shape=[None] + x_shape, nb_classes=train_labels.shape[1], ) with tf.variable_scope("Substitute", reuse=tf.AUTO_REUSE): sub_model = type_to_models[FLAGS.sub_model]( input_shape=[None] + x_shape, nb_classes=train_labels.shape[1], ) if FLAGS.debug: train_images = train_images[:20 * batch_size] train_labels = train_labels[:20 * batch_size] debug_dir = os.path.join('debug', 'blackbox', FLAGS.debug_dir) ensure_dir(debug_dir) x_debug_test = test_images[:batch_size] # Initialize substitute training set reserved for adversary images_sub = test_images[:holdout] labels_sub = np.argmax(test_labels[:holdout], axis=1) print(labels_sub) # Redefine test set as remaining samples unavailable to adversaries if FLAGS.num_tests > 0: test_images = test_images[:FLAGS.num_tests] test_labels = test_labels[:FLAGS.num_tests] test_images = test_images[holdout:] test_labels = test_labels[holdout:] # Define input and output TF placeholders if FLAGS.image_dim[0] == 3: FLAGS.image_dim = [ FLAGS.image_dim[1], FLAGS.image_dim[2], FLAGS.image_dim[0] ] images_tensor = tf.placeholder(tf.float32, shape=[None] + x_shape) labels_tensor = tf.placeholder(tf.float32, shape=(None, classes)) rng = np.random.RandomState([11, 24, 1990]) train_images_bb, train_labels_bb, test_images_bb, test_labels_bb = \ train_images, train_labels, test_images, \ test_labels cur_gan = gan if FLAGS.debug: train_images_bb = train_images_bb[:20 * batch_size] train_labels_bb = train_labels_bb[:20 * batch_size] # Prepare the black_box model. prep_bbox_out = prep_bbox(sess, images_tensor, labels_tensor, train_images_bb, train_labels_bb, test_images_bb, test_labels_bb, nb_epochs, batch_size, learning_rate, rng=rng, gan=cur_gan, adv_training=adv_training, cnn_arch=bb_model) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 print("Training the substitute model.") reconstructor = get_reconstructor(gan) recon_tensors, _ = reconstructor.reconstruct(images_tensor, batch_size=batch_size, reconstructor_id=2) model_sub, preds_sub = train_sub(sess, images_tensor, labels_tensor, model.get_logits(recon_tensors), images_sub, labels_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, rng=rng, substitute_model=sub_model, dataset_name=gan.dataset_name) accuracies['sub'] = 0 # Initialize the Fast Gradient Sign Method (FGSM) attack object. eps = attack_config_dict[gan.dataset_name]['eps'] min_val = attack_config_dict[gan.dataset_name]['clip_min'] fgsm_par = {'eps': eps, 'ord': np.inf, 'clip_min': min_val, 'clip_max': 1.} fgsm = FastGradientMethod(model_sub, sess=sess) # Craft adversarial examples using the substitute. eval_params = {'batch_size': batch_size} x_adv_sub = fgsm.generate(images_tensor, **fgsm_par) if FLAGS.debug and gan is not None: # To see some qualitative results. recon_tensors, _ = reconstructor.reconstruct(x_adv_sub, batch_size=batch_size, reconstructor_id=2) x_rec_orig, _ = reconstructor.reconstruct(images_tensor, batch_size=batch_size, reconstructor_id=3) x_adv_sub_val = sess.run(x_adv_sub, feed_dict={images_tensor: x_debug_test}) x_rec_debug_val = sess.run(recon_tensors, feed_dict={images_tensor: x_debug_test}) x_rec_orig_val = sess.run(x_rec_orig, feed_dict={images_tensor: x_debug_test}) #sess.run(tf.local_variables_initializer()) #x_rec_debug_val, x_rec_orig_val = sess.run([reconstructed_tensors, x_rec_orig], feed_dict={images_tensor: x_debug_test}) save_images_files(x_adv_sub_val, output_dir=debug_dir, postfix='adv') postfix = 'gen_rec' save_images_files(x_rec_debug_val, output_dir=debug_dir, postfix=postfix) save_images_files(x_debug_test, output_dir=debug_dir, postfix='orig') save_images_files(x_rec_orig_val, output_dir=debug_dir, postfix='orig_rec') if gan_defense_flag: num_dims = len(images_tensor.get_shape()) avg_inds = list(range(1, num_dims)) recons_adv, zs = reconstructor.reconstruct(x_adv_sub, batch_size=batch_size) diff_op = tf.reduce_mean(tf.square(x_adv_sub - recons_adv), axis=avg_inds) z_norm = tf.reduce_sum(tf.square(zs), axis=1) acc_adv, diffs_mean, roc_info_adv = model_eval_gan( sess, images_tensor, labels_tensor, predictions=model.get_logits(recons_adv), test_images=test_images, test_labels=test_labels, args=eval_params, diff_op=diff_op, z_norm=z_norm, recons_adv=recons_adv, adv_x=x_adv_sub, debug=False) # reconstruction on clean images recons_clean, zs = reconstructor.reconstruct(images_tensor, batch_size=batch_size) diff_op = tf.reduce_mean(tf.square(images_tensor - recons_clean), axis=avg_inds) z_norm = tf.reduce_sum(tf.square(zs), axis=1) acc_rec, diffs_mean_rec, roc_info_rec = model_eval_gan( sess, images_tensor, labels_tensor, model.get_logits(recons_clean), None, test_images=test_images, test_labels=test_labels, args=eval_params, diff_op=diff_op, z_norm=z_norm, recons_adv=recons_clean, adv_x=images_tensor, debug=False) print('Evaluation accuracy with reconstruction: {}'.format(acc_rec)) print('Test accuracy of oracle on cleaned images : {}'.format(acc_adv)) return { 'acc_adv': acc_adv, 'acc_rec': acc_rec, 'roc_info_adv': roc_info_adv, 'roc_info_rec': roc_info_rec } else: acc_adv = model_eval(sess, images_tensor, labels_tensor, model.get_logits(x_adv_sub), test_images, test_labels, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(acc_adv)) return { 'acc_adv': acc_adv, 'acc_rec': 0, 'roc_info_adv': None, 'roc_info_rec': None }
model_dict = {} model_dict[0] = model model_dict[1] = model_target Xdata_dict = {} Xdata_dict[0] = X_freq_s Xdata_dict[1] = X_freq_t indices = get_indices(model_dict, Xdata_dict, y_test, len(model_dict)) indices = np.random.choice(indices, 1000, replace=False) #FGSM print("FGSM") fgsm_params = {'eps': 0.03, 'clip_min': 0., 'clip_max': 1.} fgsm_attack = FastGradientMethod(wrap, sess=sess) X_adv = np.zeros((len(indices), 224, 224, 3)) for i in range(0, len(indices), 100): X_adv[i:(i + 100)] = fgsm_attack.generate_np(X_test[indices[i:(i + 100)]], **fgsm_params) print("metrics") print(metrics(model, X_adv, X_test, y_test, indices)) print(metrics(model_target, X_adv, X_test, y_test, indices)) X_adv_freq = filt(X_adv, type_freq_t, lim_freq_t1) print(metrics(model_target, X_adv_freq, X_test, y_test, indices)) #MIM Diverse print("MIM-DIVERSE") mim_params = { 'eps': 0.03, 'eps_iter': 0.01,
class TestFastGradientMethod(CleverHansTest): def setUp(self): super(TestFastGradientMethod, self).setUp() import tensorflow as tf # The world's simplest neural network def my_model(x): W1 = tf.constant([[1.5, .3], [-2, 0.3]], dtype=tf.float32) h1 = tf.nn.sigmoid(tf.matmul(x, W1)) W2 = tf.constant([[-2.4, 1.2], [0.5, -2.3]], dtype=tf.float32) res = tf.nn.softmax(tf.matmul(h1, W2)) return res self.sess = tf.Session() self.model = my_model self.attack = FastGradientMethod(self.model, sess=self.sess) def help_generate_np_gives_adversarial_example(self, ord): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=.5, ord=ord, clip_min=-5, clip_max=5) if ord == np.inf: delta = np.max(np.abs(x_adv - x_val), axis=1) elif ord == 1: delta = np.sum(np.abs(x_adv - x_val), axis=1) elif ord == 2: delta = np.sum(np.square(x_adv - x_val), axis=1)**.5 self.assertClose(delta, 0.5) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(orig_labs == new_labs) < 0.5) def test_generate_np_gives_adversarial_example_linfinity(self): self.help_generate_np_gives_adversarial_example(np.infty) def test_generate_np_gives_adversarial_example_l1(self): self.help_generate_np_gives_adversarial_example(1) def test_generate_np_gives_adversarial_example_l2(self): self.help_generate_np_gives_adversarial_example(2) def test_targeted_generate_np_gives_adversarial_example(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) random_labs = np.random.random_integers(0, 1, 100) random_labs_one_hot = np.zeros((100, 2)) random_labs_one_hot[np.arange(100), random_labs] = 1 x_adv = self.attack.generate_np(x_val, eps=.5, ord=np.inf, clip_min=-5, clip_max=5, y_target=random_labs_one_hot) delta = np.max(np.abs(x_adv - x_val), axis=1) self.assertClose(delta, 0.5) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(random_labs == new_labs) > 0.7) def test_generate_np_can_be_called_with_different_eps(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) for eps in [0.1, 0.2, 0.3, 0.4]: x_adv = self.attack.generate_np(x_val, eps=eps, ord=np.inf, clip_min=-5.0, clip_max=5.0) delta = np.max(np.abs(x_adv - x_val), axis=1) self.assertClose(delta, eps) def test_generate_np_clip_works_as_expected(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=0.5, ord=np.inf, clip_min=-0.2, clip_max=0.1) self.assertClose(np.min(x_adv), -0.2) self.assertClose(np.max(x_adv), 0.1) def test_generate_np_caches_graph_computation_for_eps_clip_or_xi(self): import tensorflow as tf x_val = np.random.rand(1, 2) x_val = np.array(x_val, dtype=np.float32) self.attack.generate_np(x_val, eps=.3, num_iterations=10, clip_max=-5.0, clip_min=-5.0, xi=1e-6) old_grads = tf.gradients def fn(*x, **y): raise RuntimeError() tf.gradients = fn self.attack.generate_np(x_val, eps=.2, num_iterations=10, clip_max=-4.0, clip_min=-4.0, xi=1e-5) tf.gradients = old_grads
#Load classifier model whose gradients will be used to create adversarial examples keras_model = load_model('classifiers/fc-784-100-100-10-defender-model.h5') atkr_clfr = load_model('classifiers/fc-784-200-200-100-10-attacker-model.h5') backend.set_learning_phase(False) data_train_shuffle = data_train data_train1 = data_train_shuffle[0:30000, 0:784] data_train2 = data_train_shuffle[30000:60000, 0:784] #Create adversarial examples on testing data sess = backend.get_session() eta1 = 0.25 eta2 = 0.50 wrap = KerasModelWrapper(keras_model) fgsm = FastGradientMethod(wrap, sess=sess) adv_train_x1 = fgsm.generate_np(data_train1, eps=eta1, clip_min=0., clip_max=1.) adv_train_x2 = fgsm.generate_np(data_train2, eps=eta2, clip_min=0., clip_max=1.) adv_train_x = np.vstack([adv_train_x1, adv_train_x2]) adv_test_x = fgsm.generate_np(data_test, eps=eta1, clip_min=0., clip_max=1.) #Total datasets data_total_train = np.vstack([data_train, adv_train_x]) data_total_test = np.vstack([data_test, adv_test_x])
def main(argv): model_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if model_file is None: print('No model found') sys.exit() cifar = cifar10_input.CIFAR10Data(FLAGS.dataset_dir) nb_classes = 10 X_test = cifar.eval_data.xs Y_test = to_categorical(cifar.eval_data.ys, nb_classes) assert Y_test.shape[1] == 10. set_log_level(logging.DEBUG) with tf.Session() as sess: x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) from madry_cifar10_model import make_madry_wresnet model = make_madry_wresnet() saver = tf.train.Saver() # Restore the checkpoint saver.restore(sess, model_file) nb_samples = FLAGS.nb_samples attack_params = {'batch_size': FLAGS.batch_size, 'clip_min': 0., 'clip_max': 255.} if FLAGS.attack_type == 'cwl2': from cleverhans.attacks import CarliniWagnerL2 attacker = CarliniWagnerL2(model, sess=sess) attack_params.update({'binary_search_steps': 1, 'max_iterations': 100, 'learning_rate': 0.1, 'initial_const': 10, 'batch_size': 10 }) else: # eps and eps_iter in range 0-255 attack_params.update({'eps': 8, 'ord': np.inf}) if FLAGS.attack_type == 'fgsm': from cleverhans.attacks import FastGradientMethod attacker = FastGradientMethod(model, sess=sess) elif FLAGS.attack_type == 'pgd': attack_params.update({'eps_iter': 2, 'nb_iter': 20}) from cleverhans.attacks import MadryEtAl attacker = MadryEtAl(model, sess=sess) eval_par = {'batch_size': FLAGS.batch_size} if FLAGS.sweep: max_eps = 16 epsilons = np.linspace(1, max_eps, max_eps) for e in epsilons: t1 = time.time() attack_params.update({'eps': e}) x_adv = attacker.generate(x, **attack_params) preds_adv = model.get_probs(x_adv) acc = model_eval(sess, x, y, preds_adv, X_test[ :nb_samples], Y_test[:nb_samples], args=eval_par) print('Epsilon %.2f, accuracy on adversarial' % e, 'examples %0.4f\n' % acc) t2 = time.time() else: t1 = time.time() x_adv = attacker.generate(x, **attack_params) preds_adv = model.get_probs(x_adv) acc = model_eval(sess, x, y, preds_adv, X_test[ :nb_samples], Y_test[:nb_samples], args=eval_par) t2 = time.time() print('Test accuracy on adversarial examples %0.4f\n' % acc) print("Took", t2 - t1, "seconds")
class TestFastGradientMethod(CleverHansTest): def setUp(self): super(TestFastGradientMethod, self).setUp() self.sess = tf.Session() self.model = SimpleModel() self.attack = FastGradientMethod(self.model, sess=self.sess) def generate_adversarial_examples_np(self, ord, eps, **kwargs): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=eps, ord=ord, clip_min=-5, clip_max=5, **kwargs) if ord == np.inf: delta = np.max(np.abs(x_adv - x_val), axis=1) elif ord == 1: delta = np.sum(np.abs(x_adv - x_val), axis=1) elif ord == 2: delta = np.sum(np.square(x_adv - x_val), axis=1) ** .5 return x_val, x_adv, delta def help_generate_np_gives_adversarial_example(self, ord, eps=.5, **kwargs): x_val, x_adv, delta = self.generate_adversarial_examples_np(ord, eps, **kwargs) self.assertClose(delta, eps) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(orig_labs == new_labs) < 0.5) def test_invalid_input(self): x_val = -np.ones((2, 2), dtype='float32') with self.assertRaises(tf.errors.InvalidArgumentError) as context: self.attack.generate_np(x_val, eps=1., clip_min=0., clip_max=1.) self.assertTrue(context.exception) def test_generate_np_gives_adversarial_example_linfinity(self): self.help_generate_np_gives_adversarial_example(np.infty) def test_generate_np_gives_adversarial_example_l1(self): self.help_generate_np_gives_adversarial_example(1) def test_generate_np_gives_adversarial_example_l2(self): self.help_generate_np_gives_adversarial_example(2) def test_generate_respects_dtype(self): self.attack = FastGradientMethod(self.model, sess=self.sess, dtypestr='float64') x = tf.placeholder(dtype=tf.float64, shape=(100, 2)) x_adv = self.attack.generate(x) self.assertEqual(x_adv.dtype, tf.float64) def test_targeted_generate_np_gives_adversarial_example(self): random_labs = np.random.random_integers(0, 1, 100) random_labs_one_hot = np.zeros((100, 2)) random_labs_one_hot[np.arange(100), random_labs] = 1 _, x_adv, delta = self.generate_adversarial_examples_np( eps=.5, ord=np.inf, y_target=random_labs_one_hot) self.assertClose(delta, 0.5) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(random_labs == new_labs) > 0.7) def test_generate_np_can_be_called_with_different_eps(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) for eps in [0.1, 0.2, 0.3, 0.4]: x_adv = self.attack.generate_np(x_val, eps=eps, ord=np.inf, clip_min=-5.0, clip_max=5.0) delta = np.max(np.abs(x_adv - x_val), axis=1) self.assertClose(delta, eps) def test_generate_np_clip_works_as_expected(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=0.5, ord=np.inf, clip_min=-0.2, clip_max=0.1, sanity_checks=False) self.assertClose(np.min(x_adv), -0.2) self.assertClose(np.max(x_adv), 0.1)
max_learning_rate - min_learning_rate) * np.exp(-i / decay_speed) if i % 500 == 0 or i == 50000: counter += 1 # Saves generated images samples = sess.run(GXsigmoid_test, feed_dict={ GY: all_classes, BS: num_classes }) fig = plot_generator(samples) plt.savefig(folder_out + "gen_" + str(i).zfill(6) + '.png', bbox_inches='tight') plt.close(fig) attack_fgsm = FastGradientMethod(model_classifier, sess=sess) adv_x_np = attack_fgsm.generate_np(x_test, **fgsm_params) fig = plot_generator(adv_x_np[:num_classes]) plt.savefig(folder_out + "adv_" + str(i).zfill(6) + '.png', bbox_inches='tight') plt.close(fig) accu_test, c_loss_test, sigmoid_test, softmax_test, sum_c = sess.run( [ accuracy, c_loss, max_output_sigmoid_test, max_output_softmax_test, c_sum ], { X: x_test, Y_: y_test }) writer.add_summary(sum_c, i)
def train(cifar10_data, epochs, L, learning_rate, scale3, Delta2, epsilon2, eps2_ratio, alpha, perturbFM, fgsm_eps, total_eps, logfile): logfile.write("fgsm_eps \t %g, LR \t %g, alpha \t %d , epsilon \t %d \n"%(fgsm_eps, learning_rate, alpha, total_eps)) """Train CIFAR-10 for a number of steps.""" with tf.Graph().as_default(): global_step = tf.Variable(0, trainable=False) eps_benign = 1/(1+eps2_ratio)*(epsilon2) eps_adv = eps2_ratio/(1+eps2_ratio)*(epsilon2) # Parameters Declarification #with tf.variable_scope('conv1') as scope: kernel1 = _variable_with_weight_decay('kernel1', shape=[4, 4, 3, 128], stddev=np.sqrt(2.0/(5*5*256))/math.ceil(5 / 2), wd=0.0, collect=[AECODER_VARIABLES]) biases1 = _bias_on_cpu('biases1', [128], tf.constant_initializer(0.0), collect=[AECODER_VARIABLES]) shape = kernel1.get_shape().as_list() w_t = tf.reshape(kernel1, [-1, shape[-1]]) w = tf.transpose(w_t) sing_vals = tf.svd(w, compute_uv=False) sensitivity = tf.reduce_max(sing_vals) gamma = 2*Delta2/(L*sensitivity) #2*3*(14*14 + 2)*16/(L*sensitivity) #with tf.variable_scope('conv2') as scope: kernel2 = _variable_with_weight_decay('kernel2', shape=[5, 5, 128, 128], stddev=np.sqrt(2.0/(5*5*256))/math.ceil(5 / 2), wd=0.0, collect=[CONV_VARIABLES]) biases2 = _bias_on_cpu('biases2', [128], tf.constant_initializer(0.1), collect=[CONV_VARIABLES]) #with tf.variable_scope('conv3') as scope: kernel3 = _variable_with_weight_decay('kernel3', shape=[5, 5, 256, 256], stddev=np.sqrt(2.0/(5*5*256))/math.ceil(5 / 2), wd=0.0, collect=[CONV_VARIABLES]) biases3 = _bias_on_cpu('biases3', [256], tf.constant_initializer(0.1), collect=[CONV_VARIABLES]) #with tf.variable_scope('local4') as scope: kernel4 = _variable_with_weight_decay('kernel4', shape=[int(image_size/4)**2*256, hk], stddev=0.04, wd=0.004, collect=[CONV_VARIABLES]) biases4 = _bias_on_cpu('biases4', [hk], tf.constant_initializer(0.1), collect=[CONV_VARIABLES]) #with tf.variable_scope('local5') as scope: kernel5 = _variable_with_weight_decay('kernel5', [hk, 10], stddev=np.sqrt(2.0/(int(image_size/4)**2*256))/math.ceil(5 / 2), wd=0.0, collect=[CONV_VARIABLES]) biases5 = _bias_on_cpu('biases5', [10], tf.constant_initializer(0.1), collect=[CONV_VARIABLES]) #scale2 = tf.Variable(tf.ones([hk])) #beta2 = tf.Variable(tf.zeros([hk])) params = [kernel1, biases1, kernel2, biases2, kernel3, biases3, kernel4, biases4, kernel5, biases5] ######## # Build a Graph that computes the logits predictions from the # inference model. FM_h = tf.placeholder(tf.float32, [None, 14, 14, 128]); noise = tf.placeholder(tf.float32, [None, image_size, image_size, 3]); adv_noise = tf.placeholder(tf.float32, [None, image_size, image_size, 3]); x = tf.placeholder(tf.float32, [None,image_size,image_size,3]); adv_x = tf.placeholder(tf.float32, [None,image_size,image_size,3]); # Auto-Encoder # Enc_Layer2 = EncLayer(inpt=adv_x, n_filter_in = 3, n_filter_out = 128, filter_size = 3, W=kernel1, b=biases1, activation=tf.nn.relu) pretrain_adv = Enc_Layer2.get_train_ops2(xShape = tf.shape(adv_x)[0], Delta = Delta2, epsilon = epsilon2, batch_size = L, learning_rate= learning_rate, W = kernel1, b = biases1, perturbFMx = adv_noise, perturbFM_h = FM_h) Enc_Layer3 = EncLayer(inpt=x, n_filter_in = 3, n_filter_out = 128, filter_size = 3, W=kernel1, b=biases1, activation=tf.nn.relu) pretrain_benign = Enc_Layer3.get_train_ops2(xShape = tf.shape(x)[0], Delta = Delta2, epsilon = epsilon2, batch_size = L, learning_rate= learning_rate, W = kernel1, b = biases1, perturbFMx = noise, perturbFM_h = FM_h) cost = tf.reduce_sum((Enc_Layer2.cost + Enc_Layer3.cost)/2.0); ### x_image = x + noise; y_conv = inference(x_image, FM_h, params); softmax_y_conv = tf.nn.softmax(y_conv) y_ = tf.placeholder(tf.float32, [None, 10]); adv_x += adv_noise y_adv_conv = inference(adv_x, FM_h, params) adv_y_ = tf.placeholder(tf.float32, [None, 10]); # Calculate loss. Apply Taylor Expansion for the output layer perturbW = perturbFM*params[8] loss = cifar10.TaylorExp(y_conv, y_, y_adv_conv, adv_y_, L, alpha, perturbW) # Build a Graph that trains the model with one batch of examples and # updates the model parameters. #pretrain_step = tf.train.AdamOptimizer(1e-4).minimize(pretrain_adv, global_step=global_step, var_list=[kernel1, biases1]); pretrain_var_list = tf.get_collection(AECODER_VARIABLES) train_var_list = tf.get_collection(CONV_VARIABLES) #print(pretrain_var_list) #print(train_var_list) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): pretrain_step = tf.train.AdamOptimizer(learning_rate).minimize(pretrain_adv+pretrain_benign, global_step=global_step, var_list=pretrain_var_list); train_op = cifar10.train(loss, global_step, learning_rate, _var_list= train_var_list) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False)) sess.run(kernel1.initializer) dp_epsilon=1.0 _gamma = sess.run(gamma) _gamma_x = Delta2/L epsilon2_update = epsilon2/(1.0 + 1.0/_gamma + 1/_gamma_x) print(epsilon2_update/_gamma + epsilon2_update/_gamma_x) print(epsilon2_update) delta_r = fgsm_eps*(image_size**2); _sensitivityW = sess.run(sensitivity) delta_h = _sensitivityW*(14**2) #delta_h = 1.0 * delta_r; #sensitivity*(14**2) = sensitivity*(\beta**2) can also be used #dp_mult = (Delta2/(L*epsilon2))/(delta_r / dp_epsilon) + (2*Delta2/(L*epsilon2))/(delta_h / dp_epsilon) #dp_mult = (Delta2/(L*epsilon2_update))/(delta_r / dp_epsilon) + (2*Delta2/(L*epsilon2_update))/(delta_h / dp_epsilon) dp_mult = (Delta2*dp_epsilon) / (L*epsilon2_update * (delta_h / 2 + delta_r)) dynamic_eps = tf.placeholder(tf.float32); """y_test = inference(x, FM_h, params) softmax_y = tf.nn.softmax(y_test); c_x_adv = fgsm(x, softmax_y, eps=dynamic_eps/3, clip_min=-1.0, clip_max=1.0) x_adv = tf.reshape(c_x_adv, [L, image_size, image_size, 3])""" attack_switch = {'fgsm':True, 'ifgsm':True, 'deepfool':False, 'mim':True, 'spsa':False, 'cwl2':False, 'madry':True, 'stm':False} ch_model_probs = CustomCallableModelWrapper(callable_fn=inference_test_input_probs, output_layer='probs', params=params, image_size=image_size, adv_noise = adv_noise) # define each attack method's tensor mu_alpha = tf.placeholder(tf.float32, [1]); attack_tensor_dict = {} # FastGradientMethod if attack_switch['fgsm']: print('creating attack tensor of FastGradientMethod') fgsm_obj = FastGradientMethod(model=ch_model_probs, sess=sess) #x_adv_test_fgsm = fgsm_obj.generate(x=x, eps=fgsm_eps, clip_min=-1.0, clip_max=1.0, ord=2) # testing now x_adv_test_fgsm = fgsm_obj.generate(x=x, eps=mu_alpha, clip_min=-1.0, clip_max=1.0) # testing now attack_tensor_dict['fgsm'] = x_adv_test_fgsm # Iterative FGSM (BasicIterativeMethod/ProjectedGradientMethod with no random init) # default: eps_iter=0.05, nb_iter=10 if attack_switch['ifgsm']: print('creating attack tensor of BasicIterativeMethod') ifgsm_obj = BasicIterativeMethod(model=ch_model_probs, sess=sess) #x_adv_test_ifgsm = ifgsm_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_ifgsm = ifgsm_obj.generate(x=x, eps=mu_alpha, eps_iter=fgsm_eps/3, nb_iter=3, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['ifgsm'] = x_adv_test_ifgsm # MomentumIterativeMethod # default: eps_iter=0.06, nb_iter=10 if attack_switch['mim']: print('creating attack tensor of MomentumIterativeMethod') mim_obj = MomentumIterativeMethod(model=ch_model_probs, sess=sess) #x_adv_test_mim = mim_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, decay_factor=1.0, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_mim = mim_obj.generate(x=x, eps=mu_alpha, eps_iter=fgsm_eps/3, nb_iter=3, decay_factor=1.0, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['mim'] = x_adv_test_mim # MadryEtAl (Projected Grdient with random init, same as rand+fgsm) # default: eps_iter=0.01, nb_iter=40 if attack_switch['madry']: print('creating attack tensor of MadryEtAl') madry_obj = MadryEtAl(model=ch_model_probs, sess=sess) #x_adv_test_madry = madry_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_madry = madry_obj.generate(x=x, eps=mu_alpha, eps_iter=fgsm_eps/3, nb_iter=3, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['madry'] = x_adv_test_madry #====================== attack ========================= #adv_logits, _ = inference(c_x_adv + W_conv1Noise, perturbFM, params) # Create a saver. saver = tf.train.Saver(tf.all_variables()) # Build an initialization operation to run below. init = tf.initialize_all_variables() sess.run(init) # Start the queue runners. #tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(os.getcwd() + dirCheckpoint, sess.graph) # load the most recent models _global_step = 0 ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: print(ckpt.model_checkpoint_path); saver.restore(sess, ckpt.model_checkpoint_path) _global_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) else: print('No checkpoint file found') T = int(int(math.ceil(D/L))*epochs + 1) # number of steps step_for_epoch = int(math.ceil(D/L)); #number of steps for one epoch perturbH_test = np.random.laplace(0.0, 0, 14*14*128) perturbH_test = np.reshape(perturbH_test, [-1, 14, 14, 128]); #W_conv1Noise = np.random.laplace(0.0, Delta2/(L*epsilon2), 32 * 32 * 3).astype(np.float32) #W_conv1Noise = np.reshape(_W_conv1Noise, [32, 32, 3]) perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*128) perturbFM_h = np.reshape(perturbFM_h, [-1, 14, 14, 128]); #_W_adv = np.random.laplace(0.0, 0, 32 * 32 * 3).astype(np.float32) #_W_adv = np.reshape(_W_adv, [32, 32, 3]) #_perturbFM_h_adv = np.random.laplace(0.0, 0, 10*10*128) #_perturbFM_h_adv = np.reshape(_perturbFM_h_adv, [10, 10, 128]); test_size = len(cifar10_data.test.images) #beta = redistributeNoise(os.getcwd() + '/LRP_0_25_v12.txt') #BenignLNoise = generateIdLMNoise(image_size, Delta2, eps_benign, L) #generateNoise(image_size, Delta2, eps_benign, L, beta); #AdvLnoise = generateIdLMNoise(image_size, Delta2, eps_adv, L) Noise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L) #generateNoise(image_size, Delta2, eps_adv, L, beta); Noise_test = generateIdLMNoise(image_size, 0, epsilon2_update, L) #generateNoise(image_size, 0, 2*epsilon2, test_size, beta); emsemble_L = int(L/3) preT_epochs = 100 pre_T = int(int(math.ceil(D/L))*preT_epochs + 1); """logfile.write("pretrain: \n") for step in range(_global_step, _global_step + pre_T): d_eps = random.random()*0.5; batch = cifar10_data.train.next_batch(L); #Get a random batch. adv_images = sess.run(x_adv, feed_dict = {x: batch[0], dynamic_eps: d_eps, FM_h: perturbH_test}) for iter in range(0, 2): adv_images = sess.run(x_adv, feed_dict = {x: adv_images, dynamic_eps: d_eps, FM_h: perturbH_test}) #sess.run(pretrain_step, feed_dict = {x: batch[0], noise: AdvLnoise, FM_h: perturbFM_h}); batch = cifar10_data.train.next_batch(L); sess.run(pretrain_step, feed_dict = {x: np.append(batch[0], adv_images, axis = 0), noise: Noise, FM_h: perturbFM_h}); if step % int(25*step_for_epoch) == 0: cost_value = sess.run(cost, feed_dict={x: cifar10_data.test.images, noise: Noise_test, FM_h: perturbH_test})/(test_size*128) logfile.write("step \t %d \t %g \n"%(step, cost_value)) print(cost_value) print('pre_train finished')""" _global_step = 0 for step in xrange(_global_step, _global_step + T): start_time = time.time() d_eps = random.random()*0.5; batch = cifar10_data.train.next_batch(emsemble_L); #Get a random batch. y_adv_batch = batch[1] """adv_images = sess.run(x_adv, feed_dict = {x: batch[0], dynamic_eps: d_eps, FM_h: perturbH_test}) for iter in range(0, 2): adv_images = sess.run(x_adv, feed_dict = {x: adv_images, dynamic_eps: d_eps, FM_h: perturbH_test})""" adv_images_ifgsm = sess.run(attack_tensor_dict['ifgsm'], feed_dict ={x:batch[0], adv_noise: Noise, mu_alpha:[d_eps]}) batch = cifar10_data.train.next_batch(emsemble_L); y_adv_batch = np.append(y_adv_batch, batch[1], axis = 0) adv_images_mim = sess.run(attack_tensor_dict['mim'], feed_dict ={x:batch[0], adv_noise: Noise, mu_alpha:[d_eps]}) batch = cifar10_data.train.next_batch(emsemble_L); y_adv_batch = np.append(y_adv_batch, batch[1], axis = 0) adv_images_madry = sess.run(attack_tensor_dict['madry'], feed_dict ={x:batch[0], adv_noise: Noise, mu_alpha:[d_eps]}) adv_images = np.append(np.append(adv_images_ifgsm, adv_images_mim, axis = 0),adv_images_madry, axis = 0) batch = cifar10_data.train.next_batch(L); #Get a random batch. sess.run(pretrain_step, feed_dict = {x: batch[0], adv_x: adv_images, adv_noise: Noise_test, noise: Noise, FM_h: perturbFM_h}); _, loss_value = sess.run([train_op, loss], feed_dict = {x: batch[0], y_: batch[1], adv_x: adv_images, adv_y_: y_adv_batch, noise: Noise, adv_noise: Noise_test, FM_h: perturbFM_h}) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' # report the result periodically if step % (50*step_for_epoch) == 0 and step >= (300*step_for_epoch): '''predictions_form_argmax = np.zeros([test_size, 10]) softmax_predictions = sess.run(softmax_y_conv, feed_dict={x: cifar10_data.test.images, noise: Noise_test, FM_h: perturbH_test}) argmax_predictions = np.argmax(softmax_predictions, axis=1) """for n_draws in range(0, 2000): _BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2, L) _perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2*L), 14*14*128) _perturbFM_h = np.reshape(_perturbFM_h, [-1, 14, 14, 128]);""" for j in range(test_size): pred = argmax_predictions[j] predictions_form_argmax[j, pred] += 2000; """softmax_predictions = sess.run(softmax_y_conv, feed_dict={x: cifar10_data.test.images, noise: _BenignLNoise, FM_h: _perturbFM_h}) argmax_predictions = np.argmax(softmax_predictions, axis=1)""" final_predictions = predictions_form_argmax; is_correct = [] is_robust = [] for j in range(test_size): is_correct.append(np.argmax(cifar10_data.test.labels[j]) == np.argmax(final_predictions[j])) robustness_from_argmax = robustness.robustness_size_argmax(counts=predictions_form_argmax[j],eta=0.05,dp_attack_size=fgsm_eps, dp_epsilon=1.0, dp_delta=0.05, dp_mechanism='laplace') / dp_mult is_robust.append(robustness_from_argmax >= fgsm_eps) acc = np.sum(is_correct)*1.0/test_size robust_acc = np.sum([a and b for a,b in zip(is_robust, is_correct)])*1.0/np.sum(is_robust) robust_utility = np.sum(is_robust)*1.0/test_size log_str = "step: {:.1f}\t epsilon: {:.1f}\t benign: {:.4f} \t {:.4f} \t {:.4f} \t {:.4f} \t".format(step, total_eps, acc, robust_acc, robust_utility, robust_acc*robust_utility)''' #===================adv samples===================== log_str = "step: {:.1f}\t epsilon: {:.1f}\t".format(step, total_eps) """adv_images_dict = {} for atk in attack_switch.keys(): if attack_switch[atk]: adv_images_dict[atk] = sess.run(attack_tensor_dict[atk], feed_dict ={x:cifar10_data.test.images}) print("Done with the generating of Adversarial samples")""" #===================adv samples===================== adv_acc_dict = {} robust_adv_acc_dict = {} robust_adv_utility_dict = {} test_bach_size = 5000 for atk in attack_switch.keys(): print(atk) if atk not in adv_acc_dict: adv_acc_dict[atk] = -1 robust_adv_acc_dict[atk] = -1 robust_adv_utility_dict[atk] = -1 if attack_switch[atk]: test_bach = cifar10_data.test.next_batch(test_bach_size) adv_images_dict = sess.run(attack_tensor_dict[atk], feed_dict ={x:test_bach[0], adv_noise: Noise_test, mu_alpha:[fgsm_eps]}) print("Done adversarial examples") ### PixelDP Robustness ### predictions_form_argmax = np.zeros([test_bach_size, 10]) softmax_predictions = sess.run(softmax_y_conv, feed_dict={x: adv_images_dict, noise: Noise, FM_h: perturbFM_h}) argmax_predictions = np.argmax(softmax_predictions, axis=1) for n_draws in range(0, 1000): _BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L); _perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*128) _perturbFM_h = np.reshape(_perturbFM_h, [-1, 14, 14, 128]); if n_draws == 500: print("n_draws = 500") for j in range(test_bach_size): pred = argmax_predictions[j] predictions_form_argmax[j, pred] += 1; softmax_predictions = sess.run(softmax_y_conv, feed_dict={x: adv_images_dict, noise: (_BenignLNoise/10 + Noise), FM_h: perturbFM_h}) * sess.run(softmax_y_conv, feed_dict={x: adv_images_dict, noise: Noise, FM_h: (_perturbFM_h/10 + perturbFM_h)}) #softmax_predictions = sess.run(softmax_y_conv, feed_dict={x: adv_images_dict, noise: (_BenignLNoise), FM_h: perturbFM_h}) * sess.run(softmax_y_conv, feed_dict={x: adv_images_dict, noise: Noise, FM_h: (_perturbFM_h)}) argmax_predictions = np.argmax(softmax_predictions, axis=1) final_predictions = predictions_form_argmax; is_correct = [] is_robust = [] for j in range(test_bach_size): is_correct.append(np.argmax(test_bach[1][j]) == np.argmax(final_predictions[j])) robustness_from_argmax = robustness.robustness_size_argmax(counts=predictions_form_argmax[j],eta=0.05,dp_attack_size=fgsm_eps, dp_epsilon=dp_epsilon, dp_delta=0.05, dp_mechanism='laplace') / dp_mult is_robust.append(robustness_from_argmax >= fgsm_eps) adv_acc_dict[atk] = np.sum(is_correct)*1.0/test_bach_size robust_adv_acc_dict[atk] = np.sum([a and b for a,b in zip(is_robust, is_correct)])*1.0/np.sum(is_robust) robust_adv_utility_dict[atk] = np.sum(is_robust)*1.0/test_bach_size ############################## for atk in attack_switch.keys(): if attack_switch[atk]: # added robust prediction log_str += " {}: {:.4f} {:.4f} {:.4f} {:.4f}".format(atk, adv_acc_dict[atk], robust_adv_acc_dict[atk], robust_adv_utility_dict[atk], robust_adv_acc_dict[atk] * robust_adv_utility_dict[atk]) print(log_str) logfile.write(log_str + '\n') # Save the model checkpoint periodically. if step % (10*step_for_epoch) == 0 and (step > _global_step): num_examples_per_step = L examples_per_sec = num_examples_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, examples_per_sec, sec_per_batch)) """if step % (50*step_for_epoch) == 0 and (step >= 900*step_for_epoch):
def evaluate_model(filepath, train_start=0, train_end=60000, test_start=0, test_end=10000, batch_size=128, testing=False, num_threads=None): """ Run evaluation on a saved model :param filepath: path to model to evaluate :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param batch_size: size of evaluation batches """ # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.INFO) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) eval_params = {'batch_size': batch_size} fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) with sess.as_default(): model = load(filepath) assert len(model.get_params()) > 0 # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) preds = model.get_logits(x) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds, x_test, y_test, 'train_clean_train_clean_eval', False) do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True)
def mnist_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=NB_CLASSES, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, nb_epochs=NB_EPOCHS, holdout=HOLDOUT, data_aug=DATA_AUG, nb_epochs_s=NB_EPOCHS_S, lmbda=LMBDA, aug_batch_size=AUG_BATCH_SIZE): """ MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ # Set logging level to see debug information set_log_level(logging.DEBUG) # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session sess = tf.Session() # Get MNIST data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Initialize substitute training set reserved for adversary x_sub = x_test[:holdout] y_sub = np.argmax(y_test[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries x_test = x_test[holdout:] y_test = y_test[holdout:] # Obtain Image parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Seed random number generator so tutorial is reproducible rng = np.random.RandomState([2017, 8, 30]) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test, nb_epochs, batch_size, learning_rate, rng, nb_classes, img_rows, img_cols, nchannels) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 print("Training the substitute model.") train_sub_out = train_sub(sess, x, y, bbox_preds, x_sub, y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, aug_batch_size, rng, img_rows, img_cols, nchannels) model_sub, preds_sub = train_sub_out # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_sub, x_test, y_test, args=eval_params) accuracies['sub'] = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object. fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model_sub, sess=sess) # Craft adversarial examples using the substitute eval_params = {'batch_size': batch_size} x_adv_sub = fgsm.generate(x, **fgsm_par) # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model.get_logits(x_adv_sub), x_test, y_test, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy return accuracies
def test_generate_respects_dtype(self): self.attack = FastGradientMethod(self.model, sess=self.sess, dtypestr='float64') x = tf.placeholder(dtype=tf.float64, shape=(100, 2)) x_adv = self.attack.generate(x) self.assertEqual(x_adv.dtype, tf.float64)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64, num_threads=None, label_smoothing=True): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] if label_smoothing: label_smooth = .1 y_train = y_train.clip(label_smooth / (nb_classes-1), 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1. } rng = np.random.RandomState([2017, 8, 30]) sess = tf.Session() def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True) # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval') print('Repeating the process, using adversarial training') # Create a new model and train it to be robust to FastGradientMethod model2 = ModelBasicCNN('model2', nb_classes, nb_filters) fgsm2 = FastGradientMethod(model2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) loss2 = LossCrossEntropy(model2, smoothing=0.1, attack=attack) preds2 = model2.get_logits(x) adv_x2 = attack(x) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) def evaluate2(): # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) # Accuracy of the adversarially trained model on adversarial examples do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) # Perform and evaluate adversarial training train(sess, loss2, x, y, x_train, y_train, evaluate=evaluate2, args=train_params, rng=rng, var_list=model2.get_params()) # Calculate training errors if testing: do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval') do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval') return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, train_dir=TRAIN_DIR, filename=FILENAME, load_model=LOAD_MODEL, testing=False, label_smoothing=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ tf.keras.backend.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if keras.backend.image_data_format() != 'channels_last': raise NotImplementedError( "this tutorial requires keras to be configured to channels_last format" ) # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data cifar10 = CIFAR10(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = cifar10.get_set('train') x_test, y_test = cifar10.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = CrossEntropy(wrap, smoothing=label_smoothing) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_train, y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) wrap_2 = KerasModelWrapper(model_2) preds_2 = model_2(x) fgsm2 = FastGradientMethod(wrap_2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) preds_2_adv = model_2(attack(x)) loss_2 = CrossEntropy(wrap_2, smoothing=label_smoothing, attack=attack) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_test, y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, x_test, y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(sess, loss_2, x_train, y_train, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_train, y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, x_train, y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="/tmp", filename="mnist.ckpt", load_model=False, testing=False): keras.layers.core.K.set_learning_phase(0) report = AccuracyReport() tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() predictions = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } # Train an MNIST model ckpt = tf.train.get_checkpoint_state(train_dir) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path rng = np.random.RandomState([2017, 8, 30]) if load_model and ckpt_path: saver = tf.train.Saver() saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) else: print("Model was not loaded, training from scratch.") model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, args=train_params, save=True, rng=rng) # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph wrap = KerasModelWrapper(model) advGenTimeStart = time.time() fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.2, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) adv_x = sess.run(adv_x, feed_dict={x: X_test[:4500]}) advGenTimeEnd = time.time() advGenTime = advGenTimeEnd - advGenTimeStart for i in xrange(4500): normalization(adv_x[i:(i + 1)]) print('adversarial examples generation time = ', advGenTime, 'seconds') intervals = [128, 85, 64, 51, 43, 37, 32, 28, 26] for intervalIndex in range(9): startTime = time.time() print('NBinterval = ', intervalIndex + 2, '; interval size = ', intervals[intervalIndex]) original_classified_wrong_number = 0 disturbed_failure_number = 0 test_number = 0 TTP = 0 TP = 0 FN = 0 FP = 0 for i in range(1000): current_class = int(np.argmax(Y_test[i])) currentXLabel = model_argmax(sess, x, predictions, X_test[i:(i + 1)]) if currentXLabel != current_class: original_classified_wrong_number += 1 continue currentAdvXLabel = model_argmax(sess, x, predictions, adv_x[i:(i + 1)]) if currentAdvXLabel == currentXLabel: disturbed_failure_number += 1 continue test_number += 1 currentX = np.reshape(X_test[i:(i + 1)], (28, 28)) currentX = scalarQuantization(currentX, intervals[intervalIndex]) currentX = np.reshape(currentX, X_test[i:(i + 1)].shape) currentXFilteredLabel = model_argmax(sess, x, predictions, currentX) currentAdvX = np.reshape(adv_x[i:(i + 1)], (28, 28)) currentAdvX = scalarQuantization(currentAdvX, intervals[intervalIndex]) currentAdvX = np.reshape(currentAdvX, X_test[i:(i + 1)].shape) currentAdvXFilteredLabel = model_argmax(sess, x, predictions, currentAdvX) if currentAdvXFilteredLabel != currentAdvXLabel: TP += 1 if currentAdvXFilteredLabel == current_class: TTP += 1 else: FN += 1 if currentXFilteredLabel != currentXLabel: FP += 1 if (i + 1) % 1000 == 0: str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % ( test_number, original_classified_wrong_number, disturbed_failure_number, TP, FN, FP, TTP) print(str1) str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % ( test_number, original_classified_wrong_number, disturbed_failure_number, TP, FN, FP, TTP) print(str1) endTime = time.time() print('lasting ', endTime - startTime, 'seconds') Recall = TP / (TP + FN) Precision = TP / (TP + FP) tempStarStr = '********************************************************' recallStr = 'Recall = %.4f' % (Recall) precisionStr = 'Precision = %.4f' % (Precision) print(tempStarStr) print(recallStr) print(precisionStr) print(tempStarStr) return report
def test_attacks(batch_size=128, source_samples=10, model_path=os.path.join("models", "mnist"), targeted=True): """ Test many attacks on MNIST with deep Bayes classifier. :param batch_size: size of training batches :param source_samples: number of test inputs to attack :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data from cleverhans.utils_mnist import data_mnist X_train, Y_train, X_test, Y_test = data_mnist(train_start=0, train_end=60000, test_start=0, test_end=10000) img_rows, img_cols, channels = X_train[0].shape nb_classes = Y_train.shape[1] # Define input TF placeholder batch_size = min(batch_size, source_samples) x = tf.placeholder(tf.float32, shape=(batch_size, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(batch_size, nb_classes)) # Define TF model graph model_name = str(sys.argv[1]) if model_name == 'bayes': from load_bayes_classifier import BayesModel conv = True checkpoint = 0 #int(sys.argv[1]) K = int(sys.argv[3]) use_mean = True model = BayesModel(sess, 'mnist', conv, K, checkpoint=checkpoint, attack_snapshot=False, use_mean=use_mean) if use_mean: model_name = 'bayes_mean_mlp' else: model_name = 'bayes_K%d' % K if model_name == 'cnn': from load_cnn_classifier import CNNModel model = CNNModel(sess, 'mnist') if model_name == 'wgan': from load_wgan_classifier import WGANModel conv = True checkpoint = 0 #int(sys.argv[1]) K = int(sys.argv[3]) T = int(sys.argv[4]) model = WGANModel(sess, 'mnist', conv, K, T, checkpoint=checkpoint) model_name = 'wgan_K%d_T%d' % (K, T) preds = model.predict(x, softmax=True) # output probabilities print("Defined TensorFlow model graph.") # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy # Craft adversarial examples nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # make adv inputs and labels for the attack if targeted if targeted: adv_inputs = np.array([[instance] * nb_classes for instance in X_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, 1)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) else: adv_inputs = X_test[:source_samples] adv_ys = Y_test[:source_samples] # Instantiate an attack object attack_method = str(sys.argv[2]) if attack_method == 'fgsm': from cleverhans.attacks import FastGradientMethod model_prob = lambda x: model.predict(x, softmax=True) attack = FastGradientMethod(model_prob, sess=sess) from attack_config import config_fgsm attack_params = config_fgsm(targeted, adv_ys) if attack_method == 'bim': from cleverhans.attacks import BasicIterativeMethod model_prob = lambda x: model.predict(x, softmax=True) attack = BasicIterativeMethod(model_prob, sess=sess) from attack_config import config_bim attack_params = config_bim(targeted, adv_ys) if attack_method == 'mim': from cleverhans.attacks import MomentumIterativeMethod model_prob = lambda x: model.predict(x, softmax=True) attack = MomentumIterativeMethod(model_prob, sess=sess) from attack_config import config_mim attack_params = config_mim(targeted, adv_ys) if attack_method == 'jsma': from cleverhans.attacks import SaliencyMapMethod model_prob = lambda x: model.predict(x, softmax=True) attack = SaliencyMapMethod(model_prob, sess=sess) from attack_config import config_jsma attack_params = config_jsma(targeted, adv_ys) if attack_method == 'vat': from cleverhans.attacks import VirtualAdversarialMethod model_logit = lambda x: model.predict(x, softmax=False) attack = VirtualAdversarialMethod(model_logit, sess=sess) from attack_config import config_vat attack_params = config_vat(targeted, adv_ys) if attack_method == 'cw': from cleverhans.attacks import CarliniWagnerL2 model_logit = lambda x: model.predict(x, softmax=False) attack = CarliniWagnerL2(model_logit, sess=sess) from attack_config import config_cw attack_params = config_cw(targeted, adv_ys) if attack_method == 'elastic': from cleverhans.attacks import ElasticNetMethod model_logit = lambda x: model.predict(x, softmax=False) attack = ElasticNetMethod(model_logit, sess=sess) from attack_config import config_elastic attack_params = config_elastic(targeted, adv_ys) if attack_method == 'deepfool': from cleverhans.attacks import DeepFool model_logit = lambda x: model.predict(x, softmax=False) attack = DeepFool(model_logit, sess=sess) from attack_config import config_deepfool attack_params = config_deepfool(targeted, adv_ys) if attack_method == 'madry': from cleverhans.attacks import MadryEtAl model_prob = lambda x: model.predict(x, softmax=True) attack = MadryEtAl(model_prob, sess=sess) from attack_config import config_madry attack_params = config_madry(targeted, adv_ys) attack_params['batch_size'] = batch_size print('batchsize', batch_size) # perform the attack! adv = [] n_batch = int(adv_inputs.shape[0] / batch_size) for i in xrange(n_batch): adv_batch = adv_inputs[i * batch_size:(i + 1) * batch_size] adv.append(attack.generate_np(adv_batch, **attack_params)) adv = np.concatenate(adv, axis=0) for _ in xrange(5): y_adv = [] for i in xrange(n_batch): adv_batch = adv[i * batch_size:(i + 1) * batch_size] y_adv.append(sess.run(preds, {x: adv_batch})) y_adv = np.concatenate(y_adv, axis=0) print('--------------------------------------') for i in xrange(10): print(np.argmax(y_adv[i * 10:(i + 1) * 10], 1)) correct_pred = np.asarray(np.argmax(y_adv, 1) == np.argmax(adv_ys, 1), dtype='f') adv_accuracy = np.mean(correct_pred) if not targeted: # adv_accuracy, y_adv = model_eval(sess, x, y, preds, adv, # adv_ys, args=eval_params, # return_pred=True) # else: # adv_accuracy, y_adv = model_eval(sess, x, y, preds, adv, # Y_test[:source_samples], args=eval_params, # return_pred=True) adv_accuracy = 1. - adv_accuracy print('--------------------------------------') print(np.argmax(adv_ys[:10], 1)) print(np.argmax(y_adv[:10], 1)) for i in xrange(5): tmp = sess.run(preds, {x: adv[:100]}) print(np.argmax(tmp[:10], 1)) # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # visualisation vis_adv = True if vis_adv: N_vis = 100 sys.path.append('../../utils') from visualisation import plot_images if channels == 1: shape = (img_rows, img_cols) else: shape = (img_rows, img_cols, channels) path = 'figs/' filename = model_name + '_' + attack_method if targeted: filename = filename + '_targeted' else: filename = filename + '_untargeted' plot_images(adv_inputs[:N_vis], shape, path, filename + '_data') plot_images(adv[:N_vis], shape, path, filename + '_adv') save_result = True if save_result: path = 'results/' filename = model_name + '_' + attack_method if targeted: filename = filename + '_targeted' y_input = adv_ys else: filename = filename + '_untargeted' y_input = Y_test[:source_samples] results = [adv_inputs, y_input, adv, y_adv] import pickle pickle.dump(results, open(path + filename + '.pkl', 'w')) print("results saved at %s.pkl" % filename) return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="/tmp", filename="mnist.ckpt", load_model=False, testing=False): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } ckpt = tf.train.get_checkpoint_state(train_dir) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path rng = np.random.RandomState([2017, 8, 30]) if load_model and ckpt_path: saver = tf.train.Saver() saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, save=True, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph wrap = KerasModelWrapper(model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model() preds_2 = model_2(x) wrap_2 = KerasModelWrapper(model_2) fgsm2 = FastGradientMethod(wrap_2, sess=sess) preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params)) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, save=False, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="/tmp", filename="mnist.ckpt", load_model=False, testing=False): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) # config = tf.ConfigProto(gpu_options=gpu_options) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() predictions = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } # Train an MNIST model ckpt = tf.train.get_checkpoint_state(train_dir) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path rng = np.random.RandomState([2017, 8, 30]) if load_model and ckpt_path: saver = tf.train.Saver() saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) else: print("Model was not loaded, training from scratch.") model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, args=train_params, save=True, rng=rng) # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph wrap = KerasModelWrapper(model) advGenTimeStart = time.time() fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.2, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) adv_x = sess.run(adv_x, feed_dict={x: X_test[:4500]}) advGenTimeEnd = time.time() advGenTime = advGenTimeEnd - advGenTimeStart for i in xrange(4500): normalization(adv_x[i:(i + 1)]) print('adversarial examples generation time = ', advGenTime, 'seconds') crosses = [ np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]]), np.array([[0, 0, 1, 0, 0], [0, 0, 1, 0, 0], [1, 1, 1, 1, 1], [0, 0, 1, 0, 0], [0, 0, 1, 0, 0]]), np.array([[0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0]]), np.array([ [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], ]) ] coefficient = [5, 9, 13, 17] #diamond filter test, kernel size: 3, 5, 7, 9 kernelIndex = -1 for kernelSize in xrange(3, 10, 2): startTime = time.time() original_classified_wrong_number = 0 disturbed_failure_number = 0 test_number = 0 TTP = 0 TP = 0 FN = 0 FP = 0 start = (kernelSize - 1) // 2 end = 28 - start kernelIndex += 1 print('cross filter') print(crosses[kernelIndex]) for i in range(4500): current_class = int(np.argmax(Y_test[i])) currentXLabel = model_argmax(sess, x, predictions, X_test[i:(i + 1)]) if currentXLabel != current_class: original_classified_wrong_number += 1 continue currentAdvXLabel = model_argmax(sess, x, predictions, adv_x[i:(i + 1)]) if currentAdvXLabel == currentXLabel: disturbed_failure_number += 1 continue test_number += 1 currentX = np.reshape(X_test[i:(i + 1)], (28, 28)) currentX = diamondAndCrossFilterOperations( currentX, crosses[kernelIndex], start, end, coefficient[kernelIndex]) currentX = np.reshape(currentX, X_test[i:(i + 1)].shape) currentXFilteredLabel = model_argmax(sess, x, predictions, currentX) currentAdvX = np.reshape(adv_x[i:(i + 1)], (28, 28)) currentAdvX = diamondAndCrossFilterOperations( currentAdvX, crosses[kernelIndex], start, end, coefficient[kernelIndex]) currentAdvX = np.reshape(currentAdvX, X_test[i:(i + 1)].shape) currentAdvXFilteredLabel = model_argmax(sess, x, predictions, currentAdvX) if currentAdvXFilteredLabel != currentAdvXLabel: TP += 1 if currentAdvXFilteredLabel == current_class: TTP += 1 else: FN += 1 if currentXFilteredLabel != currentXLabel: FP += 1 if (i + 1) % 1000 == 0: str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % ( test_number, original_classified_wrong_number, disturbed_failure_number, TP, FN, FP, TTP) print(str1) str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % ( test_number, original_classified_wrong_number, disturbed_failure_number, TP, FN, FP, TTP) print(str1) endTime = time.time() print('lasting ', endTime - startTime, 'seconds') Recall = TP / (TP + FN) Precision = TP / (TP + FP) tempStarStr = '********************************************************' recallStr = 'Recall = %.4f' % (Recall) precisionStr = 'Precision = %.4f' % (Precision) print(tempStarStr) print(recallStr) print(precisionStr) print(tempStarStr) return report
def main(argv=None): """ CIFAR10 CleverHans tutorial :return: """ # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get CIFAR10 test data X_train, Y_train, X_test, Y_test = data_cifar10() assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model(img_rows=32, img_cols=32, channels=3) predictions = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the CIFAR10 model on legitimate test # examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Train an CIFAR10 model train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate } model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, args=train_params) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) fgsm = FastGradientMethod(model) adv_x = fgsm.generate(x, eps=0.3) eval_params = {'batch_size': FLAGS.batch_size} X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], args=eval_params) assert X_test_adv.shape[0] == 10000, X_test_adv.shape # Evaluate the accuracy of the CIFAR10 model on adversarial examples accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy)) print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model(img_rows=32, img_cols=32, channels=3) predictions_2 = model_2(x) fgsm_2 = FastGradientMethod(model_2) adv_x_2 = fgsm_2.generate(x, eps=0.3) predictions_2_adv = model_2(adv_x_2) def evaluate_2(): # Evaluate the accuracy of the adversarialy trained CIFAR10 model on # legitimate test examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Evaluate the accuracy of the adversarially trained CIFAR10 model on # adversarial examples accuracy_adv = model_eval(sess, x, y, predictions_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy_adv)) # Perform adversarial training model_train(sess, x, y, predictions_2, X_train, Y_train, predictions_adv=predictions_2_adv, evaluate=evaluate_2, args=train_params) # Evaluate the accuracy of the CIFAR10 model on adversarial examples accuracy = model_eval(sess, x, y, predictions_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy))
def main(argv): model_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if model_file is None: print('No model found') sys.exit() cifar = cifar10_input.CIFAR10Data(FLAGS.dataset_dir) nb_classes = 10 X_test = cifar.eval_data.xs Y_test = to_categorical(cifar.eval_data.ys, nb_classes) assert Y_test.shape[1] == 10. set_log_level(logging.DEBUG) with tf.Session() as sess: x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) from madry_cifar10_model import make_madry_wresnet model = make_madry_wresnet() saver = tf.train.Saver() # Restore the checkpoint saver.restore(sess, model_file) nb_samples = FLAGS.nb_samples attack_params = { 'batch_size': FLAGS.batch_size, 'clip_min': 0., 'clip_max': 255. } if FLAGS.attack_type == 'cwl2': from cleverhans.attacks import CarliniWagnerL2 attacker = CarliniWagnerL2(model, sess=sess) attack_params.update({ 'binary_search_steps': 1, 'max_iterations': 100, 'learning_rate': 0.1, 'initial_const': 10, 'batch_size': 10 }) else: # eps and eps_iter in range 0-255 attack_params.update({'eps': 8, 'ord': np.inf}) if FLAGS.attack_type == 'fgsm': from cleverhans.attacks import FastGradientMethod attacker = FastGradientMethod(model, sess=sess) elif FLAGS.attack_type == 'pgd': attack_params.update({'eps_iter': 2, 'nb_iter': 20}) from cleverhans.attacks import MadryEtAl attacker = MadryEtAl(model, sess=sess) eval_par = {'batch_size': FLAGS.batch_size} if FLAGS.sweep: max_eps = 16 epsilons = np.linspace(1, max_eps, max_eps) for e in epsilons: t1 = time.time() attack_params.update({'eps': e}) x_adv = attacker.generate(x, **attack_params) preds_adv = model.get_probs(x_adv) acc = model_eval(sess, x, y, preds_adv, X_test[:nb_samples], Y_test[:nb_samples], args=eval_par) print('Epsilon %.2f, accuracy on adversarial' % e, 'examples %0.4f\n' % acc) t2 = time.time() else: t1 = time.time() x_adv = attacker.generate(x, **attack_params) preds_adv = model.get_probs(x_adv) acc = model_eval(sess, x, y, preds_adv, X_test[:nb_samples], Y_test[:nb_samples], args=eval_par) t2 = time.time() print('Test accuracy on adversarial examples %0.4f\n' % acc) print("Took", t2 - t1, "seconds")
def main(_): tf.reset_default_graph() # Import data cifar = cf.cifar10(batchSize=FLAGS.batch_size, downloadDir=FLAGS.data_dir) cifar.preprocess() with tf.variable_scope('inputs'): # Create the model x = tf.placeholder( tf.float32, [None, FLAGS.img_width * FLAGS.img_height * FLAGS.img_channels]) # Define loss and optimizer y_ = tf.placeholder(tf.float32, [None, FLAGS.num_classes]) # Whether model is training train = tf.placeholder(tf.bool, []) # Reshape to use within a convolutional neural net. Last dimension is for # 'features' - it would be 1 one for a grayscale image, 3 for an RGB image, # 4 for RGBA, etc. x_image = tf.reshape( x, [-1, FLAGS.img_width, FLAGS.img_height, FLAGS.img_channels]) with tf.variable_scope('model'): # Build the graph for the deep net y_conv = deepnn(x_image, train) model = CallableModelWrapper(lambda _x: deepnn(_x, train), 'logits') # Define your loss function - softmax_cross_entropy with tf.variable_scope('x_entropy'): cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv)) # Define your AdamOptimiser, using FLAGS.learning_rate to minimixe the loss function decayed_learning_rate = tf.train.exponential_decay( FLAGS.learning_rate, tf.Variable(0, trainable=False), 1000, FLAGS.learning_rate_decay) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): optimiser = tf.train.AdamOptimizer(decayed_learning_rate, name="Adam").minimize(cross_entropy) # calculate the prediction and the accuracy accuracy, acc_op = tf.metrics.accuracy(labels=tf.argmax(y_, axis=1), predictions=tf.argmax(y_conv, axis=1)) adv_accuracy, adv_acc_op = tf.metrics.accuracy( labels=tf.argmax(y_, axis=1), predictions=tf.argmax(y_conv, axis=1)) # summaries for TensorBoard visualisation loss_summary = tf.summary.scalar('Loss', cross_entropy) adv_loss_summary = tf.summary.scalar('Adversarial Loss', cross_entropy) acc_summary = tf.summary.scalar('Accuracy', accuracy) adv_acc_summary = tf.summary.scalar('Adv Accuracy', adv_accuracy) image_summary = tf.summary.image('Test Images', x_image) # saver for checkpoints saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) with tf.Session() as sess: with tf.variable_scope('model', reuse=True): fgsm = FastGradientMethod(model, sess=sess) adv_image_op = fgsm.generate(x_image, eps=FLAGS.fgsm_eps, clip_min=0.0, clip_max=1.0) summary_writer_train = tf.summary.FileWriter(run_log_dir + '_train', sess.graph, flush_secs=5) summary_writer_validation = tf.summary.FileWriter(run_log_dir + '_validate', sess.graph, flush_secs=5) summary_writer_adversarial = tf.summary.FileWriter(run_log_dir + '_adversarial', sess.graph, flush_secs=5) summary_writer_images = tf.summary.FileWriter(run_log_dir + '_images', sess.graph, flush_secs=5) summary_writer_images_adversarial = tf.summary.FileWriter( run_log_dir + '_images_adversarial', sess.graph, flush_secs=5) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) # Training and validation for step in range(FLAGS.max_steps): # Training: Backpropagation using train set (train_images, train_labels) = cifar.getTrainBatch() (test_images, test_labels) = cifar.getTestBatch() _, summary_str, train_images_adv = sess.run( [optimiser, loss_summary, adv_image_op], feed_dict={ x: train_images, y_: train_labels, train: True }) _, summary_str_adv = sess.run([optimiser, adv_loss_summary], feed_dict={ x_image: train_images_adv, y_: train_labels, train: True }) if step % (FLAGS.log_frequency + 1) == 0: summary_writer_train.add_summary(summary_str, step) summary_writer_train.add_summary(summary_str_adv, step) ## Validation: Monitoring accuracy using validation set if step % FLAGS.log_frequency == 0: accuracy, summary_str, image_str, test_images_adv = sess.run( [acc_op, acc_summary, image_summary, adv_image_op], feed_dict={ x: test_images, y_: test_labels, train: False }) adv_accuracy, adv_summary_str, adv_image_str = sess.run( [adv_acc_op, adv_acc_summary, image_summary], feed_dict={ x_image: test_images_adv, y_: test_labels, train: False }) print('step %d, accuracy on validation batch: %g' % (step, accuracy)) summary_writer_validation.add_summary(summary_str, step) summary_writer_images.add_summary(image_str) summary_writer_adversarial.add_summary(adv_summary_str, step) summary_writer_images_adversarial.add_summary(adv_image_str) ## Save the model checkpoint periodically. if step % FLAGS.save_model == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(run_log_dir + '_train', 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) # Testing # resetting the internal batch indexes cifar.reset() evaluated_images = 0 test_accuracy = 0 adv_test_accuracy = 0 batch_count = 0 # don't loop back when we reach the end of the test set while evaluated_images != cifar.nTestSamples: (testImages, testLabels) = cifar.getTestBatch(allowSmallerBatches=True) test_accuracy_temp, _, adv_images = sess.run( [acc_op, acc_summary, adv_image_op], feed_dict={ x: testImages, y_: testLabels, train: False }) adv_test_accuracy_temp = sess.run(adv_acc_op, feed_dict={ x_image: adv_images, y_: testLabels, train: False }) batch_count = batch_count + 1 test_accuracy = test_accuracy + test_accuracy_temp adv_test_accuracy = adv_test_accuracy + adv_test_accuracy_temp evaluated_images = evaluated_images + testLabels.shape[0] test_accuracy = test_accuracy / batch_count adv_test_accuracy = adv_test_accuracy / batch_count print('test set: accuracy on test set: %0.3f' % test_accuracy) print('test set: accuracy on adversarial test set: %0.3f' % adv_test_accuracy)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params) # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph fgsm = FastGradientMethod(model, sess=sess) fgsm_params = {'eps': 0.3} adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model() preds_2 = model_2(x) fgsm2 = FastGradientMethod(model_2, sess=sess) preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params)) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params) return report
def test(): """ """ tf.reset_default_graph() g = tf.get_default_graph() with g.as_default(): # Placeholder nodes. images_holder = tf.placeholder( tf.float32, [None, FLAGS.IMAGE_ROWS, FLAGS.IMAGE_COLS, FLAGS.NUM_CHANNELS]) label_holder = tf.placeholder(tf.float32, [None, FLAGS.NUM_CLASSES]) is_training = tf.placeholder(tf.bool, ()) # model model = model_mnist.RDPCNN(images_holder, label_holder, FLAGS.INPUT_SIGMA, is_training) # for adv examples model_loss = model.loss() model_acc = model.cnn_accuracy # robust def inference(x): logits, _ = model.cnn.prediction(x) return logits def inference_prob(x): _, probs = model.cnn.prediction(x) return probs graph_dict = {} graph_dict["images_holder"] = images_holder graph_dict["label_holder"] = label_holder graph_dict["is_training"] = is_training config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config, graph=g) as sess: sess.run(tf.global_variables_initializer()) # load model model.tf_load(sess, name=FLAGS.CNN_CKPT_RESTORE_NAME) # adv test #################################################################################################### x_advs = {} ch_model_logits = CallableModelWrapper(callable_fn=inference, output_layer='logits') ch_model_probs = CallableModelWrapper(callable_fn=inference_prob, output_layer='probs') # FastGradientMethod fgsm_obj = FastGradientMethod(model=ch_model_probs, sess=sess) x_advs["fgsm"] = fgsm_obj.generate(x=images_holder, eps=FLAGS.ATTACK_SIZE, clip_min=0.0, clip_max=1.0) # testing now # Iterative FGSM (BasicIterativeMethod/ProjectedGradientMethod with no random init) # default: eps_iter=0.05, nb_iter=10 ifgsm_obj = BasicIterativeMethod(model=ch_model_probs, sess=sess) x_advs["ifgsm"] = ifgsm_obj.generate(x=images_holder, eps=FLAGS.ATTACK_SIZE, eps_iter=FLAGS.ATTACK_SIZE / 10, nb_iter=10, clip_min=0.0, clip_max=1.0) # MomentumIterativeMethod # default: eps_iter=0.06, nb_iter=10 mim_obj = MomentumIterativeMethod(model=ch_model_probs, sess=sess) x_advs["mim"] = mim_obj.generate(x=images_holder, eps=FLAGS.ATTACK_SIZE, eps_iter=FLAGS.ATTACK_SIZE / 10, nb_iter=10, decay_factor=1.0, clip_min=0.0, clip_max=1.0) # MadryEtAl (Projected Grdient with random init, same as rand+fgsm) # default: eps_iter=0.01, nb_iter=40 madry_obj = MadryEtAl(model=ch_model_probs, sess=sess) x_advs["madry"] = madry_obj.generate(x=images_holder, eps=FLAGS.ATTACK_SIZE, eps_iter=FLAGS.ATTACK_SIZE / 10, nb_iter=10, clip_min=0.0, clip_max=1.0) graph_dict["x_advs"] = x_advs #################################################################################################### # tensorboard writer #test_writer = model_utils.init_writer(FLAGS.TEST_LOG_PATH, g) print("\nTest") if FLAGS.local: total_test_batch = 2 else: total_test_batch = None dp_info = np.load(FLAGS.DP_INFO_NPY, allow_pickle=True).item() test_info(sess, model, None, graph_dict, dp_info, FLAGS.TEST_LOG_FILENAME, total_batch=total_test_batch) robust_info(sess, model, graph_dict, FLAGS.ROBUST_LOG_FILENAME)
class TestFastGradientMethod(CleverHansTest): def setUp(self): super(TestFastGradientMethod, self).setUp() self.sess = tf.Session() self.model = SimpleModel() self.attack = FastGradientMethod(self.model, sess=self.sess) def generate_adversarial_examples_np(self, ord, eps, **kwargs): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=eps, ord=ord, clip_min=-5, clip_max=5, **kwargs) if ord == np.inf: delta = np.max(np.abs(x_adv - x_val), axis=1) elif ord == 1: delta = np.sum(np.abs(x_adv - x_val), axis=1) elif ord == 2: delta = np.sum(np.square(x_adv - x_val), axis=1)**.5 return x_val, x_adv, delta def help_generate_np_gives_adversarial_example(self, ord, eps=.5, **kwargs): x_val, x_adv, delta = self.generate_adversarial_examples_np(ord, eps, **kwargs) self.assertClose(delta, eps) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(orig_labs == new_labs) < 0.5) def test_generate_np_gives_adversarial_example_linfinity(self): self.help_generate_np_gives_adversarial_example(np.infty) def test_generate_np_gives_adversarial_example_l1(self): self.help_generate_np_gives_adversarial_example(1) def test_generate_np_gives_adversarial_example_l2(self): self.help_generate_np_gives_adversarial_example(2) def test_generate_respects_dtype(self): x = tf.placeholder(dtype=tf.float64, shape=(100, 2)) x_adv = self.attack.generate(x) self.assertEqual(x_adv.dtype, tf.float64) def test_targeted_generate_np_gives_adversarial_example(self): random_labs = np.random.random_integers(0, 1, 100) random_labs_one_hot = np.zeros((100, 2)) random_labs_one_hot[np.arange(100), random_labs] = 1 _, x_adv, delta = self.generate_adversarial_examples_np( eps=.5, ord=np.inf, y_target=random_labs_one_hot) self.assertClose(delta, 0.5) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(random_labs == new_labs) > 0.7) def test_generate_np_can_be_called_with_different_eps(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) for eps in [0.1, 0.2, 0.3, 0.4]: x_adv = self.attack.generate_np(x_val, eps=eps, ord=np.inf, clip_min=-5.0, clip_max=5.0) delta = np.max(np.abs(x_adv - x_val), axis=1) self.assertClose(delta, eps) def test_generate_np_clip_works_as_expected(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=0.5, ord=np.inf, clip_min=-0.2, clip_max=0.1) self.assertClose(np.min(x_adv), -0.2) self.assertClose(np.max(x_adv), 0.1) def test_generate_np_caches_graph_computation_for_eps_clip_or_xi(self): x_val = np.random.rand(1, 2) x_val = np.array(x_val, dtype=np.float32) self.attack.generate_np(x_val, eps=.3, num_iterations=10, clip_max=-5.0, clip_min=-5.0, xi=1e-6) old_grads = tf.gradients def fn(*x, **y): raise RuntimeError() tf.gradients = fn self.attack.generate_np(x_val, eps=.2, num_iterations=10, clip_max=-4.0, clip_min=-4.0, xi=1e-5) tf.gradients = old_grads
def attack_classifier(sess, x, y, model, x_test, y_test, attack_method="fgsm", target=None, batch_size=128): tf.set_random_seed(1822) set_log_level(logging.DEBUG) # Initialize attack if attack_method == "fgsm": from cleverhans.attacks import FastGradientMethod params = {'eps': 8 / 255, 'clip_min': 0., 'clip_max': 1.} if target is not None: params["y_target"] = tf.constant( np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0)) method = FastGradientMethod(model, sess=sess) elif attack_method == "basic_iterative": from cleverhans.attacks import BasicIterativeMethod params = { 'eps': 8 / 255, 'eps_iter': 1 / 255, 'nb_iter': 10, 'clip_min': 0., 'clip_max': 1. } if target is not None: params["y_target"] = tf.constant( np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0)) method = BasicIterativeMethod(model, sess=sess) elif attack_method == "momentum_iterative": from cleverhans.attacks import MomentumIterativeMethod params = { 'eps': 8 / 255, 'eps_iter': 1 / 255, 'nb_iter': 10, 'clip_min': 0., 'clip_max': 1. } if target is not None: params["y_target"] = tf.constant( np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0)) method = MomentumIterativeMethod(model, sess=sess) elif attack_method == "saliency": from cleverhans.attacks import SaliencyMapMethod params = { 'theta': 8 / 255, 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1. } assert target is None method = SaliencyMapMethod(model, sess=sess) elif attack_method == "virtual": from cleverhans.attacks import VirtualAdversarialMethod params = { 'eps': 8 / 255, 'num_iterations': 10, 'xi': 1e-6, 'clip_min': 0., 'clip_max': 1. } assert target is None method = VirtualAdversarialMethod(model, sess=sess) elif attack_method == "cw": from cleverhans.attacks import CarliniWagnerL2 params = { "confidence": 0, "batch_size": 128, "learning_rate": 1e-4, "binary_search_steps": 10, "max_iterations": 1000, "abort_early": True, "initial_const": 1e-2, "clip_min": 0, "clip_max": 1 } if target is not None: params["y_target"] = tf.constant( np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0)) method = CarliniWagnerL2(model, sess=sess) elif attack_method == "elastic_net": from cleverhans.attacks import ElasticNetMethod params = { "fista": "FISTA", "beta": 0.1, "decision_rule": "EN", "confidence": 0, "batch_size": 128, "learning_rate": 1e-4, "binary_search_steps": 10, "max_iterations": 1000, "abort_early": True, "initial_const": 1e-2, "clip_min": 0, "clip_max": 1 } if target is not None: params["y_target"] = tf.constant( np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0)) method = ElasticNetMethod(model, sess=sess) elif attack_method == "deepfool": from cleverhans.attacks import DeepFool params = { "nb_candidate": 10, "overshoot": 1e-3, "max_iter": 100, "nb_classes": 10, "clip_min": 0, "clip_max": 1 } assert target is None method = DeepFool(model, sess=sess) elif attack_method == "lbfgs": from cleverhans.attacks import LBFGS params = { 'batch_size': 128, "binary_search_steps": 10, "max_iterations": 1000, "initial_const": 1e-2, 'clip_min': 0., 'clip_max': 1. } assert target is not None params["y_target"] = tf.constant( np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0)) method = LBFGS(model, sess=sess) elif attack_method == "madry": from cleverhans.attacks import MadryEtAl params = { 'eps': 8 / 255, 'eps_iter': 1 / 255, 'nb_iter': 10, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1. } if target is not None: params["y_target"] = tf.constant( np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0)) method = MadryEtAl(model, sess=sess) elif attack_method == "SPSA": from cleverhans.attacks import SPSA params = { 'epsilon': 1 / 255, 'num_steps': 10, 'is_targeted': False, 'early_stop_loss_threshold': None, 'learning_rate': 0.01, 'delta': 0.01, 'batch_size': 128, 'spsa_iters': 1, 'is_debug': False } if target is not None: params["y_target"] = tf.constant( np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0)) params["is_targeted"] = True method = SPSA(model, sess=sess) else: raise ValueError("Can not recognize this attack method") adv_x = method.generate(x, **params) num_batch = x_test.shape[0] // batch_size adv_imgs = [] for i in range(num_batch): if (i + 1) * batch_size >= x_test.shape[0]: adv_imgs.append( sess.run(adv_x, feed_dict={ x: x_test[i * batch_size:], y: y_test[i * batch_size:] })) else: adv_imgs.append( sess.run(adv_x, feed_dict={ x: x_test[i * batch_size:(i + 1) * batch_size], y: y_test[i * batch_size:(i + 1) * batch_size] })) adv_imgs = np.concatenate(adv_imgs, axis=0) return adv_imgs
def setUp(self): super(TestFastGradientMethod, self).setUp() self.sess = tf.Session() self.model = SimpleModel() self.attack = FastGradientMethod(self.model, sess=self.sess)
def dknn_tutorial(): # Get MNIST data. mnist = MNIST() x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Use Image Parameters. img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] with tf.Session() as sess: with tf.variable_scope('dknn'): # Define input TF placeholder. x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define a model. model = make_basic_picklable_cnn() preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.) # Define the test set accuracy evaluation. def evaluate(): acc = model_eval(sess, x, y, preds, x_test, y_test, args={'batch_size': FLAGS.batch_size}) print('Test accuracy on test examples: %0.4f' % acc) # Train the model train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.lr } train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, var_list=model.get_params()) # Define callable that returns a dictionary of all activations for a dataset def get_activations(data): data_activations = {} for layer in layers: layer_sym = tf.layers.flatten(model.get_layer(x, layer)) data_activations[layer] = batch_eval( sess, [x], [layer_sym], [data], args={'batch_size': FLAGS.batch_size})[0] return data_activations # Use a holdout of the test set to simulate calibration data for the DkNN. train_data = x_train train_labels = np.argmax(y_train, axis=1) cali_data = x_test[:FLAGS.nb_cali] y_cali = y_test[:FLAGS.nb_cali] cali_labels = np.argmax(y_cali, axis=1) test_data = x_test[FLAGS.nb_cali:] y_test = y_test[FLAGS.nb_cali:] # Extract representations for the training and calibration data at each layer of interest to the DkNN. layers = ['ReLU1', 'ReLU3', 'ReLU5', 'logits'] # Wrap the model into a DkNNModel dknn = DkNNModel(FLAGS.neighbors, layers, get_activations, train_data, train_labels, nb_classes, scope='dknn') dknn.calibrate(cali_data, cali_labels) # Generate adversarial examples fgsm = FastGradientMethod(model, sess=sess) attack_params = {'eps': .25, 'clip_min': 0., 'clip_max': 1.} adv = sess.run(fgsm.generate(x, **attack_params), feed_dict={x: test_data}) # Test the DkNN on clean test data and FGSM test data for data_in, fname in zip([test_data, adv], ['test', 'adv']): dknn_preds = dknn.fprop_np(data_in) print(dknn_preds.shape) print( np.mean( np.argmax(dknn_preds, axis=1) == np.argmax(y_test, axis=1))) plot_reliability_diagram(dknn_preds, np.argmax(y_test, axis=1), '/tmp/dknn_' + fname + '.pdf') return True
probs = output.op.inputs[0] return probs eps = 2.0 * 16.0 / 255.0 batch_shape = [FLAGS.batch_size, FLAGS.image_height, FLAGS.image_width, 3] num_classes = 1001 tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): x_input = tf.placeholder(tf.float32, shape=batch_shape) model = InceptionModel(num_classes) fgsm = FastGradientMethod(model) x_adv = fgsm.generate(x_input, eps=eps, clip_min=-1., clip_max=1.) saver = tf.train.Saver(slim.get_model_variables()) session_creator = tf.train.ChiefSessionCreator( scaffold=tf.train.Scaffold(saver=saver), checkpoint_filename_with_path=FLAGS.checkpoint_path, master=FLAGS.master) with tf.train.MonitoredSession(session_creator=session_creator) as sess: for filenames, images in load_images( "../input/nips-2017-adversarial-learning-development-set/images/", batch_shape): adv_images = sess.run(x_adv, feed_dict={x_input: images}) save_images(adv_images, filenames, "")
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="train_dir", filename="mnist.ckpt", load_model=False, testing=False, label_smoothing=True): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] if label_smoothing: label_smooth = .1 y_train = y_train.clip(label_smooth / (nb_classes-1), 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = LossCrossEntropy(wrap, smoothing=0.1) train(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, save=True, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_train, y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) wrap_2 = KerasModelWrapper(model_2) preds_2 = model_2(x) fgsm2 = FastGradientMethod(wrap_2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) preds_2_adv = model_2(attack(x)) loss_2 = LossCrossEntropy(wrap_2, smoothing=0.1, attack=attack) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_test, y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, x_test, y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(sess, loss_2, x, y, x_train, y_train, evaluate=evaluate_2, args=train_params, save=False, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_train, y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, x_train, y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True) # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval') print('Repeating the process, using adversarial training') # Create a new model and train it to be robust to FastGradientMethod model2 = ModelBasicCNN('model2', nb_classes, nb_filters) fgsm2 = FastGradientMethod(model2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack) preds2 = model2.get_logits(x) adv_x2 = attack(x) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) def evaluate2(): # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) # Accuracy of the adversarially trained model on adversarial examples do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) # Perform and evaluate adversarial training train(sess, loss2, x_train, y_train, evaluate=evaluate2, args=train_params, rng=rng, var_list=model2.get_params()) # Calculate training errors if testing: do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval') do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval') return report
def mdt(model, data_dir, checkpoint_dir, train_dir='./tmp/cifar10_train', adversarial_dir='./tmp/cifar10_adv', batch_size=128, data_aug=False, data_norm=True): # train model if not tf.gfile.Exists(train_dir): # set input and get logits images, labels = mdt_cifar10_input.inputs(False, data_dir, batch_size, data_aug, data_norm) labels = tf.cast(labels, tf.int64) # target = False # adv_output_layer = 'adv_bounddecoder6' # loss = adv_net_loss(images, model, labels, target, adv_output_layer, 0, 10) logits = model(images) loss = stand_loss(logits, labels) train_process(model, loss, images, label, train_dir, batch_size) # define dataset format img_rows = 32 img_cols = 32 channels = 3 nb_classes = 10 # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Get predict tensor pred = model(x) sess = tf.Session() if not checkpoint_load(sess, checkpoint_dir): return False # fetch data cifar10_data.maybe_download_and_return_python(data_dir) X, Y = mdt_cifar10_input.numpy_input(True, data_dir) # create one-hot Y one_hot_Y = to_categorical(Y, nb_classes) # create mode feed train_feed = mode_feed(sess, True) eval_feed = mode_feed(sess, False) fgsm_params = {'eps': 1, 'clip_min': 0., 'clip_max': 255.} fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) # eval model accuracy class_accuracy, accuracy = model_eval_each_class(sess, x, y, pred, 10, X, one_hot_Y, feed=eval_feed, args={'batch_size': 128}) print('model accuracy: {0}'.format(accuracy)) for i in range(10): print('class {0} accuracy: {1}'.format(i, class_accuracy[i])) # eval model's accuacy in cw adversarial examples fgsm_accuracy = model_eval(sess, x, y, preds_adv, X, one_hot_Y, feed=eval_feed, args={'batch_size': 128}) print('model fgsm_accuracy: {0}'.format(fgsm_accuracy)) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} X = X[:128] Y=one_hot_Y[:128] adv_feed = {x:X, y:one_hot_Y} adv_feed.update(eval_feed) sta = time.time() adv_X_ = sess.run(adv_x,feed_dict=adv_feed) end = time.time() duration = end - sta print('finished in {0} seconds'.format(duration)) l2_dis = calculate_l2_dis(X/255, adv_X_/255) print('adversarial examples\' mean l2 distance: {0}'.format(l2_dis))
def mnist_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=10, batch_size=128, learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6, nb_epochs_s=10, lmbda=0.1, epsilon=0.3): """ MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ # Set logging level to see debug information set_log_level(logging.DEBUG) pyp = False # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session sess = tf.Session() # Get MNIST data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Initialize substitute training set reserved for adversary X_sub = X_test[:holdout] Y_sub = np.argmax(Y_test[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries X_test = X_test[holdout:] Y_test = Y_test[holdout:] # Define input and output TF placeholders x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Seed random number generator so tutorial is reproducible rng = np.random.RandomState([2017, 8, 30]) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") prep_bbox_out = prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, nb_epochs, batch_size, learning_rate, rng=rng) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 print("Training the substitute model.") train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, rng=rng) model_sub, preds_sub = train_sub_out # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params) accuracies['sub'] = acc for epstep in [epsilon * i for i in range(20)]: # Initialize the Fast Gradient Sign Method (FGSM) attack object. fgsm_par = { 'eps': epstep, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1. } fgsm = FastGradientMethod(model_sub, sess=sess) # Craft adversarial examples using the substitute eval_params = {'batch_size': batch_size} x_adv_sub = fgsm.generate(x, **fgsm_par) def find_error(glb, mdl): temparray = [] for i in range(len(glb)): prd = np.argmax(mdl.predict(np.array([glb[i]]))) if prd != np.argmax(Y_test[i]): # print('--') # print(prd) # print('diff') # print(np.argmax(Y_test[i])) # print('--') temparray.append([glb[i], Y_test[i], X_test[i], prd, i]) return temparray # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model(x_adv_sub), X_test, Y_test, args=eval_params) print( 'Test accuracy of oracle on BB Adversarial Samples with epsilon = %s : ' % epstep + str(accuracy)) if pyp: x_adv_np = fgsm.generate_np(X_test[0:200], **fgsm_par) y_adv_np = find_error(x_adv_np, keras_global_model) from matplotlib import pyplot as plt plt.rc('figure', figsize=(12.0, 12.0)) for j in range(len(y_adv_np) - 1): print( str(y_adv_np[j][3]) + "predit, et le reel etait : " + str(np.argmax(y_adv_np[j][1]))) plt.imshow(y_adv_np[j][0].reshape((28, 28)), cmap="gray", label=str(np.argmax(y_adv_np[j][3]))) plt.pause(1) print('---') accuracies['bbox_on_sub_adv_ex' + str(epstep)] = accuracy return accuracies
def mnist_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=10, batch_size=128, learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6, nb_epochs_s=10, lmbda=0.1, aug_batch_size=512): """ MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ # Set logging level to see debug information set_log_level(logging.DEBUG) # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session sess = tf.Session() # Get MNIST data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Initialize substitute training set reserved for adversary X_sub = x_test[:holdout] Y_sub = np.argmax(y_test[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries x_test = x_test[holdout:] y_test = y_test[holdout:] # Obtain Image parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Seed random number generator so tutorial is reproducible rng = np.random.RandomState([2017, 8, 30]) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test, nb_epochs, batch_size, learning_rate, rng, nb_classes, img_rows, img_cols, nchannels) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 print("Training the substitute model.") train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, aug_batch_size, rng, img_rows, img_cols, nchannels) model_sub, preds_sub = train_sub_out # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_sub, x_test, y_test, args=eval_params) accuracies['sub'] = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object. fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model_sub, sess=sess) # Craft adversarial examples using the substitute eval_params = {'batch_size': batch_size} x_adv_sub = fgsm.generate(x, **fgsm_par) # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model.get_logits(x_adv_sub), x_test, y_test, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy return accuracies
class TestFastGradientMethod(CleverHansTest): def setUp(self): super(TestFastGradientMethod, self).setUp() self.sess = tf.Session() self.model = SimpleModel() self.attack = FastGradientMethod(self.model, sess=self.sess) def generate_adversarial_examples_np(self, ord, eps, **kwargs): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=eps, ord=ord, clip_min=-5, clip_max=5, **kwargs) if ord == np.inf: delta = np.max(np.abs(x_adv - x_val), axis=1) elif ord == 1: delta = np.sum(np.abs(x_adv - x_val), axis=1) elif ord == 2: delta = np.sum(np.square(x_adv - x_val), axis=1)**.5 return x_val, x_adv, delta def help_generate_np_gives_adversarial_example(self, ord, eps=.5, **kwargs): x_val, x_adv, delta = self.generate_adversarial_examples_np( ord, eps, **kwargs) self.assertClose(delta, eps) orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(orig_labs == new_labs) < 0.5) def test_generate_np_gives_adversarial_example_linfinity(self): self.help_generate_np_gives_adversarial_example(np.infty) def test_generate_np_gives_adversarial_example_l1(self): self.help_generate_np_gives_adversarial_example(1) def test_generate_np_gives_adversarial_example_l2(self): self.help_generate_np_gives_adversarial_example(2) def test_generate_respects_dtype(self): self.attack = FastGradientMethod(self.model, sess=self.sess, dtypestr='float64') x = tf.placeholder(dtype=tf.float64, shape=(100, 2)) x_adv = self.attack.generate(x) self.assertEqual(x_adv.dtype, tf.float64) def test_targeted_generate_np_gives_adversarial_example(self): random_labs = np.random.random_integers(0, 1, 100) random_labs_one_hot = np.zeros((100, 2)) random_labs_one_hot[np.arange(100), random_labs] = 1 _, x_adv, delta = self.generate_adversarial_examples_np( eps=.5, ord=np.inf, y_target=random_labs_one_hot) self.assertClose(delta, 0.5) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(random_labs == new_labs) > 0.7) def test_generate_np_can_be_called_with_different_eps(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) for eps in [0.1, 0.2, 0.3, 0.4]: x_adv = self.attack.generate_np(x_val, eps=eps, ord=np.inf, clip_min=-5.0, clip_max=5.0) delta = np.max(np.abs(x_adv - x_val), axis=1) self.assertClose(delta, eps) def test_generate_np_clip_works_as_expected(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=0.5, ord=np.inf, clip_min=-0.2, clip_max=0.1) self.assertClose(np.min(x_adv), -0.2) self.assertClose(np.max(x_adv), 0.1) def test_generate_np_caches_graph_computation_for_eps_clip_or_xi(self): x_val = np.random.rand(1, 2) x_val = np.array(x_val, dtype=np.float32) self.attack.generate_np(x_val, eps=.3, num_iterations=10, clip_max=-5.0, clip_min=-5.0, xi=1e-6) old_grads = tf.gradients def fn(*x, **y): raise RuntimeError() tf.gradients = fn self.attack.generate_np(x_val, eps=.2, num_iterations=10, clip_max=-4.0, clip_min=-4.0, xi=1e-5) tf.gradients = old_grads
[dae_47.layers[2].output]) get_hidden_layer_output_64 = K.function([dae_64.layers[0].input], [dae_64.layers[2].output]) get_hidden_layer_output_80 = K.function([dae_80.layers[0].input], [dae_80.layers[2].output]) get_hidden_layer_output_94 = K.function([dae_94.layers[0].input], [dae_94.layers[2].output]) get_hidden_layer_output_157 = K.function([dae_157.layers[0].input], [dae_157.layers[2].output]) trad_ae_output = K.function([trad_ae.layers[0].input], [trad_ae.layers[2].output]) #Create adversarial examples on testing data sess = backend.get_session() wrap = KerasModelWrapper(attacker_classifier) fgsm = FastGradientMethod(wrap, sess=sess) adv_test_x = fgsm.generate_np(data_test, eps=eta, clip_min=0., clip_max=1.) wrap_adv_trn = KerasModelWrapper(attack_classifier_adv_trn) fgsm_adv_trn = FastGradientMethod(wrap_adv_trn, sess=sess) adv_test_x_adv_trn = fgsm_adv_trn.generate_np(data_test, eps=eta, clip_min=0., clip_max=1.) #Evaluate on clean data scores = fc_classifier.evaluate(data_test, labels_test) print("Accuracy of clean data without any defense") print("Accuracy: %.2f%%" % (scores[1] * 100)) #Evaluate model after attacking data with no defense
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="/tmp", filename="mnist.ckpt", load_model=False, testing=False): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } ckpt = tf.train.get_checkpoint_state(train_dir) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path rng = np.random.RandomState([2017, 8, 30]) if load_model and ckpt_path: saver = tf.train.Saver() saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, save=True) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph wrap = KerasModelWrapper(model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model() preds_2 = model_2(x) wrap_2 = KerasModelWrapper(model_2) fgsm2 = FastGradientMethod(wrap_2, sess=sess) preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params)) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, save=False) # Get a random slice of the data for linear extrapolation plots random_idx = np.random.randint(0, X_train.shape[0]) X_slice = X_train[random_idx] Y_slice = Y_train[random_idx] # Plot the linear extrapolation plot for clean model log_prob_adv_array = get_logits_over_interval( sess, wrap, X_slice, fgsm_params) linear_extrapolation_plot(log_prob_adv_array, Y_slice, 'lep_clean.png') # Plot the linear extrapolation plot for adv model log_prob_adv_array = get_logits_over_interval( sess, wrap_2, X_slice, fgsm_params) linear_extrapolation_plot(log_prob_adv_array, Y_slice, 'lep_adv.png') # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=10, batch_size=128, learning_rate=0.1, nb_epochs=10, holdout=150, data_aug=6, nb_epochs_s=10, lmbda=0.1): """ MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ keras.layers.core.K.set_learning_phase(0) # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Initialize substitute training set reserved for adversary X_sub = X_test[:holdout] Y_sub = np.argmax(Y_test[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries X_test = X_test[holdout:] Y_test = Y_test[holdout:] # Define input and output TF placeholders x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") prep_bbox_out = prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, nb_epochs, batch_size, learning_rate) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 print("Training the substitute model.") train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda) model_sub, preds_sub = train_sub_out # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params) accuracies['sub'] = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object. fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model_sub, sess=sess) # Craft adversarial examples using the substitute eval_params = {'batch_size': batch_size} x_adv_sub = fgsm.generate(x, **fgsm_par) # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model(x_adv_sub), X_test, Y_test, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy return accuracies