def eval_multi(self, inc_epoch=True): """ Run the evaluation on multiple attacks. """ sess = self.sess preds = self.preds x = self.x_pre y = self.y X_train = self.X_train Y_train = self.Y_train X_test = self.X_test Y_test = self.Y_test writer = self.writer self.summary = tf.Summary() report = {} # Evaluate on train set subsample_factor = 100 X_train_subsampled = X_train[::subsample_factor] Y_train_subsampled = Y_train[::subsample_factor] acc_train = model_eval(sess, x, y, preds, X_train_subsampled, Y_train_subsampled, args=self.eval_params) self.log_value('train_accuracy_subsampled', acc_train, 'Clean accuracy, subsampled train') report['train'] = acc_train # Evaluate on the test set acc = model_eval(sess, x, y, preds, X_test, Y_test, args=self.eval_params) self.log_value('test_accuracy_natural', acc, 'Clean accuracy, natural test') report['test'] = acc # Evaluate against adversarial attacks if self.epoch % self.hparams.eval_iters == 0: for att_type in self.attack_type_test: adv_x, preds_adv = self.attacks[att_type] acc = self.eval_advs(x, y, preds_adv, X_test, Y_test, att_type) report[att_type] = acc if self.writer: writer.add_summary(self.summary, self.epoch) # Add examples of adversarial examples to the summary if self.writer and self.epoch % 20 == 0 and self.sum_op is not None: sm_val = self.sess.run(self.sum_op, feed_dict={x: X_test[:self.batch_size], y: Y_test[:self.batch_size]}) if self.writer: writer.add_summary(sm_val) self.epoch += 1 if inc_epoch else 0 return report
def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy
def evaluate_2(): # Evaluate the accuracy of the adversarialy trained CIFAR10 model on # legitimate test examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Evaluate the accuracy of the adversarially trained CIFAR10 model on # adversarial examples accuracy_adv = model_eval(sess, x, y, predictions_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy_adv))
def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc)
def evaluate(): # Evaluate the accuracy of the CIFAR10 model on legitimate test # examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: ' + str(accuracy))
def main(argv): checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if checkpoint is None: raise ValueError("Couldn't find latest checkpoint in " + FLAGS.checkpoint_dir) train_start = 0 train_end = 60000 test_start = 0 test_end = 10000 X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) assert Y_train.shape[1] == 10 # NOTE: for compatibility with Madry Lab downloadable checkpoints, # we cannot enclose this in a scope or do anything else that would # change the automatic naming of the variables. model = MadryMNIST() x_input = tf.placeholder(tf.float32, shape=[None, 784]) x_image = tf.placeholder(tf.float32, shape=[None, 28, 28, 1]) y = tf.placeholder(tf.float32, shape=[None, 10]) if FLAGS.attack_type == 'fgsm': fgsm = FastGradientMethod(model) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x_image, **fgsm_params) elif FLAGS.attack_type == 'bim': bim = BasicIterativeMethod(model) bim_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1., 'nb_iter': 50, 'eps_iter': .01} adv_x = bim.generate(x_image, **bim_params) else: raise ValueError(FLAGS.attack_type) preds_adv = model.get_probs(adv_x) saver = tf.train.Saver() with tf.Session() as sess: # Restore the checkpoint saver.restore(sess, checkpoint) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': FLAGS.batch_size} t1 = time.time() acc = model_eval( sess, x_image, y, preds_adv, X_test, Y_test, args=eval_par) t2 = time.time() print("Took", t2 - t1, "seconds") print('Test accuracy on adversarial examples: %0.4f\n' % acc)
def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc))
def prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, nb_epochs, batch_size, learning_rate, rng, nb_classes=10, img_rows=28, img_cols=28, nchannels=1): """ Define and train a model that simulates the "remote" black-box oracle described in the original paper. :param sess: the TF session :param x: the input placeholder for MNIST :param y: the ouput placeholder for MNIST :param X_train: the training data for the oracle :param Y_train: the training labels for the oracle :param X_test: the testing data for the oracle :param Y_test: the testing labels for the oracle :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param rng: numpy.random.RandomState :return: """ # Define Keras-based TF model graph (for the black-box model) nb_filters = 64 model = cnn_model(nb_filters=nb_filters, nb_classes=nb_classes) # Wrap the model in KerasModelWrapper model = KerasModelWrapper(model, nb_classes) loss = LossCrossEntropy(model, smoothing=0.1) predictions = model.get_logits(x) print("Defined TensorFlow model graph.") # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } train(sess, loss, x, y, X_train, Y_train, args=train_params, rng=rng) # Print out the accuracy on legitimate data eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) print('Test accuracy of black-box on legitimate test ' 'examples: ' + str(accuracy)) return model, predictions, accuracy
def eval_advs(self, x, y, preds_adv, X_test, Y_test, att_type): """ Evaluate the accuracy of the model on adversarial examples :param x: symbolic input to model. :param y: symbolic variable for the label. :param preds_adv: symbolic variable for the prediction on an adversarial example. :param X_test: NumPy array of test set inputs. :param Y_test: NumPy array of test set labels. :param att_type: name of the attack. """ end = (len(X_test) // self.batch_size) * self.batch_size if self.hparams.fast_tests: end = 10*self.batch_size acc = model_eval(self.sess, x, y, preds_adv, X_test[:end], Y_test[:end], args=self.eval_params) self.log_value('test_accuracy_%s' % att_type, acc, 'Test accuracy on adversarial examples') return acc
def mnist_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=NB_CLASSES, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, nb_epochs=NB_EPOCHS, holdout=HOLDOUT, data_aug=DATA_AUG, nb_epochs_s=NB_EPOCHS_S, lmbda=LMBDA, aug_batch_size=AUG_BATCH_SIZE): """ MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ # Set logging level to see debug information set_log_level(logging.DEBUG) # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session sess = tf.Session() # Get MNIST data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Initialize substitute training set reserved for adversary x_sub = x_test[:holdout] y_sub = np.argmax(y_test[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries x_test = x_test[holdout:] y_test = y_test[holdout:] # Obtain Image parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Seed random number generator so tutorial is reproducible rng = np.random.RandomState([2017, 8, 30]) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test, nb_epochs, batch_size, learning_rate, rng, nb_classes, img_rows, img_cols, nchannels) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 print("Training the substitute model.") train_sub_out = train_sub(sess, x, y, bbox_preds, x_sub, y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, aug_batch_size, rng, img_rows, img_cols, nchannels) model_sub, preds_sub = train_sub_out # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_sub, x_test, y_test, args=eval_params) accuracies['sub'] = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object. fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model_sub, sess=sess) # Craft adversarial examples using the substitute eval_params = {'batch_size': batch_size} x_adv_sub = fgsm.generate(x, **fgsm_par) # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model.get_logits(x_adv_sub), x_test, y_test, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy return accuracies
def mnist_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=10, batch_size=128, learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6, nb_epochs_s=10, lmbda=0.1, attack="fgsm", targeted=False): """ MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ keras.layers.core.K.set_learning_phase(0) # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session and set as Keras backend session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) keras.backend.set_session(sess) # Get MNIST data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Initialize substitute training set reserved for adversary X_sub = X_test[:holdout] Y_sub = np.argmax(Y_test[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries X_test = X_test[holdout:] Y_test = Y_test[holdout:] X_test = X_test[:FLAGS.n_attack] Y_test = Y_test[:FLAGS.n_attack] # Define input and output TF placeholders x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") prep_bbox_out = prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, nb_epochs, batch_size, learning_rate) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 time_start = time.time() print("Training the substitute model.") train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda) model_sub, preds_sub = train_sub_out time_end = time.time() print("Substitue model training time:", time_end - time_start) # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params) accuracies['sub'] = acc print('substitution model accuracy:', acc) # Find the correctly predicted labels original_predict = batch_eval(sess, [x], [bbox_preds], [X_test], args=eval_params)[0] original_class = np.argmax(original_predict, axis = 1) true_class = np.argmax(Y_test, axis = 1) mask = true_class == original_class print(np.sum(mask), "out of", mask.size, "are correct labeled,", len(X_test[mask])) # Initialize the Fast Gradient Sign Method (FGSM) attack object. wrap = KerasModelWrapper(model_sub) # Craft adversarial examples using the substitute eval_params = {'batch_size': batch_size} if attack == "fgsm": attacker_params = {'eps': 0.4, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(wrap, sess=sess) x_adv_sub = fgsm.generate(x, **attacker_params) attacker = fgsm adv_inputs = X_test ori_labels = Y_test print("Running FGSM attack...") else: print("Running Carlini and Wagner\'s L2 attack...") yname = "y" adv_ys = None # wrap = KerasModelWrapper(model) cwl2 = CarliniWagnerL2(wrap, back='tf', sess=sess) attacker_params = {'binary_search_steps': 9, 'max_iterations': 2000, 'abort_early': True, 'learning_rate': 0.01, 'batch_size': 1, 'initial_const': 0.01, 'confidence': 20} # generate targeted labels, 9 for each test example if targeted: adv_ys = [] targeted_class = [] for i in range(0, X_test.shape[0]): for j in range(0,10): # skip the original image label if j == np.argmax(Y_test[i]): continue adv_ys.append(np.eye(10)[j]) targeted_class.append(j) attacker_params['y_target'] = np.array(adv_ys, dtype=np.float32) # duplicate the inputs by 9 times adv_inputs = np.array([[instance] * 9 for instance in X_test], dtype=np.float32) adv_inputs = adv_inputs.reshape((X_test.shape[0] * 9, 28, 28, 1)) # also update the mask mask = np.repeat(mask, 9) ori_labels = np.repeat(Y_test, 9, axis=0) else: adv_inputs = X_test ori_labels = Y_test attacker = cwl2 if attack == "fgsm": # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model(x_adv_sub), adv_inputs, ori_labels, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy time_start = time.time() # Evaluate the accuracy of the "black-box" model on adversarial examples x_adv_sub_np = attacker.generate_np(adv_inputs, **attacker_params) accuracy = model_eval(sess, x, y, bbox_preds, x_adv_sub_np, ori_labels, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute (NP): ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy time_end = time.time() print('Attack time:', time_end - time_start) # Evaluate the targeted attack bbox_adv_predict = batch_eval(sess, [x], [bbox_preds], [x_adv_sub_np], args=eval_params)[0] bbox_adv_class = np.argmax(bbox_adv_predict, axis = 1) true_class = np.argmax(ori_labels, axis = 1) untargeted_success = np.mean(bbox_adv_class != true_class) print('Untargeted attack success rate:', untargeted_success) accuracies['untargeted_success'] = untargeted_success if targeted: targeted_success = np.mean(bbox_adv_class == targeted_class) print('Targeted attack success rate:', targeted_success) accuracies['targeted_success'] = targeted_success if attack == "cwl2": # Compute the L2 pertubations of generated adversarial examples percent_perturbed = np.sum((x_adv_sub_np - adv_inputs)**2, axis=(1, 2, 3))**.5 # print(percent_perturbed) # print('Avg. L_2 norm of perturbations {0:.4f}'.format(np.mean(percent_perturbed))) # when computing the mean, removing the failure attacks first print('Avg. L_2 norm of perturbations {0:.4f}'.format(np.mean(percent_perturbed[percent_perturbed > 1e-8]))) # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, bbox_preds, adv_inputs[mask], ori_labels[mask], args=eval_params) print('Test accuracy of excluding originally incorrect labels (should be 1.0): ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex_exc_ori'] = accuracy if attack == "fgsm": # Evaluate the accuracy of the "black-box" model on adversarial examples (excluding correct) accuracy = model_eval(sess, x, y, model(x_adv_sub), adv_inputs[mask], ori_labels[mask], args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute (excluding originally incorrect labels): ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex_exc'] = accuracy # Evaluate the accuracy of the "black-box" model on adversarial examples (excluding correct) x_adv_sub_mask_np = x_adv_sub_np[mask] accuracy = model_eval(sess, x, y, bbox_preds, x_adv_sub_mask_np, ori_labels[mask], args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute (excluding originally incorrect labels, NP): ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex_exc'] = accuracy return accuracies
def mnist_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=10, batch_size=128, learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6, nb_epochs_s=10, lmbda=0.1): """ MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ # Set logging level to see debug information set_log_level(logging.DEBUG) # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session sess = tf.Session() # Get MNIST data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Initialize substitute training set reserved for adversary X_sub = X_test[:holdout] Y_sub = np.argmax(Y_test[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries X_test = X_test[holdout:] Y_test = Y_test[holdout:] # Define input and output TF placeholders x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Seed random number generator so tutorial is reproducible rng = np.random.RandomState([2017, 8, 30]) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance logger.info("Preparing the black-box model.") prep_bbox_out = prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, nb_epochs, batch_size, learning_rate, rng=rng) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 logger.info("Training the substitute model.") train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, rng=rng) model_sub, preds_sub = train_sub_out # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params) accuracies['sub'] = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object. fgsm_par = { 'eps': FLAGS.eps, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1. } fgsm = FastGradientMethod(model_sub, sess=sess) # Craft adversarial examples using the substitute eval_params = {'batch_size': batch_size} x_adv_sub = fgsm.generate(x, **fgsm_par) # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model(x_adv_sub), X_test, Y_test, args=eval_params) logger.info('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model_sub(x_adv_sub), X_test, Y_test, args=eval_params) logger.info( 'Test accuracy of substitute on adversarial examples generated ' 'using the substitute: ' + str(accuracy)) accuracies['sub_on_sub_adv_ex'] = accuracy return accuracies
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, train_dir=TRAIN_DIR, filename=FILENAME, load_model=LOAD_MODEL, testing=True, label_smoothing=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ tf.keras.backend.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if keras.backend.image_data_format() != 'channels_last': raise NotImplementedError( "this tutorial requires keras to be configured to channels_last format" ) # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = CrossEntropy(wrap, smoothing=label_smoothing) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph bim = BasicIterativeMethod(wrap, sess=sess) bim_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1., 'nb_iter': 50, 'eps_iter': .01 } adv_x = bim.generate(x, **bim_params) batch = 1000 x_adv_test = None x_adv_train = None for i in tqdm(range(int(len(x_test) / batch))): tmp = sess.run(adv_x, feed_dict={x: x_test[i * batch:(i + 1) * batch]}) if x_adv_test is None: x_adv_test = tmp else: x_adv_test = np.concatenate((x_adv_test, tmp)) for i in tqdm(range(int(len(x_train) / batch))): tmp = sess.run(adv_x, feed_dict={x: x_train[i * batch:(i + 1) * batch]}) if x_adv_train is None: x_adv_train = tmp else: x_adv_train = np.concatenate((x_adv_train, tmp)) def evaluate_adv(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_adv_test, y_test, args=eval_params) report.clean_train_clean_eval = acc print('Test accuracy on legitimate examples: %0.4f' % acc) evaluate_adv() x_adv_train = (np.repeat(x_adv_train, 3, 3) * 255).astype('uint8') x_train = (np.repeat(x_train, 3, 3) * 255).astype('uint8') x_adv_test = (np.repeat(x_adv_test, 3, 3) * 255).astype('uint8') x_test = (np.repeat(x_test, 3, 3) * 255).astype('uint8') save_list = [x_adv_train, x_adv_test] print(x_adv_train[0]) pickle.dump(save_list, open("./bim.pkl", 'wb'))
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, train_dir=TRAIN_DIR, filename=FILENAME, load_model=LOAD_MODEL, testing=False, label_smoothing=0.1, save_model=SAVE_MODEL,attack_method=ATTACK_METHOD, model_type=MODEL_TYPE): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session os.environ["CUDA_VISIBLE_DEVICES"] = '1' # only use No.0 GPU config = tf.ConfigProto() config.allow_soft_placement=True config.gpu_options.allow_growth = True sess = tf.Session(config=config) keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') my_adv = np.load('mifgsm_c_train_adv.npy').reshape(60000,28,28,1) x_train = np.concatenate([x_train,my_adv]) y_train = np.concatenate([y_train,y_train]) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph the_model = modelA if model_type == 'a': the_model = modelA elif model_type == 'b': the_model = modelB elif model_type == 'c': the_model = modelC else: exit('the model type must be a or b or c.') model = the_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_dir =train_dir + '/' + model_type + '/' + 'mifgsm_c' train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = CrossEntropy(wrap, smoothing=label_smoothing) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng) if save_model: saver = tf.train.Saver(max_to_keep=1) saver.save(sess, '{}/mnist.ckpt'.format(train_dir), global_step=NB_EPOCHS) print("model has been saved") # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Basic Iterative Method (BIM) attack object and graph if attack_method == 'fgsm': att_method = FastGradientMethod(wrap, sess=sess) att_method_params = {'eps': 0.2, 'clip_min': 0., 'clip_max': 1.} elif attack_method == 'bim': att_method = BasicIterativeMethod(wrap,sess=sess) att_method_params = {'eps': 0.2, 'eps_iter':0.06, 'nb_iter':10, 'clip_min': 0., 'clip_max': 1.} elif attack_method == 'mifgsm': att_method = MomentumIterativeMethod(wrap,sess=sess) att_method_params = {'eps': 0.2, 'eps_iter':0.08, 'nb_iter':10, 'decay_factor':0.4, 'clip_min': 0., 'clip_max': 1.} else: exit("the attack method must be fgsm,bim,mifgsm") print(att_method_params) adv_x = att_method.generate(x, **att_method_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} start_time = time.time() acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f' % acc) end_time = time.time() print("{} attack time is {}\n".format(attack_method,end_time - start_time)) report.clean_train_adv_eval = acc #save_acc = np.array(save_acc) #record = pd.DataFrame(save_acc,columns=["decay","acc"]) #record.to_csv("result/mnist_fc_decay__change.csv",index=False) gc.collect()
def adv_net_exp(data_dir, adv_dir, target_model_dir='./tmp/cifar10_train_adv_encoder', clip_norm=1.5): # sess get setting sess = tf.Session() # define dataset format img_rows = 32 img_cols = 32 channels = 3 nb_classes = 10 # fetch data cifar10_data.maybe_download_and_return_python(data_dir) X, Y = mdt_cifar10_input.numpy_input(True, data_dir) # create one-hot Y one_hot_Y = to_categorical(Y, nb_classes) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) model = make_vgg16_clipRelu_model(name='vgg16_clipRelu_eval_mode', eval_mode=True) eval_feed = mode_feed(sess, False) # Get predict tensor pred = model(x) if not checkpoint_load(sess, target_model_dir): return False # eval model accuracy accuracy = model_eval(sess, x, y, pred, X, one_hot_Y, feed=eval_feed, args={'batch_size': 128}) print('model accuracy: {0}'.format(accuracy)) dis_loss, output_images = adv_train_net(x, clip_norm) logits = model(output_images) # restore adv variables ckpt = tf.train.get_checkpoint_state(adv_dir) # define adv variables adv_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "adv_encoder") saver = tf.train.Saver(adv_variables) saver.restore(sess, ckpt.model_checkpoint_path) # eval adv accuracy accuracy = model_eval(sess, x, y, logits, X, one_hot_Y, feed=eval_feed, args={'batch_size': 128}) print('transfer rate: {0}'.format(accuracy)) # universal adversarial examples adv_imgs = adv_generate(sess, output_images, x, X, None, 128) mean_dif = adv_imgs[1] - X[1] print('mean dif\'s size: {0}'.format(mean_dif.shape)) universal_adv_X = X + mean_dif # eval universal adv accuracy accuracy = model_eval(sess, x, y, pred, universal_adv_X, one_hot_Y, feed=eval_feed, args={'batch_size': 128}) print('universal adv transfer rate: {0}'.format(accuracy))
def mnist_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=10, batch_size=128, learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6, nb_epochs_s=10, lmbda=0.1, epsilon=0.3): """ MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ # Set logging level to see debug information set_log_level(logging.DEBUG) pyp = False # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session sess = tf.Session() # Get MNIST data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Initialize substitute training set reserved for adversary X_sub = X_test[:holdout] Y_sub = np.argmax(Y_test[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries X_test = X_test[holdout:] Y_test = Y_test[holdout:] # Define input and output TF placeholders x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Seed random number generator so tutorial is reproducible rng = np.random.RandomState([2017, 8, 30]) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") prep_bbox_out = prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, nb_epochs, batch_size, learning_rate, rng=rng) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 print("Training the substitute model.") train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, rng=rng) model_sub, preds_sub = train_sub_out # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params) accuracies['sub'] = acc for epstep in [epsilon * i for i in range(20)]: # Initialize the Fast Gradient Sign Method (FGSM) attack object. fgsm_par = { 'eps': epstep, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1. } fgsm = FastGradientMethod(model_sub, sess=sess) # Craft adversarial examples using the substitute eval_params = {'batch_size': batch_size} x_adv_sub = fgsm.generate(x, **fgsm_par) def find_error(glb, mdl): temparray = [] for i in range(len(glb)): prd = np.argmax(mdl.predict(np.array([glb[i]]))) if prd != np.argmax(Y_test[i]): # print('--') # print(prd) # print('diff') # print(np.argmax(Y_test[i])) # print('--') temparray.append([glb[i], Y_test[i], X_test[i], prd, i]) return temparray # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model(x_adv_sub), X_test, Y_test, args=eval_params) print( 'Test accuracy of oracle on BB Adversarial Samples with epsilon = %s : ' % epstep + str(accuracy)) if pyp: x_adv_np = fgsm.generate_np(X_test[0:200], **fgsm_par) y_adv_np = find_error(x_adv_np, keras_global_model) from matplotlib import pyplot as plt plt.rc('figure', figsize=(12.0, 12.0)) for j in range(len(y_adv_np) - 1): print( str(y_adv_np[j][3]) + "predit, et le reel etait : " + str(np.argmax(y_adv_np[j][1]))) plt.imshow(y_adv_np[j][0].reshape((28, 28)), cmap="gray", label=str(np.argmax(y_adv_np[j][3]))) plt.pause(1) print('---') accuracies['bbox_on_sub_adv_ex' + str(epstep)] = accuracy return accuracies
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params) # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph fgsm = FastGradientMethod(model, sess=sess) fgsm_params = {'eps': 0.3} adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model() preds_2 = model_2(x) fgsm2 = FastGradientMethod(model_2, sess=sess) preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params)) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params) return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) K.set_session(sess) set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) K.set_learning_phase(1) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) models = {} preds = {} for model_name in ['mlp', 'cnn', 'hrnn']: try: print('[DEBUG] Loading model.') models[model_name] = load_model('{}{}'.format( model_type, model_name)) except: print( '[ERROR] Adversarially Trained models not found! Train and save strengthened models first. Then, run this.' ) exit(1) preds[model_name] = models[model_name](x) rng = np.random.RandomState([2017, 8, 30]) # Evaluate the accuracy of the Adv trained MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy_test = '' attacks = {} # Make computations graphs for the attacks for model_name in models.keys(): accuracy = model_eval(sess, x, y, preds[model_name], X_test, Y_test, args=eval_params) accuracy_test += '{} {}\n'.format(model_name, accuracy) # Instantiate a CW attack object wrap = KerasModelWrapper(models[model_name]) attacks['$PGD_{}$'.format(model_name[0])] = ProjectedGradientDescent( wrap, sess=sess) # Make the output tensor for specification in the attacks parameters idxs = [np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(10)] if targeted: one_hot = np.zeros((10, 10)) one_hot[np.arange(10), np.arange(10)] = 1 adv_inputs = np.array([[instance] * 10 for instance in X_test[idxs]], dtype=np.float32) adv_inputs = adv_inputs.reshape((100, 28, 28, 1)) adv_ys = np.array([one_hot] * 10, dtype=np.float32).reshape((100, 10)) yname = "y_target" else: adv_inputs = X_test[idxs] adv_ys = None yname = "y" attack_params = {'eps': 0.3, yname: adv_ys, 'eps_iter': 0.05} table_header = '{}model '.format(model_type) accuracy_attack = '' for model_name in models.keys(): accuracy_attack += '{} '.format(model_name) # For each model, apply all attacks for attack_name in attacks.keys(): print('[DEBUG] Attacking {} using {}.'.format( model_name, attack_name)) # Code brach entered only once for creating the table header with attack names if attack_name not in table_header: table_header += '{} '.format(attack_name) adv = attacks[attack_name].generate_np(adv_inputs, **attack_params) if targeted: adv_accuracy = model_eval(sess, x, y, preds[model_name], adv, adv_ys, args={'batch_size': 10}) else: adv_accuracy = model_eval(sess, x, y, preds[model_name], adv, Y_test[idxs], args={'batch_size': 10}) accuracy_attack += '{} '.format(adv_accuracy * 100) # Move on to attack the next model accuracy_attack += '\n' print(table_header) print(accuracy_attack) print(accuracy_test) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples return report
def main(argv): model_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if model_file is None: print('No model found') sys.exit() cifar = cifar10_input.CIFAR10Data(FLAGS.dataset_dir) nb_classes = 10 X_test = cifar.eval_data.xs Y_test = to_categorical(cifar.eval_data.ys, nb_classes) assert Y_test.shape[1] == 10. set_log_level(logging.DEBUG) with tf.Session() as sess: x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) from madry_cifar10_model import make_madry_wresnet model = make_madry_wresnet() saver = tf.train.Saver() # Restore the checkpoint saver.restore(sess, model_file) nb_samples = FLAGS.nb_samples attack_params = { 'batch_size': FLAGS.batch_size, 'clip_min': 0., 'clip_max': 255. } if FLAGS.attack_type == 'cwl2': from cleverhans.attacks import CarliniWagnerL2 attacker = CarliniWagnerL2(model, sess=sess) attack_params.update({ 'binary_search_steps': 1, 'max_iterations': 100, 'learning_rate': 0.1, 'initial_const': 10, 'batch_size': 10 }) else: # eps and eps_iter in range 0-255 attack_params.update({'eps': 8, 'ord': np.inf}) if FLAGS.attack_type == 'fgsm': from cleverhans.attacks import FastGradientMethod attacker = FastGradientMethod(model, sess=sess) elif FLAGS.attack_type == 'pgd': attack_params.update({'eps_iter': 2, 'nb_iter': 20}) from cleverhans.attacks import MadryEtAl attacker = MadryEtAl(model, sess=sess) eval_par = {'batch_size': FLAGS.batch_size} if FLAGS.sweep: max_eps = 16 epsilons = np.linspace(1, max_eps, max_eps) for e in epsilons: t1 = time.time() attack_params.update({'eps': e}) x_adv = attacker.generate(x, **attack_params) preds_adv = model.get_probs(x_adv) acc = model_eval(sess, x, y, preds_adv, X_test[:nb_samples], Y_test[:nb_samples], args=eval_par) print('Epsilon %.2f, accuracy on adversarial' % e, 'examples %0.4f\n' % acc) t2 = time.time() else: t1 = time.time() x_adv = attacker.generate(x, **attack_params) preds_adv = model.get_probs(x_adv) acc = model_eval(sess, x, y, preds_adv, X_test[:nb_samples], Y_test[:nb_samples], args=eval_par) t2 = time.time() print('Test accuracy on adversarial examples %0.4f\n' % acc) print("Took", t2 - t1, "seconds")
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def adv_net_exp(data_dir, checkpoint_dir, train_mode, train_dir='./tmp/cifar10_train_adv_encoder', batch_size=128, data_aug=False,clip_norm=1.5, target=0, lr=0.0001): # sess get setting sess = tf.Session() model = make_vgg16_model(name = 'vgg16_eval_mode', eval_mode=True) # create mode feed train_feed = mode_feed(sess, True) eval_feed = mode_feed(sess, False) # train model if train_mode: # set input and get logits data_norm = False images, labels = mdt_cifar10_input.inputs(False, data_dir, batch_size, data_aug, data_norm) labels = tf.constant(target,dtype=tf.int64, shape=(batch_size,)) # dis_loss, output_images = adv_net(images) dis_loss, output_images = adv_target_net(images, clip_norm) logits = model(output_images) # attack seeting # c = 0.005 c=1 confidence = 0 target = True # define model loss loss = adv_loss(dis_loss, logits, labels, target, confidence, c) global_step = tf.train.get_or_create_global_step() # train setting nb_epochs = 100 lr = 0.0001 # decay_rate = 0.99 # decay_epochs = 1 # decay_steps = decay_epochs*NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN//batch_size # lr = tf.train.exponential_decay(initial_lr, # global_step, # decay_steps, # decay_rate, # staircase=True) tf.summary.scalar('learning_rate', lr) opt = tf.train.AdamOptimizer(lr) # define train variables adv_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "adv_encoder") train_op = create_train_op(loss, global_step, adv_variables, opt) # ini all variables init_op = tf.global_variables_initializer() sess.run(init_op) # restore pre variables ckpt = tf.train.get_checkpoint_state(checkpoint_dir) var_info = tf.train.list_variables(ckpt.model_checkpoint_path) # print(var_info) var_name = [v[0] for v in var_info] restore_map = {variable.op.name:variable for variable in tf.global_variables() if variable.op.name in var_name} # print(restore_map) saver = tf.train.Saver(restore_map) saver.restore(sess, ckpt.model_checkpoint_path) #intialize global steps sess.run(global_step.initializer) # print(adv_variables) train_adv_encoder(sess, logits, loss, labels, train_op, train_dir, batch_size, eval_feed, nb_epochs) sess.close() else: # define dataset format img_rows = 32 img_cols = 32 channels = 3 nb_classes = 10 # fetch data cifar10_data.maybe_download_and_return_python(data_dir) X, Y = mdt_cifar10_input.numpy_input(True, data_dir) Y = np.zeros_like(Y) Y[:] = target # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # dis_loss, output_images = adv_net(images) dis_loss, output_images = adv_target_net(x, clip_norm) logits = model(output_images) # restore trained model if not checkpoint_load(sess, train_dir): return False # saver = tf.train.Saver() # ckpt = tf.train.get_checkpoint_state(train_dir) # saver.restore(sess, ckpt.model_checkpoint_path) # create one-hot Y one_hot_Y = to_categorical(Y, nb_classes) # eval model accuracy accuracy = model_eval(sess, x, y, logits, X, one_hot_Y, feed=eval_feed, args={'batch_size': batch_size}) print('model accuracy: {0}'.format(accuracy)) sta_time = time.time() adv_imgs = adv_generate(sess, output_images, x, X, eval_feed, batch_size) end_time = time.time() duration = end_time - sta_time print('adv crafting time: {0}'.format(duration)) #eval adv's l2 distance l2_dis = calculate_l2_dis(X/255, adv_imgs/255) print('adversarial examples\' mean l2 distance: {0}'.format(l2_dis)) adv_imgs = np.around(adv_imgs).astype(int) # compare_show(X[9], adv_imgs[9]) compare_show(X[16], adv_imgs[16]) import matplotlib matplotlib.image.imsave('i_{0}_target_{1}.png'.format(FLAGS.i,FLAGS.target), adv_imgs[16])
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="train_dir", filename="mnist.ckpt", load_model=False, testing=False, label_smoothing=True): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] if label_smoothing: label_smooth = .1 y_train = y_train.clip(label_smooth / (nb_classes-1), 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = LossCrossEntropy(wrap, smoothing=0.1) train(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, save=True, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_train, y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) wrap_2 = KerasModelWrapper(model_2) preds_2 = model_2(x) fgsm2 = FastGradientMethod(wrap_2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) preds_2_adv = model_2(attack(x)) loss_2 = LossCrossEntropy(wrap_2, smoothing=0.1, attack=attack) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_test, y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, x_test, y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(sess, loss_2, x, y, x_train, y_train, evaluate=evaluate_2, args=train_params, save=False, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_train, y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, x_train, y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
check_cnn = True if check_cnn: y_test_cat = np.argmax(y_test, axis=1) print(eval_simple_cnn(detect_3, [1], X_test, y_test_cat)) print(eval_simple_cnn(detect_0, [0, 1, 2, 3, 4, 5], X_test, y_test_cat)) print(eval_simple_cnn(detect_S, [14], X_test, y_test_cat)) print(eval_simple_cnn(detect_T, [14], X_test, y_test_cat)) print(eval_simple_cnn(detect_O, [14], X_test, y_test_cat)) print(eval_simple_cnn(detect_P, [14], X_test, y_test_cat)) wrap_clf = KerasModelWrapper(clf) preds = clf(x) eval_par = {'batch_size': 128} acc = model_eval(sess, x, y, preds, X_test, y_test, args=eval_par) print('Test accuracy on legitimate test examples: {0}'.format(acc)) report.clean_train_clean_eval = acc # fgsm = FastGradientMethod(wrap_clf, sess=sess) # fgsm_params = {'eps': 0.1, # 'clip_min': 0., # 'clip_max': 1.} # adv_x = fgsm.generate(x, **fgsm_params) # # Consider the attack to be constant # adv_x = tf.stop_gradient(adv_x) # preds_adv = clf(adv_x) # # Evaluate the accuracy of the MNIST model on adversarial examples # acc = model_eval(sess, x, y, preds_adv, X_test, y_test, args=eval_par) # print('Test accuracy on adversarial examples: %0.4f\n' % acc)
def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc print('Test accuracy on legitimate examples: %0.4f' % acc)
def tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="/tmp", filename="mnist.ckpt", load_model=False, testing=False): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get cifar10 test data X_train, Y_train, X_test, Y_test = data_cifar10() # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model_cifar10(img_rows=32, img_cols=32, channels=3) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } ckpt = tf.train.get_checkpoint_state(train_dir) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path rng = np.random.RandomState([2017, 8, 30]) if load_model and ckpt_path: saver = tf.train.Saver() saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, save=True, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph wrap = KerasModelWrapper(model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.1, 'clip_min': 0., 'clip_max': 1.} with sess.as_default(): adv_x3 = fgsm.generate(x[:100], **fgsm_params) adv_image = adv_x3.eval(feed_dict={x: X_train[:100], y: Y_train[:100]}) print("adv_image:", adv_image.shape) np.save("adv_image_FGM_cifar10", adv_image[:100]) adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model_cifar10(img_rows=32, img_cols=32, channels=3) preds_2 = model_2(x) wrap_2 = KerasModelWrapper(model_2) fgsm2 = FastGradientMethod(wrap_2, sess=sess) preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params)) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, save=False, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def main(argv=None): """ CIFAR10 CleverHans tutorial :return: """ # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get CIFAR10 test data X_train, Y_train, X_test, Y_test = data_cifar10() assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model(img_rows=32, img_cols=32, channels=3) predictions = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the CIFAR10 model on legitimate test # examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Train an CIFAR10 model train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate } model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, args=train_params) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) fgsm = FastGradientMethod(model) adv_x = fgsm.generate(x, eps=0.3) eval_params = {'batch_size': FLAGS.batch_size} X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], args=eval_params) assert X_test_adv.shape[0] == 10000, X_test_adv.shape # Evaluate the accuracy of the CIFAR10 model on adversarial examples accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy)) print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model(img_rows=32, img_cols=32, channels=3) predictions_2 = model_2(x) fgsm_2 = FastGradientMethod(model_2) adv_x_2 = fgsm_2.generate(x, eps=0.3) predictions_2_adv = model_2(adv_x_2) def evaluate_2(): # Evaluate the accuracy of the adversarialy trained CIFAR10 model on # legitimate test examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Evaluate the accuracy of the adversarially trained CIFAR10 model on # adversarial examples accuracy_adv = model_eval(sess, x, y, predictions_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy_adv)) # Perform adversarial training model_train(sess, x, y, predictions_2, X_train, Y_train, predictions_adv=predictions_2_adv, evaluate=evaluate_2, args=train_params) # Evaluate the accuracy of the CIFAR10 model on adversarial examples accuracy = model_eval(sess, x, y, predictions_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy))
def whitebox(gan, rec_data_path=None, batch_size=128, learning_rate=0.001, nb_epochs=10, eps=0.3, online_training=False, test_on_dev=True, attack_type='fgsm', defense_type='gan', num_tests=-1, num_train=-1): """Based on MNIST tutorial from cleverhans. Args: gan: A `GAN` model. rec_data_path: A string to the directory. batch_size: The size of the batch. learning_rate: The learning rate for training the target models. nb_epochs: Number of epochs for training the target model. eps: The epsilon of FGSM. online_training: Training Defense-GAN with online reconstruction. The faster but less accurate way is to reconstruct the dataset once and use it to train the target models with: `python train.py --cfg <path-to-model> --save_recs` attack_type: Type of the white-box attack. It can be `fgsm`, `rand+fgsm`, or `cw`. defense_type: String representing the type of attack. Can be `none`, `defense_gan`, or `adv_tr`. """ FLAGS = tf.flags.FLAGS # Set logging level to see debug information. set_log_level(logging.WARNING) if 'defense_gan' in FLAGS.defense_type: assert gan is not None # Create TF session. if 'defense_gan' in FLAGS.defense_type: sess = gan.sess if FLAGS.train_on_recs: assert rec_data_path is not None or online_training else: config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) train_images, train_labels, test_images, test_labels = \ get_cached_gan_data(gan, test_on_dev) rec_test_images = test_images rec_test_labels = test_labels _, _, test_images, test_labels = \ get_cached_gan_data(gan, test_on_dev, orig_data_flag=True) x_shape = [None] + list(train_images.shape[1:]) images_pl = tf.placeholder(tf.float32, shape=[None] + list(train_images.shape[1:])) alters_pl = tf.placeholder(tf.float32, shape=[None] + list(train_images.shape[1:])) labels_pl = tf.placeholder(tf.float32, shape=[None] + [train_labels.shape[1]]) if num_tests > 0: test_images = test_images[:num_tests] rec_test_images = rec_test_images[:num_tests] test_labels = test_labels[:num_tests] if num_train > 0: train_images = train_images[:num_train] train_labels = train_labels[:num_train] # GAN defense flag. models = { 'A': model_a, 'B': model_b, 'C': model_c, 'D': model_d, 'E': model_e, 'F': model_f } model = models[FLAGS.model](input_shape=x_shape, nb_classes=train_labels.shape[1]) preds = gan.model.get_probs(images_pl) report = AccuracyReport() def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples. eval_params = {'batch_size': batch_size} acc = model_eval(sess, images_pl, labels_pl, preds, rec_test_images, rec_test_labels, args=eval_params, feed={K.learning_phase(): 0}) report.clean_train_clean_eval = acc print('Test accuracy on legitimate examples: %0.4f' % acc) train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, } rng = np.random.RandomState([11, 24, 1990]) tf.set_random_seed(11241990) preds_adv = None if FLAGS.defense_type == 'adv_tr': attack_params = { 'eps': FLAGS.fgsm_eps_tr, 'clip_min': 0., 'clip_max': 1. } if gan: if gan.dataset_name == 'celeba': attack_params['clip_min'] = -1.0 attack_obj = FastGradientMethod(gan.model, sess=sess) adv_x_tr = attack_obj.generate(images_pl, **attack_params) adv_x_tr = tf.stop_gradient(adv_x_tr) preds_adv = gan.model(adv_x_tr) """classifier_folder = os.path.join(FLAGS.model_folder, FLAGS.model+'/') saver = tf.train.Saver() if os.path.isfile(os.path.join(classifier_folder,'classifier.ckpt.index')): #load model saver.restore(sess, os.path.join(classifier_folder, 'classifier.ckpt')) else: os.mkdir(classifier_folder) model_train(sess, images_pl, labels_pl, preds, train_images, train_labels, args=train_params, rng=rng, predictions_adv=preds_adv, init_all=False, feed={K.learning_phase(): 1}, evaluate=evaluate) #save model saver.save(sess, os.path.join(classifier_folder, 'classifier.ckpt'))""" # Calculate training error. eval_params = {'batch_size': batch_size} acc = model_eval( sess, images_pl, labels_pl, preds, test_images, test_labels, args=eval_params, feed={K.learning_phase(): 0}, ) print('[#] Accuracy on clean examples {}'.format(acc)) """with open(os.path.join(classifier_folder, 'accuracy.txt'), 'w') as f: f.write('Test accuracy = {}'.format(acc))""" if attack_type is None: return acc, 0, None # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph. if 'defense_gan' in FLAGS.defense_type: z_init_val = None if FLAGS.same_init: z_init_val = tf.constant( np.random.randn(batch_size * gan.rec_rr, gan.latent_dim).astype(np.float32)) if 'bpda' in FLAGS.attack_type: # recon_layer = ReconstructionLayer(gan, z_init_val, x_shape, batch_size) else: gan.model.add_rec_model(gan, z_init_val, batch_size) irecon_adv_x = tf.zeros([1]) min_val = 0.0 if gan: if gan.dataset_name == 'celeba': min_val = -1.0 if 'rand' in FLAGS.attack_type: test_images = np.clip( test_images + args.alpha * np.sign(np.random.randn(*test_images.shape)), min_val, 1.0) eps -= args.alpha if 'bpda' in FLAGS.attack_type: # if '1' in FLAGS.attack_type: attack_obj = MadryEtAl(gan.model, sess=sess) elif '2' in FLAGS.attack_type: attack_obj = FastGradientMethod(gan.model, sess=sess) elif '3' in FLAGS.attack_type: attack_obj = MomentumIterativeMethod(gan.model, sess=sess) if 'defense_gan' in FLAGS.defense_type: # 2 recon_images_pl = recon_layer.fprop(images_pl) else: recon_images_pl = images_pl attack_params = { 'eps': eps, 'ord': np.inf, 'clip_min': min_val, 'clip_max': 1. } adv_x = attack_obj.generate( recon_images_pl, **attack_params) - recon_images_pl + images_pl # adv_x = recon_layer.fprop(irecon_adv_x) else: if 'fgsm' in FLAGS.attack_type: attack_params = { 'eps': eps, 'ord': np.inf, 'clip_min': min_val, 'clip_max': 1. } attack_obj = FastGradientMethod(gan.model, sess=sess) elif FLAGS.attack_type == 'cw': attack_obj = CarliniWagnerL2(gan.model, back='tf', sess=sess) attack_iterations = 10 attack_params = { 'binary_search_steps': 1, 'max_iterations': attack_iterations, 'learning_rate': 10.0, 'batch_size': batch_size, 'initial_const': 100 } try: adv_x = attack_obj.generate(images_pl, **attack_params) except: print('none') adv_x = images_pl eval_par = {'batch_size': batch_size} if 'defense_gan' in FLAGS.defense_type: preds_adv = gan.model.get_probs(adv_x) if 'bpda' in FLAGS.attack_type: irecon_adv_x = recon_layer.fprop(alters_pl) preds_adv = gan.model.get_probs(irecon_adv_x) num_dims = len(images_pl.get_shape()) avg_inds = list(range(1, num_dims)) diff_op = tf.reduce_mean(tf.square(adv_x - images_pl), axis=avg_inds) start = time.time() # acc_adv, roc_info = model_eval_gan(sess, images_pl, labels_pl, preds_adv, None, test_images=test_images, test_labels=test_labels, args=eval_par, feed={K.learning_phase(): 0}, diff_op=diff_op, attack=('bpda' in FLAGS.attack_type), alter=alters_pl, adv_samples=adv_x) print(time.time() - start) # if 'bpda' in FLAGS.attack_type: # sess.run(tf.local_variables_initializer()) # listimg = sess.run([images_pl,recon_images_pl,adv_x], \ # feed_dict={images_pl: test_images[:batch_size],labels_pl:test_labels[:batch_size]}) # sess.run(tf.local_variables_initializer()) # listimg += sess.run([irecon_adv_x], feed_dict={alters_pl: listimg[2]}) # for j in range(len(listimg)): # samples = listimg[j] # tflib.save_images.save_images( # samples.reshape((len(samples), 28, 28)), # os.path.join('images_saved_36', 'samples_{}_{}_{}.png'.format(FLAGS.model, FLAGS.fgsm_eps, j)) # ) # elif FLAGS.attack_type == 'cw': # idx = np.random.permutation(len(test_images))[:batch_size] # listimg = sess.run([images_pl,adv_x], \ # feed_dict={images_pl: test_images[idx],labels_pl:test_labels[idx]}) # for j in range(len(listimg)): # samples = listimg[j] # tflib.save_images.save_images( # samples.reshape((len(samples), 28, 28)), # os.path.join('images_cw', 'samples_{}_{}.png'.format(FLAGS.model, j)) # ) print('Test accuracy on adversarial examples: %0.4f\n' % acc_adv) return acc_adv, 0, roc_info else: preds_adv = gan.model(adv_x) acc_adv = model_eval(sess, images_pl, labels_pl, preds_adv, test_images, test_labels, args=eval_par, feed={K.learning_phase(): 0}) print('Test accuracy on adversarial examples: %0.4f\n' % acc_adv) return acc_adv, 0, None
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session config_args = dict(intra_op_parallelism_threads=1) config_args["gpu_options"] = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(**config_args)) print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(DATA_DIR, train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelAllConvolutional('model1', nb_classes, nb_filters, input_shape=[28, 28, 1]) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes)] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape((source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" if targeted: cw_params_batch_size = source_samples * nb_classes else: cw_params_batch_size = source_samples cw_params = {'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': CW_LEARNING_RATE, 'batch_size': cw_params_batch_size, 'initial_const': 10} adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) adv_accuracy = 1 - err if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs) ** 2, axis=(1, 2, 3)) ** .5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: _ = grid_visual(grid_viz_data) return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, train_dir=TRAIN_DIR, filename=FILENAME, load_model=LOAD_MODEL, testing=False, label_smoothing=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ tf.keras.backend.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if keras.backend.image_data_format() != 'channels_last': raise NotImplementedError( "this tutorial requires keras to be configured to channels_last format" ) # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = CrossEntropy(wrap, smoothing=label_smoothing) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_train, y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) wrap_2 = KerasModelWrapper(model_2) preds_2 = model_2(x) fgsm2 = FastGradientMethod(wrap_2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) preds_2_adv = model_2(attack(x)) loss_2 = CrossEntropy(wrap_2, smoothing=label_smoothing, attack=attack) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_test, y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, x_test, y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(sess, loss_2, x_train, y_train, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_train, y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, x_train, y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def blackbox(gan, rec_data_path=None, batch_size=128, learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6, nb_epochs_s=10, lmbda=0.1, online_training=False, train_on_recs=False, test_on_dev=False, defense_type='none'): """MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 Args: train_start: index of first training set example train_end: index of last training set example test_start: index of first test set example test_end: index of last test set example defense_type: Type of defense against blackbox attacks Returns: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ FLAGS = flags.FLAGS # Set logging level to see debug information. set_log_level(logging.WARNING) # Dictionary used to keep track and return key accuracies. accuracies = {} # Create TF session. adv_training = False if defense_type: if defense_type == 'defense_gan' and gan: sess = gan.sess gan_defense_flag = True else: gan_defense_flag = False config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) if 'adv_tr' in defense_type: adv_training = True else: gan_defense_flag = False config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) train_images, train_labels, test_images, test_labels = \ get_cached_gan_data(gan, test_on_dev, orig_data_flag=True) x_shape, classes = list(train_images.shape[1:]), train_labels.shape[1] nb_classes = classes type_to_models = { 'A': model_a, 'B': model_b, 'C': model_c, 'D': model_d, 'E': model_e, 'F': model_f, 'Q': model_q, 'Y': model_y, 'Z': model_z } with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): bb_model = type_to_models[FLAGS.bb_model]( input_shape=[None] + x_shape, nb_classes=train_labels.shape[1], ) with tf.variable_scope("Substitute", reuse=tf.AUTO_REUSE): sub_model = type_to_models[FLAGS.sub_model]( input_shape=[None] + x_shape, nb_classes=train_labels.shape[1], ) if FLAGS.debug: train_images = train_images[:20 * batch_size] train_labels = train_labels[:20 * batch_size] debug_dir = os.path.join('debug', 'blackbox', FLAGS.debug_dir) ensure_dir(debug_dir) x_debug_test = test_images[:batch_size] # Initialize substitute training set reserved for adversary images_sub = test_images[:holdout] labels_sub = np.argmax(test_labels[:holdout], axis=1) print(labels_sub) # Redefine test set as remaining samples unavailable to adversaries if FLAGS.num_tests > 0: test_images = test_images[:FLAGS.num_tests] test_labels = test_labels[:FLAGS.num_tests] test_images = test_images[holdout:] test_labels = test_labels[holdout:] # Define input and output TF placeholders if FLAGS.image_dim[0] == 3: FLAGS.image_dim = [ FLAGS.image_dim[1], FLAGS.image_dim[2], FLAGS.image_dim[0] ] images_tensor = tf.placeholder(tf.float32, shape=[None] + x_shape) labels_tensor = tf.placeholder(tf.float32, shape=(None, classes)) rng = np.random.RandomState([11, 24, 1990]) train_images_bb, train_labels_bb, test_images_bb, test_labels_bb = \ train_images, train_labels, test_images, \ test_labels cur_gan = gan if FLAGS.debug: train_images_bb = train_images_bb[:20 * batch_size] train_labels_bb = train_labels_bb[:20 * batch_size] # Prepare the black_box model. prep_bbox_out = prep_bbox(sess, images_tensor, labels_tensor, train_images_bb, train_labels_bb, test_images_bb, test_labels_bb, nb_epochs, batch_size, learning_rate, rng=rng, gan=cur_gan, adv_training=adv_training, cnn_arch=bb_model) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 print("Training the substitute model.") reconstructor = get_reconstructor(gan) recon_tensors, _ = reconstructor.reconstruct(images_tensor, batch_size=batch_size, reconstructor_id=2) model_sub, preds_sub = train_sub(sess, images_tensor, labels_tensor, model.get_logits(recon_tensors), images_sub, labels_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, rng=rng, substitute_model=sub_model, dataset_name=gan.dataset_name) accuracies['sub'] = 0 # Initialize the Fast Gradient Sign Method (FGSM) attack object. eps = attack_config_dict[gan.dataset_name]['eps'] min_val = attack_config_dict[gan.dataset_name]['clip_min'] fgsm_par = {'eps': eps, 'ord': np.inf, 'clip_min': min_val, 'clip_max': 1.} fgsm = FastGradientMethod(model_sub, sess=sess) # Craft adversarial examples using the substitute. eval_params = {'batch_size': batch_size} x_adv_sub = fgsm.generate(images_tensor, **fgsm_par) if FLAGS.debug and gan is not None: # To see some qualitative results. recon_tensors, _ = reconstructor.reconstruct(x_adv_sub, batch_size=batch_size, reconstructor_id=2) x_rec_orig, _ = reconstructor.reconstruct(images_tensor, batch_size=batch_size, reconstructor_id=3) x_adv_sub_val = sess.run(x_adv_sub, feed_dict={images_tensor: x_debug_test}) x_rec_debug_val = sess.run(recon_tensors, feed_dict={images_tensor: x_debug_test}) x_rec_orig_val = sess.run(x_rec_orig, feed_dict={images_tensor: x_debug_test}) #sess.run(tf.local_variables_initializer()) #x_rec_debug_val, x_rec_orig_val = sess.run([reconstructed_tensors, x_rec_orig], feed_dict={images_tensor: x_debug_test}) save_images_files(x_adv_sub_val, output_dir=debug_dir, postfix='adv') postfix = 'gen_rec' save_images_files(x_rec_debug_val, output_dir=debug_dir, postfix=postfix) save_images_files(x_debug_test, output_dir=debug_dir, postfix='orig') save_images_files(x_rec_orig_val, output_dir=debug_dir, postfix='orig_rec') if gan_defense_flag: num_dims = len(images_tensor.get_shape()) avg_inds = list(range(1, num_dims)) recons_adv, zs = reconstructor.reconstruct(x_adv_sub, batch_size=batch_size) diff_op = tf.reduce_mean(tf.square(x_adv_sub - recons_adv), axis=avg_inds) z_norm = tf.reduce_sum(tf.square(zs), axis=1) acc_adv, diffs_mean, roc_info_adv = model_eval_gan( sess, images_tensor, labels_tensor, predictions=model.get_logits(recons_adv), test_images=test_images, test_labels=test_labels, args=eval_params, diff_op=diff_op, z_norm=z_norm, recons_adv=recons_adv, adv_x=x_adv_sub, debug=False) # reconstruction on clean images recons_clean, zs = reconstructor.reconstruct(images_tensor, batch_size=batch_size) diff_op = tf.reduce_mean(tf.square(images_tensor - recons_clean), axis=avg_inds) z_norm = tf.reduce_sum(tf.square(zs), axis=1) acc_rec, diffs_mean_rec, roc_info_rec = model_eval_gan( sess, images_tensor, labels_tensor, model.get_logits(recons_clean), None, test_images=test_images, test_labels=test_labels, args=eval_params, diff_op=diff_op, z_norm=z_norm, recons_adv=recons_clean, adv_x=images_tensor, debug=False) print('Evaluation accuracy with reconstruction: {}'.format(acc_rec)) print('Test accuracy of oracle on cleaned images : {}'.format(acc_adv)) return { 'acc_adv': acc_adv, 'acc_rec': acc_rec, 'roc_info_adv': roc_info_adv, 'roc_info_rec': roc_info_rec } else: acc_adv = model_eval(sess, images_tensor, labels_tensor, model.get_logits(x_adv_sub), test_images, test_labels, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(acc_adv)) return { 'acc_adv': acc_adv, 'acc_rec': 0, 'roc_info_adv': None, 'roc_info_rec': None }
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Disable Keras learning phase since we will be serving through tensorflow keras.layers.core.K.set_learning_phase(0) # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Image dimensions ordering should follow the TensorFlow convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' " "to 'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) print("Created TensorFlow session and set Keras backend.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models")) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object wrap = KerasModelWrapper(model) cw = CarliniWagnerL2(wrap, back='tf', sess=sess) idxs = [np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(10)] if targeted: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') one_hot = np.zeros((10, 10)) one_hot[np.arange(10), np.arange(10)] = 1 adv_inputs = np.array([[instance] * 10 for instance in X_test[idxs]], dtype=np.float32) adv_inputs = adv_inputs.reshape((100, 28, 28, 1)) adv_ys = np.array([one_hot] * 10, dtype=np.float32).reshape((100, 10)) yname = "y_target" else: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = X_test[idxs] adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': 100 if targeted else 10, 'initial_const': 10 } adv = cw.generate_np(adv_inputs, **cw_params) if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args={'batch_size': 10}) else: adv_accuracy = 1 - model_eval( sess, x, y, preds, adv, Y_test[idxs], args={'batch_size': 10}) for j in range(10): if targeted: for i in range(10): grid_viz_data[i, j] = adv[i * 10 + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def prep_bbox(sess, images, labels, images_train, labels_train, images_test, labels_test, nb_epochs, batch_size, learning_rate, rng, gan=None, adv_training=False, cnn_arch=None): """Defines and trains a model that simulates the "remote" black-box oracle described in https://arxiv.org/abs/1602.02697. Args: sess: the TF session images: the input placeholder labels: the ouput placeholder images_train: the training data for the oracle labels_train: the training labels for the oracle images_test: the testing data for the oracle labels_test: the testing labels for the oracle nb_epochs: number of epochs to train model batch_size: size of training batches learning_rate: learning rate for training rng: numpy.random.RandomState Returns: model: The blackbox model function. predictions: The predictions tensor. accuracy: Accuracy of the model. """ # Define TF model graph (for the black-box model). model = cnn_arch train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': 'classifiers/model/{}'.format(gan.dataset_name), 'filename': 'model_{}'.format(FLAGS.bb_model) } eval_params = {'batch_size': batch_size} if gan.dataset_name in ['mnist', 'f-mnist']: used_vars = model.get_params() pred_train = model.get_logits(images, dropout=True) pred_eval = model.get_logits(images) elif gan.dataset_name == 'cifar-10': pre_model = Model('classifiers/model/cifar-10', tiny=False, mode='eval', sess=sess) with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): model = DefenseWrapper(pre_model, 'logits') used_vars = [ x for x in tf.global_variables() if x.name.startswith('model') ] pred_eval = model.get_logits(images) elif gan.dataset_name == 'celeba': images_pl_transformed = tf.cast(images, tf.float32) / 255. * 2. - 1. used_vars = model.get_params() pred_train = model.get_logits(images_pl_transformed, dropout=True) pred_eval = model.get_logits(images_pl_transformed) classifier_load_success = False if FLAGS.load_bb_model: try: path = tf.train.latest_checkpoint('classifiers/model/{}'.format( gan.dataset_name)) saver = tf.train.Saver(var_list=used_vars) saver.restore(sess, path) print('[+] BB model loaded successfully ...') classifier_load_success = True except: print('[-] Fail to load BB model ...') classifier_load_success = False if not classifier_load_success: print('[+] Training classifier model ...') model_train(sess, images, labels, pred_train, images_train, labels_train, args=train_params, rng=rng, predictions_adv=None, init_all=False, save=False) # Print out the accuracy on legitimate test data. accuracy = model_eval( sess, images, labels, pred_eval, images_test, labels_test, args=eval_params, ) print('Test accuracy of black-box on legitimate test examples: ' + str(accuracy)) return model, pred_eval, accuracy
def evaluate(): eval_params = {'batch_size': 128} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64, num_threads=None): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) fgsm2 = FastGradientMethod(model_2, sess=sess) adv_x_2 = fgsm2.generate(x, **fgsm_params) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x_2 = tf.stop_gradient(adv_x_2) preds_2_adv = model_2(adv_x_2) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, rng=rng, var_list=model_2.get_params()) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mdt(model, data_dir, checkpoint_dir, train_dir='./tmp/cifar10_train', adversarial_dir='./tmp/cifar10_adv', batch_size=128, data_aug=False, data_norm=True): # train model if not tf.gfile.Exists(train_dir): # set input and get logits images, labels = mdt_cifar10_input.inputs(False, data_dir, batch_size, data_aug, data_norm) labels = tf.cast(labels, tf.int64) # target = False # adv_output_layer = 'adv_bounddecoder6' # loss = adv_net_loss(images, model, labels, target, adv_output_layer, 0, 10) logits = model(images) loss = stand_loss(logits, labels) train_process(model, loss, images, label, train_dir, batch_size) # define dataset format img_rows = 32 img_cols = 32 channels = 3 nb_classes = 10 # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Get predict tensor pred = model(x) sess = tf.Session() if not checkpoint_load(sess, checkpoint_dir): return False # fetch data cifar10_data.maybe_download_and_return_python(data_dir) X, Y = mdt_cifar10_input.numpy_input(True, data_dir) # create one-hot Y one_hot_Y = to_categorical(Y, nb_classes) # create mode feed train_feed = mode_feed(sess, True) eval_feed = mode_feed(sess, False) fgsm_params = {'eps': 1, 'clip_min': 0., 'clip_max': 255.} fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) # eval model accuracy class_accuracy, accuracy = model_eval_each_class(sess, x, y, pred, 10, X, one_hot_Y, feed=eval_feed, args={'batch_size': 128}) print('model accuracy: {0}'.format(accuracy)) for i in range(10): print('class {0} accuracy: {1}'.format(i, class_accuracy[i])) # eval model's accuacy in cw adversarial examples fgsm_accuracy = model_eval(sess, x, y, preds_adv, X, one_hot_Y, feed=eval_feed, args={'batch_size': 128}) print('model fgsm_accuracy: {0}'.format(fgsm_accuracy)) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} X = X[:128] Y=one_hot_Y[:128] adv_feed = {x:X, y:one_hot_Y} adv_feed.update(eval_feed) sta = time.time() adv_X_ = sess.run(adv_x,feed_dict=adv_feed) end = time.time() duration = end - sta print('finished in {0} seconds'.format(duration)) l2_dis = calculate_l2_dis(X/255, adv_X_/255) print('adversarial examples\' mean l2 distance: {0}'.format(l2_dis))
def main(argv=None): tf.set_random_seed(1234) sess = tf.Session() keras.backend.set_session(sess) X_train, Y_train, X_test, Y_test = data_cifar10() Y_train = Y_train.clip(.1 / 9., 1. - .1) x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) model = cnn_model(img_rows=32, img_cols=32, channels=3) predictions = model(x) def evaluate(): eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate test examples: ' + str(accuracy)) train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate, 'train_dir': FLAGS.train_dir, 'filename': FLAGS.filename } model_path=os.path.join(FLAGS.train_dir, FLAGS.filename) if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, args=train_params, save=True) wrap = KerasModelWrapper(model) nb_classes = 10 targeted = False nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' cw = CarliniWagnerL2(model, back='tf', sess=sess) n_adv = 1000 adv_inputs = X_test[:n_adv] adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': 10, 'initial_const': 10, } adv = cw.generate_np(adv_inputs, **cw_params) sigma = 16.0/255 gamma = 0.00061 * 255 * 255 alpha = 0.00061 * 255 * 255 n_clusters = 10 n_samples = 50 noise = np.random.normal(0.0, sigma, adv.shape) adv_gauss = adv + noise i1 = np.repeat(np.arange(0,n_adv), n_samples) i2 = np.random.randint(32, size = n_adv * n_samples) i3 = np.random.randint(32, size = n_adv * n_samples) sample = adv[i1, i2, i3] noise = np.random.normal(0.0, sigma, sample.shape) noisy_samples = sample + noise noisy_samples = np.reshape(noisy_samples, (n_adv, n_samples, 3)) noise = np.random.normal(0.0, sigma, adv.shape) adv_rdesc = np.zeros(adv.shape) adv_rmix = np.zeros(adv.shape) for img_no, img_samples in enumerate(noisy_samples): clusters = np.zeros((n_clusters, 3)) clusters[0] = img_samples[0] for c_j in range(1, n_clusters): prob_cj = np.zeros(n_samples) for pix_no, pix in enumerate(img_samples): l2_min = 100000 for c_l in range(0, c_j): l2_norm_sq = np.inner(pix - clusters[c_l], pix - clusters[c_l]) if l2_norm_sq < l2_min: l2_min = l2_norm_sq prob_cj[pix_no] = math.exp(gamma * l2_min) prob_cj /= prob_cj.sum() clusters[c_j] = img_samples[np.random.choice(n_samples, 1, p=prob_cj)] for pix_i in range(0, 32): for pix_j in range(0,32): c_dist_min = 100000 c_min = np.zeros(3) c_sum = np.zeros(3) weight_sum = 0 for c_j in clusters: c_dist = np.linalg.norm(adv_gauss[img_no][pix_i][pix_j] - c_j) weight_j = math.exp(-1 * alpha * c_dist * c_dist) weight_sum = weight_sum + weight_j c_sum = c_sum + weight_j * c_j if c_dist < c_dist_min: c_dist_min = c_dist c_min = c_j adv_rdesc[img_no][pix_i][pix_j] = c_min adv_rmix[img_no][pix_i][pix_j] = c_sum / weight_sum eval_params = {'batch_size': np.minimum(nb_classes, 10)} orig_accuracy = model_eval(sess, x, y, predictions, adv_inputs, Y_test[:n_adv], args=eval_params) print('Original accuracy {0:.4f}'.format(orig_accuracy)) adv_accuracy = model_eval(sess, x, y, predictions, adv, Y_test[:n_adv], args=eval_params) print('Adversarial without noise {0:.4f}'.format(adv_accuracy)) percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations without noise {0:.4f}'.format(percent_perturbed)) adv_accuracy = model_eval(sess, x, y, predictions, adv_gauss, Y_test[:n_adv], args=eval_params) print('Avg. rate of successful adv. examples with Gaussian noise {0:.4f}'.format(adv_accuracy)) percent_perturbed = np.mean(np.sum((adv_gauss - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations with Gaussian noise {0:.4f}'.format(percent_perturbed)) adv_accuracy = model_eval(sess, x, y, predictions, adv_rdesc, Y_test[:n_adv], args=eval_params) print('Avg. rate of successful adv. examples with random descent {0:.4f}'.format(adv_accuracy)) percent_perturbed = np.mean(np.sum((adv_rdesc - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations with random descent {0:.4f}'.format(percent_perturbed)) adv_accuracy = model_eval(sess, x, y, predictions, adv_rmix, Y_test[:n_adv], args=eval_params) print('Avg. rate of successful adv. examples with random mixture {0:.4f}'.format(adv_accuracy)) percent_perturbed = np.mean(np.sum((adv_rmix - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations with random mixture {0:.4f}'.format(percent_perturbed)) sess.close()
def mnist_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=10, batch_size=128, learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6, nb_epochs_s=10, lmbda=0.1, aug_batch_size=512): """ MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ # Set logging level to see debug information set_log_level(logging.DEBUG) # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session sess = tf.Session() # Get MNIST data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Initialize substitute training set reserved for adversary X_sub = x_test[:holdout] Y_sub = np.argmax(y_test[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries x_test = x_test[holdout:] y_test = y_test[holdout:] # Obtain Image parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Seed random number generator so tutorial is reproducible rng = np.random.RandomState([2017, 8, 30]) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test, nb_epochs, batch_size, learning_rate, rng, nb_classes, img_rows, img_cols, nchannels) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 print("Training the substitute model.") train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, aug_batch_size, rng, img_rows, img_cols, nchannels) model_sub, preds_sub = train_sub_out # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_sub, x_test, y_test, args=eval_params) accuracies['sub'] = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object. fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model_sub, sess=sess) # Craft adversarial examples using the substitute eval_params = {'batch_size': batch_size} x_adv_sub = fgsm.generate(x, **fgsm_par) # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model.get_logits(x_adv_sub), x_test, y_test, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy return accuracies
def evaluate(): eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate test examples: ' + str(accuracy))
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x, y, x_train, y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes)] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape((source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" cw_params = {'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': 10} adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ idxs], args=eval_params) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ :source_samples], args=eval_params) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def generate_images(): print('==> Preparing data..') if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print( "INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.5 sess = tf.Session(config=config) keras.backend.set_session(sess) print "==> Beginning Session" # Get CIFAR10 test data X_train, Y_train, X_test, Y_test = data_cifar10() # Save the vgg labels np.save("vgg_adv_y_10000", Y_test) assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Load model args_load = 'cifar10vgg.h5' args_pool = 0.05 args_attack = 'jsma' print "==> loading vgg model" model = vggbn(top=True, pool=args_pool) model.load_weights(args_load) predictions = model(x) eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) print '==> Accuracy : {}'.format(accuracy) def evaluate(): # Evaluate the accuracy of the CIFAR10 model on legitimate test examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Train an CIFAR10 model # train_params = { # 'nb_epochs': FLAGS.nb_epochs, # 'batch_size': FLAGS.batch_size, # 'learning_rate': FLAGS.learning_rate # } im_base = '/im_' if args_attack == 'fgsm' or args_attack == 'FGSM': result_dir = os.getcwd() + '/images/fgsm/' print "==> creating fgsm adversarial wrapper" epsilons = [0.01, 0.03, 0.07, 0.1, 0.2, 0.3] for eps in epsilons: model_name = "vgg_fgsm_" + str(eps) adv_x = fgsm_old(x, predictions, eps=eps) print "==> sending to batch evaluator to finalize adversarial images" eval_params = {'batch_size': FLAGS.batch_size} X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], args=eval_params) i = 0 if not os.path.exists(result_dir + model_name): os.makedirs(result_dir + model_name) print "==> saving images to {}".format(result_dir + model_name) for ad in X_test_adv: scipy.misc.imsave( result_dir + model_name + im_base + str(i) + '.png', ad) i += 1 sess.close() """ JSMA """ if args_attack == 'jsma' or args_attack == 'JSMA': np.save("JSMA_vgg_adv_y_" + str(FLAGS.source_samples), Y_test[0:FLAGS.source_samples]) result_dir = os.getcwd() + '/images/jsma/trial_single_adv_' print('Crafting ' + str(FLAGS.source_samples) + ' * ' + str(FLAGS.nb_classes - 1) + ' adversarial examples') results = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='i') # This array contains the fraction of perturbed features for each test set perturbations = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='f') # Define the TF graph for the model's Jacobian grads = jacobian_graph(predictions, x, FLAGS.nb_classes) # Initialize our array for grid visualization grid_shape = (FLAGS.nb_classes, FLAGS.nb_classes, FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels) grid_viz_data = np.zeros(grid_shape, dtype='f') i_saved = 0 n_image = 0 gammas = [0.01, 0.05, 0.1, 0.2, 0.3] for gamma in gammas: model_name = "vgg_jsma_" + str(gamma) # Loop over the samples we want to perturb into adversarial examples print "==> saving images to {}".format(result_dir + model_name) for sample_ind in xrange(0, FLAGS.source_samples): # We want to find an adversarial example for each possible target class current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(FLAGS.nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( X_test[sample_ind:(sample_ind + 1)], (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) # Loop over all target classes # pdb.set_trace() for target in np.random.permutation(target_classes): print "image {}".format(sample_ind) # here we hold all successful adversarials for this iteration # since we dont want 500k images, we will uniformly sample an image to save after each target print('--------------------------------------') print('Creating adv. example for target class ' + str(target)) # This call runs the Jacobian-based saliency map approach adv_x, res, percent_perturb = jsma_old( sess, x, predictions, grads, X_test[sample_ind:(sample_ind + 1)], target, num_classes=FLAGS.nb_classes, theta=1, gamma=gamma, increase=True, clip_min=0, clip_max=1) # Display the original and adversarial images side-by-side adversarial = np.reshape( adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) original = np.reshape( X_test[sample_ind:(sample_ind + 1)], (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) if FLAGS.viz_enabled: if 'figure' not in vars(): figure = pair_visual(original, adversarial) else: figure = pair_visual(original, adversarial, figure) if not os.path.exists(result_dir + model_name): os.makedirs(result_dir + model_name) if res == 1: scipy.misc.imsave( result_dir + model_name + im_base + str(sample_ind) + '.png', adversarial) i_saved += 1 print "==> images saved: {}".format(i_saved) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb break n_image += 1 # Compute the number of adversarial examples that were successfuly found nb_targets_tried = ((FLAGS.nb_classes - 1) * FLAGS.source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.2f}'.format( succ_rate)) # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.2f}'.format( percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print( 'Avg. rate of perturbed features for successful ' 'adversarial examples {0:.2f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if FLAGS.viz_enabled: grid_visual(grid_viz_data)
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind+1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def cifar10_blackbox(nb_classes=10, batch_size=128, nb_samples=10, l2_weight=0.0001, momentum=0.9, initial_lr=0.1, lr_step_epoch=100.0, lr_decay=0.1, num_residual_units=2, num_train_instance=50000, num_test_instance=10000, k=1, eps=0.3, learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6, nb_epochs_s=10, lmbda=0.1, binary=False, scale=False, model_path=None, targeted=False, data_dir=None, adv=False, delay=0): """ MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ # Set logging level to see debug information set_log_level(logging.DEBUG) # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get CIFAR10 test data X_train, Y_train, X_test, Y_test = data_cifar10_std() # Y_train_onehot = np_utils.to_categorical(Y_train, nb_classes) Y_test_onehot = np_utils.to_categorical(Y_test, nb_classes) # Y_test is for evaluating oracle Y_test_bbox = np.argmax(Y_test, axis=1) Y_test_bbox = Y_test_bbox.reshape(Y_test_bbox.shape[0], ) Y_test_bbox = Y_test_bbox.astype('int32') #Y_test = Y_test.reshape(Y_test.shape[0],) #Y_test = Y_test.astype('int32') #Y_train = Y_train.astype('int32') # Initialize substitute training set reserved for adversary X_sub = X_test[:holdout] Y_sub = np.argmax(Y_test_onehot[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries X_test = X_test[holdout:] Y_test = Y_test[holdout:] # CIFAR10-specific dimensions img_rows = 32 img_cols = 32 channels = 3 rng = np.random.RandomState([2017, 8, 30]) # with tf.Graph().as_default(): # Define input and output TF placeholders x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.int32, shape=(None)) phase = tf.placeholder(tf.bool, name='phase') y_s = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Seed random number generator so tutorial is reproducible # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the WideResNet black-box model.") ''' prep_bbox_out = prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, img_rows, img_cols, channels, nb_epochs, batch_size, learning_rate, rng=rng, phase=phase, binary=binary, scale=scale, nb_filters=nb_filters, model_path=model_path, adv=adv, delay=delay, eps=eps) model, bbox_preds, accuracies['bbox'], model_path = prep_bbox_out ''' decay_step = lr_step_epoch * num_train_instance / batch_size hp = resnet.HParams(batch_size=batch_size, num_classes=nb_classes, num_residual_units=num_residual_units, k=k, weight_decay=l2_weight, initial_lr=initial_lr, decay_step=decay_step, lr_decay=lr_decay, momentum=momentum) print(binary) binary = True if binary else False print(binary) network = resnet.ResNet(binary, hp, x, y, None) network.build_model() # bbox_preds = network.preds bbox_preds = network.probs init = tf.global_variables_initializer() sess.run(init) # Create a saver. saver = tf.train.Saver(tf.global_variables(), max_to_keep=10000) if 'model' in model_path.split('/')[-1]: saver.restore(sess, model_path) print('restored %s' % model_path) else: saver.restore(sess, tf.train.latest_checkpoint(model_path)) print('restored %s' % model_path) ''' if os.path.isdir(model_path): ckpt = tf.train.get_checkpoint_state(model_path) # Restores from checkpoint if ckpt and ckpt.model_checkpoint_path: print('\tRestore from %s' % ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) else: print('No checkpoint file found in the dir [%s]' % model_path) sys.exit(1) elif os.path.isfile(model_path): print('\tRestore from %s' % model_path) saver.restore(sess, model_path) else: print('No checkpoint file found in the path [%s]' % model_path) sys.exit(1) ''' eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, bbox_preds, X_test, Y_test, phase=phase, args=eval_params) print('Test accuracy of black-box on legitimate test examples: %.4f' % acc)
def main(argv): model_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if model_file is None: print('No model found') sys.exit() cifar = cifar10_input.CIFAR10Data(FLAGS.dataset_dir) nb_classes = 10 X_test = cifar.eval_data.xs Y_test = to_categorical(cifar.eval_data.ys, nb_classes) assert Y_test.shape[1] == 10. set_log_level(logging.DEBUG) with tf.Session() as sess: x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) from madry_cifar10_model import make_madry_wresnet model = make_madry_wresnet() saver = tf.train.Saver() # Restore the checkpoint saver.restore(sess, model_file) nb_samples = FLAGS.nb_samples attack_params = {'batch_size': FLAGS.batch_size, 'clip_min': 0., 'clip_max': 255.} if FLAGS.attack_type == 'cwl2': from cleverhans.attacks import CarliniWagnerL2 attacker = CarliniWagnerL2(model, sess=sess) attack_params.update({'binary_search_steps': 1, 'max_iterations': 100, 'learning_rate': 0.1, 'initial_const': 10, 'batch_size': 10 }) else: # eps and eps_iter in range 0-255 attack_params.update({'eps': 8, 'ord': np.inf}) if FLAGS.attack_type == 'fgsm': from cleverhans.attacks import FastGradientMethod attacker = FastGradientMethod(model, sess=sess) elif FLAGS.attack_type == 'pgd': attack_params.update({'eps_iter': 2, 'nb_iter': 20}) from cleverhans.attacks import MadryEtAl attacker = MadryEtAl(model, sess=sess) eval_par = {'batch_size': FLAGS.batch_size} if FLAGS.sweep: max_eps = 16 epsilons = np.linspace(1, max_eps, max_eps) for e in epsilons: t1 = time.time() attack_params.update({'eps': e}) x_adv = attacker.generate(x, **attack_params) preds_adv = model.get_probs(x_adv) acc = model_eval(sess, x, y, preds_adv, X_test[ :nb_samples], Y_test[:nb_samples], args=eval_par) print('Epsilon %.2f, accuracy on adversarial' % e, 'examples %0.4f\n' % acc) t2 = time.time() else: t1 = time.time() x_adv = attacker.generate(x, **attack_params) preds_adv = model.get_probs(x_adv) acc = model_eval(sess, x, y, preds_adv, X_test[ :nb_samples], Y_test[:nb_samples], args=eval_par) t2 = time.time() print('Test accuracy on adversarial examples %0.4f\n' % acc) print("Took", t2 - t1, "seconds")
test_in_reshape = X_test_scaled[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] X_adv[sample_ind] = adv_x results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print() print(X_adv.shape) print("=========================== Evaluation of MLP Performance ==============================") print() eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test_scaled, y_test, args=eval_params) print("Test accuracy on normal examples: {}".format(accuracy)) accuracy_adv = model_eval(sess, x, y, predictions, X_adv, y_test, args=eval_params) print("Test accuracy on adversarial examples: {}".format(accuracy_adv)) print() print("=============================== Decision tree CLassifier ==============================") dt = OneVsRestClassifier(DecisionTreeClassifier(random_state=42)) dt.fit(X_train_scaled, y_train) y_pred = dt.predict(X_test_scaled) # Calculate FPR for normal class only fpr_dt, tpr_dt, _ = roc_curve(y_test[:, 0], y_pred[:, 0]) roc_auc_dt = auc(fpr_dt, tpr_dt)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="/tmp", filename="mnist.ckpt", load_model=False, testing=False): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } ckpt = tf.train.get_checkpoint_state(train_dir) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path rng = np.random.RandomState([2017, 8, 30]) if load_model and ckpt_path: saver = tf.train.Saver() saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, save=True) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph wrap = KerasModelWrapper(model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model() preds_2 = model_2(x) wrap_2 = KerasModelWrapper(model_2) fgsm2 = FastGradientMethod(wrap_2, sess=sess) preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params)) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, save=False) # Get a random slice of the data for linear extrapolation plots random_idx = np.random.randint(0, X_train.shape[0]) X_slice = X_train[random_idx] Y_slice = Y_train[random_idx] # Plot the linear extrapolation plot for clean model log_prob_adv_array = get_logits_over_interval( sess, wrap, X_slice, fgsm_params) linear_extrapolation_plot(log_prob_adv_array, Y_slice, 'lep_clean.png') # Plot the linear extrapolation plot for adv model log_prob_adv_array = get_logits_over_interval( sess, wrap_2, X_slice, fgsm_params) linear_extrapolation_plot(log_prob_adv_array, Y_slice, 'lep_adv.png') # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, nb_epochs, batch_size, learning_rate, rng): """ Define and train a model that simulates the "remote" black-box oracle described in the original paper. :param sess: the TF session :param x: the input placeholder for cifar :param y: the ouput placeholder for cifar :param X_train: the training data for the oracle :param Y_train: the training labels for the oracle :param X_test: the testing data for the oracle :param Y_test: the testing labels for the oracle :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param rng: numpy.random.RandomState :return: """ # Define TF model graph (for the black-box model) model = cnn_cifar10_model(img_rows=32, img_cols=32, channels=3) predictions = model(x) fgsm_params = { 'eps': FLAGS.training_eps, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1. } fgsm = FastGradientMethod(model, sess=sess) predictions_adv = model(fgsm.generate(x, **fgsm_params)) logger.info("Defined TensorFlow model graph.") # Train an cifar model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } model_train(sess, x, y, predictions, X_train, Y_train, verbose=False, args=train_params, rng=rng, predictions_adv=predictions_adv) # logger.info out the accuracy on legitimate data eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) logger.info( 'Test accuracy of adversarially trained black-box on legitimate test ' 'examples: ' + str(accuracy)) return model, predictions, accuracy
def adv_net_exp(data_dir, adv_dir, target_model_dir='./tmp/cifar10_train_adv_encoder', clip_norm=1.5): # sess get setting sess = tf.Session() # define dataset format img_rows = 32 img_cols = 32 channels = 3 nb_classes = 10 # fetch data cifar10_data.maybe_download_and_return_python(data_dir) X, Y = mdt_cifar10_input.numpy_input(True, data_dir) # create one-hot Y one_hot_Y = to_categorical(Y, nb_classes) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) model = make_vgg16_clipRelu_model(name = 'vgg16_clipRelu_eval_mode', eval_mode=True) eval_feed = mode_feed(sess, False) # Get predict tensor pred = model(x) if not checkpoint_load(sess, target_model_dir): return False # eval model accuracy accuracy = model_eval(sess, x, y, pred, X, one_hot_Y, feed = eval_feed, args={'batch_size': 128}) print('model accuracy: {0}'.format(accuracy)) dis_loss, output_images = adv_train_net(x, clip_norm) logits = model(output_images) # restore adv variables ckpt = tf.train.get_checkpoint_state(adv_dir) # define adv variables adv_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "adv_encoder") saver = tf.train.Saver(adv_variables) saver.restore(sess, ckpt.model_checkpoint_path) # eval adv accuracy accuracy = model_eval(sess, x, y, logits, X, one_hot_Y, feed = eval_feed, args={'batch_size': 128}) print('transfer rate: {0}'.format(accuracy)) # universal adversarial examples adv_imgs = adv_generate(sess, output_images, x, X, None, 128) mean_dif = adv_imgs[1]-X[1] print('mean dif\'s size: {0}'.format(mean_dif.shape)) universal_adv_X = X+mean_dif # eval universal adv accuracy accuracy = model_eval(sess, x, y, pred, universal_adv_X, one_hot_Y, feed = eval_feed, args={'batch_size': 128}) print('universal adv transfer rate: {0}'.format(accuracy))
def mdt(model, data_dir, checkpoint_dir, train_dir='./tmp/cifar10_train', adversarial_dir='./tmp/cifar10_adv', batch_size=128, data_aug=False, data_norm=True): # train model if not tf.gfile.Exists(train_dir): # set input and get logits images, labels = mdt_cifar10_input.inputs(False, data_dir, batch_size, data_aug, data_norm) labels = tf.cast(labels, tf.int64) # target = False # adv_output_layer = 'adv_bounddecoder6' # loss = adv_net_loss(images, model, labels, target, adv_output_layer, 0, 10) logits = model(images) loss = stand_loss(logits, labels) train_process(model, loss, images, label, train_dir, batch_size) # define dataset format img_rows = 32 img_cols = 32 channels = 3 nb_classes = 10 # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Get predict tensor pred = model(x) sess = tf.Session() if not checkpoint_load(sess, checkpoint_dir): return False # fetch data cifar10_data.maybe_download_and_return_python(data_dir) X, Y = mdt_cifar10_input.numpy_input(True, data_dir) # print(sess.run(bn_moving_vars)) # create one-hot Y one_hot_Y = to_categorical(Y, nb_classes) # create mode feed train_feed = mode_feed(sess, True) eval_feed = mode_feed(sess, False) # craft cw adversarial examples if not os.path.exists(adversarial_dir): os.makedirs(adversarial_dir) cw_file = adversarial_dir+'/cw_adv' if os.path.isfile(cw_file): fr = open(cw_file, 'rb') cw_dict = pickle.load(fr) cw_adv = cw_dict['data'] adv_ys = cw_dict['labels'] assert cw_adv.shape[0] == adv_ys.shape[0] cw_setting = cw_dict['setting'] print('settings of cw adversarial examples that have been loaded') print(cw_setting) else: print('crafting cw adversarial examples....') start_time = time.time() cw = CarliniWagnerL2(model, back='tf', sess=sess) num_for_test = 100 adv_inputs = X[:num_for_test] yname = 'y' adv_ys = one_hot_Y[:num_for_test] cw_params = {'binary_search_steps': 5, 'confidence':0, 'max_iterations': 10000, 'learning_rate': 0.1, 'batch_size': 100, 'initial_const': 10, 'clip_min': 0, 'clip_max': 255} cw_setting = cw_params.copy() cw_params['feed'] = eval_feed cw_params[yname] = adv_ys cw_adv = cw.generate_np(adv_inputs, **cw_params) cw_setting['model'] = model.name cw_dict = {'data':cw_adv, 'labels':adv_ys, 'setting':cw_setting} fw = open(cw_file, 'wb') pickle.dump(cw_dict, fw) end_time = time.time() duration = end_time - start_time print('finished in {0} seconds'.format(duration)) # eval model accuracy class_accuracy, accuracy = model_eval_each_class(sess, x, y, pred, 10, X, one_hot_Y, feed=eval_feed, args={'batch_size': 128}) print('model accuracy: {0}'.format(accuracy)) for i in range(10): print('class {0} accuracy: {1}'.format(i, class_accuracy[i])) # eval model's accuacy in cw adversarial examples cw_accuracy = model_eval(sess, x, y, pred, cw_adv, adv_ys, feed=eval_feed, args={'batch_size': 128}) print('model cw_accuracy: {0}'.format(cw_accuracy)) part_X = X[:cw_adv.shape[0]] #eval adv's l2 distance l2_dis = calculate_l2_dis(part_X/255, cw_adv/255) print('adversarial examples\' mean l2 distance: {0}'.format(l2_dis)) # show and save img import numpy as np adv_imgs = np.around(cw_adv).astype(int) print(np.max(adv_imgs)) compare_show(X[16], adv_imgs[16]) import matplotlib matplotlib.image.imsave('cw.png', adv_imgs[16]) # eval model's uncertainty dropout_num = 30 uncert = evaluate_uncertainty(sess, model, x, part_X, dropout_num, batch_size, nb_classes, train_feed) # eval model's cw_uncertainty cw_uncert = evaluate_uncertainty(sess, model, x, cw_adv, dropout_num, batch_size,nb_classes, train_feed) # plot uncertainty histogram plt.figure("uncertainty_X") n, bins, patches = plt.hist(uncert, bins=25,edgecolor='None',facecolor='blue') plt.show() plt.figure('uncertainty_CW') cw_n, cw_bins, cw_patches = plt.hist(cw_uncert, bins=25, edgecolor='None',facecolor='red') plt.show() plt.figure('uncertainty_collections') plt.hist(uncert, bins=25,edgecolor='None',facecolor='blue') plt.hist(cw_uncert, bins=25,edgecolor='None',facecolor='red') plt.show()