def test_attacks(batch_size=128, source_samples=10, model_path=os.path.join("models", "mnist"), targeted=True): """ Test many attacks on MNIST with deep Bayes classifier. :param batch_size: size of training batches :param source_samples: number of test inputs to attack :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data from cleverhans.utils_mnist import data_mnist X_train, Y_train, X_test, Y_test = data_mnist(train_start=0, train_end=60000, test_start=0, test_end=10000) img_rows, img_cols, channels = X_train[0].shape nb_classes = Y_train.shape[1] # Define input TF placeholder batch_size = min(batch_size, source_samples) x = tf.placeholder(tf.float32, shape=(batch_size, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(batch_size, nb_classes)) # Define TF model graph model_name = str(sys.argv[1]) if model_name == 'bayes': from load_bayes_classifier import BayesModel conv = True checkpoint = 0 #int(sys.argv[1]) K = int(sys.argv[3]) use_mean = True model = BayesModel(sess, 'mnist', conv, K, checkpoint=checkpoint, attack_snapshot=False, use_mean=use_mean) if use_mean: model_name = 'bayes_mean_mlp' else: model_name = 'bayes_K%d' % K if model_name == 'cnn': from load_cnn_classifier import CNNModel model = CNNModel(sess, 'mnist') if model_name == 'wgan': from load_wgan_classifier import WGANModel conv = True checkpoint = 0 #int(sys.argv[1]) K = int(sys.argv[3]) T = int(sys.argv[4]) model = WGANModel(sess, 'mnist', conv, K, T, checkpoint=checkpoint) model_name = 'wgan_K%d_T%d' % (K, T) preds = model.predict(x, softmax=True) # output probabilities print("Defined TensorFlow model graph.") # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy # Craft adversarial examples nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # make adv inputs and labels for the attack if targeted if targeted: adv_inputs = np.array([[instance] * nb_classes for instance in X_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, 1)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) else: adv_inputs = X_test[:source_samples] adv_ys = Y_test[:source_samples] # Instantiate an attack object attack_method = str(sys.argv[2]) if attack_method == 'fgsm': from cleverhans.attacks import FastGradientMethod model_prob = lambda x: model.predict(x, softmax=True) attack = FastGradientMethod(model_prob, sess=sess) from attack_config import config_fgsm attack_params = config_fgsm(targeted, adv_ys) if attack_method == 'bim': from cleverhans.attacks import BasicIterativeMethod model_prob = lambda x: model.predict(x, softmax=True) attack = BasicIterativeMethod(model_prob, sess=sess) from attack_config import config_bim attack_params = config_bim(targeted, adv_ys) if attack_method == 'mim': from cleverhans.attacks import MomentumIterativeMethod model_prob = lambda x: model.predict(x, softmax=True) attack = MomentumIterativeMethod(model_prob, sess=sess) from attack_config import config_mim attack_params = config_mim(targeted, adv_ys) if attack_method == 'jsma': from cleverhans.attacks import SaliencyMapMethod model_prob = lambda x: model.predict(x, softmax=True) attack = SaliencyMapMethod(model_prob, sess=sess) from attack_config import config_jsma attack_params = config_jsma(targeted, adv_ys) if attack_method == 'vat': from cleverhans.attacks import VirtualAdversarialMethod model_logit = lambda x: model.predict(x, softmax=False) attack = VirtualAdversarialMethod(model_logit, sess=sess) from attack_config import config_vat attack_params = config_vat(targeted, adv_ys) if attack_method == 'cw': from cleverhans.attacks import CarliniWagnerL2 model_logit = lambda x: model.predict(x, softmax=False) attack = CarliniWagnerL2(model_logit, sess=sess) from attack_config import config_cw attack_params = config_cw(targeted, adv_ys) if attack_method == 'elastic': from cleverhans.attacks import ElasticNetMethod model_logit = lambda x: model.predict(x, softmax=False) attack = ElasticNetMethod(model_logit, sess=sess) from attack_config import config_elastic attack_params = config_elastic(targeted, adv_ys) if attack_method == 'deepfool': from cleverhans.attacks import DeepFool model_logit = lambda x: model.predict(x, softmax=False) attack = DeepFool(model_logit, sess=sess) from attack_config import config_deepfool attack_params = config_deepfool(targeted, adv_ys) if attack_method == 'madry': from cleverhans.attacks import MadryEtAl model_prob = lambda x: model.predict(x, softmax=True) attack = MadryEtAl(model_prob, sess=sess) from attack_config import config_madry attack_params = config_madry(targeted, adv_ys) attack_params['batch_size'] = batch_size print('batchsize', batch_size) # perform the attack! adv = [] n_batch = int(adv_inputs.shape[0] / batch_size) for i in xrange(n_batch): adv_batch = adv_inputs[i * batch_size:(i + 1) * batch_size] adv.append(attack.generate_np(adv_batch, **attack_params)) adv = np.concatenate(adv, axis=0) for _ in xrange(5): y_adv = [] for i in xrange(n_batch): adv_batch = adv[i * batch_size:(i + 1) * batch_size] y_adv.append(sess.run(preds, {x: adv_batch})) y_adv = np.concatenate(y_adv, axis=0) print('--------------------------------------') for i in xrange(10): print(np.argmax(y_adv[i * 10:(i + 1) * 10], 1)) correct_pred = np.asarray(np.argmax(y_adv, 1) == np.argmax(adv_ys, 1), dtype='f') adv_accuracy = np.mean(correct_pred) if not targeted: # adv_accuracy, y_adv = model_eval(sess, x, y, preds, adv, # adv_ys, args=eval_params, # return_pred=True) # else: # adv_accuracy, y_adv = model_eval(sess, x, y, preds, adv, # Y_test[:source_samples], args=eval_params, # return_pred=True) adv_accuracy = 1. - adv_accuracy print('--------------------------------------') print(np.argmax(adv_ys[:10], 1)) print(np.argmax(y_adv[:10], 1)) for i in xrange(5): tmp = sess.run(preds, {x: adv[:100]}) print(np.argmax(tmp[:10], 1)) # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # visualisation vis_adv = True if vis_adv: N_vis = 100 sys.path.append('../../utils') from visualisation import plot_images if channels == 1: shape = (img_rows, img_cols) else: shape = (img_rows, img_cols, channels) path = 'figs/' filename = model_name + '_' + attack_method if targeted: filename = filename + '_targeted' else: filename = filename + '_untargeted' plot_images(adv_inputs[:N_vis], shape, path, filename + '_data') plot_images(adv[:N_vis], shape, path, filename + '_adv') save_result = True if save_result: path = 'results/' filename = model_name + '_' + attack_method if targeted: filename = filename + '_targeted' y_input = adv_ys else: filename = filename + '_untargeted' y_input = Y_test[:source_samples] results = [adv_inputs, y_input, adv, y_adv] import pickle pickle.dump(results, open(path + filename + '.pkl', 'w')) print("results saved at %s.pkl" % filename) return report
def run_attack(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, testing=False, label_smoothing=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param testing: if true, training error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Force TensorFlow to use single thread to improve reproducibility config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) if keras.backend.image_data_format() != 'channels_last': raise NotImplementedError("this tutorial requires keras to be configured to channels_last format") # Create TF session and set as Keras backend session sess = tf.Session(config=config) keras.backend.set_session(sess) # Define Keras model model = cnn_model(img_rows=32, img_cols=32, channels=1, nb_filters=64, nb_classes=3) print("Defined Keras model.") # To be able to call the model in the custom loss, we need to call it once # before, see https://github.com/tensorflow/tensorflow/issues/23769 model(model.input) # Initialize the Fast Gradient Sign Method (FGSM) attack object wrap = KerasModelWrapper(model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_acc_metric = get_adversarial_acc_metric(model, fgsm, fgsm_params) model.compile( optimizer=keras.optimizers.Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy', keras.losses.mean_squared_error, adv_acc_metric] ) # Train the model model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epochs, validation_data=(X_test, y_test), verbose=1) # Evaluate the accuracy on legitimate and adversarial test examples _, acc, adv_acc = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1) report.clean_train_clean_eval = acc report.clean_train_adv_eval = adv_acc print('Test accuracy on legitimate examples: %0.4f' % acc) print('Test accuracy on adversarial examples: %0.4f\n' % adv_acc) # Calculate training error if testing: _, train_acc, train_adv_acc = model.evaluate(X_train, y_train, batch_size=batch_size, verbose=1) report.train_clean_train_clean_eval = train_acc report.train_clean_train_adv_eval = train_adv_acc print("Repeating the process, using adversarial training") # Redefine Keras model model_2 = cnn_model(img_rows=32, img_cols=32, channels=1, nb_filters=64, nb_classes=3) model_2(model_2.input) wrap_2 = KerasModelWrapper(model_2) fgsm_2 = FastGradientMethod(wrap_2, sess=sess) # Use a loss function based on legitimate and adversarial examples adv_loss_2 = get_adversarial_loss(model_2, fgsm_2, fgsm_params) adv_acc_metric_2 = get_adversarial_acc_metric(model_2, fgsm_2, fgsm_params) model_2.compile( optimizer=keras.optimizers.Adam(learning_rate), loss=adv_loss_2, metrics=['accuracy', adv_acc_metric_2] )
def zoo(viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED): """ :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) if DATASET == 'MNIST': train_start = 0 train_end = 60000 test_start = 0 test_end = 10000 ds = dataset.MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end, center=False) elif DATASET == 'SVHN': train_start = 0 train_end = 73257 test_start = 0 test_end = 26032 ds = dataset.SVHN(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) elif DATASET == 'CIFAR10': train_start = 0 train_end = 60000 test_start = 0 test_end = 10000 ds = dataset.CIFAR10(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end, center=False) x_train, y_train, x_test, y_test = ds.get_set('train') + ds.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN(DATASET, nb_classes, nb_filters, (None, img_rows, img_cols, nchannels)) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2018, 10, 22]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a Zoo attack object zoo = Zoo(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" zoo_params = { 'binary_search_steps': BINARY_SEARCH_STEPS, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': ZOO_LEARNING_RATE, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': INIT_CONST, 'solver': SOLVER, 'image_shape': [img_rows, img_cols, nchannels], 'nb_classes': nb_classes } adv = zoo.generate_np(adv_inputs, **zoo_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = 1 - model_eval( sess, x, y, preds, adv, y_test[idxs], args=eval_params) else: adv_accuracy = 1 - model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: _ = grid_visual(grid_viz_data) return report
def run_mnist_adv(num_epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, testing=False, learning_rate=LEARNING_RATE): # Object used to keep track of (and return) key accuracies report = AccuracyReport() # set random seed tf.set_random_seed(42) # can use gpu config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 1} ) # Create TF session and set Keras backend session as TF sess = tf.Session(config=config) keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST() x_train, y_train = mnist.get_set("train") x_test, y_test = mnist.get_set("test") # Obtain image params n_rows, n_cols, n_channels = x_train.shape[1:4] n_classes = y_train.shape[1] # define TF model graph model = ConvNet((n_rows, n_cols, n_channels), n_classes) model(model.input) wrap = KerasModelWrapper(model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1. } adv_acc_metric = get_adversarial_acc_metric(model, fgsm, fgsm_params) model.compile( optimizer=keras.optimizers.Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy', adv_acc_metric] ) # Train an MNIST model model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, validation_data=(x_test, y_test), verbose=1) # Evaluate the accuracy on legitimate and adversarial test examples _, acc, adv_acc = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=0) report.clean_train_clean_eval = acc report.clean_train_adv_eval = adv_acc print('Test accuracy on legitimate examples: %0.4f' % acc) print('Test accuracy on adversarial examples: %0.4f\n' % adv_acc) # Calculate training error if testing: _, train_acc, train_adv_acc = model.evaluate(x_train, y_train, batch_size=batch_size, verbose=0) report.train_clean_train_clean_eval = train_acc report.train_clean_train_adv_eval = train_adv_acc print("Repeating the process, using adversarial training") # Redefine Keras model model_2 = ConvNet((n_rows, n_cols, n_channels), n_classes) model_2(model_2.input) wrap_2 = KerasModelWrapper(model_2) fgsm_2 = FastGradientMethod(wrap_2, sess=sess) # Use a loss function based on legitimate and adversarial examples adv_loss_2 = get_adversarial_loss(model_2, fgsm_2, fgsm_params) adv_acc_metric_2 = get_adversarial_acc_metric(model_2, fgsm_2, fgsm_params) model_2.compile( optimizer=keras.optimizers.Adam(learning_rate), loss=adv_loss_2, metrics=['accuracy', adv_acc_metric_2] ) # Train an MNIST model model_2.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, validation_data=(x_test, y_test), verbose=1) # Evaluate the accuracy on legitimate and adversarial test examples _, acc, adv_acc = model_2.evaluate(x_test, y_test, batch_size=batch_size, verbose=0) report.adv_train_clean_eval = acc report.adv_train_adv_eval = adv_acc print('Test accuracy on legitimate examples: %0.4f' % acc) print('Test accuracy on adversarial examples: %0.4f\n' % adv_acc) # Calculate training error if testing: _, train_acc, train_adv_acc = model_2.evaluate(x_train, y_train, batch_size=batch_size, verbose=0) report.train_adv_train_clean_eval = train_acc report.train_adv_train_adv_eval = train_adv_acc return report
def mnist_tutorial_fgsm(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED, noise_output=NOISE_OUTPUT): """ MNIST tutorial for Fast Gradient Method's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a FGSM attack object fgsm = FastGradientMethod(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes)] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 1, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape((source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" if targeted: fgsm_params_batch_size = source_samples * nb_classes else: fgsm_params_batch_size = source_samples fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv = fgsm.generate_np(adv_inputs, **fgsm_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) adv_accuracy = 1 - err if viz_enabled: for i in range(nb_classes): if noise_output: image = adv[i * nb_classes] - adv_inputs[i * nb_classes] else: image = adv[i * nb_classes] grid_viz_data[i, 0] = image print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) ########################################################################### # Adversarial Training ########################################################################### model2 = ModelBasicCNN('model2', nb_classes, nb_filters) fgsm2 = FastGradientMethod(model2, sess=sess) def attack_fgsm(x): return fgsm2.generate(adv_inputs, **fgsm_params) preds2 = model2.get_logits(x) loss2 = CrossEntropy(model2, smoothing=0.1, attack=attack_fgsm) train(sess, loss2, x_train, y_train, args=train_params, rng=rng) eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds2, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on adversarial fgsm test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy print("Defined TensorFlow model graph.") eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) adv_accuracy = 1 - err if viz_enabled: for i in range(nb_classes): if noise_output: image = adv[i * nb_classes] - adv_inputs[i * nb_classes] else: image = adv[i * nb_classes] grid_viz_data[i, 0] = image print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() def save_visual(data, path): """ Modified version of cleverhans.plot.pyplot """ figure = plt.figure() # figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = data.shape[0] num_rows = data.shape[1] num_channels = data.shape[4] for y in range(num_rows): for x in range(num_cols): figure.add_subplot(num_rows, num_cols, (x + 1) + (y * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(data[x, y, :, :, 0], cmap='gray') else: plt.imshow(data[x, y, :, :, :]) # Draw the plot and return plt.savefig(path) return figure # Finally, block & display a grid of all the adversarial examples if viz_enabled: # _ = grid_visual(grid_viz_data) # cleverhans_image.save("output", grid_viz_data) if noise_output: image_name = "output/fgsm_mnist_noise.png" else: image_name = "output/fgsm_mnist.png" _ = save_visual(grid_viz_data, image_name) return report
def test_run_single_gpu_fgsm(self): """ Test the basic single GPU performance by comparing to the FGSM tutorial. """ from cleverhans_tutorials import mnist_tutorial_tf # Run the MNIST tutorial on a dataset of reduced size flags = { 'train_start': 0, 'train_end': 5000, 'test_start': 0, 'test_end': 333, 'nb_epochs': 5, 'testing': True } report = mnist_tutorial_tf.mnist_tutorial(**flags) # Run the multi-gpu trainer for clean training flags.update({ 'batch_size': 128, 'adam_lrn': 0.001, 'dataset': 'mnist', 'only_adv_train': False, 'eval_iters': 1, 'ngpu': 1, 'fast_tests': False, 'attack_type_train': '', 'save_dir': None, 'save_steps': 10000, 'attack_nb_iter_train': None, 'save': False, 'model_type': 'basic', 'attack_type_test': 'FGSM' }) flags.update({'adv_train': False}) HParams = namedtuple('HParams', flags.keys()) hparams = HParams(**flags) np.random.seed(42) tf.compat.v1.set_random_seed(42) with tf.compat.v1.variable_scope(None, 'runner'): report_dict = run_trainer(hparams) report_2 = AccuracyReport() report_2.train_clean_train_clean_eval = report_dict['train'] report_2.clean_train_clean_eval = report_dict['test'] report_2.clean_train_adv_eval = report_dict['FGSM'] # Run the multi-gpu trainer for adversarial training flags.update({'adv_train': True, 'attack_type_train': 'FGSM'}) HParams = namedtuple('HParams', flags.keys()) hparams = HParams(**flags) np.random.seed(42) tf.compat.v1.set_random_seed(42) with tf.compat.v1.variable_scope(None, 'runner'): report_dict = run_trainer(hparams) report_2.train_adv_train_clean_eval = report_dict['train'] report_2.adv_train_clean_eval = report_dict['test'] report_2.adv_train_adv_eval = report_dict['FGSM'] self.assertClose(report.train_clean_train_clean_eval, report_2.train_clean_train_clean_eval, atol=5e-2) self.assertClose(report.clean_train_clean_eval, report_2.clean_train_clean_eval, atol=2e-2) self.assertClose(report.clean_train_adv_eval, report_2.clean_train_adv_eval, atol=5e-2) self.assertClose(report.train_adv_train_clean_eval, report_2.train_adv_train_clean_eval, atol=1e-1) self.assertClose(report.adv_train_clean_eval, report_2.adv_train_clean_eval, atol=2e-2) self.assertClose(report.adv_train_adv_eval, report_2.adv_train_adv_eval, atol=1e-1)
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=False, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=1, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() file = read_mat_file(filename) label = file["label"] data = file["data"] #data[data>1]= 1 #data[data<0]= 0 adv_data = data[10000:80000, :, :, :] cw = adv_data[0::7, :, :, :] fgsm01 = adv_data[1::7, :, :, :] fgsm03 = adv_data[2::7, :, :, :] fgsm05 = adv_data[3::7, :, :, :] gaussian01 = adv_data[4::7, :, :, :] gaussian03 = adv_data[5::7, :, :, :] gaussian05 = adv_data[6::7, :, :, :] # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility #tf.set_random_seed(1234) # Create TF session config = tf.ConfigProto() config.gpu_options.allow_growth = True #config.log_device_placement=True sess = tf.Session(config=config) print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} pdb.set_trace() accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy
def mnist_tutorial_cw( train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED, ): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST( train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end, ) x_train, y_train = mnist.get_set("train") x_test, y_test = mnist.get_set("test") # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN("model1", nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { "nb_epochs": nb_epochs, "batch_size": batch_size, "learning_rate": learning_rate, "filename": os.path.split(model_path)[-1], } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {"batch_size": batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print("Test accuracy on legitimate test examples: {0}".format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else "1" print( "Crafting " + str(source_samples) + " * " + nb_adv_per_sample + " adversarial examples" ) print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype="f") adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32 ) else: adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32, ) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels) ) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes) ) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype="f") adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" if targeted: cw_params_batch_size = source_samples * nb_classes else: cw_params_batch_size = source_samples cw_params = { "binary_search_steps": 1, yname: adv_ys, "max_iterations": attack_iterations, "learning_rate": CW_LEARNING_RATE, "batch_size": cw_params_batch_size, "initial_const": 10, } adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {"batch_size": np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval( sess, x, y, preds, adv, y_test[:source_samples], args=eval_params ) adv_accuracy = 1 - err if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print("--------------------------------------") # Compute the number of adversarial examples that were successfully found print("Avg. rate of successful adv. examples {0:.4f}".format(adv_accuracy)) report.clean_train_adv_eval = 1.0 - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs) ** 2, axis=(1, 2, 3)) ** 0.5) print("Avg. L_2 norm of perturbations {0:.4f}".format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: _ = grid_visual(grid_viz_data) return report
def mnist_attack(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_filters=64, nb_samples=10, learning_rate=0.001, eps=0.3, attack=0, attack_iterations=100, model_path=None, targeted=False, binary=False, scale=False, rand=False, debug=None, test=False, data_dir=None, delay=0, adv=0, nb_iter=40): """ MNIST tutorial for generic attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param nb_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 nb_classes = 10 # Set TF random seed to improve reproducibility tf.set_random_seed(1237) # Create TF session sess = tf.Session() print("Created TensorFlow session.") if debug: set_log_level(logging.DEBUG) else: set_log_level(logging.WARNING) # for running on sharcnet # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(datadir=data_dir, train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) phase = tf.placeholder(tf.bool, name='phase') # for attempting to break unscaled network. logits_scalar = tf.placeholder_with_default(INIT_T, shape=(), name="logits_temperature") save = False train_from_scratch = False if model_path is not None: if os.path.exists(model_path): # check for existing model in immediate subfolder if any(f.endswith('.meta') for f in os.listdir(model_path)): binary, scale, nb_filters, batch_size, learning_rate, nb_epochs, adv = parse_model_settings( model_path) train_from_scratch = False else: model_path = build_model_save_path(model_path, binary, batch_size, nb_filters, learning_rate, nb_epochs, adv, delay, scale) print(model_path) save = True train_from_scratch = True else: train_from_scratch = True # train from scratch, but don't save since no path given # Define TF model graph if binary: print('binary=True') if scale: print('scale=True') if rand: print('rand=True') from cleverhans_tutorials.tutorial_models import make_scaled_binary_rand_cnn model = make_scaled_binary_rand_cnn( phase, logits_scalar, 'binsc_', input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_scaled_binary_cnn model = make_scaled_binary_cnn(phase, logits_scalar, 'binsc_', input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_basic_binary_cnn model = make_basic_binary_cnn(phase, logits_scalar, 'bin_', nb_filters=nb_filters) else: if rand: print('rand=True') from cleverhans_tutorials.tutorial_models import make_scaled_rand_cnn model = make_scaled_rand_cnn(phase, logits_scalar, 'fp_rand', nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_basic_cnn model = make_basic_cnn(phase, logits_scalar, 'fp_', nb_filters=nb_filters) preds = model(x, reuse=False) # * logits_scalar print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### rng = np.random.RandomState([2017, 8, 30]) # Train an MNIST model train_params = { 'binary': binary, 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'loss_name': 'train loss', 'filename': 'model', 'reuse_global_step': False, 'train_scope': 'train', 'is_training': True } if adv != 0: if adv == ADVERSARIAL_TRAINING_MADRYETAL: from cleverhans.attacks import MadryEtAl train_attack_params = { 'eps': MAX_EPS, 'eps_iter': 0.01, 'nb_iter': nb_iter } train_attacker = MadryEtAl(model, sess=sess) elif adv == ADVERSARIAL_TRAINING_FGSM: from cleverhans.attacks import FastGradientMethod stddev = int(np.ceil((MAX_EPS * 255) // 2)) train_attack_params = { 'eps': tf.abs( tf.truncated_normal(shape=(batch_size, 1, 1, 1), mean=0, stddev=stddev)) } train_attacker = FastGradientMethod(model, back='tf', sess=sess) # create the adversarial trainer train_attack_params.update({'clip_min': 0., 'clip_max': 1.}) adv_x_train = train_attacker.generate(x, phase, **train_attack_params) preds_adv_train = model.get_probs(adv_x_train) eval_attack_params = {'eps': MAX_EPS, 'clip_min': 0., 'clip_max': 1.} adv_x_eval = train_attacker.generate(x, phase, **eval_attack_params) preds_adv_eval = model.get_probs(adv_x_eval) # * logits_scalar def evaluate(): # Evaluate the accuracy of the MNIST model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, phase=phase, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) if adv != 0: # Accuracy of the adversarially trained model on adversarial # examples acc = model_eval(sess, x, y, preds_adv_eval, X_test, Y_test, phase=phase, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % acc) acc = model_eval(sess, x, y, preds_adv_eval, X_test, Y_test, phase=phase, args=eval_params, feed={logits_scalar: ATTACK_T}) print('Test accuracy on adversarial examples (scaled): %0.4f' % acc) if train_from_scratch: if save: train_params.update({'log_dir': model_path}) if adv and delay > 0: train_params.update({'nb_epochs': delay}) # do clean training for 'nb_epochs' or 'delay' epochs if test: model_train(sess, x, y, preds, X_train, Y_train, phase=phase, evaluate=evaluate, args=train_params, save=save, rng=rng) else: model_train(sess, x, y, preds, X_train, Y_train, phase=phase, args=train_params, save=save, rng=rng) # optionally do additional adversarial training if adv: print("Adversarial training for %d epochs" % (nb_epochs - delay)) train_params.update({'nb_epochs': nb_epochs - delay}) train_params.update({'reuse_global_step': True}) if test: model_train(sess, x, y, preds, X_train, Y_train, phase=phase, predictions_adv=preds_adv_train, evaluate=evaluate, args=train_params, save=save, rng=rng) else: model_train(sess, x, y, preds, X_train, Y_train, phase=phase, predictions_adv=preds_adv_train, args=train_params, save=save, rng=rng) else: tf_model_load(sess, model_path) print('Restored model from %s' % model_path) evaluate() # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, phase=phase, feed={phase: False}, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Build dataset ########################################################################### if viz_enabled: assert nb_samples == nb_classes idxs = [ np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] viz_rows = nb_classes if targeted else 2 # Initialize our array for grid visualization grid_shape = (nb_classes, viz_rows, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') if targeted: from cleverhans.utils import build_targeted_dataset if viz_enabled: from cleverhans.utils import grid_visual adv_inputs, true_labels, adv_ys = build_targeted_dataset( X_test, Y_test, idxs, nb_classes, img_rows, img_cols, channels) else: adv_inputs, true_labels, adv_ys = build_targeted_dataset( X_test, Y_test, np.arange(nb_samples), nb_classes, img_rows, img_cols, channels) else: if viz_enabled: from cleverhans.utils import pair_visual adv_inputs = X_test[idxs] else: adv_inputs = X_test[:nb_samples] ########################################################################### # Craft adversarial examples using generic approach ########################################################################### if targeted: att_batch_size = np.clip(nb_samples * (nb_classes - 1), a_max=MAX_BATCH_SIZE, a_min=1) nb_adv_per_sample = nb_classes - 1 yname = "y_target" else: att_batch_size = np.minimum(nb_samples, MAX_BATCH_SIZE) nb_adv_per_sample = 1 adv_ys = None yname = "y" print('Crafting ' + str(nb_samples) + ' * ' + str(nb_adv_per_sample) + ' adversarial examples') print("This could take some time ...") if attack == ATTACK_CARLINI_WAGNER_L2: print('Attack: CarliniWagnerL2') from cleverhans.attacks import CarliniWagnerL2 attacker = CarliniWagnerL2(model, back='tf', sess=sess) attack_params = { 'binary_search_steps': 1, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': att_batch_size, 'initial_const': 10, } elif attack == ATTACK_JSMA: print('Attack: SaliencyMapMethod') from cleverhans.attacks import SaliencyMapMethod attacker = SaliencyMapMethod(model, back='tf', sess=sess) attack_params = {'theta': 1., 'gamma': 0.1} elif attack == ATTACK_FGSM: print('Attack: FastGradientMethod') from cleverhans.attacks import FastGradientMethod attacker = FastGradientMethod(model, back='tf', sess=sess) attack_params = {'eps': eps} elif attack == ATTACK_MADRYETAL: print('Attack: MadryEtAl') from cleverhans.attacks import MadryEtAl attacker = MadryEtAl(model, back='tf', sess=sess) attack_params = {'eps': eps, 'eps_iter': 0.01, 'nb_iter': nb_iter} elif attack == ATTACK_BASICITER: print('Attack: BasicIterativeMethod') from cleverhans.attacks import BasicIterativeMethod attacker = BasicIterativeMethod(model, back='tf', sess=sess) attack_params = {'eps': eps, 'eps_iter': 0.01, 'nb_iter': nb_iter} else: print("Attack undefined") sys.exit(1) attack_params.update({yname: adv_ys, 'clip_min': 0., 'clip_max': 1.}) adv_np = attacker.generate_np(adv_inputs, phase, **attack_params) ''' name = 'm_fgsm_eps%s_n%s.npy' % (eps, nb_samples) fpath = os.path.join( '/scratch/gallowaa/mnist/adversarial_examples/cleverhans/', name) np.savez(fpath, x=adv_np, y=Y_test[:nb_samples]) ''' ''' adv_x = attacker.generate(x, phase, **attack_params) adv_np, = batch_eval(sess, [x], [adv_x], [adv_inputs], feed={ phase: False}, args=eval_params) ''' eval_params = {'batch_size': att_batch_size} if targeted: print("Evaluating targeted results") adv_accuracy = model_eval(sess, x, y, preds, adv_np, true_labels, phase=phase, args=eval_params) else: print("Evaluating untargeted results") if viz_enabled: adv_accuracy = model_eval(sess, x, y, preds, adv_np, Y_test[idxs], phase=phase, args=eval_params) else: adv_accuracy = model_eval(sess, x, y, preds, adv_np, Y_test[:nb_samples], phase=phase, args=eval_params) if viz_enabled: n = nb_classes - 1 for i in range(nb_classes): if targeted: for j in range(nb_classes): if i != j: if j != 0 and i != n: grid_viz_data[i, j] = adv_np[j * n + i] if j == 0 and i > 0 or i == n and j > 0: grid_viz_data[i, j] = adv_np[j * n + i - 1] else: grid_viz_data[i, j] = adv_inputs[j * n] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv_np[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Test accuracy on adversarial examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv_np - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Compute number of modified features (L_0 norm) nb_changed = np.where(adv_np != adv_inputs)[0].shape[0] percent_perturb = np.mean(float(nb_changed) / adv_np.reshape(-1).shape[0]) # Compute the average distortion introduced by the algorithm print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturb)) # Friendly output for pasting into spreadsheet print('{0:.4f}'.format(accuracy)) print('{0:.4f}'.format(adv_accuracy)) print('{0:.4f}'.format(percent_perturbed)) print('{0:.4f}'.format(percent_perturb)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def mnist_tutorial(nb_epochs=6, batch_size=128, train_end=-1, test_end=-1, learning_rate=0.001): """ MNIST cleverhans tutorial :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Train a pytorch MNIST model torch_model = PytorchMnistModel() if torch.cuda.is_available(): torch_model = torch_model.cuda() report = AccuracyReport() train_loader = torch.utils.data.DataLoader( datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor()), batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader( datasets.MNIST('data', train=False, transform=transforms.ToTensor()), batch_size=batch_size) # Truncate the datasets so that our test run more quickly train_loader.dataset.train_data = train_loader.dataset.train_data[ :train_end] test_loader.dataset.test_data = test_loader.dataset.test_data[:test_end] # Train our model optimizer = optim.Adam(torch_model.parameters(), lr=learning_rate) train_loss = [] total = 0 correct = 0 step = 0 for epoch in range(nb_epochs): for xs, ys in train_loader: xs, ys = Variable(xs), Variable(ys) if torch.cuda.is_available(): xs, ys = xs.cuda(), ys.cuda() optimizer.zero_grad() preds = torch_model(xs) loss = F.nll_loss(preds, ys) loss.backward() # calc gradients train_loss.append(loss.data.item()) optimizer.step() # update gradients preds_np = preds.data.cpu().numpy() correct += (np.argmax(preds_np, axis=1) == ys).sum() total += len(xs) step += 1 if total % 1000 == 0: acc = float(correct) / total print('[%s] Training accuracy: %.2f%%' % (step, acc * 100)) total = 0 correct = 0 # Evaluate on clean data total = 0 correct = 0 for xs, ys in test_loader: xs, ys = Variable(xs), Variable(ys) if torch.cuda.is_available(): xs, ys = xs.cuda(), ys.cuda() preds = torch_model(xs) preds_np = preds.data.cpu().numpy() correct += (np.argmax(preds_np, axis=1) == ys).sum() total += len(xs) acc = float(correct) / total report.clean_train_clean_eval = acc print('[%s] Clean accuracy: %.2f%%' % (step, acc * 100)) # We use tf for evaluation on adversarial data sess = tf.Session() x_op = tf.placeholder(tf.float32, shape=(None, 1, 28, 28,)) # Convert pytorch model to a tf_model and wrap it in cleverhans tf_model_fn = convert_pytorch_model_to_tf(torch_model) cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits') # Create an FGSM attack fgsm_op = FastGradientMethod(cleverhans_model, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x_op = fgsm_op.generate(x_op, **fgsm_params) adv_preds_op = tf_model_fn(adv_x_op) # Run an evaluation of our model against fgsm total = 0 correct = 0 for xs, ys in test_loader: adv_preds = sess.run(adv_preds_op, feed_dict={x_op: xs}) correct += (np.argmax(adv_preds, axis=1) == ys).sum() total += len(xs) acc = float(correct) / total print('Adv accuracy: {:.3f}'.format(acc * 100)) report.clean_train_adv_eval = acc return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = make_basic_cnn() preds = model.get_probs(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) s = [] for i in range(0, len(X_test), 1): pred = sess.run(preds, {x: X_test[i:i + 1]}) print(pred) print(Y_test[i:i + 1]) s.append(np.sort(pred)[0, -1] - np.sort(pred)[0, -2]) #Draw a histogram def draw_hist(myList, Title, Xlabel, Ylabel): plt.hist(myList, np.arange(0, 1, 0.01), normed=True, stacked=True, facecolor='blue') plt.xlabel(Xlabel) plt.ylabel(Ylabel) plt.title(Title) plt.show() draw_hist(myList=s, Title='legitimate', Xlabel='difference between max and second largest', Ylabel='Probability') report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in X_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in X_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, 1)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = X_test[idxs] else: adv_inputs = X_test[:source_samples] adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': 10 } adv = cw.generate_np(adv_inputs, **cw_params) preds_adv = model.get_probs(adv) pred = sess.run(preds_adv, {x: adv_inputs}) ''' s = [] for i in range(0,len(adv_inputs),1): print(pred[i]) s.append((np.sort(pred[i])[-1])-(np.sort(pred[i])[-2])) #Draw a histogram def draw_hist(myList,Title,Xlabel,Ylabel): plt.hist(myList,np.arange(0,1,0.01),normed=True,stacked=True,facecolor='red') plt.xlabel(Xlabel) plt.ylabel(Ylabel) plt.title(Title) plt.show() draw_hist(myList=s,Title='adversarial',Xlabel='difference between max and second largest', Ylabel='Probability') ''' eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, Y_test[ idxs], args=eval_params) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, Y_test[ :source_samples], args=eval_params) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() return report
def gen_adv(sess, dataset, dataset_name, attack_method, attack_params, attack_name, testing=False, adv_range=range(0, 20), output_dir='./adv_output', show_prediction=False): # Object used to keep track of (and return) key accuracies print("========= Start attack with method {} on {} =========".format( attack_name, dataset_name)) report = AccuracyReport() model = CNNModel(dataset) # Initialize the Fast Gradient Sign Method (FGSM) attack object wrap = KerasModelWrapper(model.model) attack = attack_method(wrap, sess=sess) # if fgsm_params is None: # fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1., 'y_target': None} adv_acc_metric = get_adversarial_acc_metric(model.model, attack, attack_params) model.compile(loss='categorical_crossentropy', metrics=['accuracy', adv_acc_metric]) # Train an MNIST model model.fit() # Evaluate the accuracy on legitimate and adversarial test examples _, acc, adv_acc = model.evaluate() report.clean_train_clean_eval = acc report.clean_train_adv_eval = adv_acc print('Test accuracy on legitimate examples: %0.4f' % acc) print('Test accuracy on adversarial examples: %0.4f\n' % adv_acc) for sample_ind in adv_range: sample = model.x_test[sample_ind:(sample_ind + 1)] current_class = int(np.argmax(model.y_test[sample_ind])) target_classes = other_classes(model.nb_classes, current_class) if not osp.isdir(osp.join(output_dir, dataset_name, attack_name)): os.makedirs(osp.join(output_dir, dataset_name, attack_name), ) fn = osp.join(output_dir, dataset_name, attack_name, str(sample_ind) + "_input.tiff") imageio.imwrite(fn, np.reshape(sample, (model.img_rows, model.img_cols))) if show_prediction: print("Prediction for the input is: \n", model.predict_one(sample)) for target in target_classes: one_hot_target = np.zeros((1, model.nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 attack_params['y_target'] = one_hot_target adv_x = attack.generate_np(sample, **attack_params) fn = osp.join(output_dir, dataset_name, attack_name, str(sample_ind) + "_adv{}.tiff".format(target)) imageio.imwrite( fn, np.reshape(adv_x, (model.img_rows, model.img_cols))) if show_prediction: print("Prediction for the target {} is: \n".format(target), model.predict_one(adv_x)) # Calculate training error if testing: _, train_acc, train_adv_acc = model.evaluate() report.train_clean_train_clean_eval = train_acc report.train_clean_train_adv_eval = train_adv_acc print("========= Finish attack with method {} on {} =========".format( attack_name, dataset_name)) return report
def mnist_tutorial_jsma( train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, ): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST( train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end, ) x_train, y_train = mnist.get_set("train") x_test, y_test = mnist.get_set("test") # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN("model1", nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { "nb_epochs": nb_epochs, "batch_size": batch_size, "learning_rate": learning_rate, } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {"batch_size": batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print("Test accuracy on legitimate test examples: {0}".format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print("Crafting " + str(source_samples) + " * " + str(nb_classes - 1) + " adversarial examples") # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype="i") # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype="f") # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype="f") # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, sess=sess) jsma_params = { "theta": 1.0, "gamma": 0.1, "clip_min": 0.0, "clip_max": 1.0, "y_target": None, } figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print("--------------------------------------") print("Attacking input %i/%i" % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print("Generating adv. example for target class %i" % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params["y_target"] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Compute number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure, ) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print("--------------------------------------") # Compute the number of adversarial examples that were successfully found nb_targets_tried = (nb_classes - 1) * source_samples succ_rate = float(np.sum(results)) / nb_targets_tried print("Avg. rate of successful adv. examples {0:.4f}".format(succ_rate)) report.clean_train_adv_eval = 1.0 - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations[np.where(perturbations != 0)]) print("Avg. rate of perturbed features {0:.4f}".format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean( perturbations[np.where(perturbations != 0)] * (results[np.where(perturbations != 0)] == 1)) print("Avg. rate of perturbed features for successful " "adversarial examples {0:.4f}".format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def test_run_single_gpu_fgsm(self): """ Test the basic single GPU performance by comparing to the FGSM tutorial. """ from cleverhans_tutorials import mnist_tutorial_tf # Run the MNIST tutorial on a dataset of reduced size flags = {'train_start': 0, 'train_end': 5000, 'test_start': 0, 'test_end': 333, 'nb_epochs': 5, 'testing': True} report = mnist_tutorial_tf.mnist_tutorial(**flags) # Run the multi-gpu trainer for clean training flags.update({'batch_size': 128, 'adam_lrn': 0.001, 'dataset': 'mnist', 'only_adv_train': False, 'eval_iters': 1, 'ngpu': 1, 'fast_tests': False, 'attack_type_train': '', 'save_dir': None, 'save_steps': 10000, 'attack_nb_iter_train': None, 'save': False, 'model_type': 'basic', 'attack_type_test': 'FGSM'}) flags.update({'adv_train': False}) HParams = namedtuple('HParams', flags.keys()) hparams = HParams(**flags) np.random.seed(42) tf.set_random_seed(42) with tf.variable_scope(None, 'runner'): report_dict = run_trainer(hparams) report_2 = AccuracyReport() report_2.train_clean_train_clean_eval = report_dict['train'] report_2.clean_train_clean_eval = report_dict['test'] report_2.clean_train_adv_eval = report_dict['FGSM'] # Run the multi-gpu trainer for adversarial training flags.update({'adv_train': True, 'attack_type_train': 'FGSM', }) HParams = namedtuple('HParams', flags.keys()) hparams = HParams(**flags) np.random.seed(42) tf.set_random_seed(42) with tf.variable_scope(None, 'runner'): report_dict = run_trainer(hparams) report_2.train_adv_train_clean_eval = report_dict['train'] report_2.adv_train_clean_eval = report_dict['test'] report_2.adv_train_adv_eval = report_dict['FGSM'] self.assertClose(report.train_clean_train_clean_eval, report_2.train_clean_train_clean_eval, atol=5e-2) self.assertClose(report.clean_train_clean_eval, report_2.clean_train_clean_eval, atol=2e-2) self.assertClose(report.clean_train_adv_eval, report_2.clean_train_adv_eval, atol=5e-2) self.assertClose(report.train_adv_train_clean_eval, report_2.train_adv_train_clean_eval, atol=1e-1) self.assertClose(report.adv_train_clean_eval, report_2.adv_train_clean_eval, atol=2e-2) self.assertClose(report.adv_train_adv_eval, report_2.adv_train_adv_eval, atol=1e-1)
def test_run_single_gpu_fgsm(self): """ Test the basic single GPU performance by comparing to the FGSM tutorial. """ from cleverhans_tutorials import mnist_tutorial_tf # Run the MNIST tutorial on a dataset of reduced size flags = { "train_start": 0, "train_end": 5000, "test_start": 0, "test_end": 333, "nb_epochs": 5, "testing": True, } report = mnist_tutorial_tf.mnist_tutorial(**flags) # Run the multi-gpu trainer for clean training flags.update( { "batch_size": 128, "adam_lrn": 0.001, "dataset": "mnist", "only_adv_train": False, "eval_iters": 1, "ngpu": 1, "fast_tests": False, "attack_type_train": "", "save_dir": None, "save_steps": 10000, "attack_nb_iter_train": None, "save": False, "model_type": "basic", "attack_type_test": "FGSM", } ) flags.update({"adv_train": False}) HParams = namedtuple("HParams", flags.keys()) hparams = HParams(**flags) np.random.seed(42) tf.set_random_seed(42) with tf.variable_scope(None, "runner"): report_dict = run_trainer(hparams) report_2 = AccuracyReport() report_2.train_clean_train_clean_eval = report_dict["train"] report_2.clean_train_clean_eval = report_dict["test"] report_2.clean_train_adv_eval = report_dict["FGSM"] # Run the multi-gpu trainer for adversarial training flags.update({"adv_train": True, "attack_type_train": "FGSM"}) HParams = namedtuple("HParams", flags.keys()) hparams = HParams(**flags) np.random.seed(42) tf.set_random_seed(42) with tf.variable_scope(None, "runner"): report_dict = run_trainer(hparams) report_2.train_adv_train_clean_eval = report_dict["train"] report_2.adv_train_clean_eval = report_dict["test"] report_2.adv_train_adv_eval = report_dict["FGSM"] self.assertClose( report.train_clean_train_clean_eval, report_2.train_clean_train_clean_eval, atol=5e-2, ) self.assertClose( report.clean_train_clean_eval, report_2.clean_train_clean_eval, atol=2e-2 ) self.assertClose( report.clean_train_adv_eval, report_2.clean_train_adv_eval, atol=5e-2 ) self.assertClose( report.train_adv_train_clean_eval, report_2.train_adv_train_clean_eval, atol=1e-1, ) self.assertClose( report.adv_train_clean_eval, report_2.adv_train_clean_eval, atol=2e-2 ) self.assertClose( report.adv_train_adv_eval, report_2.adv_train_adv_eval, atol=1e-1 )
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session #replace num_threads = None if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) #with sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = make_basic_picklable_cnn() preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } dataset = tf.data.Dataset.from_tensor_slices( (tf.reshape(x_train, [60000, 28, 28]), y_train)) dataset = dataset.batch(32) val_dataset = tf.data.Dataset.from_tensor_slices( (tf.reshape(x_test, [10000, 28, 28]), y_test)) val_dataset = val_dataset.batch(32) sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) if TRAIN_NEW == 1: with sess.as_default(): train(sess, loss, x_train, y_train, args=train_params, rng=rng) save("test.joblib", model) else: with sess.as_default(): model = load("test.joblib") #changed assert len(model.get_params()) > 0 preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } figure = None # Loop over the samples we want to perturb into adversarial examples seed(SEED) for sample_ind in xrange(0, source_samples): img = randint(0, 10000) print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[img:(img + 1)] #sample = x_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax( y_test[img])) #current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) tn = 0 totc = 0 # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Compute number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] diff = np.array(adv_x - sample) #print(np.sum(diff)) diff = np.reshape(diff, (28, 28)) diff = diff * 255 cv2.imwrite("test.png", diff) diff = cv2.imread("test.png") diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY) nieghbors = 0 tc = 0 for i in range(0, 28, 1): for j in range(0, 28, 1): if diff[i, j] > 0: tc = tc + 1 totc = totc + 1 if i > 0 and i < 27 and j > 0 and j < 27: #main grid not edges or corners if diff[i - 1, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j + 1] > 0: nieghbors = nieghbors + 1 else: #corners if i == 0 and j == 0: if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if i == 27 and j == 0: if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 if i == 0 and j == 27: if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if i == 27 and j == 27: if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 #edges if i == 0 and j > 0 and j < 27: #left side if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j + 1] > 0: nieghbors = nieghbors + 1 if i == 27 and j > 0 and j < 27: #right side if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j + 1] > 0: nieghbors = nieghbors + 1 if j == 0 and i > 0 and i < 27: #top side if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j + 1] > 0: nieghbors = nieghbors + 1 if j == 27 and i > 0 and i < 27: #bot side if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j - 1] > 0: nieghbors = nieghbors + 1 # print(tc) # print(nieghbors) tn = tn + nieghbors # if tc > 0: # print(nieghbors/tc) # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb #print(perturbations[target, sample_ind]) print('--------------------------------------') print("average neighbors per modified pixel ", tn / totc) # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.8f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) s = perturbations.shape myPert = np.empty(0) myResults = np.empty(0) for i in range(s[0]): for j in range(s[1]): if perturbations[i][j] > 0: myPert = np.append(myPert, perturbations[i][j]) myResults = np.append(myResults, results[i][j]) min_perturbed = np.min(myPert) max_perturbed = np.max(myPert) s2 = myResults.shape final = np.empty(0) for i in range(s2[0]): if myResults[i] > 0: final = np.append(final, myPert[i]) print('Avg. rate of perturbed features {0:.8f}'.format(percent_perturbed)) print('MIN of perturbed features {0:.8f}'.format(min_perturbed)) print('MAX of perturbed features {0:.8f}'.format(max_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) min_perturb_succ = np.min(final) max_perturb_succ = np.max(final) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.8f}'.format(percent_perturb_succ)) print('Min of perturbed features for successful ' 'adversarial examples {0:.8f}'.format(min_perturb_succ)) print('Max of perturbed features for successful ' 'adversarial examples {0:.8f}'.format(max_perturb_succ)) #Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x, y, x_train, y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes)] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape((source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" cw_params = {'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': 10} adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ idxs], args=eval_params) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ :source_samples], args=eval_params) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_deepfool(train_start=0, train_end=60000, #读60000训练 test_start=0,test_end=10000, #读10000测试 viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=2, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist")): """ MNIST tutorial for Deepfool's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples激活对抗例子 :param nb_epochs: number of epochs to train model(一个epoch指代所有的数据送入网络中完成一次前向计算及反向传播的过程。) :param batch_size: size of training batches :param nb_classes: number of output classes(输出几类) :param source_samples: number of test inputs to attack(测试输入用于攻击的数量) :param learning_rate: learning rate for training(学习率) :param model_path: path to the model file(文件路径) :param attack_iterations: 攻击迭代次数 :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies精确度报告 report = AccuracyReport() # MNIST-specific dimensions图像尺寸28*28*1 img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = make_basic_picklable_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow(构建训练模型) ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2018, 8, 9]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path+".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models"), rng=rng) print("save success") # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') print("This could take some time ...") # Instantiate a DeepFool attack object deepfool = DeepFool(model, back='tf', sess=sess) idxs = [np.where(np.argmax(Y_test, axis=1) == i)[0][1] for i in range(10)] print("idxs:",idxs) # construct adv_inputs grid_shape = (nb_classes, 2, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') print("grid_viz_data",grid_viz_data.shape) adv_inputs = X_test[idxs].reshape([-1,28,28,1]) deepfool_params = {'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': attack_iterations, 'nb_classes': 10, 'clip_min': 0., 'clip_max': 1.} adv = deepfool.generate_np(adv_inputs, **deepfool_params) print("adv success") adv_accuracy = 1-model_eval(sess, x, y, preds, adv, Y_test[idxs], args={'batch_size': 10}) for j in range(10): grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1.-adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Load and set up all models from stn.conv_model import build_cnn_no_stn clf = build_cnn_no_stn() clf.load_weights("./keras_weights/cnn_v4.hdf5") # clf = conv_model_no_color_adjust() # clf.load_weights("./keras_weights/stn_v5.hdf5") wrap_clf = KerasModelWrapper(clf) preds = clf(x) eval_par = {'batch_size': 128} acc = model_eval(sess, x, y, preds, X_test, y_test, args=eval_par) print('Test accuracy on legitimate test examples: {0}'.format(acc)) report.clean_train_clean_eval = acc # FGSM # fgsm = FastGradientMethod(wrap_clf, sess=sess) # fgsm_params = {'eps': 0.1, # 'clip_min': 0., # 'clip_max': 1.} # adv_x = fgsm.generate_np(X_atk, **fgsm_params) # # Evaluate the accuracy of the MNIST model on adversarial examples # acc = model_eval(sess, x, y, preds_adv, X_test, y_test, args=eval_par) # print('Test accuracy on adversarial examples: %0.4f\n' % acc) # CarliniWagner attack # attack_iterations = 200 # cw_params = {'binary_search_steps': 3,
def cifar10_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, model_path=MODEL_PATH, noise_output=NOISE_OUTPUT): """ CIFAR10 tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get CIFAR10 test data cifar10 = CIFAR10(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = cifar10.get_set('train') x_test, y_test = cifar10.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelAllConvolutional('model1', nb_classes, nb_filters, input_shape=[32, 32, 3]) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an CIFAR10 model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the CIFAR10 model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, 1, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } # Loop over the samples we want to perturb into adversarial examples adv_all = np.zeros((nb_classes, img_rows, img_cols, nchannels), dtype='f') sample_all = np.zeros((nb_classes, img_rows, img_cols, nchannels), dtype='f') for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal # grid_viz_data[current_class, current_class, :, :, :] = np.reshape( # sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) adv_all[current_class] = adv_x sample_all[current_class] = sample # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side # if viz_enabled: # figure = pair_visual( # np.reshape(sample, (img_rows, img_cols, nchannels)), # np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # # Add our adversarial example to our grid data # grid_viz_data[target, current_class, :, :, :] = np.reshape( # adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Compute the average distortion introduced by the algorithm l2_norm = np.mean(np.sum((adv_all - sample_all)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(l2_norm)) for i in range(nb_classes): if noise_output: image = adv_all[i] - sample_all[i] else: image = adv_all[i] grid_viz_data[i, 0] = image # Close TF session sess.close() def save_visual(data, path): """ Modified version of cleverhans.plot.pyplot """ import matplotlib.pyplot as plt figure = plt.figure() # figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = data.shape[0] num_rows = data.shape[1] num_channels = data.shape[4] for y in range(num_rows): for x in range(num_cols): figure.add_subplot(num_rows, num_cols, (x + 1) + (y * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(data[x, y, :, :, 0], cmap='gray') else: plt.imshow(data[x, y, :, :, :]) # Draw the plot and return plt.savefig(path) # Finally, block & display a grid of all the adversarial examples if viz_enabled: if noise_output: image_name = "output/jsma_cifar10_noise.png" else: image_name = "output/jsma_cifar10.png" _ = save_visual(grid_viz_data, image_name) return report
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind+1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Disable Keras learning phase since we will be serving through tensorflow keras.layers.core.K.set_learning_phase(0) # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Image dimensions ordering should follow the TensorFlow convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' " "to 'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) print("Created TensorFlow session and set Keras backend.") # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path+".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models")) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object wrap = KerasModelWrapper(model) cw = CarliniWagnerL2(wrap, back='tf', sess=sess) idxs = [np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(10)] if targeted: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') one_hot = np.zeros((10, 10)) one_hot[np.arange(10), np.arange(10)] = 1 adv_inputs = np.array([[instance] * 10 for instance in X_test[idxs]], dtype=np.float32) adv_inputs = adv_inputs.reshape((100, 28, 28, 1)) adv_ys = np.array([one_hot] * 10, dtype=np.float32).reshape((100, 10)) yname = "y_target" else: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = X_test[idxs] adv_ys = None yname = "y" cw_params = {'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': 100 if targeted else 10, 'initial_const': 10} adv = cw.generate_np(adv_inputs, **cw_params) if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args={'batch_size': 10}) else: adv_accuracy = 1-model_eval(sess, x, y, preds, adv, Y_test[idxs], args={'batch_size': 10}) for j in range(10): if targeted: for i in range(10): grid_viz_data[i, j] = adv[i * 10 + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1.-adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def test_attacks(data_name, batch_size=128, source_samples=10, model_path=os.path.join("models", "mnist"), targeted=True): # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) print("Created TensorFlow session.") set_log_level(logging.DEBUG) if data_name == 'mnist': from cleverhans.utils_mnist import data_mnist X_train, Y_train, X_test, Y_test = data_mnist(train_start=0, train_end=60000, test_start=0, test_end=10000) if data_name in ['cifar10', 'plane_frog']: from import_data_cifar10 import load_data_cifar10 labels = None if data_name == 'plane_frog': labels = [0, 6] datapath = '../cifar_data/' X_train, X_test, Y_train, Y_test = load_data_cifar10(datapath, labels=labels) img_rows, img_cols, channels = X_test[0].shape nb_classes = Y_test.shape[1] # Define input TF placeholder batch_size = min(batch_size, source_samples) x = tf.placeholder(tf.float32, shape=(batch_size, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(batch_size, nb_classes)) # Define TF model graph model_name = str(sys.argv[1]) model = load_classifier(sess, model_name, data_name) if 'bayes' in model_name: model_name = model_name + '_cnn' # Evaluate the accuracy of the MNIST model on legitimate test examples if 'bnn' not in model_name: keras.backend.set_learning_phase(0) else: keras.backend.set_learning_phase(1) preds = model.predict(x, softmax=False) # output logits eval_params = {'batch_size': batch_size} accuracy, y_pred_clean = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params, return_pred=True) print('Test accuracy on legitimate test examples: %.2f' % (accuracy * 100)) report.clean_train_clean_eval = accuracy y_pred_clean = y_pred_clean[:Y_test.shape[0]] correct_prediction = (np.argmax(Y_test, 1) == np.argmax(y_pred_clean, 1)) ind = np.where(correct_prediction == 1)[0] print('crafting adversarial examples only on correctly prediced images...') print('%d / %d in total' % (len(ind), X_test.shape[0])) path = 'data_ind/' if not os.path.isdir(path): os.mkdir(path) print('create path ' + path) filename = data_name + '_' + model_name import pickle pickle.dump(ind, open(path + filename + '.pkl', 'wb')) print("results saved at %s.pkl" % (path + filename)) return report
def mnist(nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, train_end=-1, test_end=-1, learning_rate=LEARNING_RATE): """ MNIST cleverhans tutorial :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Train a pytorch MNIST model torch_model = MNIST_arch_0() if torch.cuda.is_available(): torch_model = torch_model.cuda() report = AccuracyReport() data_dir = '/scratch/etv21/conv_gp_data/MNIST_data/expA' training_dataset, test_dataset = mnist_sevens_vs_twos(data_dir, noisy=True) train_loader = torch.utils.data.DataLoader(training_dataset, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size) #adversarial_loader = torch.utils.data.DataLoader(Adversarial_MNIST_Dataset(), batch_size=batch_size) # Train our model optimizer = optim.Adam(torch_model.parameters(), lr=learning_rate) train_loss = [] total = 0 correct = 0 step = 0 for _epoch in range(nb_epochs): for xs, ys in train_loader: xs, ys = Variable(xs), Variable(ys) if torch.cuda.is_available(): xs, ys = xs.cuda(), ys.cuda() optimizer.zero_grad() preds = torch_model(xs) loss = F.nll_loss(preds, ys) loss.backward() # calc gradients train_loss.append(loss.data.item()) optimizer.step() # update gradients preds_np = preds.data.cpu().numpy() correct += (np.argmax(preds_np, axis=1) == ys.cpu().numpy()).sum() total += len(xs) step += 1 if total % 200 == 0: acc = float(correct) / total print('[%s] Training accuracy: %.2f%%' % (step, acc * 100)) total = 0 correct = 0 #examine_weights_biases(torch_model) # Evaluate on clean data total = 0 correct = 0 for xs, ys in test_loader: xs, ys = Variable(xs), Variable(ys) if torch.cuda.is_available(): xs, ys = xs.cuda(), ys.cuda() preds = torch_model(xs) preds_np = preds.data.cpu().numpy() correct += (np.argmax(preds_np, axis=1) == ys.cpu().numpy()).sum() total += len(xs) acc = float(correct) / total report.clean_train_clean_eval = acc print('[%s] Clean accuracy: %.2f%%' % (step, acc * 100)) ''' For transfer from GP examples to CNN: total = 0 correct = 0 #import pdb; pdb.set_trace() c = 0 for xs, ys in adversarial_loader: xs, ys = Variable(xs), Variable(ys) if torch.cuda.is_available(): xs, ys = xs.cuda(), ys.cuda() preds = torch_model(xs) preds_np = preds.data.cpu().numpy() correct += (np.argmax(preds_np, axis=1) == ys.cpu().numpy()).sum() total += len(xs) acc = float(correct) / total print('[%s] Adversarial accuracy: %.2f%%' % (step, acc * 100)) ''' # We use tf for evaluation on adversarial data sess = tf.Session() x_op = tf.placeholder(tf.float32, shape=( None, 1, 28, 28, )) # Convert pytorch model to a tf_model and wrap it in cleverhans tf_model_fn = convert_pytorch_model_to_tf(torch_model) cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits') # Create an FGSM attack fgsm_op = FastGradientMethod(cleverhans_model, sess=sess) epsilon = 10 norm = 2 fgsm_params = {'eps': epsilon, 'clip_min': 0., 'clip_max': 1., 'ord': norm} attack_name = 'CNN_FGSM_eps={}_norm={}'.format(epsilon, norm) attack_dir = os.path.join(data_dir, attack_name) if not os.path.exists(attack_dir): os.makedirs(attack_dir) print("Directory ", attack_dir, " Created ") adv_x_op = fgsm_op.generate(x_op, **fgsm_params) adv_preds_op = tf_model_fn(adv_x_op) # Run an evaluation of our model against fgsm total = 0 correct = 0 all_adv_preds = np.array(0) for xs, ys in test_loader: adv_preds = sess.run(adv_preds_op, feed_dict={x_op: xs}) all_adv_preds = np.append(all_adv_preds, adv_preds) correct += (np.argmax(adv_preds, axis=1) == ys.cpu().numpy()).sum() total += len(xs) np.save('adv_predictions', all_adv_preds) acc = float(correct) / total print('Adv accuracy: {:.3f}'.format(acc * 100)) report.clean_train_adv_eval = acc single_adv_x_op = tf.placeholder(tf.float32, shape=(1, 28, 28)) encode_op = tf.image.encode_png( tf.reshape(tf.cast(single_adv_x_op * 255, tf.uint8), (28, 28, 1))) adv_images, clean_images, adv_labels = None, None, None #Print the first and 8th batches of images i.e. a batch of 2s and a batch of 7s b = 0 for xs, ys in test_loader: adv_xs = sess.run(adv_x_op, feed_dict={x_op: xs}) if b == 0 or b == 10: c = b * batch_size for i in range(0, adv_xs.shape[0]): enc_img = sess.run(encode_op, feed_dict={single_adv_x_op: adv_xs[i]}) f = open( '/scratch/etv21/conv_gp_data/MNIST_data/expA/{}/{}.png'. format(attack_name, c), "wb+") f.write(enc_img) f.close() c += 1 if adv_images is None: adv_images = np.array(adv_xs.reshape(adv_xs.shape[0], 28, 28)) clean_images = np.array(xs.reshape(xs.shape[0], 28, 28)) adv_labels = np.array(ys) else: adv_images = np.append(adv_images, adv_xs.reshape(adv_xs.shape[0], 28, 28), 0) clean_images = np.append(clean_images, xs.reshape(xs.shape[0], 28, 28), 0) adv_labels = np.append(adv_labels, ys, 0) b += 1 np.save('/scratch/etv21/conv_gp_data/MNIST_data/expA/two_vs_seven_adv_{}'. format(attack_name), adv_images, allow_pickle=False) np.save('/scratch/etv21/conv_gp_data/MNIST_data/expA/two_vs_seven_labels', adv_labels, allow_pickle=False) return report
def mnist_tutorial(nb_epochs=6, batch_size=128, train_end=-1, test_end=-1, learning_rate=0.001): """ MNIST cleverhans tutorial :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Train a pytorch MNIST model torch_model = PytorchMnistModel() if torch.cuda.is_available(): torch_model = torch_model.cuda() report = AccuracyReport() train_loader = torch.utils.data.DataLoader(datasets.MNIST( 'data', train=True, download=True, transform=transforms.ToTensor()), batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(datasets.MNIST( 'data', train=False, transform=transforms.ToTensor()), batch_size=batch_size) # Truncate the datasets so that our test run more quickly train_loader.dataset.train_data = train_loader.dataset.train_data[: train_end] test_loader.dataset.test_data = test_loader.dataset.test_data[:test_end] # Train our model optimizer = optim.Adam(torch_model.parameters(), lr=learning_rate) train_loss = [] total = 0 correct = 0 step = 0 for epoch in range(nb_epochs): for xs, ys in train_loader: xs, ys = Variable(xs), Variable(ys) if torch.cuda.is_available(): xs, ys = xs.cuda(), ys.cuda() optimizer.zero_grad() preds = torch_model(xs) loss = F.nll_loss(preds, ys) loss.backward() # calc gradients train_loss.append(loss.data.item()) optimizer.step() # update gradients preds_np = preds.data.cpu().numpy() correct += (np.argmax(preds_np, axis=1) == ys).sum() total += len(xs) step += 1 if total % 1000 == 0: acc = float(correct) / total print('[%s] Training accuracy: %.2f%%' % (step, acc * 100)) total = 0 correct = 0 # Evaluate on clean data total = 0 correct = 0 for xs, ys in test_loader: xs, ys = Variable(xs), Variable(ys) if torch.cuda.is_available(): xs, ys = xs.cuda(), ys.cuda() preds = torch_model(xs) preds_np = preds.data.cpu().numpy() correct += (np.argmax(preds_np, axis=1) == ys).sum() total += len(xs) acc = float(correct) / total report.clean_train_clean_eval = acc print('[%s] Clean accuracy: %.2f%%' % (step, acc * 100)) # We use tf for evaluation on adversarial data sess = tf.Session() x_op = tf.placeholder(tf.float32, shape=( None, 1, 28, 28, )) # Convert pytorch model to a tf_model and wrap it in cleverhans tf_model_fn = convert_pytorch_model_to_tf(torch_model) cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits') # Create an FGSM attack fgsm_op = FastGradientMethod(cleverhans_model, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x_op = fgsm_op.generate(x_op, **fgsm_params) adv_preds_op = tf_model_fn(adv_x_op) # Run an evaluation of our model against fgsm total = 0 correct = 0 for xs, ys in test_loader: adv_preds = sess.run(adv_preds_op, feed_dict={x_op: xs}) correct += (np.argmax(adv_preds, axis=1) == ys).sum() total += len(xs) acc = float(correct) / total print('Adv accuracy: {:.3f}'.format(acc * 100)) report.clean_train_adv_eval = acc return report
def mnist_tutorial_adv_train(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED, noise_output=NOISE_OUTPUT): """ MNIST tutorial for Adversarial Training :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using FGSM - BIM - MIM approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object fgsm = FastGradientMethod(model, sess=sess) bim = BasicIterativeMethod(model, sess=sess) mim = MomentumIterativeMethod(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 1, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} bim_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1., 'nb_iter': 50, 'eps_iter': .01 } mim_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1., 'nb_iter': 50, 'eps_iter': .01 } adv_fgsm = fgsm.generate_np(adv_inputs, **fgsm_params) adv_bim = bim.generate_np(adv_inputs, **bim_params) adv_mim = mim.generate_np(adv_inputs, **mim_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_fgsm_accuracy = model_eval(sess, x, y, preds, adv_fgsm, adv_ys, args=eval_params) adv_bim_accuracy = model_eval(sess, x, y, preds, adv_bim, adv_ys, args=eval_params) adv_mim_accuracy = model_eval(sess, x, y, preds, adv_mim, adv_ys, args=eval_params) else: if viz_enabled: err_fgsm = model_eval(sess, x, y, preds, adv_fgsm, y_test[idxs], args=eval_params) err_bim = model_eval(sess, x, y, preds, adv_bim, y_test[idxs], args=eval_params) err_mim = model_eval(sess, x, y, preds, adv_mim, y_test[idxs], args=eval_params) adv_fgsm_accuracy = 1 - err_fgsm adv_bim_accuracy = 1 - err_bim adv_mim_accuracy = 1 - err_mim else: err_fgsm = model_eval(sess, x, y, preds, adv_fgsm, y_test[:source_samples], args=eval_params) err_bim = model_eval(sess, x, y, preds, adv_bim, y_test[:source_samples], args=eval_params) err_mim = model_eval(sess, x, y, preds, adv_mim, y_test[:source_samples], args=eval_params) adv_fgsm_accuracy = 1 - err_fgsm adv_bim_accuracy = 1 - err_bim adv_mim_accuracy = 1 - err_mim print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. (FGSM) examples {0:.4f}'.format( adv_fgsm_accuracy)) report.clean_train_adv_fgsm_eval = 1. - adv_fgsm_accuracy print('Avg. rate of successful adv. (BIM) examples {0:.4f}'.format( adv_bim_accuracy)) report.clean_train_adv_bim_eval = 1. - adv_bim_accuracy print('Avg. rate of successful adv. (MIM) examples {0:.4f}'.format( adv_mim_accuracy)) report.clean_train_adv_mim_eval = 1. - adv_mim_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed_fgsm = np.mean( np.sum((adv_fgsm - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of (FGSM) perturbations {0:.4f}'.format( percent_perturbed_fgsm)) percent_perturbed_bim = np.mean( np.sum((adv_bim - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of (BIM) perturbations {0:.4f}'.format( percent_perturbed_bim)) percent_perturbed_mim = np.mean( np.sum((adv_mim - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of (MIM) perturbations {0:.4f}'.format( percent_perturbed_mim)) ########################################################################### # Adversarial Training ########################################################################### model2 = ModelBasicCNN('model2', nb_classes, nb_filters) fgsm2 = FastGradientMethod(model, sess=sess) # bim2 = BasicIterativeMethod(model, sess=sess) # mim2 = MomentumIterativeMethod(model, sess=sess) def attack_fgsm(x): return fgsm2.generate(adv_inputs, **fgsm_params) # def attack_bim(x): # return bim2.generate(adv_inputs, **bim_params) # def attack_mim(x): # return mim2.generate(adv_inputs, **mim_params) preds2 = model2.get_logits(x) loss2_fgsm = CrossEntropy(model2, smoothing=0.1, attack=attack_fgsm) # loss2_bim = CrossEntropy(model2, smoothing=0.1, attack=attack_bim) # loss2_mim = CrossEntropy(model2, smoothing=0.1, attack=attack_mim) train(sess, loss2_fgsm, x_train, y_train, args=train_params, rng=rng) eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds2, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on adversarial fgsm test examples: {0}'.format( accuracy)) report.clean_train_clean_eval = accuracy print("Defined TensorFlow model graph.") adv_fgsm_accuracy = model_eval(sess, x, y, preds, adv_fgsm, adv_ys, args=eval_params) adv_bim_accuracy = model_eval(sess, x, y, preds, adv_bim, adv_ys, args=eval_params) adv_mim_accuracy = model_eval(sess, x, y, preds, adv_mim, adv_ys, args=eval_params) # Close TF session sess.close() return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x, y, x_train, y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': 10 } adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ idxs], args=eval_params) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ :source_samples], args=eval_params) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=False, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=1, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility #tf.set_random_seed(1234) # Create TF session config = tf.ConfigProto() config.gpu_options.allow_growth = True #config.log_device_placement=True sess = tf.Session(config=config) print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) fgsm = FastGradientMethod(model, sess=sess) result = np.zeros((5,len(X_test))) strength = np.zeros((3,len(X_test))) adv_ys = None yname = "y" cw_params = {'binary_search_steps': 1, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples, 'initial_const': 10} fgsm_eps = [0.1,0.3, 0.5] for j in fgsm_eps: fgsm_params = {'eps': j, 'clip_min': 0., 'clip_max': 1.} for i in range(len(X_test)): feed_dict = {x: X_test[i].reshape((1,28,28,1))} Classes0 = preds.eval(feed_dict=feed_dict,session=sess) Class0 = np.argmax(Classes0) result[0,i] = Class0 adv_inputs = X_test[i] adv_inputs = adv_inputs.reshape((1,28,28,1)) #adv = cw.generate_np(adv_inputs,**cw_params) adv = fgsm.generate_np(adv_inputs, **fgsm_params) pdb.set_trace() feed_dict = {x: adv} Classes1 = preds.eval(feed_dict=feed_dict,session=sess) Class1 = np.argmax(Classes1) result[1,i] = Class1 # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) strength[0,i] = percent_perturbed adv2 = cw.generate_np(adv,**cw_params) feed_dict = {x: adv2} Classes2 = preds.eval(feed_dict=feed_dict,session=sess) Class2 = np.argmax(Classes2) result[2,i] = Class2 # Compute the average distortion introduced by the algorithm percent_perturbed2 = np.mean(np.sum((adv2 - adv)**2, axis=(1, 2, 3))**.5) strength[1,i] = percent_perturbed2 adv_f = sig.medfilt(adv,(1,3,3,1)) feed_dict = {x: adv_f} Classes1 = preds.eval(feed_dict=feed_dict,session=sess) Class1 = np.argmax(Classes1) result[3,i] = Class1 # Compute the average distortion introduced by the algorithm #percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, # axis=(1, 2, 3))**.5) #strength[0,i] = percent_perturbed adv2_f = cw.generate_np(adv_f,**cw_params) feed_dict = {x: adv2_f} Classes2 = preds.eval(feed_dict=feed_dict,session=sess) Class2 = np.argmax(Classes2) result[4,i] = Class2 # Compute the average distortion introduced by the algorithm percent_perturbed2 = np.mean(np.sum((adv2_f - adv_f)**2, axis=(1, 2, 3))**.5) strength[2,i] = percent_perturbed2 if i%100 == 0: print(i) # exit() # Close TF session sess.close() sio.savemat('fgsm_mnist.mat',{'adv_01':adv_01,'adv_03':adv_03, 'adv_05':adv_05 'strength':strength})
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) model_train(sess, x, y, preds, X_train, Y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind+1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, channels)), np.reshape(adv_x, (img_rows, img_cols, channels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def mnist_tutorial(train_start=0, train_end=600, test_start=0, test_end=200, batch_size=128, model_name="", model_dir="", print_examples=False): #----------------------------------------------------------------------------- # I) Setup (load data / model /etc.) #------------------------------------------------------------------------------ """ MNIST CleverHans tutorial :param model_name: filename for desired classifier model :param model_dir: location of classifier models :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Load Model filepath = model_dir + model_name model = keras.models.load_model(filepath) print("Model loaded from:") print(filepath) preds = model(x) #predictions print("Defined TensorFlow model graph.") # Evaluate the accuracy of the MNIST model on legitimate examples (for comparison) eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = accuracy print('Test accuracy on legitimate examples: %0.4f' % accuracy) # Print Normal Examples if (print_examples): fig2 = plt.figure() plt.title("Normal") for i in range(0, 10): example = X_test[i:i + 1, :, :, :] img = example[0, :, :, 0] plt.subplot(2, 5, i + 1) plt.imshow(img, cmap="gray") #----------------------------------------------------------------------------- # II) Adversarial operations / evaluation #------------------------------------------------------------------------------ # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph wrap = KerasModelWrapper(model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Print Adversarial Examples if (print_examples): fig1 = plt.figure() for i in range(0, 10): adversarial_example = sess.run( adv_x, feed_dict={x: X_test[i:i + 1, :, :, :]}) img = adversarial_example[0, :, :, 0] plt.subplot(2, 5, i + 1) plt.imshow(img, cmap="gray") plt.show() return report