def prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, nb_epochs, batch_size, learning_rate): """ Define and train a model that simulates the "remote" black-box oracle described in the original paper. :param sess: the TF session :param x: the input placeholder for MNIST :param y: the ouput placeholder for MNIST :param X_train: the training data for the oracle :param Y_train: the training labels for the oracle :param X_test: the testing data for the oracle :param Y_test: the testing labels for the oracle :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :return: """ # Define TF model graph (for the black-box model) if DATASET == "mnist": model = MNISTModel(use_log=True).model else: model = CIFARModel(use_log=True).model predictions = model(x) print("Defined TensorFlow model graph.") # Train an MNIST model if FLAGS.load_pretrain: tf_model_load(sess) else: train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } model_train(sess, x, y, predictions, X_train, Y_train, verbose=True, save=True, args=train_params) # Print out the accuracy on legitimate data eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) print('Test accuracy of black-box on legitimate test ' 'examples: ' + str(accuracy)) return model, predictions, accuracy
def checkpoint_load(sess, checkpoint_dir, moving_variables=None): ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: if moving_variables: variable_averages = tf.train.ExponentialMovingAverage(0.9) variables_to_restore = variable_averages.variables_to_restore(moving_variables) saver = tf.train.Saver(variables_to_restore) saver.restore(sess, ckpt.model_checkpoint_path) print(sess.run(moving_variables)) tf_model_load(sess, ckpt.model_checkpoint_path) return True print('restore fails: please provide correct checkpoint directory') return False
def checkpoint_load(sess, checkpoint_dir, moving_variables=None): ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: if moving_variables: variable_averages = tf.train.ExponentialMovingAverage(0.9) variables_to_restore = variable_averages.variables_to_restore( moving_variables) saver = tf.train.Saver(variables_to_restore) saver.restore(sess, ckpt.model_checkpoint_path) print(sess.run(moving_variables)) tf_model_load(sess, ckpt.model_checkpoint_path) return True print('restore fails: please provide correct checkpoint directory') return False
def __test(): # report = AccuracyReport() tf.set_random_seed(1234) sess = tf.Session() set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=0, train_end=60000, test_start=0, test_end=10000) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") # Train an MNIST model train_params = { 'nb_epochs': NB_EPOCHS, 'batch_size': BATCH_SIZE, 'learning_rate': LEARNING_RATE, 'filename': os.path.split(MODEL_PATH)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': BATCH_SIZE} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
def cifar10_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, model_path=MODEL_PATH, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1): """ CIFAR10 cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get CIFAR10 data data = CIFAR10(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) dataset_size = data.x_train.shape[0] dataset_train = data.to_tensorflow()[0] dataset_train = dataset_train.map( lambda x, y: (random_shift(random_horizontal_flip(x)), y), 4) dataset_train = dataset_train.batch(batch_size) dataset_train = dataset_train.prefetch(16) x_train, y_train = data.get_set('train') x_test, y_test = data.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_test.shape[1:4] nb_classes = y_test.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } eval_params = {'batch_size': batch_size} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: print('start') #model = CNN('model1', nb_classes, isL2 = True) model = make_wresnet(scope='model1') preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=label_smoothing) tf_model_load( sess, '/nfs/nas4/data-hanwei/data-hanwei/DATA/models/wresnet/cifar1') def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) optimizer = tf.train.MomentumOptimizer(learning_rate=0.0008, momentum=0) #optimizer = tf.train.MomentumOptimizer(learning_rate=0.0008,momentum=0.9) #optimizer = tf.train.MomentumOptimizer(learning_rate=0.001,momentum=0.9) train(sess, x, y, model, None, None, dataset_train=dataset_train, dataset_size=dataset_size, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params(), optimizer=optimizer) saver = tf.train.Saver() saver.save(sess, model_path) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=False, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=1, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility #tf.set_random_seed(1234) # Create TF session config = tf.ConfigProto() config.gpu_options.allow_growth = True #config.log_device_placement=True sess = tf.Session(config=config) print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) fgsm = FastGradientMethod(model, sess=sess) result = np.zeros((5,len(X_test))) strength = np.zeros((3,len(X_test))) adv_ys = None yname = "y" cw_params = {'binary_search_steps': 1, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples, 'initial_const': 10} fgsm_eps = [0.1,0.3, 0.5] for j in fgsm_eps: fgsm_params = {'eps': j, 'clip_min': 0., 'clip_max': 1.} for i in range(len(X_test)): feed_dict = {x: X_test[i].reshape((1,28,28,1))} Classes0 = preds.eval(feed_dict=feed_dict,session=sess) Class0 = np.argmax(Classes0) result[0,i] = Class0 adv_inputs = X_test[i] adv_inputs = adv_inputs.reshape((1,28,28,1)) #adv = cw.generate_np(adv_inputs,**cw_params) adv = fgsm.generate_np(adv_inputs, **fgsm_params) pdb.set_trace() feed_dict = {x: adv} Classes1 = preds.eval(feed_dict=feed_dict,session=sess) Class1 = np.argmax(Classes1) result[1,i] = Class1 # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) strength[0,i] = percent_perturbed adv2 = cw.generate_np(adv,**cw_params) feed_dict = {x: adv2} Classes2 = preds.eval(feed_dict=feed_dict,session=sess) Class2 = np.argmax(Classes2) result[2,i] = Class2 # Compute the average distortion introduced by the algorithm percent_perturbed2 = np.mean(np.sum((adv2 - adv)**2, axis=(1, 2, 3))**.5) strength[1,i] = percent_perturbed2 adv_f = sig.medfilt(adv,(1,3,3,1)) feed_dict = {x: adv_f} Classes1 = preds.eval(feed_dict=feed_dict,session=sess) Class1 = np.argmax(Classes1) result[3,i] = Class1 # Compute the average distortion introduced by the algorithm #percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, # axis=(1, 2, 3))**.5) #strength[0,i] = percent_perturbed adv2_f = cw.generate_np(adv_f,**cw_params) feed_dict = {x: adv2_f} Classes2 = preds.eval(feed_dict=feed_dict,session=sess) Class2 = np.argmax(Classes2) result[4,i] = Class2 # Compute the average distortion introduced by the algorithm percent_perturbed2 = np.mean(np.sum((adv2_f - adv_f)**2, axis=(1, 2, 3))**.5) strength[2,i] = percent_perturbed2 if i%100 == 0: print(i) # exit() # Close TF session sess.close() sio.savemat('fgsm_mnist.mat',{'adv_01':adv_01,'adv_03':adv_03, 'adv_05':adv_05 'strength':strength})
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x, y, x_train, y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes)] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape((source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" cw_params = {'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': 10} adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ idxs], args=eval_params) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ :source_samples], args=eval_params) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_cw( train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED, ): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST( train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end, ) x_train, y_train = mnist.get_set("train") x_test, y_test = mnist.get_set("test") # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN("model1", nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { "nb_epochs": nb_epochs, "batch_size": batch_size, "learning_rate": learning_rate, "filename": os.path.split(model_path)[-1], } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {"batch_size": batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print("Test accuracy on legitimate test examples: {0}".format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else "1" print( "Crafting " + str(source_samples) + " * " + nb_adv_per_sample + " adversarial examples" ) print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype="f") adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32 ) else: adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32, ) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels) ) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes) ) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype="f") adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" if targeted: cw_params_batch_size = source_samples * nb_classes else: cw_params_batch_size = source_samples cw_params = { "binary_search_steps": 1, yname: adv_ys, "max_iterations": attack_iterations, "learning_rate": CW_LEARNING_RATE, "batch_size": cw_params_batch_size, "initial_const": 10, } adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {"batch_size": np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval( sess, x, y, preds, adv, y_test[:source_samples], args=eval_params ) adv_accuracy = 1 - err if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print("--------------------------------------") # Compute the number of adversarial examples that were successfully found print("Avg. rate of successful adv. examples {0:.4f}".format(adv_accuracy)) report.clean_train_adv_eval = 1.0 - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs) ** 2, axis=(1, 2, 3)) ** 0.5) print("Avg. L_2 norm of perturbations {0:.4f}".format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(4254264) set_log_level(logging.DEBUG) # Get MNIST test data # X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, # train_end=train_end, # test_start=test_start, # test_end=test_end) # Get notMNIST data # with np.load("notmnist.npz") as data: # X_train, Y_train, X_test, Y_test = data['examples_train'], data['labels_train'], data['examples_test'], data['labels_test'] # Get MNISTnotMNIST data with np.load("mnist.npz") as data: X_train, Y_train, X_test, Y_test = data['X_train'], data[ 'Y_train'], data['X_test'], data['Y_test'] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") # Define TF model graph model_path = "./" model_name = "clean_trained_mnist_model" model = make_basic_cnn(nb_classes=nb_classes) if tf_model_load(sess, file_path=os.path.join(model_path, model_name)): print(model_name, " reloaded.") preds = model.get_probs(x) # print('shape is', preds.get_shape()) # clean_train = True # if clean_train: # train_params = { # 'nb_epochs': nb_epochs, # 'batch_size': batch_size, # 'learning_rate': learning_rate # } # model_path = "./" # model_name = "clean_trained__model_notmnist" # rng = np.random.RandomState([1989, 12, 13]) # model = make_basic_cnn() # preds = model.get_probs(x) # # def evaluate(): # # Evaluate the accuracy of the MNIST model on legitimate test # # examples # eval_params = {'batch_size': batch_size} # acc = model_eval( # sess, x, y, preds, X_test, Y_test, args=eval_params) # report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape # print('Test accuracy on legitimate examples: %0.4f' % acc) # model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate,args=train_params, rng=rng) # # save_path = os.path.join(model_path, model_name) # saver = tf.train.Saver() # saver.save(sess, save_path) # _logger.info("Completed model training and saved at: " + str(save_path)) # print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model # train_params = { # 'nb_epochs': nb_epochs, # 'batch_size': batch_size, # 'learning_rate': learning_rate, # 'train_dir': model_path, # 'filename': model_name # } # sess.run(tf.global_variables_initializer()) # rng = np.random.RandomState([2017, 8, 30]) # model_train(sess, x, y, preds, X_train, Y_train, save=True, args=train_params, # rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) # report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # misclassify results2 = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization # grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) # grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1, 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } figure = None rng = np.random.RandomState([1358, 23, 234]) index_shuf = list(range(len(X_test))) rng.shuffle(index_shuf) X_test = X_test[index_shuf] Y_test = Y_test[index_shuf] # create a dictionary to keep track of occurence of each letter # create a 2D array to kee track of successful attacks occurence = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0} # 10:0, 11:0, 12:0, 13:0, 14:0, 15:0, 16:0, 17:0, 18:0, 19:0} rate_table = np.zeros((nb_classes, nb_classes), dtype='f') # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # add one to current class occurence occurence[current_class] += 1 # For the grid visualization, keep original images along the diagonal # grid_viz_data[current_class, current_class, :, :, :] = np.reshape( # sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # misclassify res2 = int(model_argmax(sess, x, preds, adv_x) != current_class) # if success, add one to successful rate table if res == 1: rate_table[current_class, target] += 1. # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side # if viz_enabled: # figure = pair_visual( # np.reshape(sample, (img_rows, img_cols)), # np.reshape(adv_x, (img_rows, img_cols)), figure) # Add our adversarial example to our grid data # grid_viz_data[target, current_class, :, :, :] = np.reshape( # adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res results2[target, sample_ind] = res2 perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Close TF session sess.close() # Compute success rate of each letter attacking each target for cur in range(nb_classes): if occurence[cur] != 0: rate_table[cur, :] /= float(occurence[cur]) print("The table of rate of successful attacking is shown below") print(rate_table) print("the number of occurrence of each class is ", occurence) # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried # misclassify succ_rate2 = float(np.sum(results2)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) print( 'Avg. rate of misclassified adv. examples {0:.4f}'.format(succ_rate2)) # report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Finally, block & display a grid of all the adversarial examples # if viz_enabled: # import matplotlib.pyplot as plt # plt.close(figure) # _ = grid_visual(grid_viz_data) return report
def cifar10_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, architecture=ARCHITECTURE, load_model=LOAD_MODEL, ckpt_dir='None', learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.): """ CIFAR10 cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(int(time.time() * 1000) % 2**31) np.random.seed(int(time.time() * 1001) % 2**31) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get CIFAR10 data data = CIFAR10(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) dataset_size = data.x_train.shape[0] dataset_train = data.to_tensorflow()[0] dataset_train = dataset_train.map( lambda x, y: (random_shift(random_horizontal_flip(x)), y), 4) dataset_train = dataset_train.batch(batch_size) dataset_train = dataset_train.prefetch(16) x_train, y_train = data.get_set('train') pgd_train = None if FLAGS.load_pgd_train_samples: pgd_path = os.path.expanduser('~/data/advhyp/{}/samples'.format( FLAGS.load_pgd_train_samples)) x_train = np.load(os.path.join(pgd_path, 'train_clean.npy')) y_train = np.load(os.path.join(pgd_path, 'train_y.npy')) pgd_train = np.load(os.path.join(pgd_path, 'train_pgd.npy')) if x_train.shape[1] == 3: x_train = x_train.transpose((0, 2, 3, 1)) pgd_train = pgd_train.transpose((0, 2, 3, 1)) if len(y_train.shape) == 1: y_tmp = np.zeros((len(y_train), np.max(y_train) + 1), y_train.dtype) y_tmp[np.arange(len(y_tmp)), y_train] = 1. y_train = y_tmp x_test, y_test = data.get_set('test') pgd_test = None if FLAGS.load_pgd_test_samples: pgd_path = os.path.expanduser('~/data/advhyp/{}/samples'.format( FLAGS.load_pgd_test_samples)) x_test = np.load(os.path.join(pgd_path, 'test_clean.npy')) y_test = np.load(os.path.join(pgd_path, 'test_y.npy')) pgd_test = np.load(os.path.join(pgd_path, 'test_pgd.npy')) if x_test.shape[1] == 3: x_test = x_test.transpose((0, 2, 3, 1)) pgd_test = pgd_test.transpose((0, 2, 3, 1)) if len(y_test.shape) == 1: y_tmp = np.zeros((len(y_test), np.max(y_test) + 1), y_test.dtype) y_tmp[np.arange(len(y_tmp)), y_test] = 1. y_test = y_tmp train_idcs = np.arange(len(x_train)) np.random.shuffle(train_idcs) x_train, y_train = x_train[train_idcs], y_train[train_idcs] if pgd_train is not None: pgd_train = pgd_train[train_idcs] test_idcs = np.arange(len(x_test))[:FLAGS.test_size] np.random.shuffle(test_idcs) x_test, y_test = x_test[test_idcs], y_test[test_idcs] if pgd_test is not None: pgd_test = pgd_test[test_idcs] # Use Image Parameters img_rows, img_cols, nchannels = x_test.shape[1:4] nb_classes = y_test.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} pgd_params = { # ord: , 'eps': FLAGS.eps, 'eps_iter': (FLAGS.eps / 5), 'nb_iter': 10, 'clip_min': 0, 'clip_max': 255 } cw_params = { 'binary_search_steps': FLAGS.cw_search_steps, 'max_iterations': FLAGS.cw_steps, #1000 'abort_early': True, 'learning_rate': FLAGS.cw_lr, 'batch_size': batch_size, 'confidence': 0, 'initial_const': FLAGS.cw_c, 'clip_min': 0, 'clip_max': 255 } # Madry dosen't divide by 255 x_train *= 255 x_test *= 255 if pgd_train is not None: pgd_train *= 255 if pgd_test is not None: pgd_test *= 255 print('x_train amin={} amax={}'.format(np.amin(x_train), np.amax(x_train))) print('x_test amin={} amax={}'.format(np.amin(x_test), np.amax(x_test))) print( 'clip_min : {}, clip_max : {} >> CHECK WITH WHICH VALUES THE CLASSIFIER WAS PRETRAINED !!! <<' .format(pgd_params['clip_min'], pgd_params['clip_max'])) rng = np.random.RandomState() # [2017, 8, 30] debug_dict = dict() if FLAGS.save_debug_dict else None def do_eval(preds, x_set, y_set, report_key, is_adv=None, predictor=None, x_adv=None): if predictor is None: acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) else: do_eval(preds, x_set, y_set, report_key, is_adv=is_adv) if x_adv is not None: x_set_adv, = batch_eval(sess, [x], [x_adv], [x_set], batch_size=batch_size) assert x_set.shape == x_set_adv.shape x_set = x_set_adv n_batches = math.ceil(x_set.shape[0] / batch_size) p_set, p_det = np.concatenate([ predictor.send(x_set[b * batch_size:(b + 1) * batch_size]) for b in tqdm.trange(n_batches) ]).T acc = np.equal(p_set, y_set[:len(p_set)].argmax(-1)).mean() # if is_adv: # import IPython ; IPython.embed() ; exit(1) if FLAGS.save_debug_dict: debug_dict['x_set'] = x_set debug_dict['y_set'] = y_set ddfn = 'logs/debug_dict_{}.pkl'.format( 'adv' if is_adv else 'clean') if not os.path.exists(ddfn): with open(ddfn, 'wb') as f: pickle.dump(debug_dict, f) debug_dict.clear() if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples %s: %0.4f' % (report_text, 'with correction' if predictor is not None else 'without correction', acc)) if is_adv is not None: label = 'test_acc_{}_{}'.format( report_text, 'corrected' if predictor else 'uncorrected') swriter.add_scalar(label, acc) if predictor is not None: detect = np.equal(p_det, is_adv).mean() label = 'test_det_{}_{}'.format( report_text, 'corrected' if predictor else 'uncorrected') print(label, detect) swriter.add_scalar(label, detect) label = 'test_dac_{}_{}'.format( report_text, 'corrected' if predictor else 'uncorrected') swriter.add_scalar( label, np.equal(p_set, y_set[:len(p_set)].argmax(-1))[np.equal( p_det, is_adv)].mean()) return acc if clean_train: if architecture == 'ConvNet': model = ModelAllConvolutional('model1', nb_classes, nb_filters, input_shape=[32, 32, 3]) elif architecture == 'ResNet': model = ResNet(scope='ResNet') else: raise Exception('Specify valid classifier architecture!') preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=label_smoothing) if load_model: model_name = 'naturally_trained' if FLAGS.load_adv_trained: model_name = 'adv_trained' if ckpt_dir is not 'None': ckpt = tf.train.get_checkpoint_state( os.path.join(os.path.expanduser(ckpt_dir), model_name)) else: ckpt = tf.train.get_checkpoint_state('./models/' + model_name) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path saver = tf.train.Saver(var_list=dict( (v.name.split('/', 1)[1].split(':')[0], v) for v in tf.global_variables())) saver.restore(sess, ckpt_path) print('\nMODEL SUCCESSFULLY LOADED from : {}'.format(ckpt_path)) initialize_uninitialized_global_variables(sess) else: def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, None, None, dataset_train=dataset_train, dataset_size=dataset_size, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) logits_op = preds.op while logits_op.type != 'MatMul': logits_op = logits_op.inputs[0].op latent_x_tensor, weights = logits_op.inputs logits_tensor = preds nb_classes = weights.shape[-1].value if not FLAGS.save_pgd_samples: noise_eps = FLAGS.noise_eps.split(',') if FLAGS.noise_eps_detect is None: FLAGS.noise_eps_detect = FLAGS.noise_eps noise_eps_detect = FLAGS.noise_eps_detect.split(',') if pgd_train is not None: pgd_train = pgd_train[:FLAGS.n_collect] if not FLAGS.passthrough: predictor = tf_robustify.collect_statistics( x_train[:FLAGS.n_collect], y_train[:FLAGS.n_collect], x, sess, logits_tensor=logits_tensor, latent_x_tensor=latent_x_tensor, weights=weights, nb_classes=nb_classes, p_ratio_cutoff=FLAGS.p_ratio_cutoff, noise_eps=noise_eps, noise_eps_detect=noise_eps_detect, pgd_eps=pgd_params['eps'], pgd_lr=pgd_params['eps_iter'] / pgd_params['eps'], pgd_iters=pgd_params['nb_iter'], save_alignments_dir='logs/stats' if FLAGS.save_alignments else None, load_alignments_dir=os.path.expanduser( '~/data/advhyp/madry/stats') if FLAGS.load_alignments else None, clip_min=pgd_params['clip_min'], clip_max=pgd_params['clip_max'], batch_size=batch_size, num_noise_samples=FLAGS.num_noise_samples, debug_dict=debug_dict, debug=FLAGS.debug, targeted=False, pgd_train=pgd_train, fit_classifier=FLAGS.fit_classifier, clip_alignments=FLAGS.clip_alignments, just_detect=FLAGS.just_detect) else: def _predictor(): _x = yield while (_x is not None): _y = sess.run(preds, {x: _x}).argmax(-1) _x = yield np.stack((_y, np.zeros_like(_y)), -1) predictor = _predictor() next(predictor) if FLAGS.save_alignments: exit(0) # Evaluate the accuracy of the model on clean examples acc_clean = do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False, predictor=predictor) # Initialize the PGD attack object and graph if FLAGS.attack == 'pgd': pgd = MadryEtAl(model, sess=sess) adv_x = pgd.generate(x, **pgd_params) elif FLAGS.attack == 'cw': cw = CarliniWagnerL2(model, sess=sess) adv_x = cw.generate(x, **cw_params) elif FLAGS.attack == 'mean': pgd = MadryEtAl(model, sess=sess) mean_eps = FLAGS.mean_eps * FLAGS.eps def _attack_mean(x): x_many = tf.tile(x[None], (FLAGS.mean_samples, 1, 1, 1)) x_noisy = x_many + tf.random_uniform(x_many.shape, -mean_eps, mean_eps) x_noisy = tf.clip_by_value(x_noisy, 0, 255) x_pgd = pgd.generate(x_noisy, **pgd_params) x_clip = tf.minimum(x_pgd, x_many + FLAGS.eps) x_clip = tf.maximum(x_clip, x_many - FLAGS.eps) x_clip = tf.clip_by_value(x_clip, 0, 255) return x_clip adv_x = tf.map_fn(_attack_mean, x) adv_x = tf.reduce_mean(adv_x, 1) preds_adv = model.get_logits(adv_x) if FLAGS.save_pgd_samples: for ds, y, name in ((x_train, y_train, 'train'), (x_test, y_test, 'test')): train_batches = math.ceil(len(ds) / FLAGS.batch_size) train_pgd = np.concatenate([ sess.run(adv_x, { x: ds[b * FLAGS.batch_size:(b + 1) * FLAGS.batch_size] }) for b in tqdm.trange(train_batches) ]) np.save('logs/{}_clean.npy'.format(name), ds / 255.) np.save('logs/{}_y.npy'.format(name), y) train_pgd /= 255. np.save('logs/{}_pgd.npy'.format(name), train_pgd) exit(0) # Evaluate the accuracy of the model on adversarial examples if not FLAGS.load_pgd_test_samples: acc_pgd = do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True, predictor=predictor, x_adv=adv_x) else: acc_pgd = do_eval(preds, pgd_test, y_test, 'clean_train_adv_eval', True, predictor=predictor) swriter.add_scalar('test_acc_mean', (acc_clean + acc_pgd) / 2., 0) print('Repeating the process, using adversarial training') exit(0) # Create a new model and train it to be robust to MadryEtAl if architecture == 'ConvNet': model2 = ModelAllConvolutional('model2', nb_classes, nb_filters, input_shape=[32, 32, 3]) elif architecture == 'ResNet': model = ResNet() else: raise Exception('Specify valid classifier architecture!') pgd2 = MadryEtAl(model2, sess=sess) def attack(x): return pgd2.generate(x, **pgd_params) loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack) preds2 = model2.get_logits(x) adv_x2 = attack(x) if not backprop_through_attack: # For some attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) if load_model: if ckpt_dir is not 'None': ckpt = tf.train.get_checkpoint_state( os.path.join(os.path.expanduser(ckpt_dir), 'adv_trained')) else: ckpt = tf.train.get_checkpoint_state('./models/adv_trained') ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path assert ckpt_path and tf_model_load( sess, file_path=ckpt_path), '\nMODEL LOADING FAILED' print('\nMODEL SUCCESSFULLY LOADED from : {}'.format(ckpt_path)) initialize_uninitialized_global_variables(sess) else: def evaluate2(): # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) # Accuracy of the adversarially trained model on adversarial # examples do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) # Perform and evaluate adversarial training train(sess, loss2, None, None, dataset_train=dataset_train, dataset_size=dataset_size, evaluate=evaluate2, args=train_params, rng=rng, var_list=model2.get_params()) # Evaluate model do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=False, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=1, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() file = read_mat_file(filename) label = file["label"] data = file["data"] #data[data>1]= 1 #data[data<0]= 0 adv_data = data[10000:80000, :, :, :] cw = adv_data[0::7, :, :, :] fgsm01 = adv_data[1::7, :, :, :] fgsm03 = adv_data[2::7, :, :, :] fgsm05 = adv_data[3::7, :, :, :] gaussian01 = adv_data[4::7, :, :, :] gaussian03 = adv_data[5::7, :, :, :] gaussian05 = adv_data[6::7, :, :, :] # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility #tf.set_random_seed(1234) # Create TF session config = tf.ConfigProto() config.gpu_options.allow_growth = True #config.log_device_placement=True sess = tf.Session(config=config) print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} pdb.set_trace() accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy
def mnist_tutorial_deepfool(train_start=0, train_end=60000, #读60000训练 test_start=0,test_end=10000, #读10000测试 viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=2, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist")): """ MNIST tutorial for Deepfool's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples激活对抗例子 :param nb_epochs: number of epochs to train model(一个epoch指代所有的数据送入网络中完成一次前向计算及反向传播的过程。) :param batch_size: size of training batches :param nb_classes: number of output classes(输出几类) :param source_samples: number of test inputs to attack(测试输入用于攻击的数量) :param learning_rate: learning rate for training(学习率) :param model_path: path to the model file(文件路径) :param attack_iterations: 攻击迭代次数 :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies精确度报告 report = AccuracyReport() # MNIST-specific dimensions图像尺寸28*28*1 img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = make_basic_picklable_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow(构建训练模型) ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2018, 8, 9]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path+".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models"), rng=rng) print("save success") # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') print("This could take some time ...") # Instantiate a DeepFool attack object deepfool = DeepFool(model, back='tf', sess=sess) idxs = [np.where(np.argmax(Y_test, axis=1) == i)[0][1] for i in range(10)] print("idxs:",idxs) # construct adv_inputs grid_shape = (nb_classes, 2, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') print("grid_viz_data",grid_viz_data.shape) adv_inputs = X_test[idxs].reshape([-1,28,28,1]) deepfool_params = {'nb_candidate': 10, 'overshoot': 0.02, 'max_iter': attack_iterations, 'nb_classes': 10, 'clip_min': 0., 'clip_max': 1.} adv = deepfool.generate_np(adv_inputs, **deepfool_params) print("adv success") adv_accuracy = 1-model_eval(sess, x, y, preds, adv, Y_test[idxs], args={'batch_size': 10}) for j in range(10): grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1.-adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def main(argv=None): from cleverhans_tutorials import check_installation check_installation(__file__) if not os.path.exists( CONFIG.SAVE_PATH ): os.makedirs( CONFIG.SAVE_PATH ) save_path_data = CONFIG.SAVE_PATH + 'data/' if not os.path.exists( save_path_data ): os.makedirs( save_path_data ) model_path = CONFIG.SAVE_PATH + '../all/' + CONFIG.DATASET + '/' if not os.path.exists( model_path ): os.makedirs( model_path ) os.makedirs( model_path + 'data/' ) nb_epochs = FLAGS.nb_epochs batch_size = FLAGS.batch_size learning_rate = FLAGS.learning_rate nb_filters = FLAGS.nb_filters len_x = int(CONFIG.NUM_TEST/2) start = time.time() # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set seeds to improve reproducibility if CONFIG.DATASET == 'mnist' or CONFIG.DATASET == 'cifar10': tf.set_random_seed(1234) np.random.seed(1234) rd.seed(1234) elif CONFIG.DATASET == 'moon' or CONFIG.DATASET == 'dims': tf.set_random_seed(13) np.random.seed(1234) rd.seed(0) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session tf_config = tf.ConfigProto(allow_soft_placement=True,log_device_placement=True) tf_config.gpu_options.per_process_gpu_memory_fraction = 0.2 sess = tf.Session(config=tf_config) if CONFIG.DATASET == 'mnist': # Get MNIST data mnist = MNIST(train_start=0, train_end=CONFIG.NUM_TRAIN, test_start=0, test_end=CONFIG.NUM_TEST) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') elif CONFIG.DATASET == 'cifar10': # Get CIFAR10 data data = CIFAR10(train_start=0, train_end=CONFIG.NUM_TRAIN, test_start=0, test_end=CONFIG.NUM_TEST) dataset_size = data.x_train.shape[0] dataset_train = data.to_tensorflow()[0] dataset_train = dataset_train.map( lambda x, y: (random_shift(random_horizontal_flip(x)), y), 4) dataset_train = dataset_train.batch(batch_size) dataset_train = dataset_train.prefetch(16) x_train, y_train = data.get_set('train') x_test, y_test = data.get_set('test') elif CONFIG.DATASET == 'moon': # Create a two moon example X, y = make_moons(n_samples=(CONFIG.NUM_TRAIN+CONFIG.NUM_TEST), noise=0.2, random_state=0) X = StandardScaler().fit_transform(X) x_train1, x_test1, y_train1, y_test1 = train_test_split(X, y, test_size=(CONFIG.NUM_TEST/(CONFIG.NUM_TRAIN +CONFIG.NUM_TEST)), random_state=0) x_train, y_train, x_test, y_test = normalize_reshape_inputs_2d(model_path, x_train1, y_train1, x_test1, y_test1) elif CONFIG.DATASET == 'dims': X, y = make_moons(n_samples=(CONFIG.NUM_TRAIN+CONFIG.NUM_TEST), noise=0.2, random_state=0) X = StandardScaler().fit_transform(X) x_train1, x_test1, y_train1, y_test1 = train_test_split(X, y, test_size=(CONFIG.NUM_TEST/(CONFIG.NUM_TRAIN +CONFIG.NUM_TEST)), random_state=0) x_train2, y_train, x_test2, y_test = normalize_reshape_inputs_2d(model_path, x_train1, y_train1,x_test1, y_test1) x_train, x_test = add_noise_and_QR(x_train2, x_test2, CONFIG.NUM_DIMS) np.save(os.path.join(save_path_data, 'x_test'), x_test) np.save(os.path.join(save_path_data, 'y_test'), y_test) # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': 1} rng = np.random.RandomState([2017, 8, 30]) with open(CONFIG.SAVE_PATH + 'acc_param.txt', 'a') as fi: def do_eval(adv_x, preds, x_set, y_set, report_key): acc, pred_np, adv_x_np = model_eval(sess, x, y, preds, adv_x, nb_classes, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if report_key: print('Accuracy on %s examples: %0.4f' % (report_key, acc), file=fi) return pred_np, adv_x_np if CONFIG.DATASET == 'mnist': trained_model_path = model_path + 'data/trained_model' model = ModelBasicCNN('model1', nb_classes, nb_filters) elif CONFIG.DATASET == 'cifar10': trained_model_path = model_path + 'data/trained_model' model = ModelAllConvolutional('model1', nb_classes, nb_filters, input_shape=[32, 32, 3]) elif CONFIG.DATASET == 'moon': trained_model_path = model_path + 'data/trained_model' model = ModelMLP('model1', nb_classes) elif CONFIG.DATASET == 'dims': trained_model_path = save_path_data + 'trained_model' model = ModelMLP_dyn('model1', nb_classes, CONFIG.NUM_DIMS) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) def evaluate(): _, _ = do_eval(x, preds, x_test, y_test, 'test during train') if os.path.isfile( trained_model_path + '.index' ): tf_model_load(sess, trained_model_path) else: if CONFIG.DATASET == 'mnist': train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) elif CONFIG.DATASET == 'cifar10': train(sess, loss, None, None, dataset_train=dataset_train, dataset_size=dataset_size, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) elif CONFIG.DATASET == 'moon': train_2d(sess, loss, x, y, x_train, y_train, save=False, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) elif CONFIG.DATASET == 'dims': train_2d(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) saver = tf.train.Saver() saver.save(sess, trained_model_path) # Evaluate the accuracy on test examples if os.path.isfile( save_path_data + 'logits_zero_attacked.npy' ): logits_0 = np.load(save_path_data + 'logits_zero_attacked.npy') else: _, _ = do_eval(x, preds, x_train, y_train, 'train') logits_0, _ = do_eval(x, preds, x_test, y_test, 'test') np.save(os.path.join(save_path_data, 'logits_zero_attacked'), logits_0) if CONFIG.DATASET == 'moon': num_grid_points = 5000 if os.path.isfile( model_path + 'data/images_mesh' + str(num_grid_points) + '.npy' ): x_mesh = np.load(model_path + 'data/images_mesh' + str(num_grid_points) + '.npy') logits_mesh = np.load(model_path + 'data/logits_mesh' + str(num_grid_points) + '.npy') else: xx, yy = np.meshgrid(np.linspace(0, 1, num_grid_points), np.linspace(0, 1, num_grid_points)) x_mesh1 = np.stack([np.ravel(xx), np.ravel(yy)]).T y_mesh1 = np.ones((x_mesh1.shape[0]),dtype='int64') x_mesh, y_mesh, _, _ = normalize_reshape_inputs_2d(model_path, x_mesh1, y_mesh1) logits_mesh, _ = do_eval(x, preds, x_mesh, y_mesh, 'mesh') x_mesh = np.squeeze(x_mesh) np.save(os.path.join(model_path, 'data/images_mesh'+str(num_grid_points)), x_mesh) np.save(os.path.join(model_path, 'data/logits_mesh'+str(num_grid_points)), logits_mesh) points_x = x_test[:len_x] points_y = y_test[:len_x] points_x_bar = x_test[len_x:] points_y_bar = y_test[len_x:] # Initialize the CW attack object and graph cw = CarliniWagnerL2(model, sess=sess) # first attack attack_params = { 'learning_rate': CONFIG.CW_LEARNING_RATE, 'max_iterations': CONFIG.CW_MAX_ITERATIONS } if CONFIG.DATASET == 'moon': out_a = compute_polytopes_a(x_mesh, logits_mesh, model_path) attack_params['const_a_min'] = out_a attack_params['const_a_max'] = 100 adv_x = cw.generate(x, **attack_params) if os.path.isfile( save_path_data + 'images_once_attacked.npy' ): adv_img_1 = np.load(save_path_data + 'images_once_attacked.npy') logits_1 = np.load(save_path_data + 'logits_once_attacked.npy') else: #Evaluate the accuracy on adversarial examples preds_adv = model.get_logits(adv_x) logits_1, adv_img_1 = do_eval(adv_x, preds_adv, points_x_bar, points_y_bar, 'test once attacked') np.save(os.path.join(save_path_data, 'images_once_attacked'), adv_img_1) np.save(os.path.join(save_path_data, 'logits_once_attacked'), logits_1) # counter attack attack_params['max_iterations'] = 1024 if CONFIG.DATASET == 'moon': out_alpha2 = compute_epsilons_balls_alpha(x_mesh, np.squeeze(x_test), np.squeeze(adv_img_1), model_path, CONFIG.SAVE_PATH) attack_params['learning_rate'] = out_alpha2 attack_params['const_a_min'] = -1 attack_params['max_iterations'] = 2048 plot_data(np.squeeze(adv_img_1), logits_1, CONFIG.SAVE_PATH+'data_pred1.png', x_mesh, logits_mesh) adv_adv_x = cw.generate(x, **attack_params) x_k = np.concatenate((points_x, adv_img_1), axis=0) y_k = np.concatenate((points_y, logits_1), axis=0) if os.path.isfile( save_path_data + 'images_twice_attacked.npy' ): adv_img_2 = np.load(save_path_data + 'images_twice_attacked.npy') logits_2 = np.load(save_path_data + 'logits_twice_attacked.npy') else: # Evaluate the accuracy on adversarial examples preds_adv_adv = model.get_logits(adv_adv_x) logits_2, adv_img_2 = do_eval(adv_adv_x, preds_adv_adv, x_k, y_k, 'test twice attacked') np.save(os.path.join(save_path_data, 'images_twice_attacked'), adv_img_2) np.save(os.path.join(save_path_data, 'logits_twice_attacked'), logits_2) if CONFIG.DATASET == 'moon': plot_data(np.squeeze(adv_img_2[:len_x]), logits_2[:len_x], CONFIG.SAVE_PATH+'data_pred2.png', x_mesh, logits_mesh) plot_data(np.squeeze(adv_img_2[len_x:]), logits_2[len_x:], CONFIG.SAVE_PATH+'data_pred12.png', x_mesh, logits_mesh) test_balls(np.squeeze(x_k), np.squeeze(adv_img_2), logits_0, logits_1, logits_2, CONFIG.SAVE_PATH) compute_returnees(logits_0[len_x:], logits_1, logits_2[len_x:], logits_0[:len_x], logits_2[:len_x], CONFIG.SAVE_PATH) if x_test.shape[-1] > 1: num_axis=(1,2,3) else: num_axis=(1,2) D_p = np.squeeze(np.sqrt(np.sum(np.square(points_x-adv_img_2[:len_x]), axis=num_axis))) D_p_p = np.squeeze(np.sqrt(np.sum(np.square(adv_img_1-adv_img_2[len_x:]), axis=num_axis))) D_p_mod, D_p_p_mod = modify_D(D_p, D_p_p, logits_0[len_x:], logits_1, logits_2[len_x:], logits_0[:len_x], logits_2[:len_x]) if D_p_mod != [] and D_p_p_mod != []: plot_violins(D_p_mod, D_p_p_mod, CONFIG.SAVE_PATH) threshold_evaluation(D_p_mod, D_p_p_mod, CONFIG.SAVE_PATH) _ = compute_auroc(D_p_mod, D_p_p_mod, CONFIG.SAVE_PATH) plot_results_models(len_x, CONFIG.DATASET, CONFIG.SAVE_PATH) print('Time needed:', time.time()-start) return report
def main(argv=None): """ CIFAR10 CleverHans tutorial :return: """ # CIFAR10-specific dimensions img_rows = 32 img_cols = 32 channels = 3 nb_classes = 10 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) set_log_level(logging.WARNING) # Get CIFAR10 test data X_train, Y_train, X_test, Y_test = data_cifar10() assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, 10)) phase = tf.placeholder(tf.bool, name="phase") model_path = FLAGS.model_path targeted = True if FLAGS.targeted else False binary = True if FLAGS.binary else False scale = True if FLAGS.scale else False learning_rate = FLAGS.learning_rate nb_filters = FLAGS.nb_filters batch_size = FLAGS.batch_size nb_samples = FLAGS.nb_samples nb_epochs = FLAGS.nb_epochs delay = FLAGS.delay eps = FLAGS.eps adv = FLAGS.adv attack = FLAGS.attack attack_iterations = FLAGS.attack_iterations save = False train_from_scratch = False if model_path is not None: if os.path.exists(model_path): # check for existing model in immediate subfolder if any(f.endswith('.meta') for f in os.listdir(model_path)): binary, scale, nb_filters, batch_size, learning_rate, nb_epochs, adv = parse_model_settings( model_path) train_from_scratch = False else: model_path = build_model_save_path( model_path, binary, batch_size, nb_filters, learning_rate, nb_epochs, adv, delay, scale) print(model_path) save = True train_from_scratch = True else: train_from_scratch = True # train from scratch, but don't save since no path given if binary: if scale: from cleverhans_tutorials.tutorial_models import make_scaled_binary_cnn model = make_scaled_binary_cnn(phase, 'bin_', input_shape=( None, img_rows, img_cols, channels), nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_basic_binary_cnn model = make_basic_binary_cnn(phase, 'bin_', input_shape=( None, img_rows, img_cols, channels), nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_basic_cnn model = make_basic_cnn(phase, 'fp_', input_shape=( None, img_rows, img_cols, channels), nb_filters=nb_filters) preds = model(x, reuse=False) print("Defined TensorFlow model graph.") rng = np.random.RandomState([2017, 8, 30]) def evaluate(): # Evaluate the accuracy of the CIFAR10 model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval( sess, x, y, preds, X_test, Y_test, phase=phase, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an CIFAR10 model train_params = { 'binary': binary, 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'loss_name': 'train loss', 'filename': 'model', 'reuse_global_step': False, 'train_scope': 'train', 'is_training': True } if adv: from cleverhans.attacks import FastGradientMethod fgsm = FastGradientMethod(model, back='tf', sess=sess) fgsm_params = {'eps': eps, 'clip_min': 0., 'clip_max': 1.} adv_x_train = fgsm.generate(x, phase, **fgsm_params) preds_adv = model.get_probs(adv_x_train) if train_from_scratch: if save: train_params.update({'log_dir': model_path}) if adv and delay > 0: train_params.update({'nb_epochs': delay}) # do clean training for 'nb_epochs' or 'delay' epochs model_train(sess, x, y, preds, X_train, Y_train, phase=phase, evaluate=evaluate, args=train_params, save=save, rng=rng) # optionally do additional adversarial training if adv: print("Adversarial training for %d epochs" % (nb_epochs - delay)) train_params.update({'nb_epochs': nb_epochs - delay}) train_params.update({'reuse_global_step': True}) model_train(sess, x, y, preds, X_train, Y_train, phase=phase, predictions_adv=preds_adv, evaluate=evaluate, args=train_params, save=save, rng=rng) else: tf_model_load(sess, model_path) print('Restored model from %s' % model_path) evaluate() # Evaluate the accuracy of the CIFAR10 model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, phase=phase, feed={phase: False}, args=eval_params) print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) ########################################################################### # Build dataset ########################################################################### if targeted: from cleverhans.utils import build_targeted_dataset adv_inputs, true_labels, adv_ys = build_targeted_dataset( X_test, Y_test, np.arange(nb_samples), nb_classes, img_rows, img_cols, channels) else: adv_inputs = X_test[:nb_samples] ########################################################################### # Craft adversarial examples using generic approach ########################################################################### if targeted: att_batch_size = np.clip( nb_samples * (nb_classes - 1), a_max=MAX_BATCH_SIZE, a_min=1) nb_adv_per_sample = nb_classes - 1 yname = "y_target" else: att_batch_size = np.minimum(nb_samples, MAX_BATCH_SIZE) nb_adv_per_sample = 1 adv_ys = None yname = "y" print('Crafting ' + str(nb_samples) + ' * ' + str(nb_adv_per_sample) + ' adversarial examples') print("This could take some time ...") if attack == ATTACK_CARLINI_WAGNER_L2: from cleverhans.attacks import CarliniWagnerL2 attacker = CarliniWagnerL2(model, back='tf', sess=sess) attack_params = {'binary_search_steps': 1, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': att_batch_size, 'initial_const': 10, } elif attack == ATTACK_JSMA: from cleverhans.attacks import SaliencyMapMethod attacker = SaliencyMapMethod(model, back='tf', sess=sess) attack_params = {'theta': 1., 'gamma': 0.1} elif attack == ATTACK_FGSM: from cleverhans.attacks import FastGradientMethod attacker = FastGradientMethod(model, back='tf', sess=sess) attack_params = {'eps': eps} elif attack == ATTACK_MADRYETAL: from cleverhans.attacks import MadryEtAl attacker = MadryEtAl(model, back='tf', sess=sess) attack_params = {'eps': eps, 'eps_iter': 0.01, 'nb_iter': nb_iter} else: print("Attack undefined") sys.exit(1) attack_params.update({yname: adv_ys, 'clip_min': 0., 'clip_max': 1.}) X_test_adv = attacker.generate_np(adv_inputs, phase, **attack_params) ''' adv_x = attacker.generate(x, phase, **attack_params) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) eval_params = {'batch_size': att_batch_size} X_test_adv, = batch_eval(sess, [x], [adv_x], [adv_inputs], feed={ phase: False}, args=eval_params) ''' if targeted: assert X_test_adv.shape[0] == nb_samples * \ (nb_classes - 1), X_test_adv.shape # Evaluate the accuracy of the CIFAR10 model on adversarial examples print("Evaluating targeted results") adv_accuracy = model_eval(sess, x, y, preds, X_test_adv, true_labels, phase=phase, args=eval_params) else: assert X_test_adv.shape[0] == nb_samples, X_test_adv.shape # Evaluate the accuracy of the CIFAR10 model on adversarial examples print("Evaluating un-targeted results") adv_accuracy = model_eval(sess, x, y, preds, X_test_adv, Y_test, phase=phase, args=eval_params) # Compute the number of adversarial examples that were successfully found print('Test accuracy on adversarial examples {0:.4f}'.format(adv_accuracy)) # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((X_test_adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Friendly output for pasting into spreadsheet print('{0:.4f},'.format(accuracy)) print('{0:.4f},'.format(adv_accuracy)) print('{0:.4f},'.format(percent_perturbed)) sess.close() ''' print("Repeating the process, using adversarial training") def evaluate_2(): # Evaluate the accuracy of the adversarialy trained CIFAR10 model on # legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, phase=phase, args=eval_params) print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Evaluate the accuracy of the adversarially trained CIFAR10 model on # adversarial examples accuracy_adv = model_eval(sess, x, y, preds_adv, X_test, Y_test, phase=phase, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy_adv)) # Perform adversarial training train_params.update({'reuse_global_step': True}) model_train(sess, x, y, preds, X_train, Y_train, phase=phase, predictions_adv=preds_adv, evaluate=evaluate_2, args=train_params) ''' '''
def main(argv=None): tf.set_random_seed(1234) sess = tf.Session() keras.backend.set_session(sess) X_train, Y_train, X_test, Y_test = data_lmrc() Y_train = Y_train.clip(.1 / 9., 1. - .1) x = tf.placeholder(tf.float32, shape=(None, 128, 128, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) model = cnn_model(img_rows=128, img_cols=128, channels=3) predictions = model(x) def evaluate(): eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate test examples: ' + str(accuracy)) train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate, 'train_dir': FLAGS.train_dir, 'filename': FLAGS.filename } model_path = os.path.join(FLAGS.train_dir, FLAGS.filename) if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, args=train_params, save=True) wrap = KerasModelWrapper(model) nb_classes = 10 targeted = False nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' n_adv = 1000 cw = CarliniWagnerL2(model, back='tf', sess=sess) adv_inputs = X_test[:n_adv] adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': 10, 'initial_const': 10, } adv = cw.generate_np(adv_inputs, **cw_params) #nFeatures = np.ndarray(shape=(n_adv,3)) #for i in range(0,n_adv): # img = adv_inputs[i] # fast = cv2.FastFeatureDetector_create() # kp = fast.detect(img,None) # nFeatures[i][0] = len(kp) # img = adv[i] # fast = cv2.FastFeatureDetector_create() # kp = fast.detect(img,None) # nFeatures[i][1] = len(kp) # nFeatures[i][2] = int(np.argmax(Y_test[i])) #print('Format: Mean(Std)') #for i in range(0,10): # pdb.set_trace() # rows = np.where(nFeatures[:,2] == i) # data = np.delete(nFeatures[rows], 2, 1) # mean = np.mean(data, 0) # std = np.std(data, 0) # print('Class{0} : Original Image {1:.2f}({2:.2f}) Adversarial Image {3:.2f}({4:.2f})'.format(i, mean[0], std[0], mean[1], std[1])) adv_den = np.zeros(adv.shape) for i in range(0, n_adv): img = adv[i] * 255 img = img.astype('uint8') dst = cv2.fastNlMeansDenoisingColored(img, None, 10, 10, 7, 21) dst = dst.astype('float32') dst /= 255 adv_den[i] = dst eval_params = {'batch_size': np.minimum(nb_classes, 10)} original_accuracy = model_eval(sess, x, y, predictions, adv_inputs, Y_test[:n_adv], args=eval_params) adv_accuracy = model_eval(sess, x, y, predictions, adv, Y_test[:n_adv], args=eval_params) print('Accuracy on original images {0:.4f}'.format(original_accuracy)) print('Accuracy on adversarial images {0:.4f}'.format(adv_accuracy)) adv_accuracy = model_eval(sess, x, y, predictions, adv_den, Y_test[:n_adv], args=eval_params) print( 'Accuracy on denoised adversarial images {0:.4f}'.format(adv_accuracy)) percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations adversarial samples {0:.4f}'.format( percent_perturbed)) percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of denoised perturbations {0:.4f}'.format( percent_perturbed)) sess.close()
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = make_basic_cnn() preds = model.get_probs(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) s = [] for i in range(0, len(X_test), 1): pred = sess.run(preds, {x: X_test[i:i + 1]}) print(pred) print(Y_test[i:i + 1]) s.append(np.sort(pred)[0, -1] - np.sort(pred)[0, -2]) #Draw a histogram def draw_hist(myList, Title, Xlabel, Ylabel): plt.hist(myList, np.arange(0, 1, 0.01), normed=True, stacked=True, facecolor='blue') plt.xlabel(Xlabel) plt.ylabel(Ylabel) plt.title(Title) plt.show() draw_hist(myList=s, Title='legitimate', Xlabel='difference between max and second largest', Ylabel='Probability') report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in X_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in X_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, 1)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = X_test[idxs] else: adv_inputs = X_test[:source_samples] adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': 10 } adv = cw.generate_np(adv_inputs, **cw_params) preds_adv = model.get_probs(adv) pred = sess.run(preds_adv, {x: adv_inputs}) ''' s = [] for i in range(0,len(adv_inputs),1): print(pred[i]) s.append((np.sort(pred[i])[-1])-(np.sort(pred[i])[-2])) #Draw a histogram def draw_hist(myList,Title,Xlabel,Ylabel): plt.hist(myList,np.arange(0,1,0.01),normed=True,stacked=True,facecolor='red') plt.xlabel(Xlabel) plt.ylabel(Ylabel) plt.title(Title) plt.show() draw_hist(myList=s,Title='adversarial',Xlabel='difference between max and second largest', Ylabel='Probability') ''' eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, Y_test[ idxs], args=eval_params) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, Y_test[ :source_samples], args=eval_params) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() return report
def mnist_attack(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_filters=64, nb_samples=10, learning_rate=0.001, eps=0.3, attack=0, attack_iterations=100, model_path=None, targeted=False, binary=False, scale=False, rand=False, debug=None, test=False, data_dir=None, delay=0, adv=0, nb_iter=40): """ MNIST tutorial for generic attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param nb_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 nb_classes = 10 # Set TF random seed to improve reproducibility tf.set_random_seed(1237) # Create TF session sess = tf.Session() print("Created TensorFlow session.") if debug: set_log_level(logging.DEBUG) else: set_log_level(logging.WARNING) # for running on sharcnet # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(datadir=data_dir, train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) phase = tf.placeholder(tf.bool, name='phase') # for attempting to break unscaled network. logits_scalar = tf.placeholder_with_default(INIT_T, shape=(), name="logits_temperature") save = False train_from_scratch = False if model_path is not None: if os.path.exists(model_path): # check for existing model in immediate subfolder if any(f.endswith('.meta') for f in os.listdir(model_path)): binary, scale, nb_filters, batch_size, learning_rate, nb_epochs, adv = parse_model_settings( model_path) train_from_scratch = False else: model_path = build_model_save_path(model_path, binary, batch_size, nb_filters, learning_rate, nb_epochs, adv, delay, scale) print(model_path) save = True train_from_scratch = True else: train_from_scratch = True # train from scratch, but don't save since no path given # Define TF model graph if binary: print('binary=True') if scale: print('scale=True') if rand: print('rand=True') from cleverhans_tutorials.tutorial_models import make_scaled_binary_rand_cnn model = make_scaled_binary_rand_cnn( phase, logits_scalar, 'binsc_', input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_scaled_binary_cnn model = make_scaled_binary_cnn(phase, logits_scalar, 'binsc_', input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_basic_binary_cnn model = make_basic_binary_cnn(phase, logits_scalar, 'bin_', nb_filters=nb_filters) else: if rand: print('rand=True') from cleverhans_tutorials.tutorial_models import make_scaled_rand_cnn model = make_scaled_rand_cnn(phase, logits_scalar, 'fp_rand', nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_basic_cnn model = make_basic_cnn(phase, logits_scalar, 'fp_', nb_filters=nb_filters) preds = model(x, reuse=False) # * logits_scalar print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### rng = np.random.RandomState([2017, 8, 30]) # Train an MNIST model train_params = { 'binary': binary, 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'loss_name': 'train loss', 'filename': 'model', 'reuse_global_step': False, 'train_scope': 'train', 'is_training': True } if adv != 0: if adv == ADVERSARIAL_TRAINING_MADRYETAL: from cleverhans.attacks import MadryEtAl train_attack_params = { 'eps': MAX_EPS, 'eps_iter': 0.01, 'nb_iter': nb_iter } train_attacker = MadryEtAl(model, sess=sess) elif adv == ADVERSARIAL_TRAINING_FGSM: from cleverhans.attacks import FastGradientMethod stddev = int(np.ceil((MAX_EPS * 255) // 2)) train_attack_params = { 'eps': tf.abs( tf.truncated_normal(shape=(batch_size, 1, 1, 1), mean=0, stddev=stddev)) } train_attacker = FastGradientMethod(model, back='tf', sess=sess) # create the adversarial trainer train_attack_params.update({'clip_min': 0., 'clip_max': 1.}) adv_x_train = train_attacker.generate(x, phase, **train_attack_params) preds_adv_train = model.get_probs(adv_x_train) eval_attack_params = {'eps': MAX_EPS, 'clip_min': 0., 'clip_max': 1.} adv_x_eval = train_attacker.generate(x, phase, **eval_attack_params) preds_adv_eval = model.get_probs(adv_x_eval) # * logits_scalar def evaluate(): # Evaluate the accuracy of the MNIST model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, phase=phase, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) if adv != 0: # Accuracy of the adversarially trained model on adversarial # examples acc = model_eval(sess, x, y, preds_adv_eval, X_test, Y_test, phase=phase, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % acc) acc = model_eval(sess, x, y, preds_adv_eval, X_test, Y_test, phase=phase, args=eval_params, feed={logits_scalar: ATTACK_T}) print('Test accuracy on adversarial examples (scaled): %0.4f' % acc) if train_from_scratch: if save: train_params.update({'log_dir': model_path}) if adv and delay > 0: train_params.update({'nb_epochs': delay}) # do clean training for 'nb_epochs' or 'delay' epochs if test: model_train(sess, x, y, preds, X_train, Y_train, phase=phase, evaluate=evaluate, args=train_params, save=save, rng=rng) else: model_train(sess, x, y, preds, X_train, Y_train, phase=phase, args=train_params, save=save, rng=rng) # optionally do additional adversarial training if adv: print("Adversarial training for %d epochs" % (nb_epochs - delay)) train_params.update({'nb_epochs': nb_epochs - delay}) train_params.update({'reuse_global_step': True}) if test: model_train(sess, x, y, preds, X_train, Y_train, phase=phase, predictions_adv=preds_adv_train, evaluate=evaluate, args=train_params, save=save, rng=rng) else: model_train(sess, x, y, preds, X_train, Y_train, phase=phase, predictions_adv=preds_adv_train, args=train_params, save=save, rng=rng) else: tf_model_load(sess, model_path) print('Restored model from %s' % model_path) evaluate() # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, phase=phase, feed={phase: False}, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Build dataset ########################################################################### if viz_enabled: assert nb_samples == nb_classes idxs = [ np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] viz_rows = nb_classes if targeted else 2 # Initialize our array for grid visualization grid_shape = (nb_classes, viz_rows, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') if targeted: from cleverhans.utils import build_targeted_dataset if viz_enabled: from cleverhans.utils import grid_visual adv_inputs, true_labels, adv_ys = build_targeted_dataset( X_test, Y_test, idxs, nb_classes, img_rows, img_cols, channels) else: adv_inputs, true_labels, adv_ys = build_targeted_dataset( X_test, Y_test, np.arange(nb_samples), nb_classes, img_rows, img_cols, channels) else: if viz_enabled: from cleverhans.utils import pair_visual adv_inputs = X_test[idxs] else: adv_inputs = X_test[:nb_samples] ########################################################################### # Craft adversarial examples using generic approach ########################################################################### if targeted: att_batch_size = np.clip(nb_samples * (nb_classes - 1), a_max=MAX_BATCH_SIZE, a_min=1) nb_adv_per_sample = nb_classes - 1 yname = "y_target" else: att_batch_size = np.minimum(nb_samples, MAX_BATCH_SIZE) nb_adv_per_sample = 1 adv_ys = None yname = "y" print('Crafting ' + str(nb_samples) + ' * ' + str(nb_adv_per_sample) + ' adversarial examples') print("This could take some time ...") if attack == ATTACK_CARLINI_WAGNER_L2: print('Attack: CarliniWagnerL2') from cleverhans.attacks import CarliniWagnerL2 attacker = CarliniWagnerL2(model, back='tf', sess=sess) attack_params = { 'binary_search_steps': 1, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': att_batch_size, 'initial_const': 10, } elif attack == ATTACK_JSMA: print('Attack: SaliencyMapMethod') from cleverhans.attacks import SaliencyMapMethod attacker = SaliencyMapMethod(model, back='tf', sess=sess) attack_params = {'theta': 1., 'gamma': 0.1} elif attack == ATTACK_FGSM: print('Attack: FastGradientMethod') from cleverhans.attacks import FastGradientMethod attacker = FastGradientMethod(model, back='tf', sess=sess) attack_params = {'eps': eps} elif attack == ATTACK_MADRYETAL: print('Attack: MadryEtAl') from cleverhans.attacks import MadryEtAl attacker = MadryEtAl(model, back='tf', sess=sess) attack_params = {'eps': eps, 'eps_iter': 0.01, 'nb_iter': nb_iter} elif attack == ATTACK_BASICITER: print('Attack: BasicIterativeMethod') from cleverhans.attacks import BasicIterativeMethod attacker = BasicIterativeMethod(model, back='tf', sess=sess) attack_params = {'eps': eps, 'eps_iter': 0.01, 'nb_iter': nb_iter} else: print("Attack undefined") sys.exit(1) attack_params.update({yname: adv_ys, 'clip_min': 0., 'clip_max': 1.}) adv_np = attacker.generate_np(adv_inputs, phase, **attack_params) ''' name = 'm_fgsm_eps%s_n%s.npy' % (eps, nb_samples) fpath = os.path.join( '/scratch/gallowaa/mnist/adversarial_examples/cleverhans/', name) np.savez(fpath, x=adv_np, y=Y_test[:nb_samples]) ''' ''' adv_x = attacker.generate(x, phase, **attack_params) adv_np, = batch_eval(sess, [x], [adv_x], [adv_inputs], feed={ phase: False}, args=eval_params) ''' eval_params = {'batch_size': att_batch_size} if targeted: print("Evaluating targeted results") adv_accuracy = model_eval(sess, x, y, preds, adv_np, true_labels, phase=phase, args=eval_params) else: print("Evaluating untargeted results") if viz_enabled: adv_accuracy = model_eval(sess, x, y, preds, adv_np, Y_test[idxs], phase=phase, args=eval_params) else: adv_accuracy = model_eval(sess, x, y, preds, adv_np, Y_test[:nb_samples], phase=phase, args=eval_params) if viz_enabled: n = nb_classes - 1 for i in range(nb_classes): if targeted: for j in range(nb_classes): if i != j: if j != 0 and i != n: grid_viz_data[i, j] = adv_np[j * n + i] if j == 0 and i > 0 or i == n and j > 0: grid_viz_data[i, j] = adv_np[j * n + i - 1] else: grid_viz_data[i, j] = adv_inputs[j * n] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv_np[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Test accuracy on adversarial examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv_np - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Compute number of modified features (L_0 norm) nb_changed = np.where(adv_np != adv_inputs)[0].shape[0] percent_perturb = np.mean(float(nb_changed) / adv_np.reshape(-1).shape[0]) # Compute the average distortion introduced by the algorithm print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturb)) # Friendly output for pasting into spreadsheet print('{0:.4f}'.format(accuracy)) print('{0:.4f}'.format(adv_accuracy)) print('{0:.4f}'.format(percent_perturbed)) print('{0:.4f}'.format(percent_perturb)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def main(argv=None): """ CIFAR10 CleverHans tutorial :return: """ # CIFAR10-specific dimensions img_rows = 32 img_cols = 32 channels = 3 nb_classes = 10 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) set_log_level(logging.WARNING) # Get CIFAR10 test data X_train, Y_train, X_test, Y_test = data_cifar10() assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, 10)) phase = tf.placeholder(tf.bool, name="phase") logits_scalar = tf.placeholder_with_default(INIT_T, shape=(), name="logits_temperature") model_path = FLAGS.model_path targeted = True if FLAGS.targeted else False learning_rate = FLAGS.learning_rate nb_filters = FLAGS.nb_filters batch_size = FLAGS.batch_size nb_samples = FLAGS.nb_samples nb_epochs = FLAGS.nb_epochs delay = FLAGS.delay eps = FLAGS.eps adv = FLAGS.adv attack = FLAGS.attack attack_iterations = FLAGS.attack_iterations nb_iter = FLAGS.nb_iter #### EMPIR extra flags lowprecision = FLAGS.lowprecision abits = FLAGS.abits wbits = FLAGS.wbits abitsList = FLAGS.abitsList wbitsList = FLAGS.wbitsList stocRound = True if FLAGS.stocRound else False rand = FLAGS.rand model_path2 = FLAGS.model_path2 model_path1 = FLAGS.model_path1 model_path3 = FLAGS.model_path3 ensembleThree = True if FLAGS.ensembleThree else False abits2 = FLAGS.abits2 wbits2 = FLAGS.wbits2 abits2List = FLAGS.abits2List wbits2List = FLAGS.wbits2List inpgradreg = True if FLAGS.inpgradreg else False distill = True if FLAGS.distill else False student_epochs = FLAGS.student_epochs l2dbl = FLAGS.l2dbl l2cs = FLAGS.l2cs #### save = False train_from_scratch = False if ensembleThree: if (model_path1 is None or model_path2 is None or model_path3 is None): train_from_scratch = True else: train_from_scratch = False elif model_path is not None: if os.path.exists(model_path): # check for existing model in immediate subfolder if any(f.endswith('.meta') for f in os.listdir(model_path)): train_from_scratch = False else: model_path = build_model_save_path(model_path, batch_size, nb_filters, learning_rate, nb_epochs, adv, delay) print(model_path) save = True train_from_scratch = True else: train_from_scratch = True # train from scratch, but don't save since no path given if ensembleThree: if (wbitsList is None) or ( abitsList is None ): # Layer wise separate quantization not specified for first model if (wbits == 0) or (abits == 0): print( "Error: the number of bits for constant precision weights and activations across layers for the first model have to specified using wbits1 and abits1 flags" ) sys.exit(1) else: fixedPrec1 = 1 elif (len(wbitsList) != 3) or (len(abitsList) != 3): print( "Error: Need to specify the precisions for activations and weights for the atleast the three convolutional layers of the first model" ) sys.exit(1) else: fixedPrec1 = 0 if (wbits2List is None) or ( abits2List is None ): # Layer wise separate quantization not specified for second model if (wbits2 == 0) or (abits2 == 0): print( "Error: the number of bits for constant precision weights and activations across layers for the second model have to specified using wbits1 and abits1 flags" ) sys.exit(1) else: fixedPrec2 = 1 elif (len(wbits2List) != 3) or (len(abits2List) != 3): print( "Error: Need to specify the precisions for activations and weights for the atleast the three convolutional layers of the second model" ) sys.exit(1) else: fixedPrec2 = 0 if (fixedPrec2 != 1) or ( fixedPrec1 != 1 ): # Atleast one of the models have separate precisions per layer fixedPrec = 0 print("Within atleast one model has separate precisions") if (fixedPrec1 == 1): # first layer has fixed precision abitsList = (abits, abits, abits) wbitsList = (wbits, wbits, wbits) if (fixedPrec2 == 1): # second layer has fixed precision abits2List = (abits2, abits2, abits2) wbits2List = (wbits2, wbits2, wbits2) else: fixedPrec = 1 if (train_from_scratch): print("The ensemble model cannot be trained from scratch") sys.exit(1) if fixedPrec == 1: from cleverhans_tutorials.tutorial_models import make_ensemble_three_cifar_cnn model = make_ensemble_three_cifar_cnn(phase, logits_scalar, 'lp1_', 'lp2_', 'fp_', wbits, abits, wbits2, abits2, input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_ensemble_three_cifar_cnn_layerwise model = make_ensemble_three_cifar_cnn_layerwise( phase, logits_scalar, 'lp1_', 'lp2_', 'fp_', wbitsList, abitsList, wbits2List, abits2List, input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters) elif lowprecision: if (wbitsList is None) or ( abitsList is None): # Layer wise separate quantization not specified if (wbits == 0) or (abits == 0): print( "Error: the number of bits for constant precision weights and activations across layers have to specified using wbits and abits flags" ) sys.exit(1) else: fixedPrec = 1 elif (len(wbitsList) != 3) or (len(abitsList) != 3): print( "Error: Need to specify the precisions for activations and weights for the atleast the three convolutional layers" ) sys.exit(1) else: fixedPrec = 0 if fixedPrec: from cleverhans_tutorials.tutorial_models import make_basic_lowprecision_cifar_cnn model = make_basic_lowprecision_cifar_cnn( phase, logits_scalar, 'lp_', wbits, abits, input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters, stocRound=stocRound) else: from cleverhans_tutorials.tutorial_models import make_layerwise_lowprecision_cifar_cnn model = make_layerwise_lowprecision_cifar_cnn( phase, logits_scalar, 'lp_', wbitsList, abitsList, input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters, stocRound=stocRound) elif distill: from cleverhans_tutorials.tutorial_models import make_distilled_cifar_cnn model = make_distilled_cifar_cnn(phase, logits_scalar, 'teacher_fp_', 'fp_', nb_filters=nb_filters, input_shape=(None, img_rows, img_cols, channels)) #### else: from cleverhans_tutorials.tutorial_models import make_basic_cifar_cnn model = make_basic_cifar_cnn(phase, logits_scalar, 'fp_', input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters) # separate predictions of teacher for distilled training if distill: teacher_preds = model.teacher_call(x, reuse=False) teacher_logits = model.get_teacher_logits(x, reuse=False) # separate calling function for ensemble models if ensembleThree: preds = model.ensemble_call(x, reuse=False) else: ##default preds = model(x, reuse=False) print("Defined TensorFlow model graph.") rng = np.random.RandomState([2017, 8, 30]) def evaluate(): # Evaluate the accuracy of the CIFAR10 model on legitimate test # examples eval_params = {'batch_size': batch_size} if ensembleThree: acc = model_eval_ensemble(sess, x, y, preds, X_test, Y_test, phase=phase, args=eval_params) else: acc = model_eval(sess, x, y, preds, X_test, Y_test, phase=phase, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an CIFAR10 model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'loss_name': 'train loss', 'filename': 'model', 'reuse_global_step': False, 'train_scope': 'train', 'is_training': True } if adv != 0: if adv == ADVERSARIAL_TRAINING_MADRYETAL: from cleverhans.attacks import MadryEtAl train_attack_params = { 'eps': MAX_EPS, 'eps_iter': 0.01, 'nb_iter': nb_iter } train_attacker = MadryEtAl(model, sess=sess) elif adv == ADVERSARIAL_TRAINING_FGSM: from cleverhans.attacks import FastGradientMethod stddev = int(np.ceil((MAX_EPS * 255) // 2)) train_attack_params = { 'eps': tf.abs( tf.truncated_normal(shape=(batch_size, 1, 1, 1), mean=0, stddev=stddev)) } train_attacker = FastGradientMethod(model, back='tf', sess=sess) # create the adversarial trainer train_attack_params.update({'clip_min': 0., 'clip_max': 1.}) adv_x_train = train_attacker.generate(x, phase, **train_attack_params) preds_adv_train = model.get_probs(adv_x_train) eval_attack_params = {'eps': MAX_EPS, 'clip_min': 0., 'clip_max': 1.} adv_x_eval = train_attacker.generate(x, phase, **eval_attack_params) preds_adv_eval = model.get_probs(adv_x_eval) # * logits_scalar if train_from_scratch: if save: train_params.update({'log_dir': model_path}) if adv and delay > 0: train_params.update({'nb_epochs': delay}) # do clean training for 'nb_epochs' or 'delay' epochs if distill: temperature = 10 # 1 means the teacher predictions are used as it is teacher_scaled_preds_val = model_train_teacher(sess, x, y, teacher_preds, teacher_logits, temperature, X_train, Y_train, phase=phase, args=train_params, rng=rng) eval_params = {'batch_size': batch_size} teacher_acc = model_eval(sess, x, y, teacher_preds, X_test, Y_test, phase=phase, args=eval_params) print( 'Test accuracy of the teacher model on legitimate examples: %0.4f' % teacher_acc) print('Training the student model...') student_train_params = { 'nb_epochs': student_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'loss_name': 'train loss', 'filename': 'model', 'reuse_global_step': False, 'train_scope': 'train', 'is_training': True } if save: student_train_params.update({'log_dir': model_path}) y_teacher = tf.placeholder(tf.float32, shape=(None, nb_classes)) model_train_student(sess, x, y, preds, temperature, X_train, Y_train, y_teacher=y_teacher, teacher_preds=teacher_scaled_preds_val, alpha=0.3, beta=0.7, phase=phase, evaluate=evaluate, args=student_train_params, save=save, rng=rng) elif inpgradreg: model_train_inpgrad_reg(sess, x, y, preds, X_train, Y_train, phase=phase, evaluate=evaluate, l2dbl=l2dbl, l2cs=l2cs, args=train_params, save=save, rng=rng) else: # do clean training for 'nb_epochs' or 'delay' epochs model_train(sess, x, y, preds, X_train, Y_train, phase=phase, evaluate=evaluate, args=train_params, save=save, rng=rng) # optionally do additional adversarial training if adv: print("Adversarial training for %d epochs" % (nb_epochs - delay)) train_params.update({'nb_epochs': nb_epochs - delay}) train_params.update({'reuse_global_step': True}) model_train(sess, x, y, preds, X_train, Y_train, phase=phase, predictions_adv=preds_adv_train, evaluate=evaluate, args=train_params, save=save, rng=rng) else: if ensembleThree: variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) stored_variables = [ 'lp_conv1_init/k', 'lp_conv2_init/k', 'lp_conv3_init/k', 'lp_ip1init/W', 'lp_logits_init/W' ] variable_dict = dict(zip(stored_variables, variables[:5])) # Restore the first set of variables from model_path1 saver = tf.train.Saver(variable_dict) saver.restore(sess, tf.train.latest_checkpoint(model_path1)) # Restore the second set of variables from model_path2 variable_dict = dict(zip(stored_variables, variables[5:10])) saver2 = tf.train.Saver(variable_dict) saver2.restore(sess, tf.train.latest_checkpoint(model_path2)) stored_variables = [ 'fp_conv1_init/k', 'fp_conv2_init/k', 'fp_conv3_init/k', 'fp_ip1init/W', 'fp_logits_init/W' ] variable_dict = dict(zip(stored_variables, variables[10:])) saver3 = tf.train.Saver(variable_dict) saver3.restore(sess, tf.train.latest_checkpoint(model_path3)) else: tf_model_load(sess, model_path) print('Restored model from %s' % model_path) evaluate() # Evaluate the accuracy of the CIFAR10 model on legitimate test examples eval_params = {'batch_size': batch_size} if ensembleThree: accuracy = model_eval_ensemble(sess, x, y, preds, X_test, Y_test, phase=phase, feed={phase: False}, args=eval_params) else: accuracy = model_eval(sess, x, y, preds, X_test, Y_test, phase=phase, feed={phase: False}, args=eval_params) print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) ########################################################################### # Build dataset ########################################################################### if targeted: from cleverhans.utils import build_targeted_dataset adv_inputs, true_labels, adv_ys = build_targeted_dataset( X_test, Y_test, np.arange(nb_samples), nb_classes, img_rows, img_cols, channels) else: adv_inputs = X_test[:nb_samples] ########################################################################### # Craft adversarial examples using generic approach ########################################################################### if targeted: att_batch_size = np.clip(nb_samples * (nb_classes - 1), a_max=MAX_BATCH_SIZE, a_min=1) nb_adv_per_sample = nb_classes - 1 yname = "y_target" else: att_batch_size = np.minimum(nb_samples, MAX_BATCH_SIZE) nb_adv_per_sample = 1 adv_ys = None yname = "y" print('Crafting ' + str(nb_samples) + ' * ' + str(nb_adv_per_sample) + ' adversarial examples') print("This could take some time ...") if ensembleThree: model_type = 'ensembleThree' else: model_type = 'default' if attack == ATTACK_CARLINI_WAGNER_L2: from cleverhans.attacks import CarliniWagnerL2 attacker = CarliniWagnerL2(model, back='tf', model_type=model_type, num_classes=nb_classes, sess=sess) attack_params = { 'binary_search_steps': 1, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': att_batch_size, 'initial_const': 10, } elif attack == ATTACK_JSMA: from cleverhans.attacks import SaliencyMapMethod attacker = SaliencyMapMethod(model, back='tf', model_type=model_type, sess=sess, num_classes=nb_classes) attack_params = {'theta': 1., 'gamma': 0.1} elif attack == ATTACK_FGSM: from cleverhans.attacks import FastGradientMethod attacker = FastGradientMethod(model, back='tf', model_type=model_type, sess=sess, num_classes=nb_classes) attack_params = {'eps': eps} elif attack == ATTACK_MADRYETAL: from cleverhans.attacks import MadryEtAl attacker = MadryEtAl(model, back='tf', model_type=model_type, sess=sess, num_classes=nb_classes) attack_params = {'eps': eps, 'eps_iter': 0.01, 'nb_iter': nb_iter} elif attack == ATTACK_BASICITER: from cleverhans.attacks import BasicIterativeMethod attacker = BasicIterativeMethod(model, back='tf', sess=sess, model_type=model_type, num_classes=nb_classes) attack_params = {'eps': eps, 'eps_iter': 0.01, 'nb_iter': nb_iter} else: print("Attack undefined") sys.exit(1) attack_params.update({yname: adv_ys, 'clip_min': 0., 'clip_max': 1.}) X_test_adv = attacker.generate_np(adv_inputs, phase, **attack_params) ''' adv_x = attacker.generate(x, phase, **attack_params) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) eval_params = {'batch_size': att_batch_size} X_test_adv, = batch_eval(sess, [x], [adv_x], [adv_inputs], feed={ phase: False}, args=eval_params) ''' if targeted: assert X_test_adv.shape[0] == nb_samples * \ (nb_classes - 1), X_test_adv.shape # Evaluate the accuracy of the CIFAR10 model on adversarial examples print("Evaluating targeted results") adv_accuracy = model_eval(sess, x, y, preds, X_test_adv, true_labels, phase=phase, args=eval_params) else: # assert X_test_adv.shape[0] == nb_samples, X_test_adv.shape # Evaluate the accuracy of the CIFAR10 model on adversarial examples print("Evaluating un-targeted results") if ensembleThree: adv_accuracy = model_eval_ensemble(sess, x, y, preds, X_test_adv, Y_test, phase=phase, args=eval_params) else: #default below adv_accuracy = model_eval(sess, x, y, preds, X_test_adv, Y_test, phase=phase, args=eval_params) # Compute the number of adversarial examples that were successfully found print('Test accuracy on adversarial examples {0:.4f}'.format(adv_accuracy)) # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((X_test_adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Friendly output for pasting into spreadsheet print('{0:.4f},'.format(accuracy)) print('{0:.4f},'.format(adv_accuracy)) print('{0:.4f},'.format(percent_perturbed)) sess.close() ''' print("Repeating the process, using adversarial training") def evaluate_2(): # Evaluate the accuracy of the adversarialy trained CIFAR10 model on # legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, phase=phase, args=eval_params) print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Evaluate the accuracy of the adversarially trained CIFAR10 model on # adversarial examples accuracy_adv = model_eval(sess, x, y, preds_adv, X_test, Y_test, phase=phase, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy_adv)) # Perform adversarial training train_params.update({'reuse_global_step': True}) model_train(sess, x, y, preds, X_train, Y_train, phase=phase, predictions_adv=preds_adv, evaluate=evaluate_2, args=train_params) ''' '''
def restore(self, path): """ Wrapper around cleverhans utils tf.model_load """ return tf_model_load(self.session, path)
def mnist_tutorial_fgsm(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED, noise_output=NOISE_OUTPUT): """ MNIST tutorial for Fast Gradient Method's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a FGSM attack object fgsm = FastGradientMethod(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes)] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 1, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape((source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" if targeted: fgsm_params_batch_size = source_samples * nb_classes else: fgsm_params_batch_size = source_samples fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv = fgsm.generate_np(adv_inputs, **fgsm_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) adv_accuracy = 1 - err if viz_enabled: for i in range(nb_classes): if noise_output: image = adv[i * nb_classes] - adv_inputs[i * nb_classes] else: image = adv[i * nb_classes] grid_viz_data[i, 0] = image print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) ########################################################################### # Adversarial Training ########################################################################### model2 = ModelBasicCNN('model2', nb_classes, nb_filters) fgsm2 = FastGradientMethod(model2, sess=sess) def attack_fgsm(x): return fgsm2.generate(adv_inputs, **fgsm_params) preds2 = model2.get_logits(x) loss2 = CrossEntropy(model2, smoothing=0.1, attack=attack_fgsm) train(sess, loss2, x_train, y_train, args=train_params, rng=rng) eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds2, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on adversarial fgsm test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy print("Defined TensorFlow model graph.") eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) adv_accuracy = 1 - err if viz_enabled: for i in range(nb_classes): if noise_output: image = adv[i * nb_classes] - adv_inputs[i * nb_classes] else: image = adv[i * nb_classes] grid_viz_data[i, 0] = image print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() def save_visual(data, path): """ Modified version of cleverhans.plot.pyplot """ figure = plt.figure() # figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = data.shape[0] num_rows = data.shape[1] num_channels = data.shape[4] for y in range(num_rows): for x in range(num_cols): figure.add_subplot(num_rows, num_cols, (x + 1) + (y * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(data[x, y, :, :, 0], cmap='gray') else: plt.imshow(data[x, y, :, :, :]) # Draw the plot and return plt.savefig(path) return figure # Finally, block & display a grid of all the adversarial examples if viz_enabled: # _ = grid_visual(grid_viz_data) # cleverhans_image.save("output", grid_viz_data) if noise_output: image_name = "output/fgsm_mnist_noise.png" else: image_name = "output/fgsm_mnist.png" _ = save_visual(grid_viz_data, image_name) return report
def main(argv=None): model_path = FLAGS.model_path targeted = True if FLAGS.targeted else False scale = True if FLAGS.scale else False learning_rate = FLAGS.learning_rate nb_filters = FLAGS.nb_filters batch_size = FLAGS.batch_size nb_epochs = FLAGS.nb_epochs delay = FLAGS.delay eps = FLAGS.eps adv = FLAGS.adv attack = FLAGS.attack attack_iterations = FLAGS.attack_iterations nb_iter = FLAGS.nb_iter #### EMPIR extra flags lowprecision=FLAGS.lowprecision abits=FLAGS.abits wbits=FLAGS.wbits abitsList=FLAGS.abitsList wbitsList=FLAGS.wbitsList stocRound=True if FLAGS.stocRound else False rand=FLAGS.rand model_path2 = FLAGS.model_path2 model_path1 = FLAGS.model_path1 model_path3 = FLAGS.model_path3 ensembleThree=True if FLAGS.ensembleThree else False abits2=FLAGS.abits2 wbits2=FLAGS.wbits2 abits2List=FLAGS.abits2List wbits2List=FLAGS.wbits2List #### save = False train_from_scratch = False #### Imagenet flags imagenet_path = FLAGS.imagenet_path if imagenet_path is None: print("Error: Imagenet data path not specified") sys.exit(1) # Imagenet specific dimensions img_rows = _DEFAULT_IMAGE_SIZE img_cols = _DEFAULT_IMAGE_SIZE channels = _NUM_CHANNELS nb_classes = _NUM_CLASSES # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) set_log_level(logging.WARNING) # Get imagenet datasets train_dataset, test_dataset = data_imagenet(nb_epochs, batch_size, imagenet_path) # Creating a initializable iterators train_iterator = train_dataset.make_initializable_iterator() test_iterator = test_dataset.make_initializable_iterator() # Getting next elements from the iterators next_test_element = test_iterator.get_next() next_train_element = train_iterator.get_next() train_x, train_y = train_iterator.get_next() test_x, test_y = test_iterator.get_next() # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) phase = tf.placeholder(tf.bool, name="phase") logits_scalar = tf.placeholder_with_default( INIT_T, shape=(), name="logits_temperature") if ensembleThree: if (model_path1 is None or model_path2 is None or model_path3 is None): train_from_scratch = True else: train_from_scratch = False elif model_path is not None: if os.path.exists(model_path): # check for existing model in immediate subfolder if any(f.endswith('.meta') for f in os.listdir(model_path)): train_from_scratch = False else: model_path = build_model_save_path( model_path, batch_size, nb_filters, learning_rate, nb_epochs, adv, delay) print(model_path) save = True train_from_scratch = True else: train_from_scratch = True # train from scratch, but don't save since no path given if ensembleThree: if (wbitsList is None) or (abitsList is None): # Layer wise separate quantization not specified for first model if (wbits==0) or (abits==0): print("Error: the number of bits for constant precision weights and activations across layers for the first model have to specified using wbits1 and abits1 flags") sys.exit(1) else: fixedPrec1 = 1 elif (len(wbitsList) != 6) or (len(abitsList) != 6): print("Error: Need to specify the precisions for activations and weights for the atleast the four convolutional layers of alexnet excluding the first layer and 2 fully connected layers excluding the last layer of the first model") sys.exit(1) else: fixedPrec1 = 0 if (wbits2List is None) or (abits2List is None): # Layer wise separate quantization not specified for second model if (wbits2==0) or (abits2==0): print("Error: the number of bits for constant precision weights and activations across layers for the second model have to specified using wbits1 and abits1 flags") sys.exit(1) else: fixedPrec2 = 1 elif (len(wbits2List) != 6) or (len(abits2List) != 6): print("Error: Need to specify the precisions for activations and weights for the atleast the four convolutional layers of alexnet excluding the first layer and 2 fully connected layers excluding the last layer of the second model") sys.exit(1) else: fixedPrec2 = 0 if (fixedPrec2 != 1) or (fixedPrec1 != 1): # Atleast one of the models have separate precisions per layer fixedPrec=0 print("Within atleast one model has separate precisions") if (fixedPrec1 == 1): # first layer has fixed precision abitsList = (abits, abits, abits, abits, abits, abits) wbitsList = (wbits, wbits, wbits, wbits, wbits, wbits) if (fixedPrec2 == 1): # second layer has fixed precision abits2List = (abits2, abits2, abits2, abits2, abits2, abits2) wbits2List = (wbits2, wbits2, wbits2, wbits2, wbits2, wbits2) else: fixedPrec=1 if (train_from_scratch): print ("The ensemble model cannot be trained from scratch") sys.exit(1) if fixedPrec == 1: from cleverhans_tutorials.tutorial_models import make_ensemble_three_alexnet model = make_ensemble_three_alexnet( phase, logits_scalar, 'lp1_', 'lp2_', 'fp_', wbits, abits, wbits2, abits2, input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters, nb_classes=nb_classes) else: from cleverhans_tutorials.tutorial_models import make_layerwise_three_combined_alexnet model = make_layerwise_three_combined_alexnet( phase, logits_scalar, 'lp1_', 'lp2_', 'fp_', wbitsList, abitsList, wbits2List, abits2List, input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters, nb_classes=nb_classes) elif lowprecision: if (wbitsList is None) or (abitsList is None): # Layer wise separate quantization not specified if (wbits==0) or (abits==0): print("Error: the number of bits for constant precision weights and activations across layers have to specified using wbits and abits flags") sys.exit(1) else: fixedPrec = 1 elif (len(wbitsList) != 6) or (len(abitsList) != 6): print("Error: Need to specify the precisions for activations and weights for the atleast the four convolutional layers of alexnet excluding the first layer and 2 fully connected layers excluding the last layer") sys.exit(1) else: fixedPrec = 0 if fixedPrec: ### For training from scratch from cleverhans_tutorials.tutorial_models import make_basic_lowprecision_alexnet model = make_basic_lowprecision_alexnet(phase, logits_scalar, 'lp_', wbits, abits, input_shape=( None, img_rows, img_cols, channels), nb_filters=nb_filters, nb_classes=nb_classes) else: from cleverhans_tutorials.tutorial_models import make_layerwise_lowprecision_alexnet model = make_layerwise_lowprecision_alexnet(phase, logits_scalar, 'lp_', wbitsList, abitsList, input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters, nb_classes=nb_classes) else: ### For training from scratch from cleverhans_tutorials.tutorial_models import make_basic_alexnet_from_scratch model = make_basic_alexnet_from_scratch(phase, logits_scalar, 'fp_', input_shape=( None, img_rows, img_cols, channels), nb_filters=nb_filters, nb_classes=nb_classes) # separate calling function for ensemble models if ensembleThree: preds = model.ensemble_call(x, reuse=False) else: ##default preds = model(x, reuse=False) print("Defined TensorFlow model graph.") rng = np.random.RandomState([2017, 8, 30]) def evaluate(): # Evaluate the accuracy of the CIFAR10 model on legitimate test # examples eval_params = {'batch_size': batch_size} if ensembleThree: acc = model_eval_ensemble_imagenet( sess, x, y, preds, test_iterator, test_x, test_y, phase=phase, args=eval_params) else: #default below acc = model_eval_imagenet( sess, x, y, preds, test_iterator, test_x, test_y, phase=phase, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an Imagenet model train_params = { 'lowprecision': lowprecision, 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'loss_name': 'train loss', 'filename': 'model', 'reuse_global_step': False, 'train_scope': 'train', 'is_training': True } if adv != 0: if adv == ADVERSARIAL_TRAINING_MADRYETAL: from cleverhans.attacks import MadryEtAl train_attack_params = {'eps': MAX_EPS, 'eps_iter': 0.01, 'nb_iter': nb_iter} train_attacker = MadryEtAl(model, sess=sess) elif adv == ADVERSARIAL_TRAINING_FGSM: from cleverhans.attacks import FastGradientMethod stddev = int(np.ceil((MAX_EPS * 255) // 2)) train_attack_params = {'eps': tf.abs(tf.truncated_normal( shape=(batch_size, 1, 1, 1), mean=0, stddev=stddev))} train_attacker = FastGradientMethod(model, back='tf', sess=sess) # create the adversarial trainer train_attack_params.update({'clip_min': 0., 'clip_max': 1.}) adv_x_train = train_attacker.generate(x, phase, **train_attack_params) preds_adv_train = model.get_probs(adv_x_train) eval_attack_params = {'eps': MAX_EPS, 'clip_min': 0., 'clip_max': 1.} adv_x_eval = train_attacker.generate(x, phase, **eval_attack_params) preds_adv_eval = model.get_probs(adv_x_eval) # * logits_scalar # if adv: # from cleverhans.attacks import FastGradientMethod # fgsm = FastGradientMethod(model, back='tf', sess=sess) # fgsm_params = {'eps': eps, 'clip_min': 0., 'clip_max': 1.} # adv_x_train = fgsm.generate(x, phase, **fgsm_params) # preds_adv = model.get_probs(adv_x_train) if train_from_scratch: if save: train_params.update({'log_dir': model_path}) if adv and delay > 0: train_params.update({'nb_epochs': delay}) # do clean training for 'nb_epochs' or 'delay' epochs with learning rate reducing with time model_train_imagenet2(sess, x, y, preds, train_iterator, train_x, train_y, phase=phase, evaluate=evaluate, args=train_params, save=save, rng=rng) # optionally do additional adversarial training if adv: print("Adversarial training for %d epochs" % (nb_epochs - delay)) train_params.update({'nb_epochs': nb_epochs - delay}) train_params.update({'reuse_global_step': True}) model_train_imagenet(sess, x, y, preds, train_iterator, train_x, train_y, phase=phase, predictions_adv=preds_adv_train, evaluate=evaluate, args=train_params, save=save, rng=rng) else: if ensembleThree: ## ensembleThree models have to loaded from different paths variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) # First 11 variables from path1 stored_variables = ['lp_conv1_init/k', 'lp_conv1_init/b', 'lp_conv2_init/k', 'lp_conv3_init/k', 'lp_conv4_init/k', 'lp_conv5_init/k', 'lp_ip1init/W', 'lp_ip1init/b', 'lp_ip2init/W', 'lp_logits_init/W', 'lp_logits_init/b'] variable_dict = dict(OrderedDict(zip(stored_variables, variables[:11]))) # only dict was messing with the order # Restore the first set of variables from model_path1 saver = tf.train.Saver(variable_dict) saver.restore(sess, tf.train.latest_checkpoint(model_path1)) # Restore the second set of variables from model_path2 # Second 11 variables from path2 variable_dict = dict(OrderedDict(zip(stored_variables, variables[11:22]))) saver2 = tf.train.Saver(variable_dict) saver2.restore(sess, tf.train.latest_checkpoint(model_path2)) # Third 11 variables from path3 stored_variables = ['fp_conv1_init/k', 'fp_conv1_init/b', 'fp_conv2_init/k', 'fp_conv3_init/k', 'fp_conv4_init/k', 'fp_conv5_init/k', 'fp_ip1init/W', 'fp_ip1init/b', 'fp_ip2init/W', 'fp_logits_init/W', 'fp_logits_init/b'] variable_dict = dict(OrderedDict(zip(stored_variables, variables[22:33]))) saver3 = tf.train.Saver(variable_dict) saver3.restore(sess, tf.train.latest_checkpoint(model_path3)) # Next 24 batch norm variables from path1 stored_variables = ['lp__batchNorm1/batch_normalization/gamma', 'lp__batchNorm1/batch_normalization/beta', 'lp__batchNorm1/batch_normalization/moving_mean', 'lp__batchNorm1/batch_normalization/moving_variance', 'lp__batchNorm2/batch_normalization/gamma', 'lp__batchNorm2/batch_normalization/beta', 'lp__batchNorm2/batch_normalization/moving_mean', 'lp__batchNorm2/batch_normalization/moving_variance', 'lp__batchNorm3/batch_normalization/gamma', 'lp__batchNorm3/batch_normalization/beta', 'lp__batchNorm3/batch_normalization/moving_mean', 'lp__batchNorm3/batch_normalization/moving_variance', 'lp__batchNorm4/batch_normalization/gamma', 'lp__batchNorm4/batch_normalization/beta', 'lp__batchNorm4/batch_normalization/moving_mean', 'lp__batchNorm4/batch_normalization/moving_variance', 'lp__batchNorm5/batch_normalization/gamma', 'lp__batchNorm5/batch_normalization/beta', 'lp__batchNorm5/batch_normalization/moving_mean', 'lp__batchNorm5/batch_normalization/moving_variance', 'lp__batchNorm6/batch_normalization/gamma', 'lp__batchNorm6/batch_normalization/beta', 'lp__batchNorm6/batch_normalization/moving_mean', 'lp__batchNorm6/batch_normalization/moving_variance'] variable_dict = dict(OrderedDict(zip(stored_variables, variables[33:57]))) saver = tf.train.Saver(variable_dict) saver.restore(sess, tf.train.latest_checkpoint(model_path1)) # Next 24 batch norm variables from path2 variable_dict = dict(OrderedDict(zip(stored_variables, variables[57:81]))) saver = tf.train.Saver(variable_dict) saver.restore(sess, tf.train.latest_checkpoint(model_path2)) # Final 24 batch norm variables from path1 stored_variables = ['fp__batchNorm1/batch_normalization/gamma', 'fp__batchNorm1/batch_normalization/beta', 'fp__batchNorm1/batch_normalization/moving_mean', 'fp__batchNorm1/batch_normalization/moving_variance', 'fp__batchNorm2/batch_normalization/gamma', 'fp__batchNorm2/batch_normalization/beta', 'fp__batchNorm2/batch_normalization/moving_mean', 'fp__batchNorm2/batch_normalization/moving_variance', 'fp__batchNorm3/batch_normalization/gamma', 'fp__batchNorm3/batch_normalization/beta', 'fp__batchNorm3/batch_normalization/moving_mean', 'fp__batchNorm3/batch_normalization/moving_variance', 'fp__batchNorm4/batch_normalization/gamma', 'fp__batchNorm4/batch_normalization/beta', 'fp__batchNorm4/batch_normalization/moving_mean', 'fp__batchNorm4/batch_normalization/moving_variance', 'fp__batchNorm5/batch_normalization/gamma', 'fp__batchNorm5/batch_normalization/beta', 'fp__batchNorm5/batch_normalization/moving_mean', 'fp__batchNorm5/batch_normalization/moving_variance', 'fp__batchNorm6/batch_normalization/gamma', 'fp__batchNorm6/batch_normalization/beta', 'fp__batchNorm6/batch_normalization/moving_mean', 'fp__batchNorm6/batch_normalization/moving_variance'] variable_dict = dict(OrderedDict(zip(stored_variables, variables[81:105]))) saver = tf.train.Saver(variable_dict) saver.restore(sess, tf.train.latest_checkpoint(model_path3)) else: # restoring the model trained using this setup, not a downloaded one tf_model_load(sess, model_path) print('Restored model from %s' % model_path) # evaluate() # Evaluate the accuracy of the model on legitimate test examples eval_params = {'batch_size': batch_size} if ensembleThree: accuracy = model_eval_ensemble_imagenet(sess, x, y, preds, test_iterator, test_x, test_y, phase=phase, feed={phase: False}, args=eval_params) else: #default below accuracy = model_eval_imagenet(sess, x, y, preds, test_iterator, test_x, test_y, phase=phase, feed={phase: False}, args=eval_params) print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) ########################################################################### # Build dataset ########################################################################### adv_inputs = test_x #adversarial inputs can be generated from any of the test examples ########################################################################### # Craft adversarial examples using generic approach ########################################################################### nb_adv_per_sample = 1 adv_ys = None yname = "y" print('Crafting adversarial examples') print("This could take some time ...") if ensembleThree: model_type = 'ensembleThree' else: model_type = 'default' if attack == ATTACK_CARLINI_WAGNER_L2: from cleverhans.attacks import CarliniWagnerL2 attacker = CarliniWagnerL2(model, back='tf', sess=sess, model_type=model_type, num_classes=nb_classes) attack_params = {'binary_search_steps': 1, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': batch_size, 'initial_const': 10, } elif attack == ATTACK_JSMA: from cleverhans.attacks import SaliencyMapMethod attacker = SaliencyMapMethod(model, back='tf', sess=sess, model_type=model_type, num_classes=nb_classes) attack_params = {'theta': 1., 'gamma': 0.1} elif attack == ATTACK_FGSM: from cleverhans.attacks import FastGradientMethod attacker = FastGradientMethod(model, back='tf', sess=sess, model_type=model_type, num_classes=nb_classes) attack_params = {'eps': eps} elif attack == ATTACK_MADRYETAL: from cleverhans.attacks import MadryEtAl attacker = MadryEtAl(model, back='tf', sess=sess, model_type=model_type, num_classes=nb_classes) attack_params = {'eps': eps, 'eps_iter': 0.01, 'nb_iter': nb_iter} elif attack == ATTACK_BASICITER: print('Attack: BasicIterativeMethod') from cleverhans.attacks import BasicIterativeMethod attacker = BasicIterativeMethod(model, back='tf', sess=sess, model_type=model_type, num_classes=nb_classes) attack_params = {'eps': eps, 'eps_iter': 0.01, 'nb_iter': nb_iter} else: print("Attack undefined") sys.exit(1) attack_params.update({'clip_min': -2.2, 'clip_max': 2.7}) # Since max and min for imagenet turns out to be around -2.11 and 2.12 eval_params = {'batch_size': batch_size} ''' adv_x = attacker.generate(x, phase, **attack_params) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) eval_params = {'batch_size': batch_size} X_test_adv, = batch_eval(sess, [x], [adv_x], [adv_inputs], feed={ phase: False}, args=eval_params) ''' print("Evaluating un-targeted results") if ensembleThree: adv_accuracy = model_eval_ensemble_adv_imagenet(sess, x, y, preds, test_iterator, test_x, test_y, phase=phase, args=eval_params, attacker=attacker, attack_params=attack_params) else: adv_accuracy = model_eval_adv_imagenet(sess, x, y, preds, test_iterator, test_x, test_y, phase=phase, args=eval_params, attacker=attacker, attack_params=attack_params) # Compute the number of adversarial examples that were successfully found print('Test accuracy on adversarial examples {0:.4f}'.format(adv_accuracy)) # Close TF session sess.close()
X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng, save=True) eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_par) print('Train accuracy on legitimate examples: %0.4f\n' % acc) acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_par) print('Test accuracy on legitimate examples: %0.4f\n' % acc) if not TRAIN: tf_model_load(sess, cache_dir + "/" + MODEL_NAME) fgsm_params = {'eps': 0.25, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model, sess=sess) adversarial_sample = fgsm.generate(x, **fgsm_params) # Predictions and class of the adversarial examples. adversarial_preds = model.get_probs(adversarial_sample) adversarial_class = tf.argmax(adversarial_preds, axis=-1) X_adv = np.ndarray(shape=X_test.shape) Y_adv = np.copy(Y_test) X_slic = np.ndarray(shape=X_test.shape) Y_slic = np.copy(Y_test)
def prep_bbox(sess, logits_scalar, x, y, X_train, Y_train, X_test, Y_test, img_rows, img_cols, channels, nb_epochs, batch_size, learning_rate, rng, phase=None, binary=False, scale=False, nb_filters=64, model_path=None, adv=0, delay=0, eps=0.3): """ Define and train a model that simulates the "remote" black-box oracle described in the original paper. :param sess: the TF session :param x: the input placeholder for MNIST :param y: the ouput placeholder for MNIST :param X_train: the training data for the oracle :param Y_train: the training labels for the oracle :param X_test: the testing data for the oracle :param Y_test: the testing labels for the oracle :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param rng: numpy.random.RandomState :return: """ # Define TF model graph (for the black-box model) save = False train_from_scratch = False if model_path is not None: if os.path.exists(model_path): # check for existing model in immediate subfolder if any(f.endswith('.meta') for f in os.listdir(model_path)): binary, scale, nb_filters, batch_size, learning_rate, nb_epochs, adv = parse_model_settings( model_path) train_from_scratch = False else: model_path = build_model_save_path(model_path, binary, batch_size, nb_filters, learning_rate, nb_epochs, adv, delay, scale) print(model_path) save = True train_from_scratch = True else: train_from_scratch = True # train from scratch, but don't save since no path given if binary: if scale: #from cleverhans_tutorials.tutorial_models import make_scaled_binary_cnn # model = make_scaled_binary_cnn(phase, 'bb_binsc_', input_shape=( from cleverhans_tutorials.tutorial_models import make_scaled_binary_rand_cnn model = make_scaled_binary_rand_cnn(phase, logits_scalar, 'bb_binsc_', input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_basic_binary_cnn model = make_basic_binary_cnn(phase, logits_scalar, 'bb_bin_', input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters) else: from cleverhans_tutorials.tutorial_models import make_basic_cnn model = make_basic_cnn(phase, logits_scalar, 'bb_fp_', input_shape=(None, img_rows, img_cols, channels), nb_filters=nb_filters) preds = model(x, reuse=False) print("Defined TensorFlow model graph.") def evaluate(): # Print out the accuracy on legitimate data eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, phase=phase, args=eval_params) print('Test accuracy of black-box on legitimate test ' 'examples: %.4f' % acc) # Train an MNIST model train_params = { 'binary': binary, 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'loss_name': 'bb train loss', 'filename': 'bb_model', 'train_scope': 'bb_model', 'reuse_global_step': False, 'is_training': True } if adv != 0: if adv == ADVERSARIAL_TRAINING_MADRYETAL: from cleverhans.attacks import MadryEtAl nb_iter = 20 train_attack_params = { 'eps': MAX_EPS, 'eps_iter': 0.01, 'nb_iter': nb_iter } train_attacker = MadryEtAl(model, sess=sess) if adv == ADVERSARIAL_TRAINING_FGSM: from cleverhans.attacks import FastGradientMethod train_attacker = FastGradientMethod(model, back='tf', sess=sess) # create the adversarial trainer train_attack_params.update({'clip_min': 0., 'clip_max': 1.}) adv_x_train = train_attacker.generate(x, phase, **train_attack_params) preds_adv = model.get_probs(adv_x_train) if train_from_scratch: if save: train_params.update({'log_dir': model_path}) if adv and delay > 0: train_params.update({'nb_epochs': delay}) # do clean training for 'nb_epochs' or 'delay' epochs model_train(sess, x, y, preds, X_train, Y_train, phase=phase, evaluate=evaluate, args=train_params, save=save, rng=rng) # optionally do additional adversarial training if adv: print("Adversarial training for %d epochs" % (nb_epochs - delay)) train_params.update({'nb_epochs': nb_epochs - delay}) train_params.update({'reuse_global_step': True}) model_train(sess, x, y, preds, X_train, Y_train, phase=phase, predictions_adv=preds_adv, evaluate=evaluate, args=train_params, save=save, rng=rng) else: tf_model_load(sess, model_path) print('Restored model from %s' % model_path) accuracy = evaluate() return model, preds, accuracy, model_path
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x, y, x_train, y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': 10 } adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ idxs], args=eval_params) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ :source_samples], args=eval_params) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Disable Keras learning phase since we will be serving through tensorflow keras.layers.core.K.set_learning_phase(0) # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Image dimensions ordering should follow the TensorFlow convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' " "to 'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) print("Created TensorFlow session and set Keras backend.") # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path+".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, preds, X_train, Y_train, args=train_params, save=os.path.exists("models")) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object wrap = KerasModelWrapper(model) cw = CarliniWagnerL2(wrap, back='tf', sess=sess) idxs = [np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(10)] if targeted: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') one_hot = np.zeros((10, 10)) one_hot[np.arange(10), np.arange(10)] = 1 adv_inputs = np.array([[instance] * 10 for instance in X_test[idxs]], dtype=np.float32) adv_inputs = adv_inputs.reshape((100, 28, 28, 1)) adv_ys = np.array([one_hot] * 10, dtype=np.float32).reshape((100, 10)) yname = "y_target" else: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = X_test[idxs] adv_ys = None yname = "y" cw_params = {'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': 100 if targeted else 10, 'initial_const': 10} adv = cw.generate_np(adv_inputs, **cw_params) if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args={'batch_size': 10}) else: adv_accuracy = 1-model_eval(sess, x, y, preds, adv, Y_test[idxs], args={'batch_size': 10}) for j in range(10): if targeted: for i in range(10): grid_viz_data[i, j] = adv[i * 10 + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1.-adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() model_path = "./" model_name = "clean_trained__model_notmnist" # Set TF random seed to improve reproducibility tf.set_random_seed(7895) # Set logging level to see debug information set_log_level(logging.DEBUG) # Get MNIST test data # X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, # train_end=train_end, # test_start=test_start, # test_end=test_end) # Get notMNIST data with np.load("notmnist.npz") as data: X_train, Y_train, X_test, Y_test = data['examples_train'], data[ 'labels_train'], data['examples_test'], data['labels_test'] # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} # Define TF model graph model = make_basic_cnn() # Create TF session sess = tf.Session() if tf_model_load(sess, file_path=os.path.join(model_path, model_name)): print(model_name, " reloaded.") # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc return report
def main(argv=None): tf.set_random_seed(1234) sess = tf.Session() keras.backend.set_session(sess) X_train, Y_train, X_test, Y_test = data_cifar10() Y_train = Y_train.clip(.1 / 9., 1. - .1) x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) model = cnn_model(img_rows=32, img_cols=32, channels=3) predictions = model(x) def evaluate(): eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate test examples: ' + str(accuracy)) train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate, 'train_dir': FLAGS.train_dir, 'filename': FLAGS.filename } model_path = os.path.join(FLAGS.train_dir, FLAGS.filename) if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, args=train_params, save=True) wrap = KerasModelWrapper(model) nb_classes = 10 targeted = False nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' cw = CarliniWagnerL2(model, back='tf', sess=sess) adv_inputs = X_test[:10] adv_ys = None yname = "y" edges = edgeDetect(adv_inputs) print('Length of edges of adv samples: ') print(edges[:10]) cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': 100, 'learning_rate': 0.1, 'batch_size': 10, 'initial_const': 10, } adv = cw.generate_np(adv_inputs, **cw_params) edges = edgeDetect(adv) print('Length of edges of adv samples: ') print(edges[:10]) sigma = 16.0 / 255 gamma = 0.00061 * 255 * 255 alpha = 0.00061 * 255 * 255 n_clusters = 10 n_samples = 50 noise = np.random.normal(0.0, sigma, adv.shape) adv_gauss = adv + noise i1 = np.repeat(np.arange(0, 10), n_samples) i2 = np.random.randint(32, size=10 * n_samples) i3 = np.random.randint(32, size=10 * n_samples) sample = adv[i1, i2, i3] noise = np.random.normal(0.0, sigma, sample.shape) noisy_samples = sample + noise noisy_samples = np.reshape(noisy_samples, (10, n_samples, 3)) noise = np.random.normal(0.0, sigma, adv.shape) adv_rdesc = np.zeros(adv.shape) adv_rmix = np.zeros(adv.shape) for img_no, img_samples in enumerate(noisy_samples): clusters = np.zeros((n_clusters, 3)) clusters[0] = img_samples[0] for c_j in range(1, n_clusters): prob_cj = np.zeros(n_samples) for pix_no, pix in enumerate(img_samples): l2_min = 100000 for c_l in range(0, c_j): l2_norm_sq = np.inner(pix - clusters[c_l], pix - clusters[c_l]) if l2_norm_sq < l2_min: l2_min = l2_norm_sq prob_cj[pix_no] = math.exp(gamma * l2_min) prob_cj /= prob_cj.sum() clusters[c_j] = img_samples[np.random.choice(n_samples, 1, p=prob_cj)] for pix_i in range(0, 32): for pix_j in range(0, 32): c_dist_min = 100000 c_min = np.zeros(3) c_sum = np.zeros(3) weight_sum = 0 for c_j in clusters: c_dist = np.linalg.norm(adv_gauss[img_no][pix_i][pix_j] - c_j) weight_j = math.exp(-1 * alpha * c_dist * c_dist) weight_sum = weight_sum + weight_j c_sum = c_sum + weight_j * c_j if c_dist < c_dist_min: c_dist_min = c_dist c_min = c_j adv_rdesc[img_no][pix_i][pix_j] = c_min adv_rmix[img_no][pix_i][pix_j] = c_sum / weight_sum eval_params = {'batch_size': np.minimum(nb_classes, 10)} adv_accuracy = 1 - model_eval( sess, x, y, predictions, adv, Y_test[:10], args=eval_params) print('Avg. rate of successful adv. examples without noise {0:.4f}'.format( adv_accuracy)) percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations without noise {0:.4f}'.format( percent_perturbed)) adv_accuracy = 1 - model_eval( sess, x, y, predictions, adv_gauss, Y_test[:10], args=eval_params) print('Avg. rate of successful adv. examples with Gaussian noise {0:.4f}'. format(adv_accuracy)) percent_perturbed = np.mean( np.sum((adv_gauss - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations with Gaussian noise {0:.4f}'.format( percent_perturbed)) adv_accuracy = 1 - model_eval( sess, x, y, predictions, adv_rdesc, Y_test[:10], args=eval_params) print('Avg. rate of successful adv. examples with random descent {0:.4f}'. format(adv_accuracy)) percent_perturbed = np.mean( np.sum((adv_rdesc - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations with random descent {0:.4f}'.format( percent_perturbed)) adv_accuracy = 1 - model_eval( sess, x, y, predictions, adv_rmix, Y_test[:10], args=eval_params) print('Avg. rate of successful adv. examples with random mixture {0:.4f}'. format(adv_accuracy)) percent_perturbed = np.mean( np.sum((adv_rmix - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations with random mixture {0:.4f}'.format( percent_perturbed)) sess.close()
def main(_): batch_shape = [FLAGS.batch_size, FLAGS.image_height, FLAGS.image_width, 3] num_classes = 1001 config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Prepare graph x_input = tf.placeholder(tf.float32, shape=batch_shape) y_label = tf.placeholder(tf.int32, shape=(FLAGS.batch_size, )) y_hot = tf.one_hot(y_label, num_classes) model = InceptionModel(num_classes) preds = model(x_input) logits = model.get_logits(x_input) acc = _top_1_accuracy(logits, y_label) tf_model_load(sess, FLAGS.checkpoint_path) attack = KKTFun5(model, sess=sess) eps = FLAGS.eps alp = FLAGS.alp params = { 'eps': 0.3, 'alp': 1.0, 'ord': 2, 'nb_iter': eps, 'clip_min': 0., 'clip_max': 1. } adv_x, log_step, log_suc = attack.generate(x_input, y_label, **params) adv_image = np.zeros((1000, 299, 299, 3)) l2_norm = np.zeros((1000)) acc_ori = np.zeros((1000)) acc_val = np.zeros((1000)) pred_score = np.zeros((1000, 1001)) pred_score_adv = np.zeros((1000, 1001)) name = [] b_i = 0 begin = time.time() for images, _, labels, filenames in load_images(FLAGS.input_dir, FLAGS.input_dir, FLAGS.metadata_file_path, batch_shape): bb_i = b_i + FLAGS.batch_size y_labels = np.zeros((FLAGS.batch_size, num_classes)) for i_y in range(FLAGS.batch_size): y_labels[i_y][labels[i_y]] = 1 x_adv = sess.run(adv_x, feed_dict={x_input: images, y_label: labels}) #acc_val[b_i:bb_i] = sess.run(log_suc,feed_dict={x_input:images,y_label:labels}) #l2_norm[b_i:bb_i] = sess.run(log_step,feed_dict={x_input:images,y_label:labels}) #pdb.set_trace() #acc_ori[b_i] = sess.run(acc,feed_dict={x_input:images,y_label:labels}) #l2_norm[b_i] = np.mean(np.sum((images- x_adv)**2,axis=(1,2,3))**.5) adv_image[b_i:bb_i] = x_adv acc_ori[b_i:bb_i] = sess.run(acc, feed_dict={ x_input: images, y_label: labels }) acc_val[b_i:bb_i] = sess.run(acc, feed_dict={ x_input: x_adv, y_label: labels }) pred_score[b_i:bb_i] = sess.run(preds, feed_dict={ x_input: images, y_label: labels }) pred_score_adv[b_i:bb_i] = sess.run(preds, feed_dict={ x_input: x_adv, y_label: labels }) l2_norm[b_i:bb_i] = np.sum((x_adv - images)**2, axis=(1, 2, 3))**.5 name.append(filenames) b_i = bb_i
#exit() # restore from pretrained config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) using_aug = True #ckpt_path = '../tfmodels/cifar10_simple_model_epoch50' ckpt_path = '../tfmodels/cifar10_resnet_model_epoch200' if using_aug: print('Using model trained by augmented data.') ckpt_path += '_aug' tf_model_load(sess, ckpt_path) # prepare data (x_train, y_train), (x_test, y_test) = load_cifar10() x_filtered, y_filtered, _ = filter_data(sess, x, y, model, x_test, y_test) # make sure the model load properly by running it against the test set accuracy = validate_model(sess, x, y, model, x_filtered, y_filtered) print('Base accuracy of the target model on legitimate images: ' + str(accuracy)) # initiate attack target = None attack_method = 'basic_iterative' recover_method = 'middleground_flip' recover_params = {'eps': 8./255, 'eps_iter': 1./255, # only used when ord=np.inf
def mnist_tutorial_adv_train(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED, noise_output=NOISE_OUTPUT): """ MNIST tutorial for Adversarial Training :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using FGSM - BIM - MIM approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object fgsm = FastGradientMethod(model, sess=sess) bim = BasicIterativeMethod(model, sess=sess) mim = MomentumIterativeMethod(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 1, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} bim_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1., 'nb_iter': 50, 'eps_iter': .01 } mim_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1., 'nb_iter': 50, 'eps_iter': .01 } adv_fgsm = fgsm.generate_np(adv_inputs, **fgsm_params) adv_bim = bim.generate_np(adv_inputs, **bim_params) adv_mim = mim.generate_np(adv_inputs, **mim_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_fgsm_accuracy = model_eval(sess, x, y, preds, adv_fgsm, adv_ys, args=eval_params) adv_bim_accuracy = model_eval(sess, x, y, preds, adv_bim, adv_ys, args=eval_params) adv_mim_accuracy = model_eval(sess, x, y, preds, adv_mim, adv_ys, args=eval_params) else: if viz_enabled: err_fgsm = model_eval(sess, x, y, preds, adv_fgsm, y_test[idxs], args=eval_params) err_bim = model_eval(sess, x, y, preds, adv_bim, y_test[idxs], args=eval_params) err_mim = model_eval(sess, x, y, preds, adv_mim, y_test[idxs], args=eval_params) adv_fgsm_accuracy = 1 - err_fgsm adv_bim_accuracy = 1 - err_bim adv_mim_accuracy = 1 - err_mim else: err_fgsm = model_eval(sess, x, y, preds, adv_fgsm, y_test[:source_samples], args=eval_params) err_bim = model_eval(sess, x, y, preds, adv_bim, y_test[:source_samples], args=eval_params) err_mim = model_eval(sess, x, y, preds, adv_mim, y_test[:source_samples], args=eval_params) adv_fgsm_accuracy = 1 - err_fgsm adv_bim_accuracy = 1 - err_bim adv_mim_accuracy = 1 - err_mim print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. (FGSM) examples {0:.4f}'.format( adv_fgsm_accuracy)) report.clean_train_adv_fgsm_eval = 1. - adv_fgsm_accuracy print('Avg. rate of successful adv. (BIM) examples {0:.4f}'.format( adv_bim_accuracy)) report.clean_train_adv_bim_eval = 1. - adv_bim_accuracy print('Avg. rate of successful adv. (MIM) examples {0:.4f}'.format( adv_mim_accuracy)) report.clean_train_adv_mim_eval = 1. - adv_mim_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed_fgsm = np.mean( np.sum((adv_fgsm - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of (FGSM) perturbations {0:.4f}'.format( percent_perturbed_fgsm)) percent_perturbed_bim = np.mean( np.sum((adv_bim - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of (BIM) perturbations {0:.4f}'.format( percent_perturbed_bim)) percent_perturbed_mim = np.mean( np.sum((adv_mim - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of (MIM) perturbations {0:.4f}'.format( percent_perturbed_mim)) ########################################################################### # Adversarial Training ########################################################################### model2 = ModelBasicCNN('model2', nb_classes, nb_filters) fgsm2 = FastGradientMethod(model, sess=sess) # bim2 = BasicIterativeMethod(model, sess=sess) # mim2 = MomentumIterativeMethod(model, sess=sess) def attack_fgsm(x): return fgsm2.generate(adv_inputs, **fgsm_params) # def attack_bim(x): # return bim2.generate(adv_inputs, **bim_params) # def attack_mim(x): # return mim2.generate(adv_inputs, **mim_params) preds2 = model2.get_logits(x) loss2_fgsm = CrossEntropy(model2, smoothing=0.1, attack=attack_fgsm) # loss2_bim = CrossEntropy(model2, smoothing=0.1, attack=attack_bim) # loss2_mim = CrossEntropy(model2, smoothing=0.1, attack=attack_mim) train(sess, loss2_fgsm, x_train, y_train, args=train_params, rng=rng) eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds2, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on adversarial fgsm test examples: {0}'.format( accuracy)) report.clean_train_clean_eval = accuracy print("Defined TensorFlow model graph.") adv_fgsm_accuracy = model_eval(sess, x, y, preds, adv_fgsm, adv_ys, args=eval_params) adv_bim_accuracy = model_eval(sess, x, y, preds, adv_bim, adv_ys, args=eval_params) adv_mim_accuracy = model_eval(sess, x, y, preds, adv_mim, adv_ys, args=eval_params) # Close TF session sess.close() return report
def zoo(viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED): """ :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) if DATASET == 'MNIST': train_start = 0 train_end = 60000 test_start = 0 test_end = 10000 ds = dataset.MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end, center=False) elif DATASET == 'SVHN': train_start = 0 train_end = 73257 test_start = 0 test_end = 26032 ds = dataset.SVHN(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) elif DATASET == 'CIFAR10': train_start = 0 train_end = 60000 test_start = 0 test_end = 10000 ds = dataset.CIFAR10(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end, center=False) x_train, y_train, x_test, y_test = ds.get_set('train') + ds.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN(DATASET, nb_classes, nb_filters, (None, img_rows, img_cols, nchannels)) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2018, 10, 22]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a Zoo attack object zoo = Zoo(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" zoo_params = { 'binary_search_steps': BINARY_SEARCH_STEPS, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': ZOO_LEARNING_RATE, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': INIT_CONST, 'solver': SOLVER, 'image_shape': [img_rows, img_cols, nchannels], 'nb_classes': nb_classes } adv = zoo.generate_np(adv_inputs, **zoo_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = 1 - model_eval( sess, x, y, preds, adv, y_test[idxs], args=eval_params) else: adv_accuracy = 1 - model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: _ = grid_visual(grid_viz_data) return report
def cifar10_tutorial(train_start=0, train_end=50000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1): """ CIFAR10 cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get CIFAR10 data data = CIFAR10(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) dataset_size = data.x_train.shape[0] dataset_train = data.to_tensorflow()[0] dataset_train = dataset_train.map( lambda x, y: (random_shift(random_horizontal_flip(x)), y), 4) dataset_train = dataset_train.batch(batch_size) dataset_train = dataset_train.prefetch(16) x_train, y_train = data.get_set('train') x_test, y_test = data.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_test.shape[1:4] nb_classes = y_test.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = {'eps': 0.13, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) model = ModelAllConvolutional('model1', nb_classes, nb_filters, input_shape=[32, 32, 3]) preds = model.get_logits(x) if clean_train: loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, None, None, dataset_train=dataset_train, dataset_size=dataset_size, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) # save model #saver = tf.train.Saver() #saver.save(sess, "./checkpoint_dir/clean_model_100.ckpt") # load model and compute testing accuracy if testing: tf_model_load(sess, file_path="./checkpoint_dir/clean_model_100.ckpt") do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) # Evaluate the accuracy of the CIFAR10 model on adversarial examples do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True) # generate and show adversarial samples x_test_adv = np.zeros(shape=x_test.shape) for i in range(10): x_test_adv[i * 1000:(i + 1) * 1000] = adv_x.eval( session=sess, feed_dict={x: x_test[i * 1000:(i + 1) * 1000]}) # implement anisotropic diffusion on adversarial samples x_test_filtered = np.zeros(shape=x_test_adv.shape) for i in range(y_test.shape[0]): x_test_filtered[i] = filter.anisotropic_diffusion(x_test_adv[i]) # implement median on adversarial samples # x_test_filtered_med = np.zeros(shape=x_test_adv.shape) # for i in range(y_test.shape[0]): # x_test_filtered_med[i] = medfilt(x_test_filtered_ad[i], kernel_size=(3,3,1)) acc = model_eval(sess, x, y, preds, x_test_filtered, y_test, args=eval_params) print("acc after anisotropic diffusion is {}".format(acc)) return report