class AdverseCNN: def __init__(self, nb_epochs, batch_size, learning_rate, eps = 0.3, clip_min=0, clip_max=1): self.train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } self.eval_params = {'batch_size': batch_size} self.fgsm_params = { 'eps': eps, 'clip_min': clip_min, 'clip_max': clip_max } self.x_train = None self.y_train = None self.x_test = None self.y_test = None self.range = np.random.RandomState([2019, 11, 25]) self.model = None self.preds = None self.loss = None self.img_rows = None self.img_cols = None self.nchannels = None self.nb_classes = None self.preds_adv = None def get_data(self, train_start, train_end, test_start, test_end): mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) self.x_train, self.y_train = mnist.get_set('train') self.x_test, self.y_test = mnist.get_set('test') self.img_rows, self.img_cols, self.nchannels = self.x_train.shape[1:4] self.nb_classes = self.y_train.shape[1] def adverse_train(self, nb_filters, label_smoothing): self.model = ModelBasicCNN('model2', self.nb_classes, nb_filters) fgsm = FastGradientMethod(self.model, sess=sess) def attack(x): return fgsm.generate(x, **self.fgsm_params) self.preds = self.model.get_logits(x) self.loss = CrossEntropy(self.model, smoothing=label_smoothing, attack=attack) adv_x = attack(x) self.preds_adv = self.model.get_logits(adv_x) train(sess, self.loss, self.x_train, self.y_train, evaluate=self.evaluate, args=self.train_params, rng=self.range, var_list=self.model.get_params()) def evaluate(self): do_eval(self.preds, self.eval_params, self.x_test, self.y_test, 'adv_train_clean_eval', False) do_eval(self.preds_adv, self.eval_params, self.x_test, self.y_test, 'adv_train_adv_eval', True) def test(self): do_eval(self.preds, self.eval_params, self.x_train, self.y_train, 'train_adv_train_clean_eval') do_eval(self.preds_adv, self.eval_params, self.x_train, self.y_train, 'train_adv_train_adv_eval')
def prep_bbox(sess, x, y, x_train, y_train, x_test, y_test, nb_epochs, batch_size, learning_rate, rng, nb_classes=10, img_rows=28, img_cols=28, nchannels=1): """ Define and train a model that simulates the "remote" black-box oracle described in the original paper. :param sess: the TF session :param x: the input placeholder for MNIST :param y: the ouput placeholder for MNIST :param x_train: the training data for the oracle :param y_train: the training labels for the oracle :param x_test: the testing data for the oracle :param y_test: the testing labels for the oracle :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param rng: numpy.random.RandomState :return: """ # Define TF model graph (for the black-box model) nb_filters = 64 model = ModelBasicCNN('model1', nb_classes, nb_filters) loss = CrossEntropy(model, smoothing=0.1) predictions = model.get_logits(x) print("Defined TensorFlow model graph.") # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } train(sess, loss, x_train, y_train, args=train_params, rng=rng) # Print out the accuracy on legitimate data eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, predictions, x_test, y_test, args=eval_params) print('Test accuracy of black-box on legitimate test ' 'examples: ' + str(accuracy)) return model, predictions, accuracy
def prepblackbox(self, nb_filters): model = ModelBasicCNN('model1', self.nb_classes, nb_filters) loss = CrossEntropy(model, smoothing=0.1) predictions = model.get_logits(x) print("Defined TensorFlow model graph.") train(sess, loss, self.x_train, self.y_train, args=self.train_params, rng=self.range) eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, predictions, self.x_test, self.y_test, args=eval_params) print('Test accuracy of black-box on legitimate test ' 'examples: ' + str(accuracy)) return model, predictions, accuracy
def __test(): # report = AccuracyReport() tf.set_random_seed(1234) sess = tf.Session() set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=0, train_end=60000, test_start=0, test_end=10000) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") # Train an MNIST model train_params = { 'nb_epochs': NB_EPOCHS, 'batch_size': BATCH_SIZE, 'learning_rate': LEARNING_RATE, 'filename': os.path.split(MODEL_PATH)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': BATCH_SIZE} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {'allow_soft_placement': True} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_fgsm_x = fgsm.generate(x, **fgsm_params) preds_adv_fgsm = model.get_logits(adv_fgsm_x) # Generate fgsm adversarial examples and save to disk dir = 'images/fgsm_adv/' if not os.path.exists('images'): os.mkdir('images') if not os.path.exists(dir): os.mkdir(dir) if not os.path.exists(dir + 'train/'): os.mkdir(dir + 'train/') if not os.path.exists(dir + 'test/'): os.mkdir(dir + 'test/') for index in range(len(y_test)): print('test ' + str(index)) x_ = x_test[index] label = np.argmax(y_test[index]) raw_data = (fgsm.generate_np(x_.reshape( (1, 28, 28, 1)), **fgsm_params).reshape( (28, 28)) * 255).astype('uint8') im = Image.fromarray(raw_data, mode='P') im.save(dir + 'test/' + str(label) + '_' + str(uuid.uuid4()) + '.png') for index in range(len(y_train)): print('train ' + str(index)) x_ = x_train[index] label = np.argmax(y_train[index]) raw_data = (fgsm.generate_np(x_.reshape( (1, 28, 28, 1)), **fgsm_params).reshape( (28, 28)) * 255).astype('uint8') im = Image.fromarray(raw_data, mode='P') im.save(dir + 'train/' + str(label) + '_' + str(uuid.uuid4()) + '.png') return report
def mnist_tutorial_fgsm(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED, noise_output=NOISE_OUTPUT): """ MNIST tutorial for Fast Gradient Method's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a FGSM attack object fgsm = FastGradientMethod(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes)] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 1, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape((source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" if targeted: fgsm_params_batch_size = source_samples * nb_classes else: fgsm_params_batch_size = source_samples fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv = fgsm.generate_np(adv_inputs, **fgsm_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) adv_accuracy = 1 - err if viz_enabled: for i in range(nb_classes): if noise_output: image = adv[i * nb_classes] - adv_inputs[i * nb_classes] else: image = adv[i * nb_classes] grid_viz_data[i, 0] = image print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) ########################################################################### # Adversarial Training ########################################################################### model2 = ModelBasicCNN('model2', nb_classes, nb_filters) fgsm2 = FastGradientMethod(model2, sess=sess) def attack_fgsm(x): return fgsm2.generate(adv_inputs, **fgsm_params) preds2 = model2.get_logits(x) loss2 = CrossEntropy(model2, smoothing=0.1, attack=attack_fgsm) train(sess, loss2, x_train, y_train, args=train_params, rng=rng) eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds2, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on adversarial fgsm test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy print("Defined TensorFlow model graph.") eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) adv_accuracy = 1 - err if viz_enabled: for i in range(nb_classes): if noise_output: image = adv[i * nb_classes] - adv_inputs[i * nb_classes] else: image = adv[i * nb_classes] grid_viz_data[i, 0] = image print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() def save_visual(data, path): """ Modified version of cleverhans.plot.pyplot """ figure = plt.figure() # figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = data.shape[0] num_rows = data.shape[1] num_channels = data.shape[4] for y in range(num_rows): for x in range(num_cols): figure.add_subplot(num_rows, num_cols, (x + 1) + (y * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(data[x, y, :, :, 0], cmap='gray') else: plt.imshow(data[x, y, :, :, :]) # Draw the plot and return plt.savefig(path) return figure # Finally, block & display a grid of all the adversarial examples if viz_enabled: # _ = grid_visual(grid_viz_data) # cleverhans_image.save("output", grid_viz_data) if noise_output: image_name = "output/fgsm_mnist_noise.png" else: image_name = "output/fgsm_mnist.png" _ = save_visual(grid_viz_data, image_name) return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" if targeted: cw_params_batch_size = source_samples * nb_classes else: cw_params_batch_size = source_samples cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': CW_LEARNING_RATE, 'batch_size': cw_params_batch_size, 'initial_const': 10 } adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) adv_accuracy = 1 - err if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: _ = grid_visual(grid_viz_data) return report
def cifar10_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1): """ :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get CIFAR10 data # data = MNIST(train_start=train_start, train_end=train_end, # test_start=test_start, test_end=test_end) # Get Fashion MNIST test data data = keras.datasets.fashion_mnist (x_train, y_train), (x_test, y_test) = data.load_data() x_train = x_train.reshape(x_train.shape[0], 28, 28, 1) x_test = x_test.reshape(x_test.shape[0], 28, 28, 1) y_train = np_utils.to_categorical(y_train, 10) y_test = np_utils.to_categorical(y_test, 10) x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 # dataset_size = data.x_train.shape[0] # dataset_train = data.to_tensorflow()[0] # dataset_train = dataset_train.map( # lambda x, y: (random_shift(random_horizontal_flip(x)), y), 4) # dataset_train = dataset_train.batch(batch_size) # dataset_train = dataset_train.prefetch(16) # x_train, y_train = data.get_set('train') # x_test, y_test = data.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_test.shape[1:4] nb_classes = y_test.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1. } rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: # model = ModelAllConvolutional('model1', nb_classes, nb_filters, input_shape=[32,32,3]) model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng) # train(sess, loss, None, None, # dataset_train=dataset_train, dataset_size=dataset_size, # evaluate=evaluate, args=train_params, rng=rng, # var_list=model.get_params()) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph # fgsm = FastGradientMethod(model, sess=sess) # fgsm = BasicIterativeMethod(model, sess=sess) fgsm = MomentumIterativeMethod(model, sess=sess) #fgsm = SaliencyMapMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) # x_train = x_train[:100] # adversarial_images = adv_x.eval(session=sess, feed_dict={x: x_train}) # adversarial_labels = preds_adv.eval(session=sess, feed_dict = {x: x_train}) # for i in range(20): # ori_images = x_train[i] # adv_images = adversarial_images[i] # y_true_label = np.argmax(y_train[i]) # y_adv_label = np.argmax(adversarial_labels[i]) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True) # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval') print('Repeating the process, using adversarial training') # Create a new model and train it to be robust to FastGradientMethod # model2 = ModelAllConvolutional('model2', nb_classes, nb_filters, input_shape=[32,32,3]) model2 = ModelBasicCNN('model2', nb_classes, nb_filters) # fgsm2 = FastGradientMethod(model2, sess=sess) # fgsm2 = BasicIterativeMethod(model2, sess=sess) fgsm2 = MomentumIterativeMethod(model2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack) preds2 = model2.get_logits(x) adv_x2 = attack(x) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) def evaluate2(): # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) # Accuracy of the adversarially trained model on adversarial examples do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) # Perform and evaluate adversarial training train(sess, loss2, x_train, y_train, evaluate=evaluate2, args=train_params, rng=rng) # train(sess, loss2, None, None, # dataset_train=dataset_train, dataset_size=dataset_size, # evaluate=evaluate2, args=train_params, rng=rng, # var_list=model2.get_params()) # Calculate training errors if testing: do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval') do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval') return report
def mnist_tutorial_adv_train(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED, noise_output=NOISE_OUTPUT): """ MNIST tutorial for Adversarial Training :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using FGSM - BIM - MIM approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object fgsm = FastGradientMethod(model, sess=sess) bim = BasicIterativeMethod(model, sess=sess) mim = MomentumIterativeMethod(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 1, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} bim_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1., 'nb_iter': 50, 'eps_iter': .01 } mim_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1., 'nb_iter': 50, 'eps_iter': .01 } adv_fgsm = fgsm.generate_np(adv_inputs, **fgsm_params) adv_bim = bim.generate_np(adv_inputs, **bim_params) adv_mim = mim.generate_np(adv_inputs, **mim_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_fgsm_accuracy = model_eval(sess, x, y, preds, adv_fgsm, adv_ys, args=eval_params) adv_bim_accuracy = model_eval(sess, x, y, preds, adv_bim, adv_ys, args=eval_params) adv_mim_accuracy = model_eval(sess, x, y, preds, adv_mim, adv_ys, args=eval_params) else: if viz_enabled: err_fgsm = model_eval(sess, x, y, preds, adv_fgsm, y_test[idxs], args=eval_params) err_bim = model_eval(sess, x, y, preds, adv_bim, y_test[idxs], args=eval_params) err_mim = model_eval(sess, x, y, preds, adv_mim, y_test[idxs], args=eval_params) adv_fgsm_accuracy = 1 - err_fgsm adv_bim_accuracy = 1 - err_bim adv_mim_accuracy = 1 - err_mim else: err_fgsm = model_eval(sess, x, y, preds, adv_fgsm, y_test[:source_samples], args=eval_params) err_bim = model_eval(sess, x, y, preds, adv_bim, y_test[:source_samples], args=eval_params) err_mim = model_eval(sess, x, y, preds, adv_mim, y_test[:source_samples], args=eval_params) adv_fgsm_accuracy = 1 - err_fgsm adv_bim_accuracy = 1 - err_bim adv_mim_accuracy = 1 - err_mim print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. (FGSM) examples {0:.4f}'.format( adv_fgsm_accuracy)) report.clean_train_adv_fgsm_eval = 1. - adv_fgsm_accuracy print('Avg. rate of successful adv. (BIM) examples {0:.4f}'.format( adv_bim_accuracy)) report.clean_train_adv_bim_eval = 1. - adv_bim_accuracy print('Avg. rate of successful adv. (MIM) examples {0:.4f}'.format( adv_mim_accuracy)) report.clean_train_adv_mim_eval = 1. - adv_mim_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed_fgsm = np.mean( np.sum((adv_fgsm - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of (FGSM) perturbations {0:.4f}'.format( percent_perturbed_fgsm)) percent_perturbed_bim = np.mean( np.sum((adv_bim - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of (BIM) perturbations {0:.4f}'.format( percent_perturbed_bim)) percent_perturbed_mim = np.mean( np.sum((adv_mim - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of (MIM) perturbations {0:.4f}'.format( percent_perturbed_mim)) ########################################################################### # Adversarial Training ########################################################################### model2 = ModelBasicCNN('model2', nb_classes, nb_filters) fgsm2 = FastGradientMethod(model, sess=sess) # bim2 = BasicIterativeMethod(model, sess=sess) # mim2 = MomentumIterativeMethod(model, sess=sess) def attack_fgsm(x): return fgsm2.generate(adv_inputs, **fgsm_params) # def attack_bim(x): # return bim2.generate(adv_inputs, **bim_params) # def attack_mim(x): # return mim2.generate(adv_inputs, **mim_params) preds2 = model2.get_logits(x) loss2_fgsm = CrossEntropy(model2, smoothing=0.1, attack=attack_fgsm) # loss2_bim = CrossEntropy(model2, smoothing=0.1, attack=attack_bim) # loss2_mim = CrossEntropy(model2, smoothing=0.1, attack=attack_mim) train(sess, loss2_fgsm, x_train, y_train, args=train_params, rng=rng) eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds2, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on adversarial fgsm test examples: {0}'.format( accuracy)) report.clean_train_clean_eval = accuracy print("Defined TensorFlow model graph.") adv_fgsm_accuracy = model_eval(sess, x, y, preds, adv_fgsm, adv_ys, args=eval_params) adv_bim_accuracy = model_eval(sess, x, y, preds, adv_bim, adv_ys, args=eval_params) adv_mim_accuracy = model_eval(sess, x, y, preds, adv_mim, adv_ys, args=eval_params) # Close TF session sess.close() return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST data mnist = MNIST(path=file, train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # x_train = x_train[0:1].reshape(784) # k = np.unique(x_train.reshape(-1, 784)) # k = list(set(x_train.reshape(784))) # nk = [k.index(x_train[x]) for x in len(x_train)] # print(k, np.shape(k), nk) ############################### # Transform image to uniimage # ############################### # x_train = convert_uniimage(x_train) # x_test = transform_4_in_1(x_test) # trans_x_text = np.copy(x_test) # x_test = convert_uniimage(x_test) # uni_x_test = np.copy(x_test) # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': save_dir, 'filename': filename, } eval_params = {'batch_size': batch_size} fgsm_params = {'eps': eps, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None, ae=None, type=None, datasetName=None, discretizeColor=1): accuracy, distortion = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params, is_adv=is_adv, ae=ae, type=type, datasetName=datasetName, discretizeColor=discretizeColor) setattr(report, report_key, accuracy) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, accuracy)) return accuracy, distortion if clean_train: model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False, type=type, datasetName="MNIST", discretizeColor=discretizeColor) saveFileNumArr = [] # saveFileNumArr = [50, 500, 1000] count = 0 appendNum = 50 while count < 1000: count = count + appendNum saveFileNumArr.append(count) distortionArr = [] accuracyArr = [] for i in range(len(saveFileNumArr)): saveFileNum = saveFileNumArr[i] model_path = os.path.join(save_dir, filename + "-" + str(saveFileNum)) print("Trying to load trained model from: " + model_path) if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) print("Load trained model") else: # train(sess, loss, x_train, y_train, evaluate=evaluate, # args=train_params, rng=rng, var_list=model.get_params()) train_with_noise(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params(), save=save, type=type, datasetName="MNIST", retrain=retrain, discretizeColor=discretizeColor) # Calculate training error accuracy, distortion = do_eval(preds, x_test, y_test, 'train_clean_train_clean_eval', False, type=type, datasetName="MNIST", discretizeColor=discretizeColor) # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) # fgsm = BasicIterativeMethod(model, sess=sess) # fgsm = MomentumIterativeMethod(model, sess=sess) # fgsm_params = { # 'clip_min': 0., # 'clip_max': 1., # 'verbose': False # } # fgsm = HopSkipJumpAttack(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) # adv_x = fgsm.generate_np(x, **fgsm_params) # adv = sess.run(adv_x, feed_dict={x: x_test}) preds_adv = model.get_logits(adv_x) # print(sess.run(preds_adv, feed_dict={x: x_test})) ############################# # Create adversarial images # ############################# # We have to produce adversarial image 1 by 1 by using HopSkipJumpAttack # adv_test = [] # for i in range(len(x_test)): # tmp_adv_test = sess.run(adv_x, feed_dict={x: [x_test[i]]}) # adv_test.append(tmp_adv_test[0]) # if (i+1) % 100 == 0: # print((i+1),"/",len(x_test), " adversarial images") # # adv_test = np.array(adv_test) # print(np.shape(adv_test)) # Evaluate the accuracy of the MNIST model on adversarial examples # do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True) # accuracy, distortion = do_eval(preds, x_test, y_test, 'clean_train_adv_eval', True, ae=adv_x, # type=type, datasetName="MNIST", discretizeColor=discretizeColor) # do_eval(preds, adv_test, y_test, 'clean_train_adv_eval', True) distortionArr.append(distortion) accuracyArr.append(accuracy) # print(str(accuracy)) # print(str(distortion)) print("accuracy:") for accuracy in accuracyArr: print(accuracy) print("distortion:") for distortion in distortionArr: print(distortion) return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(path=file, train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] saveFileNumArr = [] # saveFileNumArr = [50, 500, 1000] count = 0 while count < 1000: count = count + 50 saveFileNumArr.append(count) distortionArr = [] accuracyArr = [] for i in range(len(saveFileNumArr)): saveFileNum = saveFileNumArr[i] model_path = os.path.join(save_dir, filename + "-" + str(saveFileNum)) # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) print("Trying to load trained model from: " + model_path) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) print("Load trained model") else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} # accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) # assert x_test.shape[0] == test_end - test_start, x_test.shape # print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) # report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes)] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape((source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_inputs = x_test adv_ys = None yname = "y" if targeted: cw_params_batch_size = source_samples * nb_classes else: cw_params_batch_size = source_samples cw_params = {'binary_search_steps': 1, 'max_iterations': attack_iterations, 'learning_rate': CW_LEARNING_RATE, 'batch_size': cw_params_batch_size, 'initial_const': 10} adv2 = cw.generate(x, **cw_params) cw_params[yname] = adv_ys adv_x = None # adv_x = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: accuracy = model_eval( sess, x, y, preds, adv_x, adv_ys, args=eval_params) else: # err = model_eval(sess, x, y, preds, adv, y_test[:source_samples], # args=eval_params) accuracy, distortion = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params, is_adv=True, ae=adv2, type=type, datasetName="MNIST", discretizeColor=discretizeColor) print('--------------------------------------') print("load save file: ", saveFileNum) # Compute the number of adversarial examples that were successfully found # print('Test with adv. examples {0:.4f}'.format(adv_accuracy)) print('Test accuracy on examples: %0.4f ,distortion: %0.4f' % (accuracy, distortion)) distortionArr.append(distortion) accuracyArr.append(accuracy) # print(str(accuracy)) # print(str(distortion)) tf.reset_default_graph() print("accuracy:") for accuracy in accuracyArr: print(accuracy) print("distortion:") for distortion in distortionArr: print(distortion) # Close TF session sess.close() return report
class CleanCNN: def __init__(self, nb_epochs, batch_size, learning_rate, eps = 0.3, clip_min=0, clip_max=1): self.train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } self.eval_params = {'batch_size': batch_size} self.fgsm_params = { 'eps': eps, 'clip_min': clip_min, 'clip_max': clip_max } self.x_train = None self.y_train = None self.x_test = None self.y_test = None self.range = np.random.RandomState([2019, 11, 25]) self.model = None self.preds = None self.loss = None self.img_rows = None self.img_cols = None self.nchannels = None self.nb_classes = None ''' def get_data(self, train_start, train_end, test_start, test_end): train = np.genfromtxt("drive/My Drive/train.csv", delimiter=',') test = np.genfromtxt("drive/My Drive/test.csv", delimiter=',') self.x_test = test[:,1:].astype(int) self.y_test = oneHotEncodeY(test[:,0].astype(int),47) self.x_train = train[:,1:].astype(int) self.y_train = oneHotEncodeY(train[:,0].astype(int),47) self.x_train = np.reshape(self.x_train,(self.x_train.shape[0], 28, 28, 1)) self.x_test = np.reshape(self.x_test,(self.x_test.shape[0], 28, 28, 1)) print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape) self.img_rows, self.img_cols, self.nchannels = self.x_train.shape[1:4] self.nb_classes = self.y_train.shape[1] mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) self.x_train, self.y_train = mnist.get_set('train') self.x_test, self.y_test = mnist.get_set('test') self.img_rows, self.img_cols, self.nchannels = self.x_train.shape[1:4] self.nb_classes = self.y_train.shape[1] ''' def get_data(self): self.x_train, self.y_train = extract_training_samples('byclass') self.x_test, self.y_test = extract_test_samples('byclass') self.y_test = oneHotEncodeY(self.y_test, 62) self.y_train = oneHotEncodeY(self.y_train, 62) self.x_train = self.x_train.astype('float32') self.y_train = self.y_train.astype('float32') self.x_test = self.x_test.astype('float32') self.y_test = self.y_test.astype('float32') #print(np.amax(self.y_train)) #print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape) self.x_train = self.x_train /255. self.y_train = self.y_train self.x_test = self.x_test/ 255. self.y_test = self.y_test self.x_train = np.reshape(self.x_train,(self.x_train.shape[0], 28, 28, 1)) self.x_test = np.reshape(self.x_test,(self.x_test.shape[0], 28, 28, 1)) #self.y_test = np.reshape(self.y_test,(self.y_test.shape[0],1)) #self.y_train = np.reshape(self.y_train,(self.y_train.shape[0],1)) self.img_rows, self.img_cols, self.nchannels = self.x_train.shape[1:4] #images = np.reshape(images,(images.shape[0], 28, 28, 1)) #self.x_train, self.y_train = mnist.get_set('train') #self.x_test, self.y_test = mnist.get_set('test') #print(type(images)) #print(images.shape[1:4]) #print(labels.shape) #print(images.shape) ''' self.x_train, self.y_train = mnist.get_set('train') self.x_test, self.y_test = mnist.get_set('test') self.img_rows, self.img_cols, self.nchannels = self.x_train.shape[1:4] self.nb_classes = self.y_train.shape[1] print(np.amax(self.y_train)) ''' self.nb_classes = 62 #self.x_sub = self.x_test[:s0] #self.y_sub = np.argmax(self.y_test[:s0], axis=1) #self.x_test = self.x_test[s0:] #self.y_test = self.y_test[s0:] def train(self, nb_filters, label_smoothing): self.model = ModelBasicCNN('model1', self.nb_classes, nb_filters) self.preds = self.model.get_logits(x) self.loss = CrossEntropy(self.model, smoothing=label_smoothing) train(sess, self.loss, self.x_train, self.y_train, evaluate=self.evaluate, args=self.train_params, rng=self.range, var_list=self.model.get_params()) def evaluate(self): do_eval(self.preds, self.eval_params, self.x_test, self.y_test, 'clean_train_clean_eval', False) def test(self): do_eval(self.preds, self.eval_params, self.x_train, self.y_train, 'train_clean_train_clean_eval') def adverserial_testing(self): fgsm = FastGradientMethod(self.model, sess=sess) adv_x = fgsm.generate(x, **self.fgsm_params) preds_adv = self.model.get_logits(adv_x) #Call from mail do_eval(preds_adv, self.eval_params, self.x_test, self.y_test, 'clean_train_adv_eval', True) do_eval(preds_adv, self.eval_params, self.x_train, self.y_train, 'train_clean_train_adv_eval') print('Repeating the process, using adversarial training')
class AdverseCNN: def __init__(self, nb_epochs, batch_size, learning_rate, eps = 0.3, clip_min=0, clip_max=1): self.train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } self.eval_params = {'batch_size': batch_size} self.fgsm_params = { 'eps': eps, 'clip_min': clip_min, 'clip_max': clip_max } self.x_train = None self.y_train = None self.x_test = None self.y_test = None self.range = np.random.RandomState([2019, 11, 25]) self.model = None self.preds = None self.loss = None self.img_rows = None self.img_cols = None self.nchannels = None self.nb_classes = None self.preds_adv = None def get_data(self): self.x_train, self.y_train = extract_training_samples('byclass') self.x_test, self.y_test = extract_test_samples('byclass') self.y_test = oneHotEncodeY(self.y_test, 62) self.y_train = oneHotEncodeY(self.y_train, 62) self.x_train = self.x_train.astype('float32') self.y_train = self.y_train.astype('float32') self.x_test = self.x_test.astype('float32') self.y_test = self.y_test.astype('float32') print(np.amax(self.y_train)) print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape) self.x_train = self.x_train /255. self.y_train = self.y_train self.x_test = self.x_test/ 255. self.y_test = self.y_test self.x_train = np.reshape(self.x_train,(self.x_train.shape[0], 28, 28, 1)) self.x_test = np.reshape(self.x_test,(self.x_test.shape[0], 28, 28, 1)) #self.y_test = np.reshape(self.y_test,(self.y_test.shape[0],1)) #self.y_train = np.reshape(self.y_train,(self.y_train.shape[0],1)) self.img_rows, self.img_cols, self.nchannels = self.x_train.shape[1:4] #images = np.reshape(images,(images.shape[0], 28, 28, 1)) #self.x_train, self.y_train = mnist.get_set('train') #self.x_test, self.y_test = mnist.get_set('test') print("//////////////////////////////") #print(type(images)) #print(images.shape[1:4]) #print(labels.shape) #print(images.shape) ''' self.x_train, self.y_train = mnist.get_set('train') self.x_test, self.y_test = mnist.get_set('test') self.img_rows, self.img_cols, self.nchannels = self.x_train.shape[1:4] self.nb_classes = self.y_train.shape[1] print(np.amax(self.y_train)) ''' self.nb_classes = 62 #self.x_sub = self.x_test[:s0] #self.y_sub = np.argmax(self.y_test[:s0], axis=1) #self.x_test = self.x_test[s0:] #self.y_test = self.y_test[s0:] def adverse_train(self, nb_filters, label_smoothing): self.model = ModelBasicCNN('model2', self.nb_classes, nb_filters) fgsm = FastGradientMethod(self.model, sess=sess) def attack(x): return fgsm.generate(x, **self.fgsm_params) self.preds = self.model.get_logits(x) self.loss = CrossEntropy(self.model, smoothing=label_smoothing, attack=attack) adv_x = attack(x) self.preds_adv = self.model.get_logits(adv_x) train(sess, self.loss, self.x_train, self.y_train, evaluate=self.evaluate, args=self.train_params, rng=self.range, var_list=self.model.get_params()) def evaluate(self): do_eval(self.preds, self.eval_params, self.x_test, self.y_test, 'adv_train_clean_eval', False) do_eval(self.preds_adv, self.eval_params, self.x_test, self.y_test, 'adv_train_adv_eval', True) def test(self): do_eval(self.preds, self.eval_params, self.x_train, self.y_train, 'train_adv_train_clean_eval') do_eval(self.preds_adv, self.eval_params, self.x_train, self.y_train, 'train_adv_train_adv_eval')
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) x_train1, y_train1 = get_train(FLAGS.train1) x_test1, y_test1 = get_test(FLAGS.test1) x_train, y_train = x_train1, y_train1 x_test, y_test = x_test1, y_test1 if (FLAGS.train2): x_train2, y_train2, x_test2, y_test2 = get_train(FLAGS.train2) x_train, y_train = zip_trains(x_train1, y_train1, x_train2, y_train2, 0.5) x_test, y_test = zip_tests(x_test1, y_test1, x_test2, y_test2, 0.5) # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) print(x) print(y) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') return report
def main(argv=None): from cleverhans_tutorials import check_installation check_installation(__file__) if not os.path.exists( CONFIG.SAVE_PATH ): os.makedirs( CONFIG.SAVE_PATH ) save_path_data = CONFIG.SAVE_PATH + 'data/' if not os.path.exists( save_path_data ): os.makedirs( save_path_data ) model_path = CONFIG.SAVE_PATH + '../all/' + CONFIG.DATASET + '/' if not os.path.exists( model_path ): os.makedirs( model_path ) os.makedirs( model_path + 'data/' ) nb_epochs = FLAGS.nb_epochs batch_size = FLAGS.batch_size learning_rate = FLAGS.learning_rate nb_filters = FLAGS.nb_filters len_x = int(CONFIG.NUM_TEST/2) start = time.time() # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set seeds to improve reproducibility if CONFIG.DATASET == 'mnist' or CONFIG.DATASET == 'cifar10': tf.set_random_seed(1234) np.random.seed(1234) rd.seed(1234) elif CONFIG.DATASET == 'moon' or CONFIG.DATASET == 'dims': tf.set_random_seed(13) np.random.seed(1234) rd.seed(0) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session tf_config = tf.ConfigProto(allow_soft_placement=True,log_device_placement=True) tf_config.gpu_options.per_process_gpu_memory_fraction = 0.2 sess = tf.Session(config=tf_config) if CONFIG.DATASET == 'mnist': # Get MNIST data mnist = MNIST(train_start=0, train_end=CONFIG.NUM_TRAIN, test_start=0, test_end=CONFIG.NUM_TEST) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') elif CONFIG.DATASET == 'cifar10': # Get CIFAR10 data data = CIFAR10(train_start=0, train_end=CONFIG.NUM_TRAIN, test_start=0, test_end=CONFIG.NUM_TEST) dataset_size = data.x_train.shape[0] dataset_train = data.to_tensorflow()[0] dataset_train = dataset_train.map( lambda x, y: (random_shift(random_horizontal_flip(x)), y), 4) dataset_train = dataset_train.batch(batch_size) dataset_train = dataset_train.prefetch(16) x_train, y_train = data.get_set('train') x_test, y_test = data.get_set('test') elif CONFIG.DATASET == 'moon': # Create a two moon example X, y = make_moons(n_samples=(CONFIG.NUM_TRAIN+CONFIG.NUM_TEST), noise=0.2, random_state=0) X = StandardScaler().fit_transform(X) x_train1, x_test1, y_train1, y_test1 = train_test_split(X, y, test_size=(CONFIG.NUM_TEST/(CONFIG.NUM_TRAIN +CONFIG.NUM_TEST)), random_state=0) x_train, y_train, x_test, y_test = normalize_reshape_inputs_2d(model_path, x_train1, y_train1, x_test1, y_test1) elif CONFIG.DATASET == 'dims': X, y = make_moons(n_samples=(CONFIG.NUM_TRAIN+CONFIG.NUM_TEST), noise=0.2, random_state=0) X = StandardScaler().fit_transform(X) x_train1, x_test1, y_train1, y_test1 = train_test_split(X, y, test_size=(CONFIG.NUM_TEST/(CONFIG.NUM_TRAIN +CONFIG.NUM_TEST)), random_state=0) x_train2, y_train, x_test2, y_test = normalize_reshape_inputs_2d(model_path, x_train1, y_train1,x_test1, y_test1) x_train, x_test = add_noise_and_QR(x_train2, x_test2, CONFIG.NUM_DIMS) np.save(os.path.join(save_path_data, 'x_test'), x_test) np.save(os.path.join(save_path_data, 'y_test'), y_test) # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': 1} rng = np.random.RandomState([2017, 8, 30]) with open(CONFIG.SAVE_PATH + 'acc_param.txt', 'a') as fi: def do_eval(adv_x, preds, x_set, y_set, report_key): acc, pred_np, adv_x_np = model_eval(sess, x, y, preds, adv_x, nb_classes, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if report_key: print('Accuracy on %s examples: %0.4f' % (report_key, acc), file=fi) return pred_np, adv_x_np if CONFIG.DATASET == 'mnist': trained_model_path = model_path + 'data/trained_model' model = ModelBasicCNN('model1', nb_classes, nb_filters) elif CONFIG.DATASET == 'cifar10': trained_model_path = model_path + 'data/trained_model' model = ModelAllConvolutional('model1', nb_classes, nb_filters, input_shape=[32, 32, 3]) elif CONFIG.DATASET == 'moon': trained_model_path = model_path + 'data/trained_model' model = ModelMLP('model1', nb_classes) elif CONFIG.DATASET == 'dims': trained_model_path = save_path_data + 'trained_model' model = ModelMLP_dyn('model1', nb_classes, CONFIG.NUM_DIMS) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) def evaluate(): _, _ = do_eval(x, preds, x_test, y_test, 'test during train') if os.path.isfile( trained_model_path + '.index' ): tf_model_load(sess, trained_model_path) else: if CONFIG.DATASET == 'mnist': train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) elif CONFIG.DATASET == 'cifar10': train(sess, loss, None, None, dataset_train=dataset_train, dataset_size=dataset_size, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) elif CONFIG.DATASET == 'moon': train_2d(sess, loss, x, y, x_train, y_train, save=False, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) elif CONFIG.DATASET == 'dims': train_2d(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) saver = tf.train.Saver() saver.save(sess, trained_model_path) # Evaluate the accuracy on test examples if os.path.isfile( save_path_data + 'logits_zero_attacked.npy' ): logits_0 = np.load(save_path_data + 'logits_zero_attacked.npy') else: _, _ = do_eval(x, preds, x_train, y_train, 'train') logits_0, _ = do_eval(x, preds, x_test, y_test, 'test') np.save(os.path.join(save_path_data, 'logits_zero_attacked'), logits_0) if CONFIG.DATASET == 'moon': num_grid_points = 5000 if os.path.isfile( model_path + 'data/images_mesh' + str(num_grid_points) + '.npy' ): x_mesh = np.load(model_path + 'data/images_mesh' + str(num_grid_points) + '.npy') logits_mesh = np.load(model_path + 'data/logits_mesh' + str(num_grid_points) + '.npy') else: xx, yy = np.meshgrid(np.linspace(0, 1, num_grid_points), np.linspace(0, 1, num_grid_points)) x_mesh1 = np.stack([np.ravel(xx), np.ravel(yy)]).T y_mesh1 = np.ones((x_mesh1.shape[0]),dtype='int64') x_mesh, y_mesh, _, _ = normalize_reshape_inputs_2d(model_path, x_mesh1, y_mesh1) logits_mesh, _ = do_eval(x, preds, x_mesh, y_mesh, 'mesh') x_mesh = np.squeeze(x_mesh) np.save(os.path.join(model_path, 'data/images_mesh'+str(num_grid_points)), x_mesh) np.save(os.path.join(model_path, 'data/logits_mesh'+str(num_grid_points)), logits_mesh) points_x = x_test[:len_x] points_y = y_test[:len_x] points_x_bar = x_test[len_x:] points_y_bar = y_test[len_x:] # Initialize the CW attack object and graph cw = CarliniWagnerL2(model, sess=sess) # first attack attack_params = { 'learning_rate': CONFIG.CW_LEARNING_RATE, 'max_iterations': CONFIG.CW_MAX_ITERATIONS } if CONFIG.DATASET == 'moon': out_a = compute_polytopes_a(x_mesh, logits_mesh, model_path) attack_params['const_a_min'] = out_a attack_params['const_a_max'] = 100 adv_x = cw.generate(x, **attack_params) if os.path.isfile( save_path_data + 'images_once_attacked.npy' ): adv_img_1 = np.load(save_path_data + 'images_once_attacked.npy') logits_1 = np.load(save_path_data + 'logits_once_attacked.npy') else: #Evaluate the accuracy on adversarial examples preds_adv = model.get_logits(adv_x) logits_1, adv_img_1 = do_eval(adv_x, preds_adv, points_x_bar, points_y_bar, 'test once attacked') np.save(os.path.join(save_path_data, 'images_once_attacked'), adv_img_1) np.save(os.path.join(save_path_data, 'logits_once_attacked'), logits_1) # counter attack attack_params['max_iterations'] = 1024 if CONFIG.DATASET == 'moon': out_alpha2 = compute_epsilons_balls_alpha(x_mesh, np.squeeze(x_test), np.squeeze(adv_img_1), model_path, CONFIG.SAVE_PATH) attack_params['learning_rate'] = out_alpha2 attack_params['const_a_min'] = -1 attack_params['max_iterations'] = 2048 plot_data(np.squeeze(adv_img_1), logits_1, CONFIG.SAVE_PATH+'data_pred1.png', x_mesh, logits_mesh) adv_adv_x = cw.generate(x, **attack_params) x_k = np.concatenate((points_x, adv_img_1), axis=0) y_k = np.concatenate((points_y, logits_1), axis=0) if os.path.isfile( save_path_data + 'images_twice_attacked.npy' ): adv_img_2 = np.load(save_path_data + 'images_twice_attacked.npy') logits_2 = np.load(save_path_data + 'logits_twice_attacked.npy') else: # Evaluate the accuracy on adversarial examples preds_adv_adv = model.get_logits(adv_adv_x) logits_2, adv_img_2 = do_eval(adv_adv_x, preds_adv_adv, x_k, y_k, 'test twice attacked') np.save(os.path.join(save_path_data, 'images_twice_attacked'), adv_img_2) np.save(os.path.join(save_path_data, 'logits_twice_attacked'), logits_2) if CONFIG.DATASET == 'moon': plot_data(np.squeeze(adv_img_2[:len_x]), logits_2[:len_x], CONFIG.SAVE_PATH+'data_pred2.png', x_mesh, logits_mesh) plot_data(np.squeeze(adv_img_2[len_x:]), logits_2[len_x:], CONFIG.SAVE_PATH+'data_pred12.png', x_mesh, logits_mesh) test_balls(np.squeeze(x_k), np.squeeze(adv_img_2), logits_0, logits_1, logits_2, CONFIG.SAVE_PATH) compute_returnees(logits_0[len_x:], logits_1, logits_2[len_x:], logits_0[:len_x], logits_2[:len_x], CONFIG.SAVE_PATH) if x_test.shape[-1] > 1: num_axis=(1,2,3) else: num_axis=(1,2) D_p = np.squeeze(np.sqrt(np.sum(np.square(points_x-adv_img_2[:len_x]), axis=num_axis))) D_p_p = np.squeeze(np.sqrt(np.sum(np.square(adv_img_1-adv_img_2[len_x:]), axis=num_axis))) D_p_mod, D_p_p_mod = modify_D(D_p, D_p_p, logits_0[len_x:], logits_1, logits_2[len_x:], logits_0[:len_x], logits_2[:len_x]) if D_p_mod != [] and D_p_p_mod != []: plot_violins(D_p_mod, D_p_p_mod, CONFIG.SAVE_PATH) threshold_evaluation(D_p_mod, D_p_p_mod, CONFIG.SAVE_PATH) _ = compute_auroc(D_p_mod, D_p_p_mod, CONFIG.SAVE_PATH) plot_results_models(len_x, CONFIG.DATASET, CONFIG.SAVE_PATH) print('Time needed:', time.time()-start) return report
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True) # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval') print('Repeating the process, using adversarial training') # Create a new model and train it to be robust to FastGradientMethod model2 = ModelBasicCNN('model2', nb_classes, nb_filters) fgsm2 = FastGradientMethod(model2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack) preds2 = model2.get_logits(x) adv_x2 = attack(x) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) def evaluate2(): # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) # Accuracy of the adversarially trained model on adversarial examples do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) # Perform and evaluate adversarial training train(sess, loss2, x_train, y_train, evaluate=evaluate2, args=train_params, rng=rng, var_list=model2.get_params()) # Calculate training errors if testing: do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval') do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval') return report
def fashion_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, model_path=MODEL_PATH, noise_output=NOISE_OUTPUT): """ Fashion MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get Fashion MNIST test data fashion = keras.datasets.fashion_mnist (x_train, y_train), (x_test, y_test) = fashion.load_data() # cifar10 = CIFAR10(train_start=train_start, train_end=train_end, # test_start=test_start, test_end=test_end) # x_train, y_train = cifar10.get_set('train') # x_test, y_test = cifar10.get_set('test') x_train = x_train.reshape(x_train.shape[0], 28, 28, 1) x_test = x_test.reshape(x_test.shape[0], 28, 28, 1) y_train = np_utils.to_categorical(y_train, 10) y_test = np_utils.to_categorical(y_test, 10) x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an Fashion MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the Fashion MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, 1, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } # Loop over the samples we want to perturb into adversarial examples adv_all = np.zeros((nb_classes, img_rows, img_cols, nchannels), dtype='f') sample_all = np.zeros((nb_classes, img_rows, img_cols, nchannels), dtype='f') for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal # grid_viz_data[current_class, current_class, :, :, :] = np.reshape( # sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) adv_all[current_class] = adv_x sample_all[current_class] = sample # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side # if viz_enabled: # figure = pair_visual( # np.reshape(sample, (img_rows, img_cols, nchannels)), # np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # # Add our adversarial example to our grid data # grid_viz_data[target, current_class, :, :, :] = np.reshape( # adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Compute the average distortion introduced by the algorithm l2_norm = np.mean(np.sum((adv_all - sample_all)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(l2_norm)) for i in range(nb_classes): if noise_output: image = adv_all[i] - sample_all[i] else: image = adv_all[i] grid_viz_data[i, 0] = image # Close TF session sess.close() def save_visual(data, path): """ Modified version of cleverhans.plot.pyplot """ import matplotlib.pyplot as plt figure = plt.figure() # figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = data.shape[0] num_rows = data.shape[1] num_channels = data.shape[4] for y in range(num_rows): for x in range(num_cols): figure.add_subplot(num_rows, num_cols, (x + 1) + (y * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(data[x, y, :, :, 0], cmap='gray') else: plt.imshow(data[x, y, :, :, :]) # Draw the plot and return plt.savefig(path) # Finally, block & display a grid of all the adversarial examples if viz_enabled: if noise_output: image_name = "output/jsma_fashion_mnist_noise.png" else: image_name = "output/jsma_fashion_mnist.png" _ = save_visual(grid_viz_data, image_name) return report