def train(self, nb_filters, label_smoothing): self.model = ModelBasicCNN('model1', self.nb_classes, nb_filters) self.preds = self.model.get_logits(x) self.loss = CrossEntropy(self.model, smoothing=label_smoothing) train(sess, self.loss, self.x_train, self.y_train, evaluate=self.evaluate, args=self.train_params, rng=self.range, var_list=self.model.get_params())
def train_model(model): # Load image from disk x_train, y_train, x_test, y_test = get_mnist(model.input_dir) # Train an MNIST model train_params = { 'nb_epochs': EPOCHS, 'batch_size': BATCH_SIZE, 'learning_rate': LEARNING_RATE } eval_params = {'batch_size': BATCH_SIZE} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) print('Test accuracy on train: %0.4f' % (acc)) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) def evaluate(): do_eval(preds, x_test, y_test) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error do_eval(preds, x_train, y_train)
def adverse_train(self, nb_filters, label_smoothing): self.model = ModelBasicCNN('model2', self.nb_classes, nb_filters) fgsm = FastGradientMethod(self.model, sess=sess) def attack(x): return fgsm.generate(x, **self.fgsm_params) self.preds = self.model.get_logits(x) self.loss = CrossEntropy(self.model, smoothing=label_smoothing, attack=attack) adv_x = attack(x) self.preds_adv = self.model.get_logits(adv_x) train(sess, self.loss, self.x_train, self.y_train, evaluate=self.evaluate, args=self.train_params, rng=self.range, var_list=self.model.get_params())
def __test(): # report = AccuracyReport() tf.set_random_seed(1234) sess = tf.Session() set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=0, train_end=60000, test_start=0, test_end=10000) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") # Train an MNIST model train_params = { 'nb_epochs': NB_EPOCHS, 'batch_size': BATCH_SIZE, 'learning_rate': LEARNING_RATE, 'filename': os.path.split(MODEL_PATH)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': BATCH_SIZE} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
def _train_model(self): self._busy = True self.change('busy_changed') def evaluate(): self.evaluate_model(self._x_test, self._y_test) # now use the cleverhans train method (this will optimize the # loss function, and hence the model): # FIXME[problem]: there seems to be no way to get some progress # report from this train method. The only callback we can # register is 'evaluate', which can be used for arbitrary # operations, but which is only called after every epoch with self._graph.as_default(): train(self._sess, self._loss, self._x_train, self._y_train, evaluate=evaluate, args=self._train_params, rng=self._rng) self._busy = False self.change('busy_changed')
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {'allow_soft_placement': True} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_fgsm_x = fgsm.generate(x, **fgsm_params) preds_adv_fgsm = model.get_logits(adv_fgsm_x) # Generate fgsm adversarial examples and save to disk dir = 'images/fgsm_adv/' if not os.path.exists('images'): os.mkdir('images') if not os.path.exists(dir): os.mkdir(dir) if not os.path.exists(dir + 'train/'): os.mkdir(dir + 'train/') if not os.path.exists(dir + 'test/'): os.mkdir(dir + 'test/') for index in range(len(y_test)): print('test ' + str(index)) x_ = x_test[index] label = np.argmax(y_test[index]) raw_data = (fgsm.generate_np(x_.reshape( (1, 28, 28, 1)), **fgsm_params).reshape( (28, 28)) * 255).astype('uint8') im = Image.fromarray(raw_data, mode='P') im.save(dir + 'test/' + str(label) + '_' + str(uuid.uuid4()) + '.png') for index in range(len(y_train)): print('train ' + str(index)) x_ = x_train[index] label = np.argmax(y_train[index]) raw_data = (fgsm.generate_np(x_.reshape( (1, 28, 28, 1)), **fgsm_params).reshape( (28, 28)) * 255).astype('uint8') im = Image.fromarray(raw_data, mode='P') im.save(dir + 'train/' + str(label) + '_' + str(uuid.uuid4()) + '.png') return report
def mnist_tutorial_fgsm(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED, noise_output=NOISE_OUTPUT): """ MNIST tutorial for Fast Gradient Method's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a FGSM attack object fgsm = FastGradientMethod(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes)] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 1, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape((source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" if targeted: fgsm_params_batch_size = source_samples * nb_classes else: fgsm_params_batch_size = source_samples fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv = fgsm.generate_np(adv_inputs, **fgsm_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) adv_accuracy = 1 - err if viz_enabled: for i in range(nb_classes): if noise_output: image = adv[i * nb_classes] - adv_inputs[i * nb_classes] else: image = adv[i * nb_classes] grid_viz_data[i, 0] = image print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) ########################################################################### # Adversarial Training ########################################################################### model2 = ModelBasicCNN('model2', nb_classes, nb_filters) fgsm2 = FastGradientMethod(model2, sess=sess) def attack_fgsm(x): return fgsm2.generate(adv_inputs, **fgsm_params) preds2 = model2.get_logits(x) loss2 = CrossEntropy(model2, smoothing=0.1, attack=attack_fgsm) train(sess, loss2, x_train, y_train, args=train_params, rng=rng) eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds2, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on adversarial fgsm test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy print("Defined TensorFlow model graph.") eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) adv_accuracy = 1 - err if viz_enabled: for i in range(nb_classes): if noise_output: image = adv[i * nb_classes] - adv_inputs[i * nb_classes] else: image = adv[i * nb_classes] grid_viz_data[i, 0] = image print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() def save_visual(data, path): """ Modified version of cleverhans.plot.pyplot """ figure = plt.figure() # figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = data.shape[0] num_rows = data.shape[1] num_channels = data.shape[4] for y in range(num_rows): for x in range(num_cols): figure.add_subplot(num_rows, num_cols, (x + 1) + (y * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(data[x, y, :, :, 0], cmap='gray') else: plt.imshow(data[x, y, :, :, :]) # Draw the plot and return plt.savefig(path) return figure # Finally, block & display a grid of all the adversarial examples if viz_enabled: # _ = grid_visual(grid_viz_data) # cleverhans_image.save("output", grid_viz_data) if noise_output: image_name = "output/fgsm_mnist_noise.png" else: image_name = "output/fgsm_mnist.png" _ = save_visual(grid_viz_data, image_name) return report
def cifar10_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1, adversarial_training=ADVERSARIAL_TRAINING): """ CIFAR10 cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :param adversarial_training: True means using adversarial training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: # put data on cpu and gpu both config_args = dict(allow_soft_placement=True) sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get CIFAR10 data data = CIFAR10(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) dataset_size = data.x_train.shape[0] dataset_train = data.to_tensorflow()[0] dataset_train = dataset_train.map( lambda x, y: (random_shift(random_horizontal_flip(x)), y), 4) dataset_train = dataset_train.batch(batch_size) dataset_train = dataset_train.prefetch(16) x_train, y_train = data.get_set('train') x_test, y_test = data.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_test.shape[1:4] nb_classes = y_test.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} bim_params = { 'eps': 0.5, 'clip_min': 0., 'eps_iter': 0.002, 'nb_iter': 10, 'clip_max': 1., 'ord': np.inf } rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = ModelAllConvolutional('model1', nb_classes, nb_filters, input_shape=[32, 32, 3]) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) """ when training, evaluating can be happened """ train(sess, loss, None, None, dataset_train=dataset_train, dataset_size=dataset_size, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) # save model # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Basic Iterative Method (BIM) attack object and # graph for i in range(20): bim = BasicIterativeMethod(model, sess=sess) adv_x = bim.generate(x, **bim_params) preds_adv = model.get_logits(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples print("eps:%0.2f" % (bim_params["eps_iter"] * bim_params['nb_iter'])) do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True) bim_params["eps_iter"] = bim_params["eps_iter"] + 0.002 # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval') if not adversarial_training: return report print('Repeating the process, using adversarial training') # Create a new model and train it to be robust to BasicIterativeMethod model2 = ModelAllConvolutional('model2', nb_classes, nb_filters, input_shape=[32, 32, 3]) bim2 = BasicIterativeMethod(model2, sess=sess) def attack(x): return bim2.generate(x, **bim_params) # add attack to loss loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack) preds2 = model2.get_logits(x) adv_x2 = attack(x) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the attacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) def evaluate2(): # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) # Accuracy of the adversarially trained model on adversarial examples do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) # Perform and evaluate adversarial training train(sess, loss2, None, None, dataset_train=dataset_train, dataset_size=dataset_size, evaluate=evaluate2, args=train_params, rng=rng, var_list=model2.get_params()) # Calculate training errors if testing: do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval') do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval') return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, train_dir=TRAIN_DIR, filename=FILENAME, load_model=LOAD_MODEL, testing=False, label_smoothing=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ tf.keras.backend.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if keras.backend.image_data_format() != 'channels_last': raise NotImplementedError( "this tutorial requires keras to be configured to channels_last format" ) # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = CrossEntropy(wrap, smoothing=label_smoothing) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_train, y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) wrap_2 = KerasModelWrapper(model_2) preds_2 = model_2(x) fgsm2 = FastGradientMethod(wrap_2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) preds_2_adv = model_2(attack(x)) loss_2 = CrossEntropy(wrap_2, smoothing=label_smoothing, attack=attack) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_test, y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, x_test, y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(sess, loss_2, x_train, y_train, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_train, y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, x_train, y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def pretrain(self, ): x_train, y_train = self.x_train, self.y_train x_test, y_test = self.x_test, self.y_test # Use Image Parameters. img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] preds = self.preds model = self.model loss = self.loss # Define input TF placeholder. x = self.x_sym y = self.y_sym # Define the test set accuracy evaluation. def evaluate(): acc = model_eval(self.sess, x, y, preds, x_test, y_test, args={'batch_size': FLAGS.batch_size}) print('Test accuracy on test examples: %0.4f' % acc) # Train the model train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.lr } train(self.sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, var_list=model.get_params()) # Define callable that returns a dictionary of all activations for a dataset def get_activations(data): data_activations = {} for layer in layers: layer_sym = tf.layers.flatten(model.get_layer(x, layer)) data_activations[layer] = batch_eval( self.sess, [x], [layer_sym], [data], args={'batch_size': FLAGS.batch_size})[0] return data_activations # Use a holdout of the test set to simulate calibration data for the DkNN. train_data = x_train train_labels = np.argmax(y_train, axis=1) cali_data = x_test[:FLAGS.nb_cali] y_cali = y_test[:FLAGS.nb_cali] cali_labels = np.argmax(y_cali, axis=1) test_data = x_test[FLAGS.nb_cali:] y_test = y_test[FLAGS.nb_cali:] # Extract representations for the training and calibration data at each layer of interest to the DkNN. if VERSION == 1: layers = ['ReLU1', 'ReLU3', 'ReLU5', 'logits'] else: layers = [ 'ReLU2', 'ResidualWithInstanceNorm4', 'ResidualWithInstanceNorm6', 'ResidualWithInstanceNorm8', 'logits' ] # Wrap the model into a DkNNModel dknn = DkNNModel(FLAGS.neighbors, layers, get_activations, train_data, train_labels, nb_classes, scope='dknn', number_bits=FLAGS.number_bits) dknn.calibrate(cali_data, cali_labels) self.dknn = dknn self.test_data = test_data self.y_test = y_test self.x = x self.y = y
def model_training(model, file_name, x_train, y_train, x_test, y_test, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, num_threads=None, label_smoothing=0.1): """ Trains the model with the specified parameters. Parameters ---------- model: cleverhans.model.Model The cleverhans picklable model file_name: str The name of the joblib file. x_train: numpy.ndarray The input array of the train dataset. y_train: numpy.ndarray The output array of the train dataset. x_test: numpy.ndarray The input array of the test dataset. y_test: numpy.ndarray The output array of the test dataset. nb_epochs: int, optional The number of epochs. batch_size: int, optional The batch size. learning_rate: float, optional The learning rate. num_threads: int, optional The number of threads used. label_smoothing: float, optional The amount of label smooting used. """ if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} session = tf.Session(config=tf.ConfigProto(**config_args)) img_rows, img_cols, channels = x_train.shape[1:4] nb_classes = y_train.shape[1] x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) train_params = { "nb_epochs": nb_epochs, "batch_size": batch_size, "learning_rate": learning_rate } eval_params = {"batch_size": batch_size} predictions = model.get_logits(x) loss = CrossEntropy(model, smoothing=label_smoothing) def train_evaluation(): """ Prints the performances of the models after each epoch. """ evaluate(session, x, y, predictions, x_train, y_train, x_test, y_test, eval_params) train(session, loss, x_train, y_train, evaluate=train_evaluation, args=train_params, var_list=model.get_params()) with session.as_default(): save("models/joblibs/" + file_name, model)
def cifar10_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, architecture=ARCHITECTURE, load_model=LOAD_MODEL, ckpt_dir='None', learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.): """ CIFAR10 cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(int(time.time() * 1000) % 2**31) np.random.seed(int(time.time() * 1001) % 2**31) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get CIFAR10 data data = CIFAR10(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) dataset_size = data.x_train.shape[0] dataset_train = data.to_tensorflow()[0] dataset_train = dataset_train.map( lambda x, y: (random_shift(random_horizontal_flip(x)), y), 4) dataset_train = dataset_train.batch(batch_size) dataset_train = dataset_train.prefetch(16) x_train, y_train = data.get_set('train') pgd_train = None if FLAGS.load_pgd_train_samples: pgd_path = os.path.expanduser('~/data/advhyp/{}/samples'.format( FLAGS.load_pgd_train_samples)) x_train = np.load(os.path.join(pgd_path, 'train_clean.npy')) y_train = np.load(os.path.join(pgd_path, 'train_y.npy')) pgd_train = np.load(os.path.join(pgd_path, 'train_pgd.npy')) if x_train.shape[1] == 3: x_train = x_train.transpose((0, 2, 3, 1)) pgd_train = pgd_train.transpose((0, 2, 3, 1)) if len(y_train.shape) == 1: y_tmp = np.zeros((len(y_train), np.max(y_train) + 1), y_train.dtype) y_tmp[np.arange(len(y_tmp)), y_train] = 1. y_train = y_tmp x_test, y_test = data.get_set('test') pgd_test = None if FLAGS.load_pgd_test_samples: pgd_path = os.path.expanduser('~/data/advhyp/{}/samples'.format( FLAGS.load_pgd_test_samples)) x_test = np.load(os.path.join(pgd_path, 'test_clean.npy')) y_test = np.load(os.path.join(pgd_path, 'test_y.npy')) pgd_test = np.load(os.path.join(pgd_path, 'test_pgd.npy')) if x_test.shape[1] == 3: x_test = x_test.transpose((0, 2, 3, 1)) pgd_test = pgd_test.transpose((0, 2, 3, 1)) if len(y_test.shape) == 1: y_tmp = np.zeros((len(y_test), np.max(y_test) + 1), y_test.dtype) y_tmp[np.arange(len(y_tmp)), y_test] = 1. y_test = y_tmp train_idcs = np.arange(len(x_train)) np.random.shuffle(train_idcs) x_train, y_train = x_train[train_idcs], y_train[train_idcs] if pgd_train is not None: pgd_train = pgd_train[train_idcs] test_idcs = np.arange(len(x_test))[:FLAGS.test_size] np.random.shuffle(test_idcs) x_test, y_test = x_test[test_idcs], y_test[test_idcs] if pgd_test is not None: pgd_test = pgd_test[test_idcs] # Use Image Parameters img_rows, img_cols, nchannels = x_test.shape[1:4] nb_classes = y_test.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} pgd_params = { # ord: , 'eps': FLAGS.eps, 'eps_iter': (FLAGS.eps / 5), 'nb_iter': 10, 'clip_min': 0, 'clip_max': 255 } cw_params = { 'binary_search_steps': FLAGS.cw_search_steps, 'max_iterations': FLAGS.cw_steps, #1000 'abort_early': True, 'learning_rate': FLAGS.cw_lr, 'batch_size': batch_size, 'confidence': 0, 'initial_const': FLAGS.cw_c, 'clip_min': 0, 'clip_max': 255 } # Madry dosen't divide by 255 x_train *= 255 x_test *= 255 if pgd_train is not None: pgd_train *= 255 if pgd_test is not None: pgd_test *= 255 print('x_train amin={} amax={}'.format(np.amin(x_train), np.amax(x_train))) print('x_test amin={} amax={}'.format(np.amin(x_test), np.amax(x_test))) print( 'clip_min : {}, clip_max : {} >> CHECK WITH WHICH VALUES THE CLASSIFIER WAS PRETRAINED !!! <<' .format(pgd_params['clip_min'], pgd_params['clip_max'])) rng = np.random.RandomState() # [2017, 8, 30] debug_dict = dict() if FLAGS.save_debug_dict else None def do_eval(preds, x_set, y_set, report_key, is_adv=None, predictor=None, x_adv=None): if predictor is None: acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) else: do_eval(preds, x_set, y_set, report_key, is_adv=is_adv) if x_adv is not None: x_set_adv, = batch_eval(sess, [x], [x_adv], [x_set], batch_size=batch_size) assert x_set.shape == x_set_adv.shape x_set = x_set_adv n_batches = math.ceil(x_set.shape[0] / batch_size) p_set, p_det = np.concatenate([ predictor.send(x_set[b * batch_size:(b + 1) * batch_size]) for b in tqdm.trange(n_batches) ]).T acc = np.equal(p_set, y_set[:len(p_set)].argmax(-1)).mean() # if is_adv: # import IPython ; IPython.embed() ; exit(1) if FLAGS.save_debug_dict: debug_dict['x_set'] = x_set debug_dict['y_set'] = y_set ddfn = 'logs/debug_dict_{}.pkl'.format( 'adv' if is_adv else 'clean') if not os.path.exists(ddfn): with open(ddfn, 'wb') as f: pickle.dump(debug_dict, f) debug_dict.clear() if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples %s: %0.4f' % (report_text, 'with correction' if predictor is not None else 'without correction', acc)) if is_adv is not None: label = 'test_acc_{}_{}'.format( report_text, 'corrected' if predictor else 'uncorrected') swriter.add_scalar(label, acc) if predictor is not None: detect = np.equal(p_det, is_adv).mean() label = 'test_det_{}_{}'.format( report_text, 'corrected' if predictor else 'uncorrected') print(label, detect) swriter.add_scalar(label, detect) label = 'test_dac_{}_{}'.format( report_text, 'corrected' if predictor else 'uncorrected') swriter.add_scalar( label, np.equal(p_set, y_set[:len(p_set)].argmax(-1))[np.equal( p_det, is_adv)].mean()) return acc if clean_train: if architecture == 'ConvNet': model = ModelAllConvolutional('model1', nb_classes, nb_filters, input_shape=[32, 32, 3]) elif architecture == 'ResNet': model = ResNet(scope='ResNet') else: raise Exception('Specify valid classifier architecture!') preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=label_smoothing) if load_model: model_name = 'naturally_trained' if FLAGS.load_adv_trained: model_name = 'adv_trained' if ckpt_dir is not 'None': ckpt = tf.train.get_checkpoint_state( os.path.join(os.path.expanduser(ckpt_dir), model_name)) else: ckpt = tf.train.get_checkpoint_state('./models/' + model_name) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path saver = tf.train.Saver(var_list=dict( (v.name.split('/', 1)[1].split(':')[0], v) for v in tf.global_variables())) saver.restore(sess, ckpt_path) print('\nMODEL SUCCESSFULLY LOADED from : {}'.format(ckpt_path)) initialize_uninitialized_global_variables(sess) else: def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, None, None, dataset_train=dataset_train, dataset_size=dataset_size, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) logits_op = preds.op while logits_op.type != 'MatMul': logits_op = logits_op.inputs[0].op latent_x_tensor, weights = logits_op.inputs logits_tensor = preds nb_classes = weights.shape[-1].value if not FLAGS.save_pgd_samples: noise_eps = FLAGS.noise_eps.split(',') if FLAGS.noise_eps_detect is None: FLAGS.noise_eps_detect = FLAGS.noise_eps noise_eps_detect = FLAGS.noise_eps_detect.split(',') if pgd_train is not None: pgd_train = pgd_train[:FLAGS.n_collect] if not FLAGS.passthrough: predictor = tf_robustify.collect_statistics( x_train[:FLAGS.n_collect], y_train[:FLAGS.n_collect], x, sess, logits_tensor=logits_tensor, latent_x_tensor=latent_x_tensor, weights=weights, nb_classes=nb_classes, p_ratio_cutoff=FLAGS.p_ratio_cutoff, noise_eps=noise_eps, noise_eps_detect=noise_eps_detect, pgd_eps=pgd_params['eps'], pgd_lr=pgd_params['eps_iter'] / pgd_params['eps'], pgd_iters=pgd_params['nb_iter'], save_alignments_dir='logs/stats' if FLAGS.save_alignments else None, load_alignments_dir=os.path.expanduser( '~/data/advhyp/madry/stats') if FLAGS.load_alignments else None, clip_min=pgd_params['clip_min'], clip_max=pgd_params['clip_max'], batch_size=batch_size, num_noise_samples=FLAGS.num_noise_samples, debug_dict=debug_dict, debug=FLAGS.debug, targeted=False, pgd_train=pgd_train, fit_classifier=FLAGS.fit_classifier, clip_alignments=FLAGS.clip_alignments, just_detect=FLAGS.just_detect) else: def _predictor(): _x = yield while (_x is not None): _y = sess.run(preds, {x: _x}).argmax(-1) _x = yield np.stack((_y, np.zeros_like(_y)), -1) predictor = _predictor() next(predictor) if FLAGS.save_alignments: exit(0) # Evaluate the accuracy of the model on clean examples acc_clean = do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False, predictor=predictor) # Initialize the PGD attack object and graph if FLAGS.attack == 'pgd': pgd = MadryEtAl(model, sess=sess) adv_x = pgd.generate(x, **pgd_params) elif FLAGS.attack == 'cw': cw = CarliniWagnerL2(model, sess=sess) adv_x = cw.generate(x, **cw_params) elif FLAGS.attack == 'mean': pgd = MadryEtAl(model, sess=sess) mean_eps = FLAGS.mean_eps * FLAGS.eps def _attack_mean(x): x_many = tf.tile(x[None], (FLAGS.mean_samples, 1, 1, 1)) x_noisy = x_many + tf.random_uniform(x_many.shape, -mean_eps, mean_eps) x_noisy = tf.clip_by_value(x_noisy, 0, 255) x_pgd = pgd.generate(x_noisy, **pgd_params) x_clip = tf.minimum(x_pgd, x_many + FLAGS.eps) x_clip = tf.maximum(x_clip, x_many - FLAGS.eps) x_clip = tf.clip_by_value(x_clip, 0, 255) return x_clip adv_x = tf.map_fn(_attack_mean, x) adv_x = tf.reduce_mean(adv_x, 1) preds_adv = model.get_logits(adv_x) if FLAGS.save_pgd_samples: for ds, y, name in ((x_train, y_train, 'train'), (x_test, y_test, 'test')): train_batches = math.ceil(len(ds) / FLAGS.batch_size) train_pgd = np.concatenate([ sess.run(adv_x, { x: ds[b * FLAGS.batch_size:(b + 1) * FLAGS.batch_size] }) for b in tqdm.trange(train_batches) ]) np.save('logs/{}_clean.npy'.format(name), ds / 255.) np.save('logs/{}_y.npy'.format(name), y) train_pgd /= 255. np.save('logs/{}_pgd.npy'.format(name), train_pgd) exit(0) # Evaluate the accuracy of the model on adversarial examples if not FLAGS.load_pgd_test_samples: acc_pgd = do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True, predictor=predictor, x_adv=adv_x) else: acc_pgd = do_eval(preds, pgd_test, y_test, 'clean_train_adv_eval', True, predictor=predictor) swriter.add_scalar('test_acc_mean', (acc_clean + acc_pgd) / 2., 0) print('Repeating the process, using adversarial training') exit(0) # Create a new model and train it to be robust to MadryEtAl if architecture == 'ConvNet': model2 = ModelAllConvolutional('model2', nb_classes, nb_filters, input_shape=[32, 32, 3]) elif architecture == 'ResNet': model = ResNet() else: raise Exception('Specify valid classifier architecture!') pgd2 = MadryEtAl(model2, sess=sess) def attack(x): return pgd2.generate(x, **pgd_params) loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack) preds2 = model2.get_logits(x) adv_x2 = attack(x) if not backprop_through_attack: # For some attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) if load_model: if ckpt_dir is not 'None': ckpt = tf.train.get_checkpoint_state( os.path.join(os.path.expanduser(ckpt_dir), 'adv_trained')) else: ckpt = tf.train.get_checkpoint_state('./models/adv_trained') ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path assert ckpt_path and tf_model_load( sess, file_path=ckpt_path), '\nMODEL LOADING FAILED' print('\nMODEL SUCCESSFULLY LOADED from : {}'.format(ckpt_path)) initialize_uninitialized_global_variables(sess) else: def evaluate2(): # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) # Accuracy of the adversarially trained model on adversarial # examples do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) # Perform and evaluate adversarial training train(sess, loss2, None, None, dataset_train=dataset_train, dataset_size=dataset_size, evaluate=evaluate2, args=train_params, rng=rng, var_list=model2.get_params()) # Evaluate model do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) return report
# For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) def evaluate2(): # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) # Accuracy of the adversarially trained model on adversarial examples do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) # Perform and evaluate adversarial training train(sess, loss2, x_train, y_train, evaluate=evaluate2, args=train_params, rng=rng, var_list=model2.get_params()) # Calculate training errors if testing: do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval') do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval')
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, preprocess='', backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) ### CHANGE DATASET ### # Get MNIST data # mnist = MNIST_67(train_start=train_start, train_end=train_end, # test_start=test_start, test_end=test_end) # x_train, y_train = mnist.get_set('train') # x_test, y_test = mnist.get_set('test') x_train, y_train, x_test, y_test = get_MNIST_67_preprocess( preprocess=preprocess) with open('../pickle/{}_y_train.pickle'.format(FILENAME), 'wb') as handle: pickle.dump(y_train, handle) with open('../pickle/{}_y_test.pickle'.format(FILENAME), 'wb') as handle: pickle.dump(y_test, handle) # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) ### ADD PARAMETERS ### def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, save_logit=True, filename=FLAGS.filename + "_" + report_key, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: ### picklable ### #model = ModelBasicCNN('model1', nb_classes, nb_filters) model = make_basic_picklable_cnn(nb_filters=nb_filters, nb_classes=nb_classes) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) #Now, save the graph with sess.as_default(): save("../models/CNN_{}.joblib".format(preprocess), model) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') exit() # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True) # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval') print('Repeating the process, using adversarial training') # Create a new model and train it to be robust to FastGradientMethod ### picklable ### #model2 = ModelBasicCNN('model2', nb_classes, nb_filters) model2 = make_basic_picklable_cnn(nb_filters=nb_filters, nb_classes=nb_classes) fgsm2 = FastGradientMethod(model2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack) preds2 = model2.get_logits(x) adv_x2 = attack(x) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) def evaluate2(): # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) # Accuracy of the adversarially trained model on adversarial examples do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) # Perform and evaluate adversarial training train(sess, loss2, x_train, y_train, evaluate=evaluate2, args=train_params, rng=rng, var_list=model2.get_params()) #Now, save the graph with sess.as_default(): save("../models/{}_{}.joblib".format(FILENAME, preprocess), model2) # Calculate training errors if testing: do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval') do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval') return report
def mnist_ae(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, num_threads=None, label_smoothing=0.1): report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) rng = np.random.RandomState() source_samples = 10 # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get CIFAR10 data data = CIFAR10(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) dataset_size = data.x_train.shape[0] dataset_train = data.to_tensorflow()[0] dataset_train = dataset_train.map( lambda x, y: (random_shift(random_horizontal_flip(x)), y), 4) dataset_train = dataset_train.batch(batch_size) dataset_train = dataset_train.prefetch(16) x_train, y_train = data.get_set('train') x_test, y_test = data.get_set('test') nb_latent_size = 100 # Get MNIST test data # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] print("img_Rows, img_cols, nchannels: ", img_rows, img_cols, nchannels) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) x_t = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) y_t = tf.placeholder( tf.float32, shape=(None, nb_classes)) #z = tf.placeholder(tf.float32, shape = (None, nb_latent_size)) #z_t = tf.placeholder(tf.float32, shape = (None, nb_latent_size)) ''' save_dir= 'models' model_name = 'cifar10_AE.h5' model_path_ae = os.path.join(save_dir, model_name) ''' #model_ae= ae_model(x, img_rows=img_rows, img_cols=img_cols, # channels=nchannels) #recon = model_ae(x) #print("recon: ",recon) wrap_ae = ModelVAE('wrap_ae') recon = wrap_ae.get_layer(x,'RECON') print("Defined TensorFlow model graph.") def evaluate_ae(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': 128} noise, d1, d2, dist_diff, avg_dist_lat = model_eval_ae(sess, x, x_t,recon, x_train, x_train, args=eval_params) print("reconstruction distance: ", d1) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, #'train_dir': train_dir_ae, #'filename': filename } rng = np.random.RandomState([2017, 8, 30]) #if not os.path.exists(train_dir_ae): # os.mkdir(train_dir_ae) #ckpt = tf.train.get_checkpoint_state(train_dir_ae) #print(train_dir_ae, ckpt) #ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path if clean_train_vae==True: print("Training VAE") loss = vae_loss(wrap_ae) train_ae(sess, loss, x_train, x_train, tf.train.AdamOptimizer(learning_rate=0.0002, beta1=0.5), evaluate=evaluate_ae, args=train_params, rng=rng, var_list = wrap_ae.get_params()) saver = tf.train.Saver() saver.save(sess, "train_dir/model_vae_fgsm.ckpt") print("saved model") else: print("Loading VAE") saver = tf.train.Saver() #print(ckpt_path) saver.restore(sess, "train_dir/model_vae.ckpt") evaluate_ae() if(train_further): train_params = { 'nb_epochs': 10, 'batch_size': batch_size, 'learning_rate': 0.0002, } #training with the saved model as starting point loss = SquaredError(wrap_ae) train_ae(sess, loss, x_train, x_train, optimizer = tf.train.AdamOptimizer(learning_rate=0.0002, beta1=0.5), evaluate=evaluate_ae, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, "train_dir/model_vae_fgsm.ckpt") evaluate_ae() print("Model loaded and trained for more epochs") num_classes = 10 ''' save_dir= 'models' model_name = 'cifar10_CNN.h5' model_path_cls = os.path.join(save_dir, model_name) ''' cl_model = cnn_cl_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds_cl = cl_model(x) def do_eval_cls(preds, x_set, y_set, x_tar_set,report_key, is_adv = None): acc = model_eval(sess, x, y, preds, x_t, x_set, y_set, x_tar_set, args=eval_params_cls) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_cl,x_t, x_test, y_test, x_test,args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) train_params = { 'nb_epochs': 100, 'batch_size': batch_size, 'learning_rate': learning_rate, #'train_dir': train_dir_cl, #'filename': filename } rng = np.random.RandomState([2017, 8, 30]) wrap_cl = KerasModelWrapper(cl_model) if clean_train_cl == True: train_params = { 'nb_epochs': 5, 'batch_size': batch_size, 'learning_rate': learning_rate, #'train_dir': train_dir_cl, #'filename': filename } print("Training CNN Classifier") ''' datagen = ImageDataGenerator( rotation_range=15, width_shift_range=0.1, height_shift_range=0.1, horizontal_flip=True, ) datagen.fit(x_train) ''' loss_cl = CrossEntropy(wrap_cl, smoothing=label_smoothing) #for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size = 128): # train(sess, loss_cl, x_batch, y_batch, tf.train.AdamOptimizer(learning_rate=0.0002, beta1=0.5), evaluate=evaluate, # args=train_params, rng=rng) train(sess, loss_cl, x_train, y_train, evaluate=evaluate, optimizer = tf.train.RMSPropOptimizer(learning_rate = 0.0001, decay = 1e-6), args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, "train_dir/model_cnn_cl.ckpt") print("saved model at ", "train_dir/model_cnn_cl_fgsm.ckpt") else: print("Loading CNN Classifier") saver = tf.train.Saver() #print(ckpt_path) saver.restore(sess, "train_dir/model_cnn_cl.ckpt") evaluate() if(train_further): train_params = { 'nb_epochs': 10, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir_cl, 'filename': filename } loss_cl = CrossEntropy(wrap_cl, smoothing=label_smoothing) train(sess, loss_cl, x_train, y_train, evaluate=evaluate, optimizer = tf.train.RMSPropOptimizer(learning_rate = 0.0001, decay = 1e-6), args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, "train_dir/model_cl_fgsm.ckpt") print("Model loaded and trained further") evaluate() ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object #cw = CarliniWagnerAE(wrap_ae,wrap_cl, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes)] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') grid_viz_data_1 = np.zeros(grid_shape, dtype='f') adv_inputs = np.array( [[instance] * (nb_classes-1) for instance in x_test[idxs]], dtype=np.float32) #adv_input_y = np.array([[instance]*(nb_classes-1) for instance in y_test[idxs]]) adv_input_y = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes-1): targ.append(y_test[idxs[curr_num]]) adv_input_y.append(targ) adv_input_y = np.array(adv_input_y) adv_target_y = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes): if(id!=curr_num): targ.append(y_test[idxs[id]]) adv_target_y.append(targ) adv_target_y = np.array(adv_target_y) #print("adv_input_y: \n", adv_input_y) #print("adv_target_y: \n", adv_target_y) adv_input_targets = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes): if(id!=curr_num): targ.append(x_test[idxs[id]]) adv_input_targets.append(targ) adv_input_targets = np.array(adv_input_targets) adv_inputs = adv_inputs.reshape( (source_samples * (nb_classes-1), img_rows, img_cols, nchannels)) adv_input_targets = adv_input_targets.reshape( (source_samples * (nb_classes-1), img_rows, img_cols, nchannels)) adv_input_y = adv_input_y.reshape(source_samples*(nb_classes-1), 10) adv_target_y = adv_target_y.reshape(source_samples*(nb_classes-1), 10) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape((source_samples * nb_classes, nb_classes)) yname = "y_target" fgsm_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1. } fgsm = FastGradientMethodAe(wrap_ae, sess=sess) adv = fgsm.generate(x,x_t, **fgsm_params) adv = sess.run(adv, {x: adv_inputs, x_t: adv_input_targets}) recon_orig = wrap_ae.get_layer(x, 'RECON') recon_orig = sess.run(recon_orig, feed_dict = {x: adv_inputs}) recon_adv = wrap_ae.get_layer(x, 'RECON') recon_adv = sess.run(recon_adv, feed_dict = {x: adv}) pred_adv_recon = wrap_cl.get_logits(x) pred_adv_recon = sess.run(pred_adv_recon, {x:recon_adv}) #scores1 = cl_model.evaluate(recon_adv, adv_input_y, verbose=1) #scores2 = cl_model.evaluate(recon_adv, adv_target_y, verbose = 1) #acc_1 = model_eval(sess, x, y, pred_adv_recon, x_t, adv_inputs, adv_target_y, adv_input_targets, args=eval_params_cls) #acc_2 = model_eval(sess, x, y, pred_adv_recon, x_t, adv_inputs, adv_input_y, adv_input_targets, args=eval_params_cls) shape = np.shape(adv_inputs) noise = np.sum(np.square(adv-adv_inputs))/(np.shape(adv)[0]) noise = pow(noise,0.5) d1 = np.sum(np.square(recon_adv-adv_inputs))/(np.shape(adv_inputs)[0]) d2 = np.sum(np.square(recon_adv-adv_input_targets))/(np.shape(adv_inputs)[0]) acc_1 = (sum(np.argmax(pred_adv_recon, axis=-1)== np.argmax(adv_target_y, axis=-1)))/(np.shape(adv_target_y)[0]) acc_2 = (sum(np.argmax(pred_adv_recon, axis=-1)== np.argmax(adv_input_y, axis=-1)))/(np.shape(adv_target_y)[0]) print("noise: ", noise) print("d1: ", d1) print("d2: ", d2) print("classifier acc_target: ", acc_1) print("classifier acc_true: ", acc_2) #print("recon_adv[0]\n", recon_adv[0,:,:,0]) curr_class = 0 if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if(i==j): grid_viz_data[i,j] = recon_orig[curr_class*9] grid_viz_data_1[i,j] = adv_inputs[curr_class*9] curr_class = curr_class+1 else: if(j>i): grid_viz_data[i,j] = recon_adv[i*(nb_classes-1) + j-1] grid_viz_data_1[i,j] = adv[i*(nb_classes-1)+j-1] else: grid_viz_data[i,j] = recon_adv[i*(nb_classes-1) + j] grid_viz_data_1[i,j] = adv[i*(nb_classes-1)+j] #rint(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Finally, block & display a grid of all the adversarial examples if viz_enabled: plt.ioff() figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = grid_viz_data.shape[0] num_rows = grid_viz_data.shape[1] num_channels = grid_viz_data.shape[4] for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') plt.imshow(grid_viz_data[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_fgsm_vae_fig1') figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') plt.imshow(grid_viz_data_1[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_fgsm_vae_fig2') if adversarial_training: print("starting adversarial training") index_shuf = list(range(len(x_train))) x_train_target = x_train[index_shuf] y_train_target = y_train[index_shuf] # Randomly repeat a few training examples each epoch to avoid # having a too-small batch ''' while len(index_shuf) % batch_size != 0: index_shuf.append(rng.randint(len(x_train))) nb_batches = len(index_shuf) // batch_size rng.shuffle(index_shuf) # Shuffling here versus inside the loop doesn't seem to affect # timing very much, but shuffling here makes the code slightly # easier to read ''' print("len of x_train_target and x_train: ", len(x_train_target), len(x_train)) for ind in range (0, len(x_train)): r_ind = -1 while(np.argmax(y_train_target[ind])==np.argmax(y_train[ind])): r_ind = rng.randint(0,len(x_train)) y_train_target[ind] = y_train[r_ind] if r_ind>-1: x_train_target[ind] = x_train[r_ind] wrap_ae2 = ModelVAE('wrap_ae2') fgsm2 = FastGradientMethodAe(wrap_ae2, sess=sess) adv2 = fgsm.generate(x,x_t, **fgsm_params) adv_set = sess.run(adv2, {x: x_train, x_t: x_train_target}) x_train_aim = np.append(x_train, x_train, axis = 0) x_train_app = np.append(x_train, adv_set, axis = 0) loss2 = vae_loss(wrap_ae2) train_params = { 'nb_epochs': 5, 'batch_size': batch_size, 'learning_rate': learning_rate} train_ae(sess, loss2, x_train_app, x_train_aim, tf.train.AdamOptimizer(learning_rate=0.0002, beta1=0.5), args=train_params, rng=rng, var_list = wrap_ae2.get_params()) evaluate_ae() adv3 = fgsm2.generate(x, x_t, **fgsm_params) adv3 = sess.run(adv3, {x: adv_inputs, x_t: adv_input_targets}) recon_orig2 = wrap_ae2.get_layer(x, 'RECON') recon_orig2 = sess.run(recon_orig2, feed_dict = {x: adv_inputs}) recon_adv2 = wrap_ae2.get_layer(x, 'RECON') recon_adv2 = sess.run(recon_adv2, feed_dict = {x: adv3}) pred_adv_recon2 = wrap_cl.get_logits(x) pred_adv_recon2 = sess.run(pred_adv_recon2, {x:recon_adv2}) shape = np.shape(adv_inputs) noise = np.sum(np.square(adv3-adv_inputs))/(np.shape(adv3)[0]) noise = pow(noise,0.5) d1 = np.sum(np.square(recon_adv2-adv_inputs))/(np.shape(adv_inputs)[0]) d2 = np.sum(np.square(recon_adv2-adv_input_targets))/(np.shape(adv_inputs)[0]) acc_1 = (sum(np.argmax(pred_adv_recon2, axis=-1)== np.argmax(adv_target_y, axis=-1)))/(np.shape(adv_target_y)[0]) acc_2 = (sum(np.argmax(pred_adv_recon2, axis=-1)== np.argmax(adv_input_y, axis=-1)))/(np.shape(adv_target_y)[0]) print("noise: ", noise) print("d1: ", d1) print("d2: ", d2) print("classifier acc_target: ", acc_1) print("classifier acc_true: ", acc_2) #print("recon_adv[0]\n", recon_adv[0,:,:,0]) curr_class = 0 if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if(i==j): grid_viz_data[i,j] = recon_orig2[curr_class*9] grid_viz_data_1[i,j] = adv_inputs[curr_class*9] curr_class = curr_class+1 else: if(j>i): grid_viz_data[i,j] = recon_adv2[i*(nb_classes-1) + j-1] grid_viz_data_1[i,j] = adv3[i*(nb_classes-1)+j-1] else: grid_viz_data[i,j] = recon_adv2[i*(nb_classes-1) + j] grid_viz_data_1[i,j] = adv3[i*(nb_classes-1)+j] #rint(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Finally, block & display a grid of all the adversarial examples if viz_enabled: plt.ioff() figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = grid_viz_data.shape[0] num_rows = grid_viz_data.shape[1] num_channels = grid_viz_data.shape[4] for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') plt.imshow(grid_viz_data[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_vae_fgsm_adv_fig1') figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') plt.imshow(grid_viz_data_1[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_vae_fgsm_adv_fig2') #return report if binarization: print("----------------") print("BINARIZATION") adv[adv>0.5] = 1.0 adv[adv<=0.5] = 0.0 recon_orig = wrap_ae.get_layer(x, 'RECON') recon_adv = wrap_ae.get_layer(x, 'RECON') #pred_adv = wrap_cl.get_logits(x) recon_orig = sess.run(recon_orig, {x: adv_inputs}) recon_adv = sess.run(recon_adv, {x: adv}) #pred_adv = sess.run(pred_adv, {x: recon_adv}) pred_adv_recon = wrap_cl.get_logits(x) pred_adv_recon = sess.run(pred_adv_recon, {x:recon_adv}) eval_params = {'batch_size': 90} if targeted: noise = np.sum(np.square(adv-adv_inputs))/(np.shape(adv)[0]) noise = pow(noise,0.5) d1 = np.sum(np.square(recon_adv-adv_inputs))/(np.shape(adv_inputs)[0]) d2 = np.sum(np.square(recon_adv-adv_input_targets))/(np.shape(adv_inputs)[0]) acc_1 = (sum(np.argmax(pred_adv_recon, axis=-1)== np.argmax(adv_target_y, axis=-1)))/(np.shape(adv_target_y)[0]) acc_2 = (sum(np.argmax(pred_adv_recon, axis=-1)== np.argmax(adv_input_y, axis=-1)))/(np.shape(adv_target_y)[0]) print("noise: ", noise) print("d1: ", d1) print("d2: ", d2) print("classifier acc_target: ", acc_1) print("classifier acc_true: ", acc_2) curr_class = 0 if viz_enabled: for j in range(nb_classes): for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if(i==j): grid_viz_data[i,j] = recon_orig[curr_class*9] grid_viz_data_1[i,j] = adv_inputs[curr_class*9] curr_class = curr_class+1 else: if(j>i): grid_viz_data[i,j] = recon_adv[i*(nb_classes-1) + j-1] grid_viz_data_1[i,j] = adv[i*(nb_classes-1)+j-1] else: grid_viz_data[i,j] = recon_adv[i*(nb_classes-1) + j] grid_viz_data_1[i,j] = adv[i*(nb_classes-1)+j] plt.ioff() figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = grid_viz_data.shape[0] num_rows = grid_viz_data.shape[1] num_channels = grid_viz_data.shape[4] for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy* num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(grid_viz_data[xx, yy, :, :, 0]) else: plt.imshow(grid_viz_data[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_fgsm_vae_fig1_bin') figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(grid_viz_data_1[xx, yy, :, :, 0]) else: plt.imshow(grid_viz_data_1[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_fgsm_vae_fig2_bin') if(mean_filtering ==True): print("----------------") print("MEAN FILTERING") adv = uniform_filter(adv, 2) recon_orig = wrap_ae.get_layer(x, 'RECON') recon_adv = wrap_ae.get_layer(x, 'RECON') pred_adv_recon = wrap_cl.get_logits(x) recon_orig = sess.run(recon_orig, {x: adv_inputs}) recon_adv = sess.run(recon_adv, {x: adv}) pred_adv_recon = sess.run(pred_adv_recon, {x: recon_adv}) eval_params = {'batch_size': 90} noise = np.sum(np.square(adv-adv_inputs))/(np.shape(adv)[0]) noise = pow(noise,0.5) d1 = np.sum(np.square(recon_adv-adv_inputs))/(np.shape(adv_inputs)[0]) d2 = np.sum(np.square(recon_adv-adv_input_targets))/(np.shape(adv_inputs)[0]) acc_1 = (sum(np.argmax(pred_adv_recon, axis=-1)== np.argmax(adv_target_y, axis=-1)))/(np.shape(adv_target_y)[0]) acc_2 = (sum(np.argmax(pred_adv_recon, axis=-1)== np.argmax(adv_input_y, axis=-1)))/(np.shape(adv_target_y)[0]) print("noise: ", noise) print("d1: ", d1) print("d2: ", d2) print("classifier acc_target: ", acc_1) print("classifier acc_true: ", acc_2) curr_class = 0 if viz_enabled: for j in range(nb_classes): for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if(i==j): grid_viz_data[i,j] = recon_orig[curr_class*9] grid_viz_data_1[i,j] = adv_inputs[curr_class*9] curr_class = curr_class+1 else: if(j>i): grid_viz_data[i,j] = recon_adv[i*(nb_classes-1) + j-1] grid_viz_data_1[i,j] = adv[i*(nb_classes-1)+j-1] else: grid_viz_data[i,j] = recon_adv[i*(nb_classes-1) + j] grid_viz_data_1[i,j] = adv[i*(nb_classes-1)+j] plt.ioff() figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = grid_viz_data.shape[0] num_rows = grid_viz_data.shape[1] num_channels = grid_viz_data.shape[4] for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy* num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(grid_viz_data[xx, yy, :, :, 0]) else: plt.imshow(grid_viz_data[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_fgsm_vae_fig1_mean') figure = plt.figure() figure.canvas.set_window_title('Cleverhans: Grid Visualization') for yy in range(num_rows): for xx in range(num_cols): figure.add_subplot(num_rows, num_cols, (xx + 1) + (yy * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(grid_viz_data_1[xx, yy, :, :, 0]) else: plt.imshow(grid_viz_data_1[xx, yy, :, :, :]) # Draw the plot and return plt.savefig('cifar10_fgsm_vae_fig2_mean')
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" if targeted: cw_params_batch_size = source_samples * nb_classes else: cw_params_batch_size = source_samples cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': CW_LEARNING_RATE, 'batch_size': cw_params_batch_size, 'initial_const': 10 } adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) adv_accuracy = 1 - err if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: _ = grid_visual(grid_viz_data) return report