def __init__(self, model, image_shape_hwc, epsilon=(16. / 255), num_steps=200, batch_size=32, is_debug=False): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf.Session(graph=self.graph) self.x_input = tf.placeholder(tf.float32, shape=(1, ) + image_shape_hwc) self.y_label = tf.placeholder(tf.int32, shape=(1, )) self.model = model attack = SPSA(CleverhansPyfuncModelWrapper(self.model), sess=self.sess) self.x_adv = attack.generate(self.x_input, y=self.y_label, epsilon=epsilon, num_steps=num_steps, early_stop_loss_threshold=-1., batch_size=batch_size, is_debug=is_debug) self.graph.finalize()
def test_attack_success(self): """Check SPSA creates misclassified images.""" epsilon = 4. / 255 input_dir = FLAGS.input_image_dir metadata_file_path = FLAGS.metadata_file_path num_images = 8 batch_shape = (num_images, 299, 299, 3) images, labels = load_images(input_dir, metadata_file_path, batch_shape) num_classes = 1001 tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): # Prepare graph x_input = tf.placeholder(tf.float32, shape=(1, ) + batch_shape[1:]) y_label = tf.placeholder(tf.int32, shape=(1, )) model = InceptionModel(num_classes) attack = SPSA(model) x_adv = attack.generate(x_input, y=y_label, epsilon=epsilon, num_steps=30, early_stop_loss_threshold=-1., spsa_samples=32, spsa_iters=16, is_debug=True) logits = model.get_logits(x_adv) acc = _top_1_accuracy(logits, y_label) # Run computation saver = tf.train.Saver(slim.get_model_variables()) session_creator = tf.train.ChiefSessionCreator( scaffold=tf.train.Scaffold(saver=saver), checkpoint_filename_with_path=FLAGS.checkpoint_path, master=FLAGS.master) num_correct = 0. with tf.train.MonitoredSession( session_creator=session_creator) as sess: for i in xrange(num_images): acc_val = sess.run(acc, feed_dict={ x_input: np.expand_dims(images[i], axis=0), y_label: np.expand_dims(labels[i], axis=0), }) tf.logging.info('Accuracy: %s', acc_val) num_correct += acc_val assert (num_correct / num_images) < 0.1
def test_attack_bounds(self): """Check SPSA respects perturbation limits.""" epsilon = 4. / 255 input_dir = FLAGS.input_image_dir metadata_file_path = FLAGS.metadata_file_path num_images = 8 batch_shape = (num_images, 299, 299, 3) images, labels = load_images(input_dir, metadata_file_path, batch_shape) nb_classes = 1001 tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) with tf.Graph().as_default(): # Prepare graph x_input = tf.compat.v1.placeholder(tf.float32, shape=(1, ) + batch_shape[1:]) y_label = tf.compat.v1.placeholder(tf.int32, shape=(1, )) model = InceptionModel(nb_classes) attack = SPSA(model) x_adv = attack.generate(x_input, y=y_label, epsilon=epsilon, num_steps=10, early_stop_loss_threshold=-1., spsa_samples=32, spsa_iters=1, is_debug=True) # Run computation saver = tf.compat.v1.train.Saver(slim.get_model_variables()) session_creator = tf.compat.v1.train.ChiefSessionCreator( scaffold=tf.compat.v1.train.Scaffold(saver=saver), checkpoint_filename_with_path=FLAGS.checkpoint_path, master=FLAGS.master) with tf.compat.v1.train.MonitoredSession( session_creator=session_creator) as sess: for i in xrange(num_images): x_expanded = np.expand_dims(images[i], axis=0) y_expanded = np.expand_dims(labels[i], axis=0) adv_image = sess.run(x_adv, feed_dict={ x_input: x_expanded, y_label: y_expanded }) diff = adv_image - images[i] assert np.max(np.abs(diff)) < epsilon + 1e-4 assert np.max(adv_image < 1. + 1e-4) assert np.min(adv_image > -1e-4)
class TestSPSA(CleverHansTest): def setUp(self): super(TestSPSA, self).setUp() self.sess = tf.Session() self.model = SimpleModel() self.attack = SPSA(self.model, sess=self.sess) def test_attack_strength(self): # This uses the existing input structure for SPSA. Tom tried for ~40 # minutes to get generate_np to work correctly but could not. n_samples = 10 x_val = np.random.rand(n_samples, 2) x_val = np.array(x_val, dtype=np.float32) # The SPSA attack currently uses non-one-hot labels # TODO: change this to use standard cleverhans label conventions feed_labs = np.random.randint(0, 2, n_samples) x_input = tf.placeholder(tf.float32, shape=(1, 2)) y_label = tf.placeholder(tf.int32, shape=(1, )) x_adv_op = self.attack.generate( x_input, y=y_label, epsilon=.5, num_steps=100, batch_size=64, spsa_iters=1, ) all_x_adv = [] for i in range(n_samples): x_adv_np = self.sess.run(x_adv_op, feed_dict={ x_input: np.expand_dims(x_val[i], axis=0), y_label: np.expand_dims(feed_labs[i], axis=0), }) all_x_adv.append(x_adv_np[0]) x_adv = np.vstack(all_x_adv) new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1) self.assertTrue(np.mean(feed_labs == new_labs) < 0.1)
def test_attack_success(self): """Check SPSA creates misclassified images.""" epsilon = 4. / 255 input_dir = FLAGS.input_image_dir metadata_file_path = FLAGS.metadata_file_path num_images = 8 batch_shape = (num_images, 299, 299, 3) images, labels = load_images( input_dir, metadata_file_path, batch_shape) num_classes = 1001 tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): # Prepare graph x_input = tf.placeholder(tf.float32, shape=(1,) + batch_shape[1:]) y_label = tf.placeholder(tf.int32, shape=(1,)) model = InceptionModel(num_classes) attack = SPSA(model) x_adv = attack.generate( x_input, y=y_label, epsilon=epsilon, num_steps=30, early_stop_loss_threshold=-1., batch_size=32, spsa_iters=16, is_debug=True) logits = model.get_logits(x_adv) acc = _top_1_accuracy(logits, y_label) # Run computation saver = tf.train.Saver(slim.get_model_variables()) session_creator = tf.train.ChiefSessionCreator( scaffold=tf.train.Scaffold(saver=saver), checkpoint_filename_with_path=FLAGS.checkpoint_path, master=FLAGS.master) num_correct = 0. with tf.train.MonitoredSession( session_creator=session_creator) as sess: for i in xrange(num_images): acc_val = sess.run(acc, feed_dict={ x_input: np.expand_dims(images[i], axis=0), y_label: np.expand_dims(labels[i], axis=0), }) tf.logging.info('Accuracy: %s', acc_val) num_correct += acc_val assert (num_correct / num_images) < 0.1
def test_attack_bounds(self): """Check SPSA respects perturbation limits.""" epsilon = 4. / 255 input_dir = FLAGS.input_image_dir metadata_file_path = FLAGS.metadata_file_path num_images = 8 batch_shape = (num_images, 299, 299, 3) images, labels = load_images( input_dir, metadata_file_path, batch_shape) num_classes = 1001 tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): # Prepare graph x_input = tf.placeholder(tf.float32, shape=(1,) + batch_shape[1:]) y_label = tf.placeholder(tf.int32, shape=(1,)) model = InceptionModel(num_classes) attack = SPSA(model) x_adv = attack.generate( x_input, y=y_label, epsilon=epsilon, num_steps=10, early_stop_loss_threshold=-1., batch_size=32, spsa_iters=1, is_debug=True) # Run computation saver = tf.train.Saver(slim.get_model_variables()) session_creator = tf.train.ChiefSessionCreator( scaffold=tf.train.Scaffold(saver=saver), checkpoint_filename_with_path=FLAGS.checkpoint_path, master=FLAGS.master) with tf.train.MonitoredSession( session_creator=session_creator) as sess: for i in xrange(num_images): adv_image = sess.run(x_adv, feed_dict={ x_input: np.expand_dims(images[i], axis=0), y_label: np.expand_dims(labels[i], axis=0), }) diff = adv_image - images[i] assert np.max(np.abs(diff)) < epsilon + 1e-4 assert np.max(adv_image < 1. + 1e-4) assert np.min(adv_image > -1e-4)
def iterate_through_cwl2_attacks(): tf.logging.set_verbosity(tf.logging.INFO) input_dir = FLAGS.input_image_dir metadata_file_path = FLAGS.metadata_file_path num_images = len(os.listdir(input_dir)) batch_shape = (num_images, 299, 299, 3) num_classes = 1001 batch_size = attack_name_to_params[ATTACKS.CARLINI_WAGNER]['batch_size'] images, labels, target_classes = load_images(input_dir, metadata_file_path, batch_shape, num_classes) list_param_dict = expand_param_dict( attack_name_to_params[ATTACKS.CARLINI_WAGNER], attack_name_to_configurable_params[ATTACKS.CARLINI_WAGNER] ) save_dir = 'saves' os.makedirs(save_dir, exist_ok=True) for idx, params in enumerate(list_param_dict): tf.reset_default_graph() logger.info('Running attack with parameters: {}'.format(params)) logger.info('Current index of parameters: {}/{}'.format(idx, len(list_param_dict))) # Get save path adv_imgs_save_path = get_attack_images_filename_prefix( attack_name=ATTACKS.CARLINI_WAGNER, params=params, model='inception', targeted_prefix='targeted' ) adv_imgs_save_path = os.path.join(save_dir, adv_imgs_save_path) # Run inference graph = tf.Graph() with graph.as_default(): sess = tf.Session(graph=graph) # Prepare graph x_input = tf.placeholder(tf.float32, shape=(batch_size,) + batch_shape[1:]) y_label = tf.placeholder(tf.int32, shape=(batch_size, num_classes)) y_target = tf.placeholder(tf.int32, shape=(batch_size, num_classes)) model = InceptionModel(num_classes) cwl2 = True if cwl2: attack = CarliniWagnerL2(model=model, sess=sess) x_adv = attack.generate(x_input, y_target=y_target, **params) else: attack = SPSA(model=model) x_adv = attack.generate(x_input, y_target=y_label, epsilon=4. / 255, num_steps=30, early_stop_loss_threshold=-1., batch_size=32, spsa_iters=16, is_debug=True) logits = model.get_logits(x_input) acc = _top_k_accuracy(logits, tf.argmax(y_label, axis=1), k=1) success_rate = _top_k_accuracy(logits, tf.argmax(y_target, axis=1), k=1) # Run computation saver = tf.train.Saver(slim.get_model_variables()) saver.restore(sess, save_path=FLAGS.checkpoint_path) list_adv_images = [] if num_images % batch_size == 0: num_batches = int(num_images / batch_size) else: num_batches = int(num_images / batch_size + 1) for i in tqdm.tqdm(range(num_batches)): feed_dict_i = {x_input: images[i * batch_size:(i + 1) * batch_size], y_target: target_classes[i * batch_size:(i + 1) * batch_size]} adv_img = sess.run(x_adv, feed_dict=feed_dict_i) list_adv_images.append(adv_img) adv_images = np.concatenate((list_adv_images)) np.save(adv_imgs_save_path, adv_images) acc_store = [] succ_store = [] for i in tqdm.tqdm(range(num_batches)): feed_dict_i = {x_input: adv_images[i * batch_size:(i + 1) * batch_size], y_target: target_classes[i * batch_size:(i + 1) * batch_size], y_label: labels[i * batch_size:(i + 1) * batch_size]} succ_batch, acc_batch = sess.run([success_rate, acc], feed_dict=feed_dict_i) acc_store.extend(acc_batch) succ_store.extend(succ_batch) logger.info('Accuracy is: {:.4f}'.format(np.mean(acc_store))) logger.info('Success Rate is: {:.4f}'.format(np.mean(succ_store)))
def run(args, restrict=True): if restrict: # Restrict the visible GPUs to the one for this subprocess id = np.int(multiprocessing.current_process().name.split("-")[1]) os.environ["CUDA_VISIBLE_DEVICES"] = str(id - 1) # Load Parameters dataset = args[0] epsilon = float(args[1]) mode = args[2] K = int(args[3]) fname = dataset + "/" + str(epsilon) + "_" + mode + "_" + str(K) # Configure Keras/Tensorflow Keras.clear_session() config = tf.ConfigProto() config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) sess = Keras.get_session() Keras.set_learning_phase(False) # Fix Random Seeds np.random.seed(1) tf.set_random_seed( 1 ) #Having this before keras.clear_session() causes it it hang for some reason # Load Model/Data and setup SPSA placeholders N = 50 if dataset == "MNIST": # Base Model base_model = MNISTModel("../1-Models/MNIST") data = MNIST() x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) # SPSA shape_spsa = (1, 28, 28, 1) x_spsa = tf.placeholder(tf.float32, shape=shape_spsa) elif dataset == "CIFAR": # Base Model base_model = CIFARModel("../1-Models/CIFAR") data = CIFAR() x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) # SPSA shape_spsa = (1, 32, 32, 3) x_spsa = tf.placeholder(tf.float32, shape=shape_spsa) y_spsa = tf.placeholder(tf.int32) # Load the hidden representations of the real and adversarial examples from the training set x_train_real = np.squeeze( np.load("../3-Representation/" + dataset + "/train_" + mode + ".npy")) x_train_adv = np.squeeze( np.load("../3-Representation/" + dataset + "/train_adv_" + mode + ".npy")) n_train = x_train_real.shape[0] n_train_adv = x_train_adv.shape[0] x_train = np.float32(np.vstack((x_train_real, x_train_adv))) #print("Bounds ", np.max(np.abs(x_train))) y_train = np.float32( np.hstack((-1.0 * np.ones(n_train), np.ones(n_train_adv)))) # Create the defended model model_defended = DefendedModel(base_model, x_train, y_train, K) defended_logits = model_defended.get_logits(x) # Configure the attack attack = SPSA(model_defended, back="tf", sess=sess) with tf.name_scope("Attack") as scope: gen = attack.generate(x_spsa, y=y_spsa, epsilon=epsilon, is_targeted=False, num_steps=100, batch_size=2048, early_stop_loss_threshold=-5.0) # Run the attack f = open(fname + ".txt", "w") sample = np.random.choice(data.test_data.shape[0], N, replace=False) x_sample = data.test_data[sample] y_sample = np.argmax(data.test_labels[sample], axis=1) logits_nat = sess.run(defended_logits, {x: x_sample}) f.write("Accuracy on Natural Images: " + str(np.mean(np.argmax(logits_nat, axis=1) == y_sample)) + "\n") pred_adv = -1.0 * np.ones((N)) for i in range(N): x_real = x_sample[i].reshape(shape_spsa) x_adv = sess.run(gen, {x_spsa: x_real, y_spsa: y_sample[i]}) pred_adv[i] = np.argmax(sess.run(defended_logits, {x: x_adv})) f.write("Accuracy on Adversarial Images: " + str(np.mean(pred_adv == y_sample))) f.close()
def run(args, restrict=True): if restrict: # Restrict the visible GPUs to the one for this subprocess id = np.int(multiprocessing.current_process().name.split("-")[1]) os.environ["CUDA_VISIBLE_DEVICES"] = str(id - 1) # Load Parameters dataset = args[0] epsilon = float(args[1]) mode = args[2] K = int(args[3]) bias = float(args[4]) fname = dataset + "/" + str(epsilon) + "_" + mode + "_" + str( K) + "_" + str(bias) # Configure Keras/Tensorflow Keras.clear_session() config = tf.ConfigProto() config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) sess = Keras.get_session() Keras.set_learning_phase(False) # Fix Random Seeds np.random.seed(1) tf.set_random_seed( 1 ) #Having this before keras.clear_session() causes it it hang for some reason # Load Model/Data and setup SPSA placeholders N = 1000 if dataset == "MNIST": # Base Model base_model = MNISTModel("../1-Models/MNIST") data = MNIST() x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) # SPSA shape_spsa = (1, 28, 28, 1) x_spsa = tf.placeholder(tf.float32, shape=shape_spsa) elif dataset == "CIFAR": # Base Model base_model = CIFARModel("../1-Models/CIFAR") data = CIFAR() x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) # SPSA shape_spsa = (1, 32, 32, 3) x_spsa = tf.placeholder(tf.float32, shape=shape_spsa) y_spsa = tf.placeholder(tf.int32) # Load the hidden representations of the real and adversarial examples from the training set x_train_real = np.squeeze( np.load("../3-Representation/" + dataset + "/train_" + mode + ".npy")) x_train_adv = np.squeeze( np.load("../3-Representation/" + dataset + "/train_adv_" + mode + ".npy")) n_train = x_train_real.shape[0] n_train_adv = x_train_adv.shape[0] x_train = np.float32(np.vstack((x_train_real, x_train_adv))) #print("Bounds ", np.max(np.abs(x_train))) y_train = np.float32( np.hstack((-1.0 * np.ones(n_train), np.ones(n_train_adv)))) # Create the defended model model_defended = DefendedModel(base_model, x_train, y_train, K, bias=bias) defended_logits = model_defended.get_logits(x) # Get the predictions on the original images labels = np.argmax(data.test_labels[:N], axis=1) logits_real = sess.run(defended_logits, {x: data.test_data[:N]}) fp = (np.argmax(logits_real, axis=1) == 10) #False positives of the defense pred_undefended = np.argmax(np.delete(logits_real, -1, axis=1), axis=1) #Original model prediction # Configure the attack attack = SPSA(model_defended, back="tf", sess=sess) with tf.name_scope("Attack") as scope: gen = attack.generate(x_spsa, y_target=y_spsa, epsilon=epsilon, is_targeted=True, num_steps=100, batch_size=2048, early_stop_loss_threshold=-5.0) # Run the attack pred_adv = -1.0 * np.ones((N, 10)) for i in range(N): if i % 10 == 0: print(fname, " ", i) out = {} out["FP"] = fp out["Labels"] = labels out["UndefendedPrediction"] = pred_undefended out["AdversarialPredictions"] = pred_adv file = open(fname, "wb") pickle.dump(out, file) file.close() x_real = data.test_data[i].reshape(shape_spsa) # Try a targeted attack for each class other than the original network prediction and the adversarial class for y in range(10): if y != pred_undefended[i]: x_adv = sess.run(gen, {x_spsa: x_real, y_spsa: y}) pred_adv[i, y] = np.argmax(sess.run(defended_logits, {x: x_adv})) out = {} out["FP"] = fp out["Labels"] = labels out["UndefendedPrediction"] = pred_undefended out["AdversarialPredictions"] = pred_adv file = open(fname, "wb") pickle.dump(out, file) file.close() analysis(fname)
spsa_params = { 'eps': float(sys.argv[1]), 'learning_rate': 0.01, 'delta': 0.01, 'spsa_samples': 128, 'spsa_iters': 1, 'nb_iter': 100, 'clip_min': 0., 'clip_max': 1. } spsa_attack = SPSA(wrap_source, sess=sess) x = tf.placeholder(dtype=tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(dtype=tf.float32, shape=(None, 10)) x_adv = spsa_attack.generate(x, y, **spsa_params) X_adv_source = np.zeros((len(indices_test), 32, 32, 3)) for i in range(0, len(indices_test)): X_adv_source[i] = sess.run(x_adv, feed_dict={ x: X_test[indices_test[i:(i + 1)]], y: Y_test[indices_test[i:(i + 1)]] }) print("metrics source model") print(metrics(model_source, X_adv_source, X_test, pred_source, indices_test)) print("metrics base model") print(metrics(model, X_adv_source, X_test, pred_base, indices_test)) pred_source_adv = np.argmax(model_source.predict(X_adv_source), axis=1) pred_adv_basefromsource = np.argmax(model.predict(X_adv_source), axis=1)
def craft_one_type(sess, model, X, Y, dataset, attack, batch_size, log_path=None, fp_path=None, model_logits=None): """ TODO :param sess: :param model: :param X: :param Y: :param dataset: :param attack: :param batch_size: :return: """ print("entered") if not log_path is None: PATH_DATA = log_path if attack == 'fgsm': # FGSM attack print('Crafting fgsm adversarial samples...') X_adv = fast_gradient_sign_method(sess, model, X, Y, eps=ATTACK_PARAMS[dataset]['eps'], clip_min=CLIP_MIN, clip_max=CLIP_MAX, batch_size=batch_size) elif attack == 'adapt-fgsm': # Adaptive FGSM attack print('Crafting fgsm adversarial samples...') X_adv = adaptive_fast_gradient_sign_method( sess, model, X, Y, eps=ATTACK_PARAMS[dataset]['eps'], clip_min=CLIP_MIN, clip_max=CLIP_MAX, batch_size=batch_size, log_dir=fp_path, model_logits=model_logits, dataset=dataset) elif attack == 'adapt-bim-b': # BIM attack print('Crafting %s adversarial samples...' % attack) X_adv = adaptive_basic_iterative_method( sess, model, X, Y, eps=ATTACK_PARAMS[dataset]['eps'], eps_iter=ATTACK_PARAMS[dataset]['eps_iter'], clip_min=CLIP_MIN, clip_max=CLIP_MAX, batch_size=batch_size, log_dir=fp_path, model_logits=model_logits, dataset=dataset) elif attack in ['bim-a', 'bim-b']: # BIM attack print('Crafting %s adversarial samples...' % attack) its, results = basic_iterative_method( sess, model, X, Y, eps=ATTACK_PARAMS[dataset]['eps'], eps_iter=ATTACK_PARAMS[dataset]['eps_iter'], clip_min=CLIP_MIN, clip_max=CLIP_MAX, batch_size=batch_size) if attack == 'bim-a': # BIM-A # For each sample, select the time step where that sample first # became misclassified X_adv = np.asarray([results[its[i], i] for i in range(len(Y))]) else: # BIM-B # For each sample, select the very last time step X_adv = results[-1] elif attack == 'jsma': # JSMA attack print('Crafting jsma adversarial samples. This may take > 5 hours') X_adv = saliency_map_method(sess, model, X, Y, theta=1, gamma=0.1, clip_min=CLIP_MIN, clip_max=CLIP_MAX) elif attack == 'cw-l2': # C&W attack print( 'Crafting %s examples. This takes > 5 hours due to internal grid search' % attack) image_size = ATTACK_PARAMS[dataset]['image_size'] num_channels = ATTACK_PARAMS[dataset]['num_channels'] num_labels = ATTACK_PARAMS[dataset]['num_labels'] cw_attack = CarliniL2(sess, model, image_size, num_channels, num_labels, batch_size=batch_size) X_adv = cw_attack.attack(X, Y) elif attack == 'cw-fp': # C&W attack to break LID detector print( 'Crafting %s examples. This takes > 5 hours due to internal grid search' % attack) image_size = ATTACK_PARAMS[dataset]['image_size'] num_channels = ATTACK_PARAMS[dataset]['num_channels'] num_labels = ATTACK_PARAMS[dataset]['num_labels'] cw_attack = CarliniFP_2vars(sess, model, image_size, num_channels, num_labels, batch_size=batch_size, fp_dir=fp_path) X_adv = cw_attack.attack(X, Y) elif attack == 'spsa': binary_steps = 1 batch_shape = X.shape X_input = tf.placeholder(tf.float32, shape=(1, ) + batch_shape[1:]) Y_label = tf.placeholder(tf.int32, shape=(1, )) alpha = tf.placeholder(tf.float32, shape=(1, )) num_samples = np.shape(X)[0] # X = (X - np.argmin(X))/(np.argmax(X)-np.argmin(X)) _min = np.min(X) _max = np.max(X) print(_max, _min) print(tf.trainable_variables()) filters = sess.run('conv1/kernel:0') biases = 0.0 * sess.run('conv1/bias:0') shift_model = Sequential() if (dataset == 'mnist'): shift_model.add( Conv2D(32, kernel_size=(3, 3), activation=None, input_shape=(1, 28, 28))) else: shift_model.add( Conv2D(32, kernel_size=(3, 3), activation=None, input_shape=(3, 32, 32))) X_input_2 = tf.placeholder(tf.float32, shape=(None, ) + batch_shape[1:]) correction_term = shift_model(X_input_2) if (dataset == 'mnist'): X_correction = -0.5 * np.ones( (1, 1, 28, 28) ) # We will shift the image up by 0.5, so this is the correction else: X_correction = -0.5 * np.ones( (1, 3, 32, 32) ) # We will shift the image up by 0.5, so this is the correction # for PGD shift_model.layers[0].set_weights([filters, biases]) bias_correction_terms = (sess.run(correction_term, feed_dict={X_input_2: X_correction})) for i in range(32): biases[i] = bias_correction_terms[0, i, 0, 0] _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0) print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc)) original_biases = model.layers[0].get_weights()[1] original_weights = model.layers[0].get_weights()[0] model.layers[0].set_weights( [original_weights, original_biases + biases]) #Correct model for input shift X = X + 0.5 #shift input to make it >=0 _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0) print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc)) # check accuracy post correction of input and model print('Crafting %s examples. Using Cleverhans' % attack) image_size = ATTACK_PARAMS[dataset]['image_size'] num_channels = ATTACK_PARAMS[dataset]['num_channels'] num_labels = ATTACK_PARAMS[dataset]['num_labels'] from cleverhans.utils_keras import KerasModelWrapper wrapped_model = KerasModelWrapper(model) if dataset == "mnist": wrapped_model.nb_classes = 10 elif dataset == "cifar": wrapped_model.nb_classes = 10 else: wrapped_model.nb_classes = 10 real_batch_size = X.shape[0] X_adv = None spsa = SPSA(wrapped_model, back='tf', sess=sess) spsa_params = { "epsilon": ATTACK_PARAMS[dataset]['eps'], 'num_steps': 100, 'spsa_iters': 1, 'early_stop_loss_threshold': None, 'is_targeted': False, 'is_debug': False } X_adv_spsa = spsa.generate(X_input, alpha=alpha, y=Y_label, fp_path=fp_path, **spsa_params) for i in range(num_samples): # rescale to format TF wants #X_i_norm = (X[i] - _min)/(_max-_min) X_i_norm = X[i] # Run attack best_res = None ALPHA = np.ones(1) * 0.1 lb = 1.0e-2 ub = 1.0e2 for j in range(binary_steps): res = sess.run(X_adv_spsa, feed_dict={ X_input: np.expand_dims(X_i_norm, axis=0), Y_label: np.array([np.argmax(Y[i])]), alpha: ALPHA }) if (dataset == 'mnist'): X_place = tf.placeholder(tf.float32, shape=[1, 1, 28, 28]) else: X_place = tf.placeholder(tf.float32, shape=[1, 3, 32, 32]) pred = model(X_place) model_op = sess.run(pred, feed_dict={X_place: res}) if (not np.argmax(model_op) == np.argmax(Y[i, :])): lb = ALPHA[0] else: ub = ALPHA[0] ALPHA[0] = 0.5 * (lb + ub) print(ALPHA) if (best_res is None): best_res = res else: if (not np.argmax(model_op) == np.argmax(Y[i, :])): best_res = res pass # Rescale result back to our scale if (i == 0): X_adv = best_res else: X_adv = np.concatenate((X_adv, best_res), axis=0) _, acc = model.evaluate(X_adv, Y, batch_size=batch_size, verbose=0) print("Model accuracy on the adversarial test set: %0.2f%%" % (100.0 * acc)) _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0) print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc)) #Revert model to original model.layers[0].set_weights([original_weights, original_biases]) #Revert adv shift X_adv = X_adv - 0.5 X = X - 0.5 #Not used but just for logging purposes elif attack == 'adapt-pgd': binary_steps = 1 rand_starts = 2 batch_shape = X.shape X_input = tf.placeholder(tf.float32, shape=(1, ) + batch_shape[1:]) Y_label = tf.placeholder(tf.int32, shape=(1, )) alpha = tf.placeholder(tf.float32, shape=(1, )) num_samples = np.shape(X)[0] # X = (X - np.argmin(X))/(np.argmax(X)-np.argmin(X)) _min = np.min(X) _max = np.max(X) print(_max, _min) print(tf.trainable_variables()) filters = sess.run('conv1/kernel:0') biases = 0.0 * sess.run('conv1/bias:0') shift_model = Sequential() if (dataset == 'mnist'): shift_model.add( Conv2D(32, kernel_size=(3, 3), activation=None, input_shape=(1, 28, 28))) else: shift_model.add( Conv2D(32, kernel_size=(3, 3), activation=None, input_shape=(3, 32, 32))) X_input_2 = tf.placeholder(tf.float32, shape=(None, ) + batch_shape[1:]) correction_term = shift_model(X_input_2) if (dataset == 'mnist'): X_correction = -0.5 * np.ones( (1, 1, 28, 28) ) # We will shift the image up by 0.5, so this is the correction else: X_correction = -0.5 * np.ones( (1, 3, 32, 32) ) # We will shift the image up by 0.5, so this is the correction # for PGD shift_model.layers[0].set_weights([filters, biases]) bias_correction_terms = (sess.run(correction_term, feed_dict={X_input_2: X_correction})) for i in range(32): biases[i] = bias_correction_terms[0, i, 0, 0] _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0) print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc)) original_biases = model.layers[0].get_weights()[1] original_weights = model.layers[0].get_weights()[0] model.layers[0].set_weights( [original_weights, original_biases + biases]) #Correct model for input shift X = X + 0.5 #shift input to make it >=0 _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0) print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc)) # check accuracy post correction of input and model print('Crafting %s examples. Using Cleverhans' % attack) image_size = ATTACK_PARAMS[dataset]['image_size'] num_channels = ATTACK_PARAMS[dataset]['num_channels'] num_labels = ATTACK_PARAMS[dataset]['num_labels'] from cleverhans.utils_keras import KerasModelWrapper wrapped_model = KerasModelWrapper(model) if dataset == "mnist": wrapped_model.nb_classes = 10 elif dataset == "cifar": wrapped_model.nb_classes = 10 else: wrapped_model.nb_classes = 10 real_batch_size = X.shape[0] X_adv = None pgd = MadryEtAl(wrapped_model, back='tf', sess=sess) X_adv_pgd, adv_loss_fp = pgd.generate(X_input, eps=0.3, eps_iter=0.02, clip_min=0.0, clip_max=1.0, nb_iter=20, rand_init=True, fp_path=fp_path, alpha=alpha) for i in range(num_samples): # rescale to format TF wants #X_i_norm = (X[i] - _min)/(_max-_min) X_i_norm = X[i] # Run attack best_res = None best_res_loss = 1000000.0 ALPHA = np.ones(1) * 0.1 lb = 1.0e-2 ub = 1.0e2 for j in range(binary_steps): bin_flag = 0 for jj in range(rand_starts): [res, res_loss] = sess.run( [X_adv_pgd, adv_loss_fp], feed_dict={ X_input: np.expand_dims(X[i], axis=0), Y_label: np.array([np.argmax(Y[i])]), alpha: ALPHA }) if (dataset == 'mnist'): X_place = tf.placeholder(tf.float32, shape=[1, 1, 28, 28]) else: X_place = tf.placeholder(tf.float32, shape=[1, 3, 32, 32]) pred = model(X_place) model_op = sess.run(pred, feed_dict={X_place: res}) if (best_res is None): best_res = res else: if ((not np.argmax(model_op) == np.argmax(Y[i, :])) and res_loss < best_res_loss): best_res = res best_res_loss = res_loss bin_flag = 1 pass if (bin_flag == 1): lb = ALPHA[0] else: ub = ALPHA[0] ALPHA[0] = 0.5 * (lb + ub) print(ALPHA) # Rescale result back to our scale if (i == 0): X_adv = best_res else: X_adv = np.concatenate((X_adv, best_res), axis=0) _, acc = model.evaluate(X_adv, Y, batch_size=batch_size, verbose=0) print("Model accuracy on the adversarial test set: %0.2f%%" % (100.0 * acc)) _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0) print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc)) #Revert model to original model.layers[0].set_weights([original_weights, original_biases]) #Revert adv shift X_adv = X_adv - 0.5 X = X - 0.5 #Not used but just for logging purposes pass if ("adapt" in attack or "fp" in attack or "spsa" in attack): [m, _, _, _] = (np.shape(X_adv)) cropped_X_adv = [] cropped_Y = [] cropped_X = [] if (dataset == 'mnist'): X_place = tf.placeholder(tf.float32, shape=[1, 1, 28, 28]) pred = model(X_place) else: X_place = tf.placeholder(tf.float32, shape=[1, 3, 32, 32]) pred = model(X_place) for i in range(m): logits_op = sess.run(pred, feed_dict={X_place: X_adv[i:i + 1, :, :, :]}) if (not np.argmax(logits_op) == np.argmax(Y[i, :])): cropped_Y.append(Y[i, :]) cropped_X_adv.append(X_adv[i, :, :, :]) cropped_X.append(X[i, :, :, :]) X_adv = np.array(cropped_X_adv) X = np.array(cropped_X) Y = np.array(cropped_Y) f = open( os.path.join(log_path, 'Random_Test_%s_%s.p' % (dataset, attack)), 'w') pickle.dump({"adv_input": X, "adv_labels": Y}, f) f.close() #np.save(os.path.join(PATH_DATA, 'Adv_%s_%s.npy' % (dataset, attack)), X_adv) f = open(os.path.join(log_path, 'Adv_%s_%s.p' % (dataset, attack)), 'w') pickle.dump({"adv_input": X_adv, "adv_labels": Y}, f) f.close() _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0) print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc)) l2_diff = np.linalg.norm(X_adv.reshape((len(X), -1)) - X.reshape( (len(X), -1)), axis=1).mean() print("Average L-2 perturbation size of the %s attack: %0.2f" % (attack, l2_diff)) if (("adapt" in attack) or ("cw-fp" in attack)): return (X, X_adv, Y) else: print(Y.shape) return (X_adv, Y)
def run(args, restrict=True): if restrict: # Restrict the visible GPUs to the one for this subprocess id = np.int(multiprocessing.current_process().name.split("-")[1]) os.environ["CUDA_VISIBLE_DEVICES"] = str(id - 1) # Load Parameters dataset = args[0] epsilon = float(args[1]) mode = args[2] K = int(args[3]) fname = dataset + "/" + str(epsilon) + "_" + mode + "_" + str(K) # Configure Keras/Tensorflow Keras.clear_session() config = tf.ConfigProto() config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) sess = Keras.get_session() Keras.set_learning_phase(False) # Fix Random Seeds np.random.seed(1) tf.set_random_seed( 1 ) #Having this before keras.clear_session() causes it it hang for some reason # Load Model/Data and setup SPSA placeholders N = 500 if dataset == "MNIST": # Base Model base_model = MNISTModel("../1-Models/MNIST") data = MNIST() x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) # SPSA shape_spsa = (1, 28, 28, 1) x_spsa = tf.placeholder(tf.float32, shape=shape_spsa) elif dataset == "CIFAR": # Base Model base_model = CIFARModel("../1-Models/CIFAR") data = CIFAR() x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) # SPSA shape_spsa = (1, 32, 32, 3) x_spsa = tf.placeholder(tf.float32, shape=shape_spsa) y_spsa = tf.placeholder(tf.int32) # Load the hidden representations of the real and adversarial examples from the training set x_train_real = np.squeeze( np.load("../3-Representation/" + dataset + "/train_" + mode + ".npy")) x_train_adv = np.squeeze( np.load("../3-Representation/" + dataset + "/train_adv_" + mode + ".npy")) n_train = x_train_real.shape[0] n_train_adv = x_train_adv.shape[0] x_train = np.float32(np.vstack((x_train_real, x_train_adv))) y_train = np.float32( np.hstack((-1.0 * np.ones(n_train), np.ones(n_train_adv)))) # Create the defended model defense = DefendedModel(base_model, x_train, y_train, K) get_votes = defense.get_votes( x) # Should this be get_votes, introducing separate method get_logits = defense.get_logits(x) # Configure the attack attack = SPSA(defense, back="tf", sess=sess) with tf.name_scope("Attack") as scope: gen = attack.generate(x_spsa, y=y_spsa, epsilon=0.01, is_targeted=False, num_steps=100, batch_size=2048, early_stop_loss_threshold=-0.05) # Run the test sample = np.random.choice(data.test_data.shape[0], N, replace=False) x_sample = data.test_data[sample] y_sample = np.argmax(data.test_labels[sample], axis=1) votes = sess.run(get_votes, {x: x_sample}) count = 0 bound = 0 correct = 0 for i in range(N): if votes[i, 0] > 0: count += 1 # Project via an adversarially attack on the votest #x_real = x_sample[i].reshape(shape_spsa) #x_adv = sess.run(gen, {x_spsa: x_real, y_spsa: 0}) #TODO: not adv, is projected x_proj = sess.run(get_logits, {x: x_sample[i]}) projection_labels = np.argmax(x_proj, axis=1) successful_projections = projection_labels[np.nonzero( projection_labels * (projection_labels != 10))] # Check if the projection was a success if successful_projections.shape[0] != 0: bound += 1 # Check if the projection is predicted correctly if y_sample[i] == np.argmax(sess.run(get_logits, {x: x_proj}), axis=1)[0]: correct += 1 print("FP Count: ", count) print("FP Recovery in Bounds: ", bound / count) print("FP Recovery Accuracy: ", correct / count)
def eval_robustness(ARGS, verbose=True): ############################################# # Load pre-trained model ############################################# if verbose: print('\n- Loading pre-trained model...') # Build evaluation graph eval_graph = tf.Graph() config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(graph=eval_graph, config=config) # Define input TF placeholder with eval_graph.as_default(): with tf.device('/gpu:0'): # Define placeholders with tf.name_scope('Placeholders'): x = tf.placeholder(dtype=tf.float32, shape=input_shape, name='inputs') y = tf.placeholder(dtype=tf.float32, shape=(None, n_classes), name='labels') is_training = tf.placeholder_with_default(False, shape=(), name='is-training') # Define model with tf.name_scope('Model'): model = Model(nb_classes=n_classes, input_shape=input_shape, is_training=is_training) # Define forward-pass with tf.name_scope('Logits'): logits = model.get_logits(x) with tf.name_scope('Probs'): preds = tf.nn.softmax(logits) # Restore the pre-trained model with sess.as_default(): saver = tf.train.Saver() saver.restore(sess, ARGS.restore_path + '/model.ckpt') # Define accuracy ops with tf.name_scope('Accuracy'): ground_truth = tf.argmax(y, axis=1) predicted_label = tf.argmax(preds, axis=1) correct_prediction = tf.equal(predicted_label, ground_truth) clean_acc = tf.reduce_mean(tf.to_float(correct_prediction), name='accuracy') # Define PGD adversary if ARGS.attack == 'PGD': if verbose: print('\n- Building {:s} attack graph...'.format( ARGS.attack)) with tf.name_scope('PGD-Attacker'): pgd_params = { 'ord': np.inf, 'y': y, 'eps': ARGS.eps / 255, 'eps_iter': ARGS.eps_iter / 255, 'nb_iter': ARGS.nb_iter, 'rand_init': ARGS.rand_init, 'rand_minmax': ARGS.eps / 255, 'clip_min': 0., 'clip_max': 1., 'sanity_checks': True } pgd = ProjectedGradientDescent(model, sess=None) adv_x = pgd.generate(x, **pgd_params) # Define SPSA adversary elif ARGS.attack == 'SPSA': if verbose: print('\n- Building {:s} attack graph...'.format( ARGS.attack)) with tf.name_scope('PGD-Attacker'): spsa_params = { 'y': y, 'eps': ARGS.eps / 255, 'nb_iter': ARGS.nb_iter, 'spsa_samples': ARGS.spsa_samples, 'spsa_iters': ARGS.spsa_iters, 'clip_min': 0., 'clip_max': 1., 'learning_rate': ARGS.spsa_lr, 'delta': ARGS.spsa_delta } spsa = SPSA(model, sess=sess) adv_x = spsa.generate(x, **spsa_params) else: raise NotImplementedError with tf.name_scope('Logits'): adv_logits = model.get_logits(adv_x) with tf.name_scope('Probs'): adv_preds = tf.nn.softmax(adv_logits) adv_loss = tf.nn.softmax_cross_entropy_with_logits( logits=adv_logits, labels=y) adv_predicted_label = tf.argmax(adv_preds, axis=1) correct_prediction = tf.equal(adv_predicted_label, ground_truth) adv_accuracy = tf.reduce_mean(tf.to_float(correct_prediction), name='adv-accuracy') is_adv_example = tf.not_equal(ground_truth, adv_predicted_label) ############################################# # Run evaluation ############################################# if verbose: print('\n- Running robustness evaluation against {:s} attacker...\n'. format(ARGS.attack)) if ARGS.attack == 'PGD': clean, adv_mean, adv_worstcase = run_pgd_eval(x, y, is_training, sess, adv_testloader, clean_acc, adv_accuracy, adv_loss, is_adv_example, ARGS, save_loss_dist=False, verbose=verbose) elif ARGS.attack == 'SPSA': clean, adv_mean = run_spsa_eval(x, y, is_training, sess, adv_testloader, clean_acc, adv_accuracy, adv_loss, is_adv_example, ARGS, save_loss_dist=False, verbose=verbose) adv_worstcase = adv_mean else: raise NotImplementedError return clean, adv_mean, adv_worstcase
def eval(sess, model_name, X_train, Y_train, X_test, Y_test, cnn=False, rbf=False): """ Load model saved in model_name.json and model_name_weights.h5 and evaluate its accuracy on legitimate test samples and adversarial samples. Use cnn=True if the model is CNN based. """ # load saved model print("Load model ... ") ''' json = open('models/{}.json'.format(model_name), 'r') model = json.read() json.close() loaded_model = model_from_json(model) loaded_model.load_weights("models/{}_weights.h5".format(model_name)) ''' if rbf: loaded_model = load_model("rbfmodels/{}.h5".format(model_name), custom_objects={'RBFLayer': RBFLayer}) else: loaded_model = load_model("models/{}.h5".format(model_name)) # Set placeholders if cnn: x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) else: x = tf.placeholder(tf.float32, shape=(None, 784)) y = tf.placeholder(tf.float32, shape=(None, 10)) predictions = loaded_model(x) accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args={"batch_size": 128}) print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) # Using functions from /cleverhans/attacks_tf.py # Will be deprecated next year # adv_x = fgsm(x, predictions, eps=0.3) # X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], batch_size=128) # Using functions from /cleverhans/attacks.py (as specified by creators) wrap = KerasModelWrapper(loaded_model) spsa = SPSA(wrap, sess=sess) images = 100 correctImages = 0 adv_pred = np.zeros((images, 10)) for i in range(images): tensorpls = X_test[i].reshape(1, 784) tensorpls2 = Y_test[i].reshape(1, 10) x_in = tf.convert_to_tensor(tensorpls, tf.float32) y_in = tf.convert_to_tensor(tensorpls2, tf.float32) adv_x = spsa.generate(x_in, y_in, eps=0.3, nb_iter=100, clip_min=0, clip_max=1, early_stop_loss_threshold=-1., spsa_samples=32, spsa_iters=1) adv_x = tf.stop_gradient(adv_x) test2 = adv_x.eval(session=sess) test3 = test2.reshape(28, 28) plt.imshow(test3) plt.colorbar() plt.show() print(type(test2)) print(test2.shape) preds_adv = loaded_model(adv_x) test = preds_adv.eval(session=sess) for j in range(10): adv_pred[i][j] = test[0][j] if np.argmax(adv_pred[i]) == np.argmax(Y_test[i]): correctImages = correctImages + 1 accuracy = correctImages / (i + 1) print('Test accuracy (' + str(i + 1) + '): ' + str(accuracy)) # Evaluate the accuracy of the MNIST model on adversarial examples #accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={ "batch_size" : 128 }) accuracy = correctImages / images print('Test accuracy on adversarial test examples: ' + str(accuracy))
spsa_op = SPSA(cleverhans_model, sess=sess) spsa_params = {'eps': 2.5, 'clip_min': -2.3, 'clip_max': 2.8, 'nb_iter': 40, 'y': None} correct = 0 count = 0 for xs, ys in val_subset_loader: count += 1 ys = ys.numpy().astype(np.int32) # Create an SPSA attack spsa_params['y'] = ys adv_x = spsa_op.generate_np(xs, **spsa_params) break # tf computational graph count = 0 for xs, ys in val_subset_loader: count += 1 print(count) # Create an SPSA attack spsa_params['y'] = ys adv_x_op = spsa_op.generate(x_op, **spsa_params) #adv_preds_op = tf_model_fn(adv_x_op) adv_x = sess.run(adv_x_op, feed_dict={x_op: xs}) print(adv_x.shape) break
def train(alpha, eps2_ratio, gen_ratio, fgsm_eps, LR, logfile): logfile.write("fgsm_eps \t %g, LR \t %g, alpha \t %d , eps2_ratio \t %d , gen_ratio \t %d \n"%(fgsm_eps, LR, alpha, eps2_ratio, gen_ratio)) ############################# ##Hyper-parameter Setting#### ############################# hk = 256; #number of hidden units at the last layer Delta2 = (14*14+2)*25; #global sensitivity for the first hidden layer Delta3_adv = 2*hk #10*(hk + 1/4 * hk**2) #10*(hk) #global sensitivity for the output layer Delta3_benign = 2*hk #10*(hk); #global sensitivity for the output layer D = 50000; #size of the dataset L = 2499; #batch size image_size = 28; padding = 4; #numHidUnits = 14*14*32 + 7*7*64 + M + 10; #number of hidden units #gen_ratio = 1 epsilon1 = 0.0; #0.175; #epsilon for dpLRP epsilon2 = 0.1*(1 + gen_ratio); #epsilon for the first hidden layer epsilon3 = 0.1*(1); #epsilon for the last hidden layer total_eps = epsilon1 + epsilon2 + epsilon3 print(total_eps) uncert = 0.1; #uncertainty modeling at the output layer infl = 1; #inflation rate in the privacy budget redistribution R_lowerbound = 1e-5; #lower bound of the LRP c = [0, 40, 50, 200] #norm bounds epochs = 200; #number of epochs preT_epochs = 50; #number of epochs T = int(D/L*epochs + 1); #number of steps T pre_T = int(D/L*preT_epochs + 1); step_for_epoch = int(D/L); #number of steps for one epoch broken_ratio = 1 #alpha = 9.0 # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] #eps2_ratio = 10; # [1/10, 1/8, 1/6, 1/4, 1/2, 1, 2, 4, 6, 8, 10] #eps_benign = 1/(1+eps2_ratio)*(2*epsilon2) #eps_adv = eps2_ratio/(1+eps2_ratio)*(2*epsilon2) #fgsm_eps = 0.1 rand_alpha = 0.05 ##Robustness## robustness_T = (fgsm_eps*18*18*L*epsilon2)/Delta2; #### LRPfile = os.getcwd() + '/Relevance_R_0_075.txt'; ############################# mnist = input_data.read_data_sets("MNIST_data/", one_hot = True); ############################# ##Construct the Model######## ############################# #Step 4: Randomly initiate the noise, Compute 1/|L| * Delta3 for the output layer# #Compute the 1/|L| * Delta3 for the last hidden layer# """eps3_ratio = Delta3_adv/Delta3_benign; eps3_benign = 1/(1+eps3_ratio)*(epsilon3) eps3_adv = eps3_ratio/(1+eps3_ratio)*(epsilon3)""" loc, scale3_benign, scale3_adv = 0., Delta3_benign/(epsilon3*L), Delta3_adv/(epsilon3*L); ### #End Step 4# # Parameters Declarification W_conv1 = weight_variable('W_conv1', [5, 5, 1, 32], collect=[AECODER_VARIABLES]); b_conv1 = bias_variable('b_conv1', [32], collect=[AECODER_VARIABLES]); shape = W_conv1.get_shape().as_list() w_t = tf.reshape(W_conv1, [-1, shape[-1]]) w = tf.transpose(w_t) sing_vals = tf.svd(w, compute_uv=False) sensitivity = tf.reduce_max(sing_vals) gamma = 2*(14*14 + 2)*25/(L*sensitivity) dp_epsilon=1.0 #0.1 delta_r = fgsm_eps*(image_size**2); #delta_h = 1.0 * delta_r; #sensitivity*(14**2) = sensitivity*(\beta**2) can also be used #dp_mult = (Delta2/(L*epsilon2))/(delta_r / dp_epsilon) + (2*Delta2/(L*epsilon2))/(delta_h / dp_epsilon) W_conv2 = weight_variable('W_conv2', [5, 5, 32, 64], collect=[CONV_VARIABLES]); b_conv2 = bias_variable('b_conv2', [64], collect=[CONV_VARIABLES]); W_fc1 = weight_variable('W_fc1', [4 * 4 * 64, hk], collect=[CONV_VARIABLES]); b_fc1 = bias_variable('b_fc1', [hk], collect=[CONV_VARIABLES]); W_fc2 = weight_variable('W_fc2', [hk, 10], collect=[CONV_VARIABLES]); b_fc2 = bias_variable('b_fc2', [10], collect=[CONV_VARIABLES]); """scale2 = tf.Variable(tf.ones([hk])) beta2 = tf.Variable(tf.zeros([hk])) tf.add_to_collections([CONV_VARIABLES], scale2) tf.add_to_collections([CONV_VARIABLES], beta2)""" params = [W_conv1, b_conv1, W_conv2, b_conv2, W_fc1, b_fc1, W_fc2, b_fc2] ### #Step 5: Create the model# noise = tf.placeholder(tf.float32, [None, image_size, image_size, 1]); adv_noise = tf.placeholder(tf.float32, [None, image_size, image_size, 1]); keep_prob = tf.placeholder(tf.float32); x = tf.placeholder(tf.float32, [None, image_size*image_size]); x_image = tf.reshape(x, [-1,image_size,image_size,1]); #perturbFMx = np.random.laplace(0.0, Delta2/(2*epsilon2*L), 28*28) #perturbFMx = np.reshape(perturbFMx, [-1, 28, 28, 1]); # pretrain ### #Enc_Layer1 = EncLayer(inpt=x_image, n_filter_in = 1, n_filter_out = 32, filter_size = 5, W=W_conv1, b=b_conv1, activation=tf.nn.relu) #pretrain = Enc_Layer1.get_train_ops2(xShape = tf.shape(x_image)[0], Delta = Delta2, epsilon = 2*epsilon2, batch_size = L, learning_rate= LR, W = W_conv1, b = b_conv1, perturbFMx = noise) ########### adv_x = tf.placeholder(tf.float32, [None, image_size*image_size]); adv_image = tf.reshape(adv_x, [-1,image_size,image_size,1]); #perturbFMx_adv = np.random.laplace(0.0, Delta2/(2*epsilon2*L), 28*28) #perturbFMx_adv = np.reshape(perturbFMx_adv, [-1, 28, 28, 1]); # pretrain adv ### #perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2*L), 14*14*32) #perturbFM_h = np.reshape(perturbFM_h, [-1, 14, 14, 32]); FM_h = tf.placeholder(tf.float32, [None, 14, 14, 32]); Enc_Layer2 = EncLayer(inpt=adv_image, n_filter_in = 1, n_filter_out = 32, filter_size = 5, W=W_conv1, b=b_conv1, activation=tf.nn.relu) pretrain_adv = Enc_Layer2.get_train_ops2(xShape = tf.shape(adv_image)[0], Delta = Delta2, batch_size = L, learning_rate= LR, W = W_conv1, b = b_conv1, perturbFMx = adv_noise, perturbFM_h = FM_h) Enc_Layer3 = EncLayer(inpt=x_image, n_filter_in = 1, n_filter_out = 32, filter_size = 5, W=W_conv1, b=b_conv1, activation=tf.nn.relu) pretrain_benign = Enc_Layer3.get_train_ops2(xShape = tf.shape(x_image)[0], Delta = Delta2, batch_size = L, learning_rate= LR, W = W_conv1, b = b_conv1, perturbFMx = noise, perturbFM_h = FM_h) ########### x_image += noise; x_image = tf.clip_by_value(x_image, -10, 10) #Clip the values of each input feature. adv_image += adv_noise; adv_image = tf.clip_by_value(adv_image, -10, 10) #Clip the values of each input feature. #perturbFM = np.random.laplace(0.0, scale3_benign, hk) #perturbFM = np.reshape(perturbFM, [hk]); perturbFM = np.random.laplace(0.0, scale3_benign, hk * 10) perturbFM = np.reshape(perturbFM, [hk, 10]); y_conv = inference(x_image, perturbFM, hk, FM_h, params); softmax_y_conv = tf.nn.softmax(y_conv) #robust_mask = inference_robust_mask(y_conv, Delta2, L, epsilon2, robustness_T) #perturbFM = np.random.laplace(0.0, scale3_adv, hk) #perturbFM = np.reshape(perturbFM, [hk]); y_adv_conv = inference(adv_image, perturbFM, hk, FM_h, params); #adv_robust_mask = inference_robust_mask(y_adv_conv, Delta2, L, epsilon2, robustness_T) # test model perturbFM_test = np.random.laplace(0.0, 0, hk) perturbFM_test = np.reshape(perturbFM_test, [hk]); x_test = tf.reshape(x, [-1,image_size,image_size,1]); y_test = inference(x_test, perturbFM_test, hk, FM_h, params); #test_robust_mask = inference_robust_mask(y_test, Delta2, L, epsilon2, robustness_T) #Define a place holder for the output label# y_ = tf.placeholder(tf.float32, [None, 10]); adv_y_ = tf.placeholder(tf.float32, [None, 10]); #End Step 5# ############################# ############################# ##Define loss and Optimizer## ############################# ''' Computes differentially private sigmoid cross entropy given `logits`. Measures the probability error in discrete classification tasks in which each class is independent and not mutually exclusive. For brevity, let `x = logits`, `z = labels`. The logistic loss is z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x))) = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x))) = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x)) = (1 - z) * x + log(1 + exp(-x)) = x - x * z + log(1 + exp(-x)) For x < 0, to avoid overflow in exp(-x), we reformulate the above x - x * z + log(1 + exp(-x)) = log(exp(x)) - x * z + log(1 + exp(-x)) = - x * z + log(1 + exp(x)) Hence, to ensure stability and avoid overflow, the implementation uses this equivalent formulation max(x, 0) - x * z + log(1 + exp(-abs(x))) `logits` and `labels` must have the same type and shape. Let denote neg_abs_logits = -abs(y_conv) = -abs(h_fc1 * W_fc2). By Applying Taylor Expansion, we have: Taylor = max(y_conv, 0) - y_conv * y_ + log(1 + exp(-abs(y_conv))); = max(h_fc1 * W_fc2, 0) - (y_ * h_fc1) * W_fc2 + (math.log(2.0) + 0.5*neg_abs_logits + 1.0/8.0*neg_abs_logits**2) = max(h_fc1 * W_fc2, 0) - (y_ * h_fc1) * W_fc2 + (math.log(2.0) + 0.5*(-abs(h_fc1 * W_fc2)) + 1.0/8.0*(-abs(h_fc1 * W_fc2))**2) = F1 + F2 where: F1 = max(h_fc1 * W_fc2, 0) + (math.log(2.0) + 0.5*(-abs(h_fc1 * W_fc2)) + 1.0/8.0*(-abs(h_fc1 * W_fc2))**2) and F2 = - (y_ * h_fc1) * W_fc2 To ensure that Taylor is differentially private, we need to perturb all the coefficients, including the term y_ * h_fc1 * W_fc2. Note that h_fc1 is differentially private, since its computation on top of the DP Affine transformation does not access the original data. Therefore, F1 should be differentially private. We need to preserve DP in F2, which reads the groundtruth label y_, as follows: By applying Funtional Mechanism, we perturb (y_ * h_fc1) * W_fc2 as ((y_ * h_fc1) + perturbFM) * W_fc2 = (y_ * h_fc1)*W_fc2 + (perturbFM * W_fc2): perturbFM = np.random.laplace(0.0, scale3, hk * 10) perturbFM = np.reshape(perturbFM/L, [hk, 10]); where scale3 = Delta3/(epsilon3) = 2*hk/(epsilon3); To allow computing gradients at zero, we define custom versions of max and abs functions [Tensorflow]. Source: https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/python/ops/nn_impl.py @ TensorFlow ''' ### Taylor for benign x zeros = array_ops.zeros_like(y_conv, dtype=y_conv.dtype) cond = (y_conv >= zeros) relu_logits = array_ops.where(cond, y_conv, zeros) neg_abs_logits = array_ops.where(cond, -y_conv, y_conv) #Taylor = math_ops.add(relu_logits - y_conv * y_, math_ops.log1p(math_ops.exp(neg_abs_logits))) Taylor_benign = math_ops.add(relu_logits - y_conv * y_, math.log(2.0) + 0.5*neg_abs_logits + 1.0/8.0*neg_abs_logits**2) - tf.reduce_sum(perturbFM*W_fc2) #Taylor_benign = tf.abs(y_conv - y_) ### Taylor for adv_x zeros_adv = array_ops.zeros_like(y_adv_conv, dtype=y_conv.dtype) cond_adv = (y_adv_conv >= zeros_adv) relu_logits_adv = array_ops.where(cond_adv, y_adv_conv, zeros_adv) neg_abs_logits_adv = array_ops.where(cond_adv, -y_adv_conv, y_adv_conv) #Taylor = math_ops.add(relu_logits - y_conv * y_, math_ops.log1p(math_ops.exp(neg_abs_logits))) Taylor_adv = math_ops.add(relu_logits_adv - y_adv_conv * adv_y_, math.log(2.0) + 0.5*neg_abs_logits_adv + 1.0/8.0*neg_abs_logits_adv**2) - tf.reduce_sum(perturbFM*W_fc2) #Taylor_adv = tf.abs(y_adv_conv - adv_y_) ### Adversarial training loss adv_loss = (1/(L + L*alpha))*(Taylor_benign + alpha * Taylor_adv) '''Some time, using learning rate decay can help to stablize training process. However, use this carefully, since it may affect the convergent speed.''' global_step = tf.Variable(0, trainable=False) pretrain_var_list = tf.get_collection(AECODER_VARIABLES) train_var_list = tf.get_collection(CONV_VARIABLES) #print(pretrain_var_list) #print(train_var_list) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): pretrain_step = tf.train.AdamOptimizer(LR).minimize(pretrain_adv+pretrain_benign, global_step=global_step, var_list=pretrain_var_list); train_step = tf.train.AdamOptimizer(LR).minimize(adv_loss, global_step=global_step, var_list=train_var_list); sess = tf.InteractiveSession(); # Define the correct prediction and accuracy # This needs to be changed to "Robust Prediction" correct_prediction_x = tf.equal(tf.argmax(y_test,1), tf.argmax(y_,1)); accuracy_x = tf.reduce_mean(tf.cast(correct_prediction_x, tf.float32)); ############# # use these to get predictions wrt to robust conditions """robust_correct_prediction_x = tf.multiply(test_robust_mask, tf.cast(correct_prediction_x, tf.float32)) accuracy_x_robust = tf.reduce_sum(robust_correct_prediction_x) / tf.reduce_sum(test_robust_mask) #certified_utility = 2/(1/accuracy_x_robust + 1/(tf.reduce_sum(test_robust_mask)/(1.0*tf.cast(tf.size(test_robust_mask), tf.float32)))) certified_utility = (1.0*tf.reduce_sum(test_robust_mask))/(1.0*tf.cast(tf.size(test_robust_mask), tf.float32))""" ############# # craft adversarial samples from x for training dynamic_eps = tf.placeholder(tf.float32); emsemble_L = int(L/3) softmax_y = tf.nn.softmax(y_test) #c_x_adv = fgsm(x, softmax_y, eps=fgsm_eps, clip_min=0.0, clip_max=1.0) c_x_adv = fgsm(x, softmax_y, eps=(dynamic_eps)/10, clip_min=-1.0, clip_max=1.0) # for I-FGSM x_adv = tf.reshape(c_x_adv, [emsemble_L,image_size*image_size]); #====================== attack ========================= #attack_switch = {'randfgsm':True, 'fgsm':True, 'ifgsm':True, 'deepfool':True, 'mim':True, 'spsa':False, 'cwl2':False, 'madry':True, 'stm':True} #attack_switch = {'fgsm':True, 'ifgsm':True, 'deepfool':True, 'mim':True, 'spsa':False, 'cwl2':False, 'madry':True, 'stm':True} attack_switch = {'fgsm':True, 'ifgsm':True, 'deepfool':False, 'mim':True, 'spsa':False, 'cwl2':False, 'madry':True, 'stm':False} #other possible attacks: # ElasticNetMethod # FastFeatureAdversaries # LBFGS # SaliencyMapMethod # VirtualAdversarialMethod # y_test = logits (before softmax) # softmax_y_test = preds (probs, after softmax) softmax_y_test = tf.nn.softmax(y_test) # create saver saver = tf.train.Saver(tf.all_variables()) sess.run(W_conv1.initializer) _gamma = sess.run(gamma) _gamma_x = Delta2/L epsilon2_update = epsilon2/(1.0 + 1.0/_gamma + 1/_gamma_x) print(epsilon2_update/_gamma + epsilon2_update/_gamma_x) print(epsilon2_update) _sensitivityW = sess.run(sensitivity) delta_h = _sensitivityW*(14**2) dp_mult = (Delta2/(L*epsilon2_update))/(delta_r / dp_epsilon) + (2*Delta2/(L*epsilon2_update))/(delta_h / dp_epsilon) ############################# iterativeStep = 100 # load the most recent models _global_step = 0 ckpt = tf.train.get_checkpoint_state(os.getcwd() + './tmp/train') if ckpt and ckpt.model_checkpoint_path: print(ckpt.model_checkpoint_path); saver.restore(sess, ckpt.model_checkpoint_path) _global_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) else: print('No checkpoint file found') start_time = time.time(); # adv pretrain model (Auto encoder layer) cost = tf.reduce_sum(Enc_Layer2.cost); logfile.write("pretrain: \n") # define cleverhans abstract models for using cleverhans attacks ch_model_logits = CustomCallableModelWrapper(callable_fn=inference_test_input, output_layer='logits', hk=hk, params=params, image_size=image_size, adv_noise = adv_noise) ch_model_probs = CustomCallableModelWrapper(callable_fn=inference_test_input_probs, output_layer='probs', hk=hk, params=params, image_size=image_size, adv_noise = adv_noise) # rand+fgsm # if attack_switch['randfgsm']: # randfgsm_obj = FastGradientMethod(model=ch_model_probs, sess=sess) # x_randfgsm_t = (fgsm_eps - rand_alpha) * randfgsm_obj.generate(x=x, eps=fgsm_eps, clip_min=-1.0, clip_max=1.0) # x_rand_t = rand_alpha * tf.sign(tf.random_normal(shape=tf.shape(x), mean=0.0, stddev=1.0)) # define each attack method's tensor mu_alpha = tf.placeholder(tf.float32, [1]); attack_tensor_dict = {} # FastGradientMethod if attack_switch['fgsm']: print('creating attack tensor of FastGradientMethod') fgsm_obj = FastGradientMethod(model=ch_model_probs, sess=sess) #x_adv_test_fgsm = fgsm_obj.generate(x=x, eps=fgsm_eps, clip_min=-1.0, clip_max=1.0, ord=2) # testing now x_adv_test_fgsm = fgsm_obj.generate(x=x, eps=mu_alpha, clip_min=-1.0, clip_max=1.0) # testing now attack_tensor_dict['fgsm'] = x_adv_test_fgsm # Iterative FGSM (BasicIterativeMethod/ProjectedGradientMethod with no random init) # default: eps_iter=0.05, nb_iter=10 if attack_switch['ifgsm']: print('creating attack tensor of BasicIterativeMethod') ifgsm_obj = BasicIterativeMethod(model=ch_model_probs, sess=sess) #x_adv_test_ifgsm = ifgsm_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_ifgsm = ifgsm_obj.generate(x=x, eps=mu_alpha, eps_iter=mu_alpha/iterativeStep, nb_iter=iterativeStep, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['ifgsm'] = x_adv_test_ifgsm # Deepfool if attack_switch['deepfool']: print('creating attack tensor of DeepFool') deepfool_obj = DeepFool(model=ch_model_logits, sess=sess) #x_adv_test_deepfool = deepfool_obj.generate(x=x, nb_candidate=10, overshoot=0.02, max_iter=50, nb_classes=10, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_deepfool = deepfool_obj.generate(x=x, nb_candidate=10, overshoot=0.02, max_iter=50, nb_classes=10, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['deepfool'] = x_adv_test_deepfool # MomentumIterativeMethod # default: eps_iter=0.06, nb_iter=10 if attack_switch['mim']: print('creating attack tensor of MomentumIterativeMethod') mim_obj = MomentumIterativeMethod(model=ch_model_probs, sess=sess) #x_adv_test_mim = mim_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, decay_factor=1.0, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_mim = mim_obj.generate(x=x, eps=mu_alpha, eps_iter=mu_alpha/iterativeStep, nb_iter=iterativeStep, decay_factor=1.0, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['mim'] = x_adv_test_mim # SPSA # note here the epsilon is the infinity norm instead of precent of perturb # Maybe exclude this method first, since it seems to have some constrain about the data value range if attack_switch['spsa']: print('creating attack tensor of SPSA') spsa_obj = SPSA(model=ch_model_logits, sess=sess) #x_adv_test_spsa = spsa_obj.generate(x=x, epsilon=fgsm_eps, num_steps=10, is_targeted=False, early_stop_loss_threshold=None, learning_rate=0.01, delta=0.01,spsa_samples=1000, spsa_iters=1, ord=2) x_adv_test_spsa = spsa_obj.generate(x=x, epsilon=fgsm_eps, num_steps=10, is_targeted=False, early_stop_loss_threshold=None, learning_rate=0.01, delta=0.01,spsa_samples=1000, spsa_iters=1) attack_tensor_dict['spsa'] = x_adv_test_spsa # CarliniWagnerL2 # confidence=0 is fron their paper # it is said to be slow, maybe exclude first if attack_switch['cwl2']: print('creating attack tensor of CarliniWagnerL2') cwl2_obj = CarliniWagnerL2(model=ch_model_logits, sess=sess) #x_adv_test_cwl2 = cwl2_obj.generate(x=x, confidence=0, batch_size=1000, learning_rate=0.005, binary_search_steps=5, max_iterations=500, abort_early=True, initial_const=0.01, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_cwl2 = cwl2_obj.generate(x=x, confidence=0, batch_size=1000, learning_rate=0.005, binary_search_steps=5, max_iterations=500, abort_early=True, initial_const=0.01, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['cwl2'] = x_adv_test_cwl2 # MadryEtAl (Projected Grdient with random init, same as rand+fgsm) # default: eps_iter=0.01, nb_iter=40 if attack_switch['madry']: print('creating attack tensor of MadryEtAl') madry_obj = MadryEtAl(model=ch_model_probs, sess=sess) #x_adv_test_madry = madry_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_madry = madry_obj.generate(x=x, eps=mu_alpha, eps_iter=fgsm_eps/iterativeStep, nb_iter=iterativeStep, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['madry'] = x_adv_test_madry # SpatialTransformationMethod # the params are pretty different from on the paper # so I use default # exclude since there's bug if attack_switch['stm']: print('creating attack tensor of SpatialTransformationMethod') stm_obj = SpatialTransformationMethod(model=ch_model_probs, sess=sess) #x_adv_test_stm = stm_obj.generate(x=x, batch_size=1000, n_samples=None, dx_min=-0.1, dx_max=0.1, n_dxs=2, dy_min=-0.1, dy_max=0.1, n_dys=2, angle_min=-30, angle_max=30, n_angles=6, ord=2) x_adv_test_stm = stm_obj.generate(x=x, batch_size=1000, n_samples=None, dx_min=-0.1, dx_max=0.1, n_dxs=2, dy_min=-0.1, dy_max=0.1, n_dys=2, angle_min=-30, angle_max=30, n_angles=6) attack_tensor_dict['stm'] = x_adv_test_stm #====================== attack ========================= sess.run(tf.initialize_all_variables()); ##perturb h for training perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*32) perturbFM_h = np.reshape(perturbFM_h, [-1, 14, 14, 32]); ##perturb h for testing perturbFM_h_test = np.random.laplace(0.0, 0, 14*14*32) perturbFM_h_test = np.reshape(perturbFM_h_test, [-1, 14, 14, 32]); '''for i in range(_global_step, _global_step + pre_T): d_eps = random.random(); batch = mnist.train.next_batch(L); #Get a random batch. adv_images = sess.run(x_adv, feed_dict = {x:batch[0], y_:batch[1], FM_h: perturbFM_h_test, dynamic_eps: d_eps}) for iter in range(0, 9): adv_images = sess.run(x_adv, feed_dict = {x:adv_images, y_:batch[1], FM_h: perturbFM_h_test, dynamic_eps: d_eps}) """batch = mnist.train.next_batch(emsemble_L) adv_images_mim = sess.run(attack_tensor_dict['mim'], feed_dict = {x:batch[0], y_: batch[1]}) batch = mnist.train.next_batch(emsemble_L) adv_images_madry = sess.run(attack_tensor_dict['mim'], feed_dict = {x:batch[0], y_: batch[1]}) train_images = np.append(np.append(adv_images, adv_images_mim, axis = 0),adv_images_madry, axis = 0)""" batch_2 = mnist.train.next_batch(L); pretrain_step.run(feed_dict={adv_x: np.append(adv_images, batch_2[0], axis = 0), adv_noise: AdvLnoise, FM_h: perturbFM_h}); if i % int(5*step_for_epoch) == 0: cost_value = sess.run(cost, feed_dict={adv_x:mnist.test.images, adv_noise: AdvLnoise_test, FM_h: perturbFM_h_test})/(test_size*32) logfile.write("step \t %d \t %g \n"%(i, cost_value)) print(cost_value) pre_train_finish_time = time.time() print('pre_train finished in: ' + parse_time(pre_train_finish_time - start_time))''' # train and test model with adv samples max_benign_acc = -1; max_robust_benign_acc = -1 #max_adv_acc = -1; test_size = len(mnist.test.images) AdvLnoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L); AdvLnoise_test = generateIdLMNoise(image_size, 0, epsilon2_update, test_size); Lnoise_empty = generateIdLMNoise(image_size, 0, epsilon2_update, L); BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L); last_eval_time = -1 accum_time = 0 accum_epoch = 0 max_adv_acc_dict = {} max_robust_adv_acc_dict = {} #max_robust_adv_utility_dict = {} for atk in attack_switch.keys(): if atk not in max_adv_acc_dict: max_adv_acc_dict[atk] = -1 max_robust_adv_acc_dict[atk] = -1 for i in range(_global_step, _global_step + T): # this batch is for generating adv samples batch = mnist.train.next_batch(emsemble_L); #Get a random batch. y_adv_batch = batch[1] #The number of epochs we print out the result. Print out the result every 5 epochs. if i % int(10*step_for_epoch) == 0 and i > int(10*step_for_epoch): cost_value = sess.run(cost, feed_dict={adv_x:mnist.test.images, adv_noise: AdvLnoise_test, FM_h: perturbFM_h_test})/(test_size*32) print(cost_value) if last_eval_time < 0: last_eval_time = time.time() #===================benign samples===================== predictions_form_argmax = np.zeros([test_size, 10]) #test_bach = mnist.test.next_batch(test_size) softmax_predictions = softmax_y_conv.eval(feed_dict={x: mnist.test.images, noise: BenignLNoise, FM_h: perturbFM_h}) argmax_predictions = np.argmax(softmax_predictions, axis=1) for n_draws in range(0, 1): _BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L); _perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*32) _perturbFM_h = np.reshape(_perturbFM_h, [-1, 14, 14, 32]); for j in range(test_size): pred = argmax_predictions[j] predictions_form_argmax[j, pred] += 1; softmax_predictions = softmax_y_conv.eval(feed_dict={x: mnist.test.images, noise: (BenignLNoise + _BenignLNoise/2), FM_h: (perturbFM_h + _perturbFM_h/2)}) argmax_predictions = np.argmax(softmax_predictions, axis=1) final_predictions = predictions_form_argmax; is_correct = [] is_robust = [] for j in range(test_size): is_correct.append(np.argmax(mnist.test.labels[j]) == np.argmax(final_predictions[j])) robustness_from_argmax = robustness.robustness_size_argmax(counts=predictions_form_argmax[j],eta=0.05,dp_attack_size=fgsm_eps, dp_epsilon=1.0, dp_delta=0.05, dp_mechanism='laplace') / (dp_mult) is_robust.append(robustness_from_argmax >= fgsm_eps) acc = np.sum(is_correct)*1.0/test_size robust_acc = np.sum([a and b for a,b in zip(is_robust, is_correct)])*1.0/np.sum(is_robust) robust_utility = np.sum(is_robust)*1.0/test_size max_benign_acc = max(max_benign_acc, acc) max_robust_benign_acc = max(max_robust_benign_acc, robust_acc*robust_utility) log_str = "step: {:.1f}\t epsilon: {:.1f}\t benign: {:.4f} \t {:.4f} \t {:.4f} \t {:.4f} \t".format(i, total_eps, acc, robust_acc, robust_utility, robust_acc*robust_utility) #===================adv samples===================== #log_str = "step: {:.1f}\t epsilon: {:.1f}\t".format(i, total_eps) """adv_images_dict = {} for atk in attack_switch.keys(): if attack_switch[atk]: adv_images_dict[atk] = sess.run(attack_tensor_dict[atk], feed_dict = {x:mnist.test.images, y_:mnist.test.labels}) print("Done with the generating of Adversarial samples")""" #===================adv samples===================== adv_acc_dict = {} robust_adv_acc_dict = {} robust_adv_utility_dict = {} for atk in attack_switch.keys(): if atk not in adv_acc_dict: adv_acc_dict[atk] = -1 robust_adv_acc_dict[atk] = -1 robust_adv_utility_dict[atk] = -1 if attack_switch[atk]: adv_images_dict = sess.run(attack_tensor_dict[atk], feed_dict = {x:mnist.test.images, y_: mnist.test.labels, adv_noise: AdvLnoise_test, mu_alpha:[fgsm_eps]}) ### PixelDP Robustness ### predictions_form_argmax = np.zeros([test_size, 10]) softmax_predictions = softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: BenignLNoise, FM_h: perturbFM_h}) argmax_predictions = np.argmax(softmax_predictions, axis=1) for n_draws in range(0, 2000): if n_draws % 1000 == 0: print(n_draws) _BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L); _perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*32) _perturbFM_h = np.reshape(_perturbFM_h, [-1, 14, 14, 32]); for j in range(test_size): pred = argmax_predictions[j] predictions_form_argmax[j, pred] += 1; softmax_predictions = softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: BenignLNoise, FM_h: (perturbFM_h + _perturbFM_h/2)}) * softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: (BenignLNoise + _BenignLNoise/2), FM_h: perturbFM_h}) #softmax_predictions = softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: BenignLNoise, FM_h: (_perturbFM_h)}) * softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: (_BenignLNoise), FM_h: perturbFM_h}) argmax_predictions = np.argmax(softmax_predictions, axis=1) final_predictions = predictions_form_argmax; is_correct = [] is_robust = [] for j in range(test_size): is_correct.append(np.argmax(mnist.test.labels[j]) == np.argmax(final_predictions[j])) robustness_from_argmax = robustness.robustness_size_argmax(counts=predictions_form_argmax[j],eta=0.05,dp_attack_size=fgsm_eps, dp_epsilon=1.0, dp_delta=0.05, dp_mechanism='laplace') / (dp_mult) is_robust.append(robustness_from_argmax >= fgsm_eps) adv_acc_dict[atk] = np.sum(is_correct)*1.0/test_size robust_adv_acc_dict[atk] = np.sum([a and b for a,b in zip(is_robust, is_correct)])*1.0/np.sum(is_robust) robust_adv_utility_dict[atk] = np.sum(is_robust)*1.0/test_size ############################## for atk in attack_switch.keys(): if attack_switch[atk]: # added robust prediction log_str += " {}: {:.4f} {:.4f} {:.4f} {:.4f}".format(atk, adv_acc_dict[atk], robust_adv_acc_dict[atk], robust_adv_utility_dict[atk], robust_adv_acc_dict[atk]*robust_adv_utility_dict[atk]) max_adv_acc_dict[atk] = max(max_adv_acc_dict[atk], adv_acc_dict[atk]) max_robust_adv_acc_dict[atk] = max(max_robust_adv_acc_dict[atk], robust_adv_acc_dict[atk]*robust_adv_utility_dict[atk]) print(log_str) logfile.write(log_str + '\n') # logfile.write("step \t %d \t %g \t %g \n"%(i, benign_acc, adv_acc)) # print("step \t %d \t %g \t %g"%(i, benign_acc, adv_acc)); # estimate end time """if i > 0 and i % int(10*step_for_epoch) == 0: current_time_interval = time.time() - last_eval_time last_eval_time = time.time() print('during last eval interval, {} epoch takes {}'.format(10, parse_time(current_time_interval))) accum_time += current_time_interval accum_epoch += 10 estimate_time = ((_global_step + T - i) / step_for_epoch) * (accum_time / accum_epoch) print('estimate finish in: {}'.format(parse_time(estimate_time)))""" #print("step \t %d \t adversarial test accuracy \t %g"%(i, accuracy_x.eval(feed_dict={x: adv_images, y_: mnist.test.labels, noise: Lnoise_empty}))); """checkpoint_path = os.path.join(os.getcwd() + '/tmp/train', 'model.ckpt') saver.save(sess, checkpoint_path, global_step=i);""" d_eps = random.random(); y_adv = batch[1] adv_images = sess.run(attack_tensor_dict['ifgsm'], feed_dict = {x:batch[0], y_: batch[1], adv_noise: AdvLnoise, mu_alpha:[d_eps]}) """for iter in range(0, 9): adv_images = sess.run(x_adv, feed_dict = {x:adv_images, y_:batch[1], FM_h: perturbFM_h_test, dynamic_eps: d_eps})""" batch = mnist.train.next_batch(emsemble_L) adv_images_mim = sess.run(attack_tensor_dict['mim'], feed_dict = {x:batch[0], y_: batch[1], adv_noise: AdvLnoise, mu_alpha:[d_eps]}) y_adv = np.append(y_adv, batch[1], axis = 0) batch = mnist.train.next_batch(emsemble_L) adv_images_madry = sess.run(attack_tensor_dict['madry'], feed_dict = {x:batch[0], y_: batch[1], adv_noise: AdvLnoise, mu_alpha:[d_eps]}) y_adv = np.append(y_adv, batch[1], axis = 0) train_images = np.append(np.append(adv_images, adv_images_mim, axis = 0),adv_images_madry, axis = 0) batch = mnist.train.next_batch(L); #Get a random batch. # train with benign and adv samples pretrain_step.run(feed_dict={adv_x: train_images, x: batch[0], adv_noise: AdvLnoise_test, noise: BenignLNoise, FM_h: perturbFM_h}); train_step.run(feed_dict={x: batch[0], adv_x: train_images, y_: batch[1], adv_y_: y_adv, noise: BenignLNoise, adv_noise: AdvLnoise_test, FM_h: perturbFM_h}); duration = time.time() - start_time; # print(parse_time(duration)); #print running time duration# max_acc_string = "max acc: benign: \t{:.4f} {:.4f}".format(max_benign_acc, max_robust_benign_acc) for atk in attack_switch.keys(): if attack_switch[atk]: max_acc_string += " {}: \t{:.4f} {:.4f}".format(atk, max_adv_acc_dict[atk], max_robust_adv_acc_dict[atk]) logfile.write(max_acc_string + '\n') logfile.write(str(duration) + '\n')