z1[i] = tf.matmul(h1[i - 1], w[i]) + b[i] h1[i] = tf.nn.relu(z1[i]) if i == n - 1: z1[i] = tf.matmul(h1[i - 1], w[i]) + b[i] h1[i] = z1[i] y = classifier(x) yn = classifier_n(x) loss_cls = softmax_loss(label, yn) all_vars = tf.trainable_variables() c_vars = [var for var in all_vars if 'classifier' in var.name] train_op_classifier = GradientDescentOptimizer(learning_rate = learning_rate) \ .minimize(loss_cls, var_list = c_vars, global_step = global_step) jsma = SaliencyMapMethod(classifier, back='tf', sess=sess) x_fgsm = attack.fgsm(x, y, eps=0.2, clip_min=0, clip_max=1) def main(): mnist = input_data.read_data_sets('MNIST_data', one_hot=True) sess.run(tf.global_variables_initializer()) with sess.as_default(): acc = {} print('train classifier') for t in range(1, steps + 1): batch = mnist.train.next_batch(batch_size) f_dict = {x: batch[0], label: batch[1]} sess.run(train_op_classifier, feed_dict=f_dict) if t % 550 == 0: epoch = int(t / 550) acc['benign'] = sess.run(get_acc(x, label),
def model_train(para): sess = tf.Session() tf.set_random_seed(random_seed) n = len(layers) x = tf.placeholder(tf.float32, [None, 784]) # input label = tf.placeholder(tf.float32, [None, 10]) # true label std = para['std'] w, b = [0 for i in range(n)], [0 for i in range(n)] for i in range(1, n): w[i] = weight_variable([layers[i - 1], layers[i]]) b[i] = bias_variable([layers[i]]) # model with noise z, h = [0 for i in range(n)], [0 for i in range(n)] for i in range(n): if i == 0: z[i] = x z[i] += tf.random_normal(shape=tf.shape(z[i]), mean=0.0, stddev=std[i], dtype=tf.float32) z[i] = tf.clip_by_value(z[i], 0, 1) h[i] = z[i] if i > 0 and i < n - 1: z[i] = tf.matmul(h[i - 1], w[i]) + b[i] #z[i] = tf.clip_by_norm(z[i], 1, axes = 1) z[i] += tf.random_normal(shape=tf.shape(z[i]), mean=0.0, stddev=std[i], dtype=tf.float32) h[i] = tf.nn.relu(z[i]) if i == n - 1: z[i] = tf.matmul(h[i - 1], w[i]) + b[i] #z[i] = tf.clip_by_norm(z[i], 1000, axes = 1) z[i] += tf.random_normal(shape=tf.shape(z[i]), mean=0.0, stddev=std[i], dtype=tf.float32) h[i] = z[i] y = h[n - 1] w_sum = tf.constant(0, dtype='float32') for i in range(1, n): w_sum += tf.reduce_sum(tf.square(w[i])) # gradient descent loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=label, logits=y)) gw, gb = [0 for i in range(n)], [0 for i in range(n)] for i in range(1, n): gw[i] = tf.gradients(loss, w[i])[0] gb[i] = tf.gradients(loss, b[i])[0] opt = GradientDescentOptimizer(learning_rate=learning_rate) gradients = [] for i in range(1, n): gradients.append((gw[i], w[i])) gradients.append((gb[i], b[i])) train_step = opt.apply_gradients(gradients) # model without noise z2, h2 = [0 for i in range(n)], [0 for i in range(n)] for i in range(n): if i == 0: z2[i] = x h2[i] = z2[i] if i > 0 and i < n - 1: z2[i] = tf.matmul(h2[i - 1], w[i]) + b[i] h2[i] = tf.nn.relu(z2[i]) if i == n - 1: z2[i] = tf.matmul(h2[i - 1], w[i]) + b[i] h2[i] = z2[i] y2 = h2[n - 1] # attack x_adv = attack.fgsm(x, y2, eps=0.3, clip_min=0, clip_max=1) #evaluation acc = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(y2, 1), tf.argmax(label, 1)), tf.float32)) # data mnist = input_data.read_data_sets('MNIST_data', one_hot=True) x_adv_mnist_fsgm = np.load(os.path.join('data', 'x_adv_mnist_fsgm.npy')) sess.run(tf.global_variables_initializer()) with sess.as_default(): for t in range(1, steps + 1): batch = mnist.train.next_batch(batch_size) sess.run(train_step, feed_dict={x: batch[0], label: batch[1]}) if t % int(1 / sample_rate) == 0: epoch = int(t / int(1 / sample_rate)) x_adv_sample = sess.run(x_adv, feed_dict={ x: mnist.test.images, label: mnist.test.labels }) acc_benign = sess.run(acc, feed_dict={ x: mnist.test.images, label: mnist.test.labels }) acc_adv = sess.run(acc, feed_dict={ x: x_adv_sample, label: mnist.test.labels }) acc_pre_adv = sess.run(acc, feed_dict={ x: x_adv_mnist_fsgm, label: mnist.test.labels }) print(epoch, acc_benign, acc_adv, acc_pre_adv) check = tf.reduce_mean(tf.norm(y2, axis=1)) print( sess.run([check], feed_dict={ x: mnist.test.images, label: mnist.test.labels }))
def main(): sess = tf.Session() train_size, test_size = 55000, 10000 batch_size = 100 lr = 0.05 epochs = 100 steps = epochs * int(train_size / batch_size) global_step = tf.Variable(0, name='global_step', trainable=False) x = tf.placeholder(tf.float32, [None, 784]) # input for real images x_adv = tf.placeholder(tf.float32, [None, 784]) y_ = tf.placeholder(tf.float32, [None, 10]) # groundtruth class label y_target = tf.placeholder(tf.float32, [None, 10]) noise = tf.placeholder(tf.float32, [None, 100]) tx = transformer(x, noise, 3) yx_2, z_norm = classifier_x(x) yx = classifier(x) ytx = classifier(tx) ytx_2, z_norm2 = classifier_x(tx) y_x_adv = classifier(x_adv) x_fgsm = attack.fgsm(x, yx_2, eps=0.1, clip_min=0, clip_max=1) y_x_fgsm = classifier(x_fgsm) jsma = SaliencyMapMethod(classifier, back='tf', sess=sess) one_hot_target = np.zeros((100, 10), dtype=np.float32) one_hot_target[:, 1] = 1 jsma_params = { 'theta': 1., 'gamma': 0.1, 'nb_classes': 10, 'clip_min': 0., 'clip_max': 1., 'targets': yx, 'y_val': one_hot_target } perturb = {} perturb['tx'] = tf.reduce_mean(tf.norm(tx - x, axis=1)) perturb['fgsm'] = tf.reduce_mean(tf.norm(x_fgsm - x, axis=1)) loss = {} loss['cx'] = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=yx_2)) loss['ctx'] = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=ytx_2)) loss['cttx'] = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_target, logits=ytx)) loss['classifier'] = loss['cx'] #+ loss['ctx'] #loss['transformer'] = loss['cttx'] loss['transformer'] = -loss['ctx'] all_vars = tf.trainable_variables() c_vars = [var for var in all_vars if 'classifier' in var.name] t_vars = [var for var in all_vars if 'transformer' in var.name] train_op = {} train_op['classifier'] = GradientDescentOptimizer(learning_rate = lr) \ .minimize(loss['classifier'], var_list = c_vars, global_step = global_step) train_op['transformer'] = GradientDescentOptimizer(learning_rate = lr) \ .minimize(loss['transformer'], var_list = t_vars, global_step = global_step) acc = {} acc['x'] = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(yx, 1), tf.argmax(y_, 1)), tf.float32)) acc['tx'] = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(ytx, 1), tf.argmax(y_, 1)), tf.float32)) acc['x_fgsm'] = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(y_x_fgsm, 1), tf.argmax(y_, 1)), tf.float32)) acc['x_adv'] = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(y_x_adv, 1), tf.argmax(y_, 1)), tf.float32)) mnist = input_data.read_data_sets('MNIST_data', one_hot=True) x_adv_mnist_fsgm = np.load(os.path.join('data', 'x_fgsm_mnist.npy')) tf.set_random_seed(1024) sess.run(tf.global_variables_initializer()) with sess.as_default(): for t in range(1, steps + 1): batch = mnist.train.next_batch(batch_size) y_tar = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0] for i in range(batch_size)] y_tar = np.array(y_tar, dtype='float32') noise_sample = sample_Z(batch_size, 100) sess.run(train_op['classifier'], feed_dict={ x: batch[0], y_: batch[1], noise: noise_sample, y_target: y_tar }) sess.run(train_op['transformer'], feed_dict={ x: batch[0], y_: batch[1], noise: noise_sample, y_target: y_tar }) if t % int(train_size / batch_size) == 0: epoch = int(t / int(train_size / batch_size)) noise_sample2 = sample_Z(10000, 100) test_batch = mnist.test.next_batch(10000) print(test_batch[0].shape) var_list = [acc, z_norm] res = sess.run(var_list, feed_dict = {x: test_batch[0], y_: test_batch[1], noise : noise_sample2, \ x_adv: x_adv_mnist_fsgm, y_target: y_tar}) print(epoch) for r in res: print(r)
def main(): random_seed = 1024 train_size, test_size = 55000, 10000 batch_size = 100 learning_rate = 0.05 epochs = 20 steps = epochs * 550 sess = tf.Session() global_step = tf.Variable(0, name = 'global_step', trainable = False) tf.set_random_seed(random_seed) saver = tf.train.Saver() x = tf.placeholder(tf.float32, [None, 784]) # input label = tf.placeholder(tf.float32, [None, 10]) # true label noise = tf.placeholder(tf.float32, [None, 100]) # noise vector y_target = tf.placeholder(tf.float32, [None, 10]) # target label x1 = tf.placeholder(tf.float32, [None, 784]) x2 = tf.placeholder(tf.float32, [None, 784]) y = classifier(x) # gan x_gan = generator(x, noise, 4) y_gan = classifier(x_gan) loss = softmax_loss(label, y) loss_gan = - softmax_loss(label, y_gan) all_vars = tf.trainable_variables() c_vars = [var for var in all_vars if 'classifier' in var.name] g_vars = [var for var in all_vars if 'generator' in var.name] train_op_classifier = GradientDescentOptimizer(learning_rate = learning_rate) \ .minimize(loss, var_list = c_vars, global_step = global_step) train_op_generator = GradientDescentOptimizer(learning_rate = 0.05) \ .minimize(loss_gan, var_list = g_vars, global_step = global_step) #fgsm x_fgsm = attack.fgsm(x, y, eps = 0.2, clip_min=0, clip_max=1) y_fgsm = classifier(x_fgsm) # jsma jsma = SaliencyMapMethod(classifier, back='tf', sess=sess) # train mnist = input_data.read_data_sets('MNIST_data', one_hot=True) y_target_batch = np.zeros((100, 10), dtype=np.float32) y_target_batch[:, 0] = 1.0 y_target_test = np.zeros((10000, 10), dtype=np.float32) y_target_test[:, 0] = 1.0 sess.run(tf.global_variables_initializer()) with sess.as_default(): print('train classifier') for t in range(1, steps + 1): batch = mnist.train.next_batch(batch_size) sess.run(train_op_classifier, feed_dict={x: batch[0], label: batch[1]}) if t % 550 == 0: epoch = int(t / 550) acc_benign = sess.run(get_acc(x, label), feed_dict={x: mnist.test.images, label: mnist.test.labels}) print(epoch, acc_benign) print('train gan') for t in range(1, 550 * 5 + 1): batch = mnist.train.next_batch(batch_size) f_dict = {x: batch[0], label: batch[1], noise: sample_Z(batch_size, 100), y_target: y_target_batch} sess.run(train_op_generator, feed_dict=f_dict) if t % 550 == 0: epoch = int(t / 550) f_dict = {x: mnist.test.images, label: mnist.test.labels, noise: sample_Z(10000, 100), y_target: y_target_batch} x_gan_d = sess.run(x_gan, feed_dict=f_dict) f_dict = {x: x_gan_d, label: mnist.test.labels} acc_gan = sess.run(get_acc(x, label), feed_dict=f_dict) print(epoch, acc_gan) checkpoint_path = os.path.join('model', 'basic_model.ckpt') #saver.save(sess, checkpoint_path, global_step = 1) print('generate adv samples for the first batch of the testing set') # real x_real_mnist_1 = mnist.test.images[0:100,] np.save(os.path.join('data','x_real_mnist_1.npy'), x_real_mnist_1) x_real_mnist_1_r = x_real_mnist_1.reshape([100, 28, 28]) save_images(x_real_mnist_1_r, [10, 10], os.path.join('img', 'x_real_mnist_1.png')) # fgsm x_fgsm_mnist_1 = sess.run(x_fgsm, feed_dict = {x: mnist.test.images[0:100,], label: mnist.test.labels[0:100,]}) np.save(os.path.join('data','x_fgsm_mnist_1.npy'), x_fgsm_mnist_1) x_fgsm_mnist_1_r = x_fgsm_mnist_1.reshape([100, 28, 28]) save_images(x_fgsm_mnist_1_r, [10, 10], os.path.join('img', 'x_fgsm_mnist_1.png')) #jsma jsma_params = {'theta': 1., 'gamma': 0.1,'nb_classes': 10, 'clip_min': 0.,'clip_max': 1., 'targets': y,\ 'y_val': y_target_batch} x_jsma_mnist_1 = jsma.generate_np(mnist.test.images[0:100,], **jsma_params) np.save(os.path.join('data','x_jsma_mnist_1.npy'), x_jsma_mnist_1) acc_jsma_1 = sess.run(get_acc(x, label), feed_dict={x: x_jsma_mnist_1, label: mnist.test.labels[0:100,]}) x_jsma_mnist_1_r = x_jsma_mnist_1.reshape([100, 28, 28]) save_images(x_jsma_mnist_1_r, [10, 10], os.path.join('img', 'x_jsma_mnist_1.png')) x_gan_mnist_1 = sess.run(x_gan, feed_dict={x: mnist.test.images[0:100,] ,label: mnist.test.labels[0:100,]\ , noise: sample_Z(batch_size, 100), y_target: y_target_batch}) np.save(os.path.join('data','x_gan_mnist_1.npy'), x_gan_mnist_1) x_gan_mnist_1_r = x_gan_mnist_1.reshape([100, 28, 28]) save_images(x_gan_mnist_1_r, [10, 10], os.path.join('img', 'x_gan_mnist_1.png')) diff_fgsm = sess.run(diff(x1, x2), feed_dict={x1: x_real_mnist_1, x2: x_fgsm_mnist_1}) diff_jsma = sess.run(diff(x1, x2), feed_dict={x1: x_real_mnist_1, x2: x_jsma_mnist_1}) diff_gan = sess.run(diff(x1, x2), feed_dict={x1: x_real_mnist_1, x2: x_gan_mnist_1}) print('perturb: fgsm: {:.3f}, jsma: {:.3f}, gan: {:.3f}'.format(diff_fgsm, diff_jsma, diff_gan)) acc_benign = sess.run(get_acc(x, label), feed_dict={x: mnist.test.images, label: mnist.test.labels}) print('generate adv samples for the entire testing set') # fgsm x_fgsm_mnist = sess.run(x_fgsm, feed_dict = {x: mnist.test.images, label: mnist.test.labels}) np.save(os.path.join('data','x_fgsm_mnist.npy'), x_fgsm_mnist) acc_fgsm = sess.run(get_acc(x, label), feed_dict={x: x_fgsm_mnist, label: mnist.test.labels}) # gan x_gan_mnist = sess.run(x_gan, feed_dict={x: mnist.test.images ,label: mnist.test.labels\ , noise: sample_Z(10000, 100), y_target: y_target_test}) np.save(os.path.join('data','x_gan_mnist.npy'), x_gan_mnist) acc_gan = sess.run(get_acc(x, label), feed_dict={x: x_gan_mnist ,label: mnist.test.labels\ , noise: sample_Z(10000, 100), y_target: y_target_test}) print('accuracy: benign: {:.3f}, fgsm: {:.3f}, jsma: {:.3f}, gan: {:.3f}'.format(acc_benign, acc_fgsm, acc_jsma_1, acc_gan)) ''' x_fgsm_mnist = np.load(os.path.join('data','x_fgsm_mnist.npy')) x_gan_mnist = np.load(os.path.join('data','x_gan_mnist.npy')) x_jsma_mnist_1 = np.load(os.path.join('data','x_jsma_mnist_1.npy')) sess.run(tf.global_variables_initializer()) print('train classifier') for t in range(1, steps + 1): batch = mnist.train.next_batch(batch_size) sess.run(train_op_classifier, feed_dict={x: batch[0], label: batch[1]}) if t % 550 == 0: epoch = int(t / 550) acc = {} acc['benign'] = sess.run(get_acc(x, label), feed_dict = {x: mnist.test.images, label: mnist.test.labels}) acc['pre fgsm'] = sess.run(get_acc(x, label), feed_dict={x: x_fgsm_mnist, label: mnist.test.labels}) acc['pre gan'] = sess.run(get_acc(x, label), feed_dict={x: x_gan_mnist, label: mnist.test.labels}) acc['pre jsma 1'] = sess.run(get_acc(x, label), feed_dict={x: x_jsma_mnist_1, label: mnist.test.labels[0:100,]}) print(epoch, acc) ''' sess.close() return # jsma jsma_params = {'theta': 1., 'gamma': 0.1,'nb_classes': 10, 'clip_min': 0.,'clip_max': 1., 'targets': y,\ 'y_val': y_target_test} x_jsma_mnist = jsma.generate_np(mnist.test.images, **jsma_params) np.save(os.path.join('data','x_jsma_mnist.npy'), x_jsma_mnist)
def main(): sess = tf.Session() global_step = tf.Variable(0, name='global_step', trainable=False) tf.set_random_seed(random_seed) saver = tf.train.Saver() x = tf.placeholder(tf.float32, [None, 784]) # input label = tf.placeholder(tf.float32, [None, 10]) # true label y_target = tf.placeholder(tf.float32, [None, 10]) # noise vector x1 = tf.placeholder(tf.float32, [None, 784]) x2 = tf.placeholder(tf.float32, [None, 784]) y_1 = model_1(x) y_2 = model_2(x) y_3 = model_3(x) loss_1 = softmax_loss(label, y_1) loss_2 = softmax_loss(label, y_2) loss_3 = softmax_loss(label, y_3) all_vars = tf.trainable_variables() model_1_vars = [var for var in all_vars if 'model_1' in var.name] model_2_vars = [var for var in all_vars if 'model_2' in var.name] model_3_vars = [var for var in all_vars if 'model_3' in var.name] train_op_1 = GradientDescentOptimizer( learning_rate=learning_rate).minimize(loss_1, var_list=model_1_vars, global_step=global_step) train_op_2 = GradientDescentOptimizer( learning_rate=learning_rate).minimize(loss_2, var_list=model_2_vars, global_step=global_step) train_op_3 = GradientDescentOptimizer( learning_rate=learning_rate).minimize(loss_3, var_list=model_3_vars, global_step=global_step) x_fgsm_1 = attack.fgsm(x, y_1, eps=0.3, clip_min=0, clip_max=1) x_fgsm_2 = attack.fgsm(x, y_2, eps=0.3, clip_min=0, clip_max=1) x_fgsm_3 = attack.fgsm(x, y_3, eps=0.3, clip_min=0, clip_max=1) mnist = input_data.read_data_sets('MNIST_data', one_hot=True) y_target = np.zeros((10000, 10), dtype=np.float32) y_target[:, 0] = 1.0 sess.run(tf.global_variables_initializer()) with sess.as_default(): print('train models') for t in range(1, steps + 1): batch = mnist.train.next_batch(batch_size) sess.run(train_op_1, feed_dict={x: batch[0], label: batch[1]}) sess.run(train_op_2, feed_dict={x: batch[0], label: batch[1]}) sess.run(train_op_3, feed_dict={x: batch[0], label: batch[1]}) if t % 550 == 0: epoch = int(t / 550) acc_1 = sess.run(accuracy(label, y_1), feed_dict={ x: mnist.test.images, label: mnist.test.labels }) acc_2 = sess.run(accuracy(label, y_2), feed_dict={ x: mnist.test.images, label: mnist.test.labels }) acc_3 = sess.run(accuracy(label, y_3), feed_dict={ x: mnist.test.images, label: mnist.test.labels }) print(epoch, acc_1, acc_2, acc_3) print('generate adv samples') # fgsm x_fgsm_1_data = sess.run(x_fgsm_1, feed_dict={ x: mnist.test.images, label: mnist.test.labels }) np.save(os.path.join('data', 'mnist', 'x_fgsm_1_data.npy'), x_fgsm_1_data) acc_fgsm_1 = sess.run(accuracy(label, y_1), feed_dict={ x: x_fgsm_1_data, label: mnist.test.labels }) x_fgsm_2_data = sess.run(x_fgsm_2, feed_dict={ x: mnist.test.images, label: mnist.test.labels }) np.save(os.path.join('data', 'mnist', 'x_fgsm_2_data.npy'), x_fgsm_2_data) acc_fgsm_2 = sess.run(accuracy(label, y_2), feed_dict={ x: x_fgsm_2_data, label: mnist.test.labels }) x_fgsm_3_data = sess.run(x_fgsm_3, feed_dict={ x: mnist.test.images, label: mnist.test.labels }) np.save(os.path.join('data', 'mnist', 'x_fgsm_3_data.npy'), x_fgsm_3_data) acc_fgsm_3 = sess.run(accuracy(label, y_3), feed_dict={ x: x_fgsm_3_data, label: mnist.test.labels }) x_data = mnist.test.images x_perturb_data = x_data + np.random.normal( loc=0.0, scale=0.1, size=[10000, 784]) x_perturb_data = np.clip(x_perturb_data, 0, 1) x_fgsm_rd_1_data = sess.run(x_fgsm_1, feed_dict={ x: x_perturb_data, label: mnist.test.labels }) np.save(os.path.join('data', 'mnist', 'x_fgsm_rd_1_data.npy'), x_fgsm_rd_1_data) acc_fgsm_rd_1 = sess.run(accuracy(label, y_1), feed_dict={ x: x_fgsm_rd_1_data, label: mnist.test.labels }) x_fgsm_rd_2_data = sess.run(x_fgsm_2, feed_dict={ x: x_perturb_data, label: mnist.test.labels }) np.save(os.path.join('data', 'mnist', 'x_fgsm_rd_2_data.npy'), x_fgsm_rd_2_data) acc_fgsm_rd_2 = sess.run(accuracy(label, y_2), feed_dict={ x: x_fgsm_rd_2_data, label: mnist.test.labels }) x_fgsm_rd_3_data = sess.run(x_fgsm_3, feed_dict={ x: x_perturb_data, label: mnist.test.labels }) np.save(os.path.join('data', 'mnist', 'x_fgsm_rd_3_data.npy'), x_fgsm_rd_3_data) acc_fgsm_rd_3 = sess.run(accuracy(label, y_3), feed_dict={ x: x_fgsm_rd_3_data, label: mnist.test.labels }) x_fgsm_it_1_data = np.copy(mnist.test.images) x_fgsm_it_2_data = np.copy(mnist.test.images) x_fgsm_it_3_data = np.copy(mnist.test.images) for _ in range(10): grad_1 = sess.run(tf.gradients(loss_1, x)[0], feed_dict={ x: x_fgsm_it_1_data, label: mnist.test.labels }) x_fgsm_it_1_data += np.sign(grad_1) * 0.01 x_fgsm_it_1_data = np.clip(x_fgsm_it_1_data, 0, 1) grad_2 = sess.run(tf.gradients(loss_2, x)[0], feed_dict={ x: x_fgsm_it_2_data, label: mnist.test.labels }) x_fgsm_it_2_data += np.sign(grad_2) * 0.01 x_fgsm_it_2_data = np.clip(x_fgsm_it_2_data, 0, 1) grad_3 = sess.run(tf.gradients(loss_3, x)[0], feed_dict={ x: x_fgsm_it_3_data, label: mnist.test.labels }) x_fgsm_it_3_data += np.sign(grad_3) * 0.01 x_fgsm_it_3_data = np.clip(x_fgsm_it_3_data, 0, 1) np.save(os.path.join('data', 'mnist', 'x_fgsm_it_1_data.npy'), x_fgsm_it_1_data) acc_fgsm_it_1 = sess.run(accuracy(label, y_1), feed_dict={ x: x_fgsm_it_1_data, label: mnist.test.labels }) np.save(os.path.join('data', 'mnist', 'x_fgsm_it_2_data.npy'), x_fgsm_it_2_data) acc_fgsm_it_2 = sess.run(accuracy(label, y_2), feed_dict={ x: x_fgsm_it_2_data, label: mnist.test.labels }) np.save(os.path.join('data', 'mnist', 'x_fgsm_it_3_data.npy'), x_fgsm_it_3_data) acc_fgsm_it_3 = sess.run(accuracy(label, y_3), feed_dict={ x: x_fgsm_it_3_data, label: mnist.test.labels }) grad_tg_1 = sess.run(tf.gradients(loss_1, x)[0], feed_dict={ x: mnist.test.images, label: y_target }) x_fgsm_tg_1_data = mnist.test.images - np.sign(grad_tg_1) * 0.3 x_fgsm_tg_1_data = np.clip(x_fgsm_tg_1_data, 0, 1) grad_tg_2 = sess.run(tf.gradients(loss_2, x)[0], feed_dict={ x: mnist.test.images, label: y_target }) x_fgsm_tg_2_data = mnist.test.images - np.sign(grad_tg_2) * 0.3 x_fgsm_tg_2_data = np.clip(x_fgsm_tg_2_data, 0, 1) grad_tg_3 = sess.run(tf.gradients(loss_3, x)[0], feed_dict={ x: mnist.test.images, label: y_target }) x_fgsm_tg_3_data = mnist.test.images - np.sign(grad_tg_3) * 0.3 x_fgsm_tg_3_data = np.clip(x_fgsm_tg_3_data, 0, 1) np.save(os.path.join('data', 'mnist', 'x_fgsm_tg_1_data.npy'), x_fgsm_tg_1_data) acc_fgsm_tg_1 = sess.run(accuracy(label, y_1), feed_dict={ x: x_fgsm_tg_1_data, label: mnist.test.labels }) np.save(os.path.join('data', 'mnist', 'x_fgsm_tg_2_data.npy'), x_fgsm_tg_2_data) acc_fgsm_tg_2 = sess.run(accuracy(label, y_2), feed_dict={ x: x_fgsm_tg_2_data, label: mnist.test.labels }) np.save(os.path.join('data', 'mnist', 'x_fgsm_tg_3_data.npy'), x_fgsm_tg_3_data) acc_fgsm_tg_3 = sess.run(accuracy(label, y_3), feed_dict={ x: x_fgsm_tg_3_data, label: mnist.test.labels }) print('Accuracy fgsm bl: {:.4f}, {:.4f}, {:.4f}'.format( acc_fgsm_1, acc_fgsm_2, acc_fgsm_3)) print('Accuracy fgsm rd: {:.4f}, {:.4f}, {:.4f}'.format( acc_fgsm_rd_1, acc_fgsm_rd_2, acc_fgsm_rd_3)) print('Accuracy fgsm it: {:.4f}, {:.4f}, {:.4f}'.format( acc_fgsm_it_1, acc_fgsm_it_2, acc_fgsm_it_3)) print('Accuracy fgsm tg: {:.4f}, {:.4f}, {:.4f}'.format( acc_fgsm_tg_1, acc_fgsm_tg_2, acc_fgsm_tg_3)) return
'learning_rate': 0.001, } model_train(sess, x, y, predictions, X_train, Y_train, args=train_params) # Evaluate the MNIST model eval_params = {'batch_size': 128} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Test accuracy on legitimate test examples: 0.9888 # Craft adversarial examples using the Fast Gradient Sign Method from cleverhans.attacks_tf import fgsm from cleverhans.utils_tf import batch_eval adv_x = fgsm(x, predictions, eps=0.3) X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], args=eval_params) accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy)) # Test accuracy on adversarial examples: 0.0837 # Adversarial training model_2 = cnn_model() predictions_2 = model_2(x) adv_x_2 = fgsm(x, predictions_2, eps=0.3) predictions_2_adv = model_2(adv_x_2) model_train(sess, x, y, predictions_2, X_train, Y_train, predictions_adv=predictions_2_adv, args=train_params) # Evaluate the accuracy on legitimate examples
def main(): random_seed = 1024 train_size, test_size = 55000, 10000 batch_size = 100 learning_rate = 0.05 epochs = 10 steps = epochs * 550 config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.8 sess = tf.InteractiveSession(config=config) global_step = tf.Variable(0, name='global_step', trainable=False) tf.set_random_seed(random_seed) x = tf.placeholder(tf.float32, [None, 784]) # input label = tf.placeholder(tf.float32, [None, 10]) # true label x_perturb = x + tf.random_normal( shape=tf.shape(x), mean=0.0, stddev=0.5, dtype=tf.float32) x_perturb = tf.clip_by_value(x_perturb, 0, 1) y = classifier(x) y_ = classifier(x_perturb) y_perturb = y_ + tf.random_normal( shape=tf.shape(y), mean=0.0, stddev=300.0, dtype=tf.float32) #fgsm x_fgsm = attack.fgsm(x, y, eps=0.2, clip_min=0, clip_max=1) y_fgsm = classifier(x_fgsm) x_perturb2 = x + tf.random_normal( shape=tf.shape(x), mean=0.0, stddev=0.2, dtype=tf.float32) y_2 = classifier(x_perturb2) x_fgsm_rd = attack.fgsm(x_perturb2, y_2, eps=0.2, clip_min=0, clip_max=1) y_fgsm_rd = classifier(x_fgsm_rd) loss_cls = softmax_loss(label, y_perturb) all_vars = tf.trainable_variables() c_vars = [var for var in all_vars if 'classifier' in var.name] train_op_classifier = GradientDescentOptimizer(learning_rate = learning_rate) \ .minimize(loss_cls, var_list = c_vars, global_step = global_step) x1 = tf.placeholder(tf.float32, [None, 784]) x2 = tf.placeholder(tf.float32, [None, 784]) # train saver = tf.train.Saver() mnist = input_data.read_data_sets('MNIST_data', one_hot=True) x_pre_fgsm_data = np.load(os.path.join('data', 'x_fgsm_mnist.npy')) sess.run(tf.global_variables_initializer()) with sess.as_default(): print('train classifier') for t in range(1, steps + 1): batch = mnist.train.next_batch(batch_size) f_dict = {x: batch[0], label: batch[1]} sess.run(train_op_classifier, feed_dict=f_dict) if t % 550 == 0: epoch = int(t / 550) acc = {} dist = {} x_data = mnist.test.images label_data = mnist.test.labels acc['benign'] = sess.run(get_acc(x, label), feed_dict={ x: mnist.test.images, label: mnist.test.labels }) acc['pre'] = sess.run(get_acc(x, label), feed_dict={ x: x_pre_fgsm_data, label: mnist.test.labels }) dist['pre'] = sess.run(avg_dist(x1, x2), feed_dict={ x1: mnist.test.images, x2: x_pre_fgsm_data }) x_fgsm_data = sess.run(x_fgsm, feed_dict={ x: mnist.test.images, label: mnist.test.labels }) acc['fgsm'] = sess.run(get_acc(x, label), feed_dict={ x: x_fgsm_data, label: mnist.test.labels }) dist['fgsm'] = sess.run(avg_dist(x1, x2), feed_dict={ x1: mnist.test.images, x2: x_fgsm_data }) x_perturb_data = x_data + np.random.normal( loc=0.0, scale=0.2, size=[10000, 784]) x_perturb_data = np.clip(x_perturb_data, 0, 1) x_rd_fgsm_data = sess.run(x_fgsm, feed_dict={ x: x_perturb_data, label: mnist.test.labels }) acc['rd'] = sess.run(get_acc(x, label), feed_dict={ x: x_rd_fgsm_data, label: mnist.test.labels }) dist['rd'] = sess.run(avg_dist(x1, x2), feed_dict={ x1: mnist.test.images, x2: x_rd_fgsm_data }) x_it_fgsm_data = np.copy(mnist.test.images) for _ in range(10): grad = sess.run(tf.gradients(loss_cls, x)[0], feed_dict={ x: x_it_fgsm_data, label: mnist.test.labels }) x_it_fgsm_data += np.sign(grad) * 0.01 x_it_fgsm_data = np.clip(x_it_fgsm_data, 0, 1) acc['it'] = sess.run(get_acc(x, label), feed_dict={ x: x_it_fgsm_data, label: mnist.test.labels }) dist['it'] = sess.run(avg_dist(x1, x2), feed_dict={ x1: mnist.test.images, x2: x_it_fgsm_data }) x_it_rd_fgsm_data = np.copy( mnist.test.images) + np.random.normal( loc=0.0, scale=0.2, size=[10000, 784]) x_it_rd_fgsm_data = np.clip(x_it_rd_fgsm_data, 0, 1) for _ in range(10): grad = sess.run(tf.gradients(loss_cls, x)[0], feed_dict={ x: x_it_rd_fgsm_data, label: mnist.test.labels }) x_it_rd_fgsm_data += np.sign(grad) * 0.01 x_it_rd_fgsm_data = np.clip(x_it_rd_fgsm_data, 0, 1) acc['it rd'] = sess.run(get_acc(x, label), feed_dict={ x: x_it_rd_fgsm_data, label: mnist.test.labels }) dist['it rd'] = sess.run(avg_dist(x1, x2), feed_dict={ x1: mnist.test.images, x2: x_it_rd_fgsm_data }) print(epoch) print(acc) print(dist)
def model_train(para): sigma = compute_sigma(para['eps'], para['delta']) std = sigma * para['sens'] sess = tf.Session() tf.set_random_seed(random_seed) n = len(layers) x = tf.placeholder(tf.float32, [None, 784]) # input label = tf.placeholder(tf.float32, [None, 10]) # true label w, b = [0 for i in range(n)], [0 for i in range(n)] for i in range(1, n): w[i] = weight_variable([layers[i - 1], layers[i]]) b[i] = bias_variable([layers[i]]) # noisy model z, h = [0 for i in range(n)], [0 for i in range(n)] h[0] = x h[0] = h[0] + tf.random_normal( shape=tf.shape(h[0]), mean=0.0, stddev=std, dtype=tf.float32) for i in range(1, n): z[i] = tf.matmul(h[i - 1], w[i]) + b[i] if i < n - 1: h[i] = tf.nn.relu(z[i]) else: h[i] = z[i] y = h[n - 1] loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=label, logits=y)) # noiseless model z2, h2 = [0 for i in range(n)], [0 for i in range(n)] h2[0] = x for i in range(1, n): z2[i] = tf.matmul(h2[i - 1], w[i]) + b[i] if i < n - 1: h2[i] = tf.nn.relu(z2[i]) else: h2[i] = z2[i] y2 = h2[n - 1] x_adv = attack.fgsm(x, y, eps=0.3, clip_min=0, clip_max=1) # gradient descent gw, gb = [0 for i in range(n)], [0 for i in range(n)] for i in range(1, n): gw[i] = tf.gradients(loss, w[i])[0] gb[i] = tf.gradients(loss, b[i])[0] opt = GradientDescentOptimizer(learning_rate=learning_rate) gradients = [] for i in range(1, n): gradients.append((gw[i], w[i])) gradients.append((gb[i], b[i])) train_step = opt.apply_gradients(gradients) #evaluation acc = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(y, 1), tf.argmax(label, 1)), tf.float32)) # data mnist = input_data.read_data_sets('MNIST_data', one_hot=True) x_adv_mnist_fsgm = np.load(os.path.join('data', 'x_adv_mnist_fsgm.npy')) print('sigma: {:.3f}, std: {:.3f}'.format(sigma, std)) sess.run(tf.global_variables_initializer()) with sess.as_default(): for t in range(steps): batch = mnist.train.next_batch(batch_size) sess.run(train_step, feed_dict={x: batch[0], label: batch[1]}) if t % int(1 / sample_rate) == 0 or t == steps - 1: if t < steps - 1: epoch = int(t / int(1 / sample_rate)) else: epoch = epochs x_adv_sample = sess.run(x_adv, feed_dict={ x: mnist.test.images, label: mnist.test.labels }) acc_benign = sess.run(acc, feed_dict={ x: mnist.test.images, label: mnist.test.labels }) acc_adv = sess.run(acc, feed_dict={ x: x_adv_sample, label: mnist.test.labels }) acc_pre_adv = sess.run(acc, feed_dict={ x: x_adv_mnist_fsgm, label: mnist.test.labels }) print(epoch, acc_benign, acc_adv, acc_pre_adv)
def main(): random_seed = 1024 train_size, test_size = 55000, 10000 batch_size = 100 learning_rate = 0.05 epochs = 5 steps = epochs * 550 config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 1 sess = tf.InteractiveSession(config=config) global_step = tf.Variable(0, name='global_step', trainable=False) tf.set_random_seed(random_seed) x = tf.placeholder(tf.float32, [None, 784]) # input label = tf.placeholder(tf.float32, [None, 10]) # true label noise = tf.placeholder(tf.float32, [None, 100]) # noise vector y_target = tf.placeholder(tf.float32, [None, 10]) # target label x_perturb = x + tf.random_normal( shape=tf.shape(x), mean=0.0, stddev=0.5, dtype=tf.float32) x_perturb = tf.clip_by_value(x_perturb, 0, 1) x1 = tf.placeholder(tf.float32, [None, 784]) x2 = tf.placeholder(tf.float32, [None, 784]) y_n = classifier_n(x) y = classifier(x) y_perturb = classifier(x_perturb) # gan x_gan = generator(x, noise) y_gan = classifier(x_gan) loss_cls = softmax_loss(label, y_n) #+ softmax_loss(label, y_gan) loss_gan = -softmax_loss(label, y_gan) all_vars = tf.trainable_variables() c_vars = [var for var in all_vars if 'classifier' in var.name] g_vars = [var for var in all_vars if 'generator' in var.name] train_op_classifier = GradientDescentOptimizer(learning_rate = learning_rate) \ .minimize(loss_cls, var_list = c_vars, global_step = global_step) train_op_generator = GradientDescentOptimizer(learning_rate = 0.05) \ .minimize(loss_gan, var_list = g_vars, global_step = global_step) #fgsm x_fgsm = attack.fgsm(x, y, eps=0.2, clip_min=0, clip_max=1) y_fgsm = classifier(x_fgsm) # random fgsm x_fgsm_rd = attack.fgsm(x_perturb, y_perturb, eps=0.2, clip_min=0, clip_max=1) y_fgsm_rd = classifier(x_fgsm_rd) # jsma jsma = SaliencyMapMethod(classifier, back='tf', sess=sess) # train saver = tf.train.Saver() mnist = input_data.read_data_sets('MNIST_data', one_hot=True) x_fgsm_mnist = np.load(os.path.join('data', 'x_fgsm_mnist.npy')) x_gan_mnist = np.load(os.path.join('data', 'x_gan_mnist.npy')) x_jsma_mnist_1 = np.load(os.path.join('data', 'x_jsma_mnist_1.npy')) y_target_batch = np.zeros((100, 10), dtype=np.float32) y_target_batch[:, 0] = 1.0 y_target_test = np.zeros((10000, 10), dtype=np.float32) y_target_test[:, 0] = 1.0 sess.run(tf.global_variables_initializer()) with sess.as_default(): acc = {} print('train classifier') for t in range(1, steps + 1): batch = mnist.train.next_batch(batch_size) noise_d = sample_Z(batch_size, 100) f_dict = { x: batch[0], label: batch[1], noise: noise_d, y_target: y_target_batch } sess.run(train_op_classifier, feed_dict=f_dict) #for j in range(1): #sess.run(train_op_generator, feed_dict=f_dict) if t % 550 == 0: epoch = int(t / 550) acc['benign'] = sess.run(get_acc(x, label), feed_dict={ x: mnist.test.images, label: mnist.test.labels }) acc['pre fgsm'] = sess.run(get_acc(x, label), feed_dict={ x: x_fgsm_mnist, label: mnist.test.labels }) acc['pre gan'] = sess.run(get_acc(x, label), feed_dict={ x: x_gan_mnist, label: mnist.test.labels }) acc['pre jsma 1'] = sess.run(get_acc(x, label), feed_dict={ x: x_jsma_mnist_1, label: mnist.test.labels[0:100, ] }) x_fgsm_d = sess.run(x_fgsm, feed_dict={ x: mnist.test.images, label: mnist.test.labels }) acc['fgsm'] = sess.run(get_acc(x, label), feed_dict={ x: x_fgsm_d, label: mnist.test.labels }) x_fgsm_rd_d = sess.run(x_fgsm_rd, feed_dict={ x: mnist.test.images, label: mnist.test.labels }) acc['fgsm_rd'] = sess.run(get_acc(x, label), feed_dict={ x: x_fgsm_rd_d, label: mnist.test.labels }) print(epoch, acc) ''' print('train gan') for t in range(1, 550 * 10 + 1): batch = mnist.train.next_batch(batch_size) f_dict = {x: batch[0], label: batch[1], noise: sample_Z(batch_size, 100), y_target: y_target_batch} sess.run(train_op_generator, feed_dict=f_dict) if t % 550 == 0: epoch = int(t / 550) batch = mnist.test.next_batch(batch_size) f_dict = {x: batch[0], label: batch[1], noise: sample_Z(batch_size, 100), y_target: y_target_batch} x_gan_data = sess.run(x_gan, feed_dict=f_dict) acc_gan = sess.run(get_acc(x, label), feed_dict={x: x_gan_data, label: batch[1]}) print(epoch, acc_gan) x_fgsm_d = sess.run(x_fgsm, feed_dict = {x: mnist.test.images, label: mnist.test.labels}) acc['fgsm'] = sess.run(get_acc(x, label), feed_dict={x: x_fgsm_d, label: mnist.test.labels}) x_gan_d = sess.run(x_gan, feed_dict={x: mnist.test.images ,label: mnist.test.labels\ , noise: sample_Z(10000, 100), y_target: y_target_test}) acc['gan'] = sess.run(get_acc(x, label), feed_dict={x: x_gan_d ,label: mnist.test.labels\ , noise: sample_Z(10000, 100), y_target: y_target_test}) ''' jsma_params = {'theta': 1., 'gamma': 0.1,'nb_classes': 10, 'clip_min': 0.,'clip_max': 1., 'targets': y,\ 'y_val': y_target_batch} x_jsma_1_d = jsma.generate_np(mnist.test.images[0:100, ], **jsma_params) acc['jsma 1'] = sess.run(get_acc(x, label), feed_dict={ x: x_jsma_1_d, label: mnist.test.labels[0:100, ] }) print(acc['jsma 1'])
model_train(sess_legit, x_reg, x_seq, y, predictions, train['reg'], train['seq'], train['expr'], val['reg'], val['seq'], val['expr'], args=train_params) pred=model.predict({'seq_input':test['seq'],'reg_input':test['reg']},batch_size=100,verbose=1) plt.scatter(pred,test['expr']) plt.savefig("%s/pred_vs_obs.legit.png"%(fig_dir)) output=np.column_stack((test['expr'], pred[:,0])) np.savetxt("%s/prediction.legit.txt"%(out_dir), output,delimiter='\t') # Adversarial training sess_adv=tf.Session() model_2=concatenation_model(num_reg,seq_length) predictions_2 = model_2([x_reg,x_seq]) adv_x_seq_2 = fgsm(x_seq, predictions_2, eps=0.3) predictions_2_adv = model_2([x_reg,adv_x_seq_2]) train_params = { 'nb_epochs': 50, 'batch_size': 100, 'learning_rate': 0.01, 'train_dir': log_dir, 'filename': 'model_adv' } model_train(sess_adv, x_reg, x_seq, y, predictions_2, train['reg'], train['seq'], train['expr'], val['reg'], val['seq'], val['expr'], predictions_adv=predictions_2_adv, args=train_params)
saved_path = dirs.SAVED_MODELS if parseval: saved_path += '/wrn-28-10-p-t--2018-01-24-21-18/ResNet' # Parseval else: saved_path += '/wrn-28-10-t--2018-01-23-19-13/ResNet' # vanilla model.load_state(saved_path) cost, ev = model.test(ds_test) accuracies = [ev['accuracy']] for eps in epss[1:]: print("Creating adversarial examples...") clip_max = (255 - np.max(Cifar10Loader.mean)) / np.max( Cifar10Loader.std) n_fgsm = fgsm(model.nodes.input, model.nodes.probs, eps=eps, clip_min=-clip_max, clip_max=clip_max) images_adv, = batch_eval(model._sess, [model.nodes.input], [n_fgsm], [ds_test.images[:model.batch_size * 64]], args={'batch_size': model.batch_size}, feed={model._is_training: False}) adv_ds_test = Dataset(images_adv, ds_test.labels, ds_test.class_count) cost, ev = model.test(adv_ds_test) accuracies.append(ev['accuracy']) accuracieses.append(accuracies) print(accuracies) def plot(epss, curves, names): plt.figure()
def train(hps, data): """Training loop.""" images = tf.placeholder(tf.float32, shape=(None, FLAGS.image_size, FLAGS.image_size, FLAGS.channels), name="images") labels = tf.placeholder(tf.int64, shape=(None), name="labels") labels_onehot = tf.one_hot(labels, depth=hps.num_classes, dtype=tf.float32, name="labels_onehot") if FLAGS.label_smooth: labels_onehot = label_smooth(labels_onehot) lrn_rate = tf.placeholder(tf.float32, shape=(), name="lrn_rate") tf.logging.info(json.dumps(vars(FLAGS))) tf.logging.info(json.dumps(hps._asdict())) flipped_images = random_flip_left_right(images) net = tf.make_template('net', resnet_template, hps=hps) if FLAGS.model == 'resnet' else \ tf.make_template('net', vgg_template, hps=hps) truth = labels if FLAGS.adversarial or FLAGS.adversarial_BIM: logits = net(flipped_images, training=False) else: logits = net(flipped_images, training=True) probs = tf.nn.softmax(logits) predictions = tf.argmax(logits, axis=1) precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth))) cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels_onehot)) weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='net') weight_norm = tf.add_n([tf.nn.l2_loss(v) for v in weights]) cost = cost + 0.0005 * weight_norm with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): train_op = tf.train.MomentumOptimizer(learning_rate=lrn_rate, momentum=0.9).minimize(cost) if FLAGS.adversarial or FLAGS.adversarial_BIM: eps = tf.abs(tf.truncated_normal(shape=(tf.shape(images)[0],), mean=0, stddev=FLAGS.adv_std)) eps = eps[:, None, None, None] adv_x = fgsm(flipped_images, probs, eps=eps, clip_min=0.0, clip_max=255.0) adv_x_leak = fgm(flipped_images, probs, y=labels_onehot, eps=np.asarray([1])[:, None, None, None], clip_min=0.0, clip_max=255.0) adv_logits = net(adv_x, training=False) adv_pred = tf.argmax(adv_logits, axis=1) adv_precision = tf.reduce_mean(tf.to_float(tf.equal(adv_pred, truth))) adv_logits_leak = net(adv_x_leak, training=False) adv_pred_leak = tf.argmax(adv_logits_leak, axis=1) adv_precision_leak = tf.reduce_mean(tf.to_float(tf.equal(adv_pred_leak, truth))) num_normal = hps.batch_size // 2 combined_images = tf.concat([flipped_images[:num_normal], images[num_normal:]], axis=0) com_logits = net(combined_images, training=True) normal_cost = 2.0 / 1.3 * tf.nn.softmax_cross_entropy_with_logits(logits=com_logits[:num_normal], labels=labels_onehot[:num_normal]) adv_cost = 0.6 / 1.3 * tf.nn.softmax_cross_entropy_with_logits(logits=com_logits[num_normal:], labels=labels_onehot[num_normal:]) combined_cost = tf.reduce_mean(tf.concat([normal_cost, adv_cost], axis=0)) + 0.0005 * weight_norm with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): train_op_adv = tf.train.MomentumOptimizer(learning_rate=lrn_rate, momentum=0.9).minimize(combined_cost) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: tf.global_variables_initializer().run() saver = tf.train.Saver(max_to_keep=3) save_path, save_path_ckpt = get_weights_path() state = tf.train.get_checkpoint_state(save_path) if state and state.model_checkpoint_path: ans = verify("Warning: model already trained. Delete files and re-train? (y/n)") if ans: shutil.rmtree(save_path) os.makedirs(save_path) else: saver_state = tf.train.get_checkpoint_state(save_path) saver.restore(sess, saver_state.model_checkpoint_path) # raise FileExistsError("Model weight already exists") else: os.makedirs(save_path, exist_ok=True) hps_path = os.path.join(save_path, 'hps.txt') with open(hps_path, 'w') as fout: fout.write(json.dumps(vars(FLAGS))) fout.write(json.dumps(hps._asdict())) for iter in range(FLAGS.maxiter): try: x, y = data.next(hps.batch_size) except StopIteration: tf.logging.info("New epoch!") if iter < 40000: lr = 0.1 elif iter < 60000: lr = 0.01 elif iter < 80000: lr = 0.001 else: lr = 0.0001 if not FLAGS.adversarial and not FLAGS.adversarial_BIM: _, acc = sess.run([train_op, precision], feed_dict={ images: x, labels: y, lrn_rate: lr }) tf.logging.info("Iter: {}, Precision: {:.6f}".format(iter + 1, acc)) elif FLAGS.adversarial: adv_images, acc, acc_adv = sess.run([adv_x, precision, adv_precision], feed_dict={ images: x, labels: y, }) combined_batch = np.concatenate([x[:num_normal], adv_images[num_normal:]], axis=0) _, com_loss = sess.run([train_op_adv, combined_cost], feed_dict={ images: combined_batch, labels: y, lrn_rate: lr }) tf.logging.info("Iter: {}, Precision: {:.6f}, Adv precision: {:.6f}, Combined loss: {:.6f}" .format(iter + 1, acc, acc_adv, com_loss)) elif FLAGS.adversarial_BIM: BIM_eps = np.abs(truncnorm.rvs(a=-2., b=2.) * FLAGS.adv_std) attack_iter = int(min(BIM_eps + 4, 1.25 * BIM_eps)) adv_images = np.copy(x) for i in range(attack_iter): adv_images, acc, acc_adv = sess.run([adv_x_leak, precision, adv_precision_leak], feed_dict={ images: adv_images, labels: y, }) combined_batch = np.concatenate([x[:num_normal], adv_images[num_normal:]], axis=0) _, com_loss = sess.run([train_op_adv, combined_cost], feed_dict={ images: combined_batch, labels: y, lrn_rate: lr }) tf.logging.info("Iter: {}, Precision: {:.6f}, Adv precision: {:.6f}, Combined loss: {:.6f}" .format(iter + 1, acc, acc_adv, com_loss)) if (iter + 1) % 5000 == 0: saver.save(sess, save_path_ckpt, global_step=iter + 1) tf.logging.info("Model saved! Path: " + save_path)
def train(alpha, eps2_ratio, gen_ratio, fgsm_eps, LR, logfile): logfile.write("fgsm_eps \t %g, LR \t %g, alpha \t %d , eps2_ratio \t %d , gen_ratio \t %d \n"%(fgsm_eps, LR, alpha, eps2_ratio, gen_ratio)) ############################# ##Hyper-parameter Setting#### ############################# hk = 256; #number of hidden units at the last layer Delta2 = (14*14+2)*25; #global sensitivity for the first hidden layer Delta3_adv = 2*hk #10*(hk + 1/4 * hk**2) #10*(hk) #global sensitivity for the output layer Delta3_benign = 2*hk #10*(hk); #global sensitivity for the output layer D = 50000; #size of the dataset L = 2499; #batch size image_size = 28; padding = 4; #numHidUnits = 14*14*32 + 7*7*64 + M + 10; #number of hidden units #gen_ratio = 1 epsilon1 = 0.0; #0.175; #epsilon for dpLRP epsilon2 = 0.1*(1 + gen_ratio); #epsilon for the first hidden layer epsilon3 = 0.1*(1); #epsilon for the last hidden layer total_eps = epsilon1 + epsilon2 + epsilon3 print(total_eps) uncert = 0.1; #uncertainty modeling at the output layer infl = 1; #inflation rate in the privacy budget redistribution R_lowerbound = 1e-5; #lower bound of the LRP c = [0, 40, 50, 200] #norm bounds epochs = 200; #number of epochs preT_epochs = 50; #number of epochs T = int(D/L*epochs + 1); #number of steps T pre_T = int(D/L*preT_epochs + 1); step_for_epoch = int(D/L); #number of steps for one epoch broken_ratio = 1 #alpha = 9.0 # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] #eps2_ratio = 10; # [1/10, 1/8, 1/6, 1/4, 1/2, 1, 2, 4, 6, 8, 10] #eps_benign = 1/(1+eps2_ratio)*(2*epsilon2) #eps_adv = eps2_ratio/(1+eps2_ratio)*(2*epsilon2) #fgsm_eps = 0.1 rand_alpha = 0.05 ##Robustness## robustness_T = (fgsm_eps*18*18*L*epsilon2)/Delta2; #### LRPfile = os.getcwd() + '/Relevance_R_0_075.txt'; ############################# mnist = input_data.read_data_sets("MNIST_data/", one_hot = True); ############################# ##Construct the Model######## ############################# #Step 4: Randomly initiate the noise, Compute 1/|L| * Delta3 for the output layer# #Compute the 1/|L| * Delta3 for the last hidden layer# """eps3_ratio = Delta3_adv/Delta3_benign; eps3_benign = 1/(1+eps3_ratio)*(epsilon3) eps3_adv = eps3_ratio/(1+eps3_ratio)*(epsilon3)""" loc, scale3_benign, scale3_adv = 0., Delta3_benign/(epsilon3*L), Delta3_adv/(epsilon3*L); ### #End Step 4# # Parameters Declarification W_conv1 = weight_variable('W_conv1', [5, 5, 1, 32], collect=[AECODER_VARIABLES]); b_conv1 = bias_variable('b_conv1', [32], collect=[AECODER_VARIABLES]); shape = W_conv1.get_shape().as_list() w_t = tf.reshape(W_conv1, [-1, shape[-1]]) w = tf.transpose(w_t) sing_vals = tf.svd(w, compute_uv=False) sensitivity = tf.reduce_max(sing_vals) gamma = 2*(14*14 + 2)*25/(L*sensitivity) dp_epsilon=1.0 #0.1 delta_r = fgsm_eps*(image_size**2); #delta_h = 1.0 * delta_r; #sensitivity*(14**2) = sensitivity*(\beta**2) can also be used #dp_mult = (Delta2/(L*epsilon2))/(delta_r / dp_epsilon) + (2*Delta2/(L*epsilon2))/(delta_h / dp_epsilon) W_conv2 = weight_variable('W_conv2', [5, 5, 32, 64], collect=[CONV_VARIABLES]); b_conv2 = bias_variable('b_conv2', [64], collect=[CONV_VARIABLES]); W_fc1 = weight_variable('W_fc1', [4 * 4 * 64, hk], collect=[CONV_VARIABLES]); b_fc1 = bias_variable('b_fc1', [hk], collect=[CONV_VARIABLES]); W_fc2 = weight_variable('W_fc2', [hk, 10], collect=[CONV_VARIABLES]); b_fc2 = bias_variable('b_fc2', [10], collect=[CONV_VARIABLES]); """scale2 = tf.Variable(tf.ones([hk])) beta2 = tf.Variable(tf.zeros([hk])) tf.add_to_collections([CONV_VARIABLES], scale2) tf.add_to_collections([CONV_VARIABLES], beta2)""" params = [W_conv1, b_conv1, W_conv2, b_conv2, W_fc1, b_fc1, W_fc2, b_fc2] ### #Step 5: Create the model# noise = tf.placeholder(tf.float32, [None, image_size, image_size, 1]); adv_noise = tf.placeholder(tf.float32, [None, image_size, image_size, 1]); keep_prob = tf.placeholder(tf.float32); x = tf.placeholder(tf.float32, [None, image_size*image_size]); x_image = tf.reshape(x, [-1,image_size,image_size,1]); #perturbFMx = np.random.laplace(0.0, Delta2/(2*epsilon2*L), 28*28) #perturbFMx = np.reshape(perturbFMx, [-1, 28, 28, 1]); # pretrain ### #Enc_Layer1 = EncLayer(inpt=x_image, n_filter_in = 1, n_filter_out = 32, filter_size = 5, W=W_conv1, b=b_conv1, activation=tf.nn.relu) #pretrain = Enc_Layer1.get_train_ops2(xShape = tf.shape(x_image)[0], Delta = Delta2, epsilon = 2*epsilon2, batch_size = L, learning_rate= LR, W = W_conv1, b = b_conv1, perturbFMx = noise) ########### adv_x = tf.placeholder(tf.float32, [None, image_size*image_size]); adv_image = tf.reshape(adv_x, [-1,image_size,image_size,1]); #perturbFMx_adv = np.random.laplace(0.0, Delta2/(2*epsilon2*L), 28*28) #perturbFMx_adv = np.reshape(perturbFMx_adv, [-1, 28, 28, 1]); # pretrain adv ### #perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2*L), 14*14*32) #perturbFM_h = np.reshape(perturbFM_h, [-1, 14, 14, 32]); FM_h = tf.placeholder(tf.float32, [None, 14, 14, 32]); Enc_Layer2 = EncLayer(inpt=adv_image, n_filter_in = 1, n_filter_out = 32, filter_size = 5, W=W_conv1, b=b_conv1, activation=tf.nn.relu) pretrain_adv = Enc_Layer2.get_train_ops2(xShape = tf.shape(adv_image)[0], Delta = Delta2, batch_size = L, learning_rate= LR, W = W_conv1, b = b_conv1, perturbFMx = adv_noise, perturbFM_h = FM_h) Enc_Layer3 = EncLayer(inpt=x_image, n_filter_in = 1, n_filter_out = 32, filter_size = 5, W=W_conv1, b=b_conv1, activation=tf.nn.relu) pretrain_benign = Enc_Layer3.get_train_ops2(xShape = tf.shape(x_image)[0], Delta = Delta2, batch_size = L, learning_rate= LR, W = W_conv1, b = b_conv1, perturbFMx = noise, perturbFM_h = FM_h) ########### x_image += noise; x_image = tf.clip_by_value(x_image, -10, 10) #Clip the values of each input feature. adv_image += adv_noise; adv_image = tf.clip_by_value(adv_image, -10, 10) #Clip the values of each input feature. #perturbFM = np.random.laplace(0.0, scale3_benign, hk) #perturbFM = np.reshape(perturbFM, [hk]); perturbFM = np.random.laplace(0.0, scale3_benign, hk * 10) perturbFM = np.reshape(perturbFM, [hk, 10]); y_conv = inference(x_image, perturbFM, hk, FM_h, params); softmax_y_conv = tf.nn.softmax(y_conv) #robust_mask = inference_robust_mask(y_conv, Delta2, L, epsilon2, robustness_T) #perturbFM = np.random.laplace(0.0, scale3_adv, hk) #perturbFM = np.reshape(perturbFM, [hk]); y_adv_conv = inference(adv_image, perturbFM, hk, FM_h, params); #adv_robust_mask = inference_robust_mask(y_adv_conv, Delta2, L, epsilon2, robustness_T) # test model perturbFM_test = np.random.laplace(0.0, 0, hk) perturbFM_test = np.reshape(perturbFM_test, [hk]); x_test = tf.reshape(x, [-1,image_size,image_size,1]); y_test = inference(x_test, perturbFM_test, hk, FM_h, params); #test_robust_mask = inference_robust_mask(y_test, Delta2, L, epsilon2, robustness_T) #Define a place holder for the output label# y_ = tf.placeholder(tf.float32, [None, 10]); adv_y_ = tf.placeholder(tf.float32, [None, 10]); #End Step 5# ############################# ############################# ##Define loss and Optimizer## ############################# ''' Computes differentially private sigmoid cross entropy given `logits`. Measures the probability error in discrete classification tasks in which each class is independent and not mutually exclusive. For brevity, let `x = logits`, `z = labels`. The logistic loss is z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x))) = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x))) = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x)) = (1 - z) * x + log(1 + exp(-x)) = x - x * z + log(1 + exp(-x)) For x < 0, to avoid overflow in exp(-x), we reformulate the above x - x * z + log(1 + exp(-x)) = log(exp(x)) - x * z + log(1 + exp(-x)) = - x * z + log(1 + exp(x)) Hence, to ensure stability and avoid overflow, the implementation uses this equivalent formulation max(x, 0) - x * z + log(1 + exp(-abs(x))) `logits` and `labels` must have the same type and shape. Let denote neg_abs_logits = -abs(y_conv) = -abs(h_fc1 * W_fc2). By Applying Taylor Expansion, we have: Taylor = max(y_conv, 0) - y_conv * y_ + log(1 + exp(-abs(y_conv))); = max(h_fc1 * W_fc2, 0) - (y_ * h_fc1) * W_fc2 + (math.log(2.0) + 0.5*neg_abs_logits + 1.0/8.0*neg_abs_logits**2) = max(h_fc1 * W_fc2, 0) - (y_ * h_fc1) * W_fc2 + (math.log(2.0) + 0.5*(-abs(h_fc1 * W_fc2)) + 1.0/8.0*(-abs(h_fc1 * W_fc2))**2) = F1 + F2 where: F1 = max(h_fc1 * W_fc2, 0) + (math.log(2.0) + 0.5*(-abs(h_fc1 * W_fc2)) + 1.0/8.0*(-abs(h_fc1 * W_fc2))**2) and F2 = - (y_ * h_fc1) * W_fc2 To ensure that Taylor is differentially private, we need to perturb all the coefficients, including the term y_ * h_fc1 * W_fc2. Note that h_fc1 is differentially private, since its computation on top of the DP Affine transformation does not access the original data. Therefore, F1 should be differentially private. We need to preserve DP in F2, which reads the groundtruth label y_, as follows: By applying Funtional Mechanism, we perturb (y_ * h_fc1) * W_fc2 as ((y_ * h_fc1) + perturbFM) * W_fc2 = (y_ * h_fc1)*W_fc2 + (perturbFM * W_fc2): perturbFM = np.random.laplace(0.0, scale3, hk * 10) perturbFM = np.reshape(perturbFM/L, [hk, 10]); where scale3 = Delta3/(epsilon3) = 2*hk/(epsilon3); To allow computing gradients at zero, we define custom versions of max and abs functions [Tensorflow]. Source: https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/python/ops/nn_impl.py @ TensorFlow ''' ### Taylor for benign x zeros = array_ops.zeros_like(y_conv, dtype=y_conv.dtype) cond = (y_conv >= zeros) relu_logits = array_ops.where(cond, y_conv, zeros) neg_abs_logits = array_ops.where(cond, -y_conv, y_conv) #Taylor = math_ops.add(relu_logits - y_conv * y_, math_ops.log1p(math_ops.exp(neg_abs_logits))) Taylor_benign = math_ops.add(relu_logits - y_conv * y_, math.log(2.0) + 0.5*neg_abs_logits + 1.0/8.0*neg_abs_logits**2) - tf.reduce_sum(perturbFM*W_fc2) #Taylor_benign = tf.abs(y_conv - y_) ### Taylor for adv_x zeros_adv = array_ops.zeros_like(y_adv_conv, dtype=y_conv.dtype) cond_adv = (y_adv_conv >= zeros_adv) relu_logits_adv = array_ops.where(cond_adv, y_adv_conv, zeros_adv) neg_abs_logits_adv = array_ops.where(cond_adv, -y_adv_conv, y_adv_conv) #Taylor = math_ops.add(relu_logits - y_conv * y_, math_ops.log1p(math_ops.exp(neg_abs_logits))) Taylor_adv = math_ops.add(relu_logits_adv - y_adv_conv * adv_y_, math.log(2.0) + 0.5*neg_abs_logits_adv + 1.0/8.0*neg_abs_logits_adv**2) - tf.reduce_sum(perturbFM*W_fc2) #Taylor_adv = tf.abs(y_adv_conv - adv_y_) ### Adversarial training loss adv_loss = (1/(L + L*alpha))*(Taylor_benign + alpha * Taylor_adv) '''Some time, using learning rate decay can help to stablize training process. However, use this carefully, since it may affect the convergent speed.''' global_step = tf.Variable(0, trainable=False) pretrain_var_list = tf.get_collection(AECODER_VARIABLES) train_var_list = tf.get_collection(CONV_VARIABLES) #print(pretrain_var_list) #print(train_var_list) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): pretrain_step = tf.train.AdamOptimizer(LR).minimize(pretrain_adv+pretrain_benign, global_step=global_step, var_list=pretrain_var_list); train_step = tf.train.AdamOptimizer(LR).minimize(adv_loss, global_step=global_step, var_list=train_var_list); sess = tf.InteractiveSession(); # Define the correct prediction and accuracy # This needs to be changed to "Robust Prediction" correct_prediction_x = tf.equal(tf.argmax(y_test,1), tf.argmax(y_,1)); accuracy_x = tf.reduce_mean(tf.cast(correct_prediction_x, tf.float32)); ############# # use these to get predictions wrt to robust conditions """robust_correct_prediction_x = tf.multiply(test_robust_mask, tf.cast(correct_prediction_x, tf.float32)) accuracy_x_robust = tf.reduce_sum(robust_correct_prediction_x) / tf.reduce_sum(test_robust_mask) #certified_utility = 2/(1/accuracy_x_robust + 1/(tf.reduce_sum(test_robust_mask)/(1.0*tf.cast(tf.size(test_robust_mask), tf.float32)))) certified_utility = (1.0*tf.reduce_sum(test_robust_mask))/(1.0*tf.cast(tf.size(test_robust_mask), tf.float32))""" ############# # craft adversarial samples from x for training dynamic_eps = tf.placeholder(tf.float32); emsemble_L = int(L/3) softmax_y = tf.nn.softmax(y_test) #c_x_adv = fgsm(x, softmax_y, eps=fgsm_eps, clip_min=0.0, clip_max=1.0) c_x_adv = fgsm(x, softmax_y, eps=(dynamic_eps)/10, clip_min=-1.0, clip_max=1.0) # for I-FGSM x_adv = tf.reshape(c_x_adv, [emsemble_L,image_size*image_size]); #====================== attack ========================= #attack_switch = {'randfgsm':True, 'fgsm':True, 'ifgsm':True, 'deepfool':True, 'mim':True, 'spsa':False, 'cwl2':False, 'madry':True, 'stm':True} #attack_switch = {'fgsm':True, 'ifgsm':True, 'deepfool':True, 'mim':True, 'spsa':False, 'cwl2':False, 'madry':True, 'stm':True} attack_switch = {'fgsm':True, 'ifgsm':True, 'deepfool':False, 'mim':True, 'spsa':False, 'cwl2':False, 'madry':True, 'stm':False} #other possible attacks: # ElasticNetMethod # FastFeatureAdversaries # LBFGS # SaliencyMapMethod # VirtualAdversarialMethod # y_test = logits (before softmax) # softmax_y_test = preds (probs, after softmax) softmax_y_test = tf.nn.softmax(y_test) # create saver saver = tf.train.Saver(tf.all_variables()) sess.run(W_conv1.initializer) _gamma = sess.run(gamma) _gamma_x = Delta2/L epsilon2_update = epsilon2/(1.0 + 1.0/_gamma + 1/_gamma_x) print(epsilon2_update/_gamma + epsilon2_update/_gamma_x) print(epsilon2_update) _sensitivityW = sess.run(sensitivity) delta_h = _sensitivityW*(14**2) dp_mult = (Delta2/(L*epsilon2_update))/(delta_r / dp_epsilon) + (2*Delta2/(L*epsilon2_update))/(delta_h / dp_epsilon) ############################# iterativeStep = 100 # load the most recent models _global_step = 0 ckpt = tf.train.get_checkpoint_state(os.getcwd() + './tmp/train') if ckpt and ckpt.model_checkpoint_path: print(ckpt.model_checkpoint_path); saver.restore(sess, ckpt.model_checkpoint_path) _global_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]) else: print('No checkpoint file found') start_time = time.time(); # adv pretrain model (Auto encoder layer) cost = tf.reduce_sum(Enc_Layer2.cost); logfile.write("pretrain: \n") # define cleverhans abstract models for using cleverhans attacks ch_model_logits = CustomCallableModelWrapper(callable_fn=inference_test_input, output_layer='logits', hk=hk, params=params, image_size=image_size, adv_noise = adv_noise) ch_model_probs = CustomCallableModelWrapper(callable_fn=inference_test_input_probs, output_layer='probs', hk=hk, params=params, image_size=image_size, adv_noise = adv_noise) # rand+fgsm # if attack_switch['randfgsm']: # randfgsm_obj = FastGradientMethod(model=ch_model_probs, sess=sess) # x_randfgsm_t = (fgsm_eps - rand_alpha) * randfgsm_obj.generate(x=x, eps=fgsm_eps, clip_min=-1.0, clip_max=1.0) # x_rand_t = rand_alpha * tf.sign(tf.random_normal(shape=tf.shape(x), mean=0.0, stddev=1.0)) # define each attack method's tensor mu_alpha = tf.placeholder(tf.float32, [1]); attack_tensor_dict = {} # FastGradientMethod if attack_switch['fgsm']: print('creating attack tensor of FastGradientMethod') fgsm_obj = FastGradientMethod(model=ch_model_probs, sess=sess) #x_adv_test_fgsm = fgsm_obj.generate(x=x, eps=fgsm_eps, clip_min=-1.0, clip_max=1.0, ord=2) # testing now x_adv_test_fgsm = fgsm_obj.generate(x=x, eps=mu_alpha, clip_min=-1.0, clip_max=1.0) # testing now attack_tensor_dict['fgsm'] = x_adv_test_fgsm # Iterative FGSM (BasicIterativeMethod/ProjectedGradientMethod with no random init) # default: eps_iter=0.05, nb_iter=10 if attack_switch['ifgsm']: print('creating attack tensor of BasicIterativeMethod') ifgsm_obj = BasicIterativeMethod(model=ch_model_probs, sess=sess) #x_adv_test_ifgsm = ifgsm_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_ifgsm = ifgsm_obj.generate(x=x, eps=mu_alpha, eps_iter=mu_alpha/iterativeStep, nb_iter=iterativeStep, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['ifgsm'] = x_adv_test_ifgsm # Deepfool if attack_switch['deepfool']: print('creating attack tensor of DeepFool') deepfool_obj = DeepFool(model=ch_model_logits, sess=sess) #x_adv_test_deepfool = deepfool_obj.generate(x=x, nb_candidate=10, overshoot=0.02, max_iter=50, nb_classes=10, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_deepfool = deepfool_obj.generate(x=x, nb_candidate=10, overshoot=0.02, max_iter=50, nb_classes=10, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['deepfool'] = x_adv_test_deepfool # MomentumIterativeMethod # default: eps_iter=0.06, nb_iter=10 if attack_switch['mim']: print('creating attack tensor of MomentumIterativeMethod') mim_obj = MomentumIterativeMethod(model=ch_model_probs, sess=sess) #x_adv_test_mim = mim_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, decay_factor=1.0, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_mim = mim_obj.generate(x=x, eps=mu_alpha, eps_iter=mu_alpha/iterativeStep, nb_iter=iterativeStep, decay_factor=1.0, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['mim'] = x_adv_test_mim # SPSA # note here the epsilon is the infinity norm instead of precent of perturb # Maybe exclude this method first, since it seems to have some constrain about the data value range if attack_switch['spsa']: print('creating attack tensor of SPSA') spsa_obj = SPSA(model=ch_model_logits, sess=sess) #x_adv_test_spsa = spsa_obj.generate(x=x, epsilon=fgsm_eps, num_steps=10, is_targeted=False, early_stop_loss_threshold=None, learning_rate=0.01, delta=0.01,spsa_samples=1000, spsa_iters=1, ord=2) x_adv_test_spsa = spsa_obj.generate(x=x, epsilon=fgsm_eps, num_steps=10, is_targeted=False, early_stop_loss_threshold=None, learning_rate=0.01, delta=0.01,spsa_samples=1000, spsa_iters=1) attack_tensor_dict['spsa'] = x_adv_test_spsa # CarliniWagnerL2 # confidence=0 is fron their paper # it is said to be slow, maybe exclude first if attack_switch['cwl2']: print('creating attack tensor of CarliniWagnerL2') cwl2_obj = CarliniWagnerL2(model=ch_model_logits, sess=sess) #x_adv_test_cwl2 = cwl2_obj.generate(x=x, confidence=0, batch_size=1000, learning_rate=0.005, binary_search_steps=5, max_iterations=500, abort_early=True, initial_const=0.01, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_cwl2 = cwl2_obj.generate(x=x, confidence=0, batch_size=1000, learning_rate=0.005, binary_search_steps=5, max_iterations=500, abort_early=True, initial_const=0.01, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['cwl2'] = x_adv_test_cwl2 # MadryEtAl (Projected Grdient with random init, same as rand+fgsm) # default: eps_iter=0.01, nb_iter=40 if attack_switch['madry']: print('creating attack tensor of MadryEtAl') madry_obj = MadryEtAl(model=ch_model_probs, sess=sess) #x_adv_test_madry = madry_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, clip_min=-1.0, clip_max=1.0, ord=2) x_adv_test_madry = madry_obj.generate(x=x, eps=mu_alpha, eps_iter=fgsm_eps/iterativeStep, nb_iter=iterativeStep, clip_min=-1.0, clip_max=1.0) attack_tensor_dict['madry'] = x_adv_test_madry # SpatialTransformationMethod # the params are pretty different from on the paper # so I use default # exclude since there's bug if attack_switch['stm']: print('creating attack tensor of SpatialTransformationMethod') stm_obj = SpatialTransformationMethod(model=ch_model_probs, sess=sess) #x_adv_test_stm = stm_obj.generate(x=x, batch_size=1000, n_samples=None, dx_min=-0.1, dx_max=0.1, n_dxs=2, dy_min=-0.1, dy_max=0.1, n_dys=2, angle_min=-30, angle_max=30, n_angles=6, ord=2) x_adv_test_stm = stm_obj.generate(x=x, batch_size=1000, n_samples=None, dx_min=-0.1, dx_max=0.1, n_dxs=2, dy_min=-0.1, dy_max=0.1, n_dys=2, angle_min=-30, angle_max=30, n_angles=6) attack_tensor_dict['stm'] = x_adv_test_stm #====================== attack ========================= sess.run(tf.initialize_all_variables()); ##perturb h for training perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*32) perturbFM_h = np.reshape(perturbFM_h, [-1, 14, 14, 32]); ##perturb h for testing perturbFM_h_test = np.random.laplace(0.0, 0, 14*14*32) perturbFM_h_test = np.reshape(perturbFM_h_test, [-1, 14, 14, 32]); '''for i in range(_global_step, _global_step + pre_T): d_eps = random.random(); batch = mnist.train.next_batch(L); #Get a random batch. adv_images = sess.run(x_adv, feed_dict = {x:batch[0], y_:batch[1], FM_h: perturbFM_h_test, dynamic_eps: d_eps}) for iter in range(0, 9): adv_images = sess.run(x_adv, feed_dict = {x:adv_images, y_:batch[1], FM_h: perturbFM_h_test, dynamic_eps: d_eps}) """batch = mnist.train.next_batch(emsemble_L) adv_images_mim = sess.run(attack_tensor_dict['mim'], feed_dict = {x:batch[0], y_: batch[1]}) batch = mnist.train.next_batch(emsemble_L) adv_images_madry = sess.run(attack_tensor_dict['mim'], feed_dict = {x:batch[0], y_: batch[1]}) train_images = np.append(np.append(adv_images, adv_images_mim, axis = 0),adv_images_madry, axis = 0)""" batch_2 = mnist.train.next_batch(L); pretrain_step.run(feed_dict={adv_x: np.append(adv_images, batch_2[0], axis = 0), adv_noise: AdvLnoise, FM_h: perturbFM_h}); if i % int(5*step_for_epoch) == 0: cost_value = sess.run(cost, feed_dict={adv_x:mnist.test.images, adv_noise: AdvLnoise_test, FM_h: perturbFM_h_test})/(test_size*32) logfile.write("step \t %d \t %g \n"%(i, cost_value)) print(cost_value) pre_train_finish_time = time.time() print('pre_train finished in: ' + parse_time(pre_train_finish_time - start_time))''' # train and test model with adv samples max_benign_acc = -1; max_robust_benign_acc = -1 #max_adv_acc = -1; test_size = len(mnist.test.images) AdvLnoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L); AdvLnoise_test = generateIdLMNoise(image_size, 0, epsilon2_update, test_size); Lnoise_empty = generateIdLMNoise(image_size, 0, epsilon2_update, L); BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L); last_eval_time = -1 accum_time = 0 accum_epoch = 0 max_adv_acc_dict = {} max_robust_adv_acc_dict = {} #max_robust_adv_utility_dict = {} for atk in attack_switch.keys(): if atk not in max_adv_acc_dict: max_adv_acc_dict[atk] = -1 max_robust_adv_acc_dict[atk] = -1 for i in range(_global_step, _global_step + T): # this batch is for generating adv samples batch = mnist.train.next_batch(emsemble_L); #Get a random batch. y_adv_batch = batch[1] #The number of epochs we print out the result. Print out the result every 5 epochs. if i % int(10*step_for_epoch) == 0 and i > int(10*step_for_epoch): cost_value = sess.run(cost, feed_dict={adv_x:mnist.test.images, adv_noise: AdvLnoise_test, FM_h: perturbFM_h_test})/(test_size*32) print(cost_value) if last_eval_time < 0: last_eval_time = time.time() #===================benign samples===================== predictions_form_argmax = np.zeros([test_size, 10]) #test_bach = mnist.test.next_batch(test_size) softmax_predictions = softmax_y_conv.eval(feed_dict={x: mnist.test.images, noise: BenignLNoise, FM_h: perturbFM_h}) argmax_predictions = np.argmax(softmax_predictions, axis=1) for n_draws in range(0, 1): _BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L); _perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*32) _perturbFM_h = np.reshape(_perturbFM_h, [-1, 14, 14, 32]); for j in range(test_size): pred = argmax_predictions[j] predictions_form_argmax[j, pred] += 1; softmax_predictions = softmax_y_conv.eval(feed_dict={x: mnist.test.images, noise: (BenignLNoise + _BenignLNoise/2), FM_h: (perturbFM_h + _perturbFM_h/2)}) argmax_predictions = np.argmax(softmax_predictions, axis=1) final_predictions = predictions_form_argmax; is_correct = [] is_robust = [] for j in range(test_size): is_correct.append(np.argmax(mnist.test.labels[j]) == np.argmax(final_predictions[j])) robustness_from_argmax = robustness.robustness_size_argmax(counts=predictions_form_argmax[j],eta=0.05,dp_attack_size=fgsm_eps, dp_epsilon=1.0, dp_delta=0.05, dp_mechanism='laplace') / (dp_mult) is_robust.append(robustness_from_argmax >= fgsm_eps) acc = np.sum(is_correct)*1.0/test_size robust_acc = np.sum([a and b for a,b in zip(is_robust, is_correct)])*1.0/np.sum(is_robust) robust_utility = np.sum(is_robust)*1.0/test_size max_benign_acc = max(max_benign_acc, acc) max_robust_benign_acc = max(max_robust_benign_acc, robust_acc*robust_utility) log_str = "step: {:.1f}\t epsilon: {:.1f}\t benign: {:.4f} \t {:.4f} \t {:.4f} \t {:.4f} \t".format(i, total_eps, acc, robust_acc, robust_utility, robust_acc*robust_utility) #===================adv samples===================== #log_str = "step: {:.1f}\t epsilon: {:.1f}\t".format(i, total_eps) """adv_images_dict = {} for atk in attack_switch.keys(): if attack_switch[atk]: adv_images_dict[atk] = sess.run(attack_tensor_dict[atk], feed_dict = {x:mnist.test.images, y_:mnist.test.labels}) print("Done with the generating of Adversarial samples")""" #===================adv samples===================== adv_acc_dict = {} robust_adv_acc_dict = {} robust_adv_utility_dict = {} for atk in attack_switch.keys(): if atk not in adv_acc_dict: adv_acc_dict[atk] = -1 robust_adv_acc_dict[atk] = -1 robust_adv_utility_dict[atk] = -1 if attack_switch[atk]: adv_images_dict = sess.run(attack_tensor_dict[atk], feed_dict = {x:mnist.test.images, y_: mnist.test.labels, adv_noise: AdvLnoise_test, mu_alpha:[fgsm_eps]}) ### PixelDP Robustness ### predictions_form_argmax = np.zeros([test_size, 10]) softmax_predictions = softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: BenignLNoise, FM_h: perturbFM_h}) argmax_predictions = np.argmax(softmax_predictions, axis=1) for n_draws in range(0, 2000): if n_draws % 1000 == 0: print(n_draws) _BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L); _perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*32) _perturbFM_h = np.reshape(_perturbFM_h, [-1, 14, 14, 32]); for j in range(test_size): pred = argmax_predictions[j] predictions_form_argmax[j, pred] += 1; softmax_predictions = softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: BenignLNoise, FM_h: (perturbFM_h + _perturbFM_h/2)}) * softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: (BenignLNoise + _BenignLNoise/2), FM_h: perturbFM_h}) #softmax_predictions = softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: BenignLNoise, FM_h: (_perturbFM_h)}) * softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: (_BenignLNoise), FM_h: perturbFM_h}) argmax_predictions = np.argmax(softmax_predictions, axis=1) final_predictions = predictions_form_argmax; is_correct = [] is_robust = [] for j in range(test_size): is_correct.append(np.argmax(mnist.test.labels[j]) == np.argmax(final_predictions[j])) robustness_from_argmax = robustness.robustness_size_argmax(counts=predictions_form_argmax[j],eta=0.05,dp_attack_size=fgsm_eps, dp_epsilon=1.0, dp_delta=0.05, dp_mechanism='laplace') / (dp_mult) is_robust.append(robustness_from_argmax >= fgsm_eps) adv_acc_dict[atk] = np.sum(is_correct)*1.0/test_size robust_adv_acc_dict[atk] = np.sum([a and b for a,b in zip(is_robust, is_correct)])*1.0/np.sum(is_robust) robust_adv_utility_dict[atk] = np.sum(is_robust)*1.0/test_size ############################## for atk in attack_switch.keys(): if attack_switch[atk]: # added robust prediction log_str += " {}: {:.4f} {:.4f} {:.4f} {:.4f}".format(atk, adv_acc_dict[atk], robust_adv_acc_dict[atk], robust_adv_utility_dict[atk], robust_adv_acc_dict[atk]*robust_adv_utility_dict[atk]) max_adv_acc_dict[atk] = max(max_adv_acc_dict[atk], adv_acc_dict[atk]) max_robust_adv_acc_dict[atk] = max(max_robust_adv_acc_dict[atk], robust_adv_acc_dict[atk]*robust_adv_utility_dict[atk]) print(log_str) logfile.write(log_str + '\n') # logfile.write("step \t %d \t %g \t %g \n"%(i, benign_acc, adv_acc)) # print("step \t %d \t %g \t %g"%(i, benign_acc, adv_acc)); # estimate end time """if i > 0 and i % int(10*step_for_epoch) == 0: current_time_interval = time.time() - last_eval_time last_eval_time = time.time() print('during last eval interval, {} epoch takes {}'.format(10, parse_time(current_time_interval))) accum_time += current_time_interval accum_epoch += 10 estimate_time = ((_global_step + T - i) / step_for_epoch) * (accum_time / accum_epoch) print('estimate finish in: {}'.format(parse_time(estimate_time)))""" #print("step \t %d \t adversarial test accuracy \t %g"%(i, accuracy_x.eval(feed_dict={x: adv_images, y_: mnist.test.labels, noise: Lnoise_empty}))); """checkpoint_path = os.path.join(os.getcwd() + '/tmp/train', 'model.ckpt') saver.save(sess, checkpoint_path, global_step=i);""" d_eps = random.random(); y_adv = batch[1] adv_images = sess.run(attack_tensor_dict['ifgsm'], feed_dict = {x:batch[0], y_: batch[1], adv_noise: AdvLnoise, mu_alpha:[d_eps]}) """for iter in range(0, 9): adv_images = sess.run(x_adv, feed_dict = {x:adv_images, y_:batch[1], FM_h: perturbFM_h_test, dynamic_eps: d_eps})""" batch = mnist.train.next_batch(emsemble_L) adv_images_mim = sess.run(attack_tensor_dict['mim'], feed_dict = {x:batch[0], y_: batch[1], adv_noise: AdvLnoise, mu_alpha:[d_eps]}) y_adv = np.append(y_adv, batch[1], axis = 0) batch = mnist.train.next_batch(emsemble_L) adv_images_madry = sess.run(attack_tensor_dict['madry'], feed_dict = {x:batch[0], y_: batch[1], adv_noise: AdvLnoise, mu_alpha:[d_eps]}) y_adv = np.append(y_adv, batch[1], axis = 0) train_images = np.append(np.append(adv_images, adv_images_mim, axis = 0),adv_images_madry, axis = 0) batch = mnist.train.next_batch(L); #Get a random batch. # train with benign and adv samples pretrain_step.run(feed_dict={adv_x: train_images, x: batch[0], adv_noise: AdvLnoise_test, noise: BenignLNoise, FM_h: perturbFM_h}); train_step.run(feed_dict={x: batch[0], adv_x: train_images, y_: batch[1], adv_y_: y_adv, noise: BenignLNoise, adv_noise: AdvLnoise_test, FM_h: perturbFM_h}); duration = time.time() - start_time; # print(parse_time(duration)); #print running time duration# max_acc_string = "max acc: benign: \t{:.4f} {:.4f}".format(max_benign_acc, max_robust_benign_acc) for atk in attack_switch.keys(): if attack_switch[atk]: max_acc_string += " {}: \t{:.4f} {:.4f}".format(atk, max_adv_acc_dict[atk], max_robust_adv_acc_dict[atk]) logfile.write(max_acc_string + '\n') logfile.write(str(duration) + '\n')
def eval(sess, model_name, X_train, Y_train, X_test, Y_test, cnn=False, rbf=False): """ Load model saved in model_name.json and model_name_weights.h5 and evaluate its accuracy on legitimate test samples and adversarial samples. Use cnn=True if the model is CNN based. """ # load saved model print("Load model ... ") ''' json = open('models/{}.json'.format(model_name), 'r') model = json.read() json.close() loaded_model = model_from_json(model) loaded_model.load_weights("models/{}_weights.h5".format(model_name)) ''' if rbf: loaded_model = load_model("rbfmodels/{}.h5".format(model_name), custom_objects={'RBFLayer': RBFLayer}) else: loaded_model = load_model("models/{}.h5".format(model_name)) # Set placeholders if cnn: x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) else: x = tf.placeholder(tf.float32, shape=(None, 784)) y = tf.placeholder(tf.float32, shape=(None, 10)) predictions = loaded_model(x) accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args={"batch_size": 128}) print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) # Using functions from /cleverhans/attacks_tf.py # Will be deprecated next year adv_x = fgsm(x, predictions, eps=0.3) X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], batch_size=128) # Using functions from /cleverhans/attacks.py (as specified by creators) # Does not work at the moment ''' wrap = KerasModelWrapper(loaded_model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3} #'y': y} adv_x = fgsm.generate(x, **fgsm_params) adv_x = tf.stop_gradient(adv_x) X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], batch_size=128) predictions_adv = loaded_model(adv_x) ''' # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test, args={"batch_size": 128}) print('Test accuracy on adversarial test examples: ' + str(accuracy)) # Craft adversarial examples using Jacobian-based Saliency Map Approach (JSMA) wrap = KerasModelWrapper(loaded_model) jsma = SaliencyMapMethod(wrap, sess=sess) jsma_params = { 'theta': 1., 'gamma': 1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } adv_x = jsma.generate(x, **jsma_params) adv_x = tf.stop_gradient(adv_x) preds_adv = loaded_model(adv_x) accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={"batch_size": 512}) print('Test accuracy on adversarial test examples: ' + str(accuracy)) ''' report = AccuracyReport() viz_enabled=VIZ_ENABLED source_samples=SOURCE_SAMPLES img_rows, img_cols, nchannels = 28, 28, 1 nb_classes = 10 print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object wrap = KerasModelWrapper(loaded_model) jsma = SaliencyMapMethod(wrap, sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, predictions, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) #adv_x = jsma(sess, x, predictions, 10, X_test, Y_test, 0, 0.5, 0, 1) #X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], batch_size=128) #accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test, args={ "batch_size" : 128 }) ''' sess.close()