def attack(X, y, batch_size=128, thresh=0.3, target=-1): x_pl = tf.placeholder(tf.float32, [None, X.shape[1], X.shape[2], X.shape[3]]) # image placeholder t = tf.placeholder(tf.float32, [None, 10]) # target placeholder is_training = tf.placeholder(tf.bool, []) is_targeted = False if target in range(0, y.shape[-1]): is_targeted = True perturb = tf.clip_by_value(generator(x_pl, is_training), -thresh, thresh) x_perturbed = perturb + x_pl x_perturbed = tf.clip_by_value(x_perturbed, 0, 1) f = target_model() f_real_logits, f_real_probs = f.ModelC(x_pl) f_fake_logits, f_fake_probs = f.ModelC(x_perturbed) t_vars = tf.trainable_variables() f_vars = [var for var in t_vars if 'ModelC' in var.name] g_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='g_weights') sess = tf.Session() f_saver = tf.train.Saver(f_vars) g_saver = tf.train.Saver(g_vars) f_saver.restore(sess, "./weights/target_model/model.ckpt") g_saver.restore(sess, tf.train.latest_checkpoint("./weights/generator/")) rawpert, pert, fake_l, real_l = sess.run([perturb, x_perturbed, f_fake_probs, f_real_probs], \ feed_dict={x_pl: X[:32], \ is_training: False}) print('LA: ' + str(np.argmax(y[:32], axis=1))) print('OG: ' + str(np.argmax(real_l, axis=1))) print('PB: ' + str(np.argmax(fake_l, axis=1))) correct_prediction = tf.equal(tf.argmax(f_fake_probs, 1), tf.argmax(t, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) accs = [] total_batches_test = int(X.shape[0] / batch_size) for i in range(total_batches_test): batch_x, batch_y = next_batch(X, y, i, batch_size) if is_targeted: targets = np.full((batch_y.shape[0],), target) batch_y = np.eye(y.shape[-1])[targets] acc, fake_l, x_pert = sess.run([accuracy, f_fake_probs, x_perturbed], feed_dict={x_pl: batch_x, t: batch_y, is_training: False}) accs.append(acc) print('accuracy of test set: {}'.format(sum(accs) / len(accs))) f, axarr = plt.subplots(2,2) axarr[0,0].imshow(np.squeeze(X[3]), cmap='Greys_r') axarr[0,1].imshow(np.squeeze(pert[3]), cmap='Greys_r') axarr[1,0].imshow(np.squeeze(X[4]), cmap='Greys_r') axarr[1,1].imshow(np.squeeze(pert[4]), cmap='Greys_r') plt.show()
def attack(X, y): x_pl = tf.placeholder(tf.float32, [None, 28, 28, 1]) # image placeholder perturb = generator(x_pl) x_perturbed = x_pl + perturb d_perturb_logits, d_perturb_probs = discriminator(x_perturbed) f = target_model() f_real_logits, f_real_probs = f.ModelC(x_pl) f_fake_logits, f_fake_probs = f.ModelC(x_perturbed) t_vars = tf.trainable_variables() f_vars = [var for var in t_vars if 'ModelC' in var.name] d_vars = [var for var in t_vars if 'd_' in var.name] g_vars = [var for var in t_vars if 'g_' in var.name] init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) f_saver = tf.train.Saver(f_vars) g_saver = tf.train.Saver(g_vars) d_saver = tf.train.Saver(d_vars) # f_saver.restore(sess, "./weights/target_model/model.ckpt") g_saver.restore(sess, "./weights/generator/gen.ckpt") # d_saver.restore(sess, "weights/discriminator/disc.ckpt") # p, xp, real_l, fake_l = sess.run([perturb, x_perturbed, f_real_probs, f_fake_probs], \ # feed_dict={x_pl: X}) real_l = sess.run(x_perturbed, \ feed_dict={x_pl: X}) # print(np.argmax(y, axis=1)) print(real_l.shape)
def AdvGAN(x_train, y_train, x_test, y_test, t_mu, t_cov, target=-1, epochs=50, batch_size=32): # placeholder definitions x_pl = tf.placeholder(tf.float32, [None, x_train.shape[-1]]) y_pl = tf.placeholder(tf.float32, [None, y_train.shape[-1]]) is_training = tf.placeholder(tf.bool, []) target_is_training = tf.placeholder(tf.bool, []) #----------------------------------------------------------------------------------- # MODEL DEFINITIONS if target != -1: is_targeted = True else: is_targeted = False # gather target model f = target_model(n_input=x_train.shape[-1], n_classes=y_train.shape[-1]) # generate perturbation, add to original input image(s) perturb, logit_perturb = generator.generator(x_pl, is_training) x_perturbed = perturb + x_pl x_perturbed = tf.clip_by_value(x_perturbed, 0, 1) # pass real and perturbed image to discriminator and the target model d_real_logits, d_real_probs = discriminator.discriminator( x_pl, is_training) d_fake_logits, d_fake_probs = discriminator.discriminator( x_perturbed, is_training) # pass real and perturbed images to the model we are trying to fool f_real_logits, f_real_probs = f.Model(x_pl, target_is_training) f_fake_logits, f_fake_probs = f.Model(x_perturbed, target_is_training) # generate labels for discriminator (optionally smooth labels for stability) smooth = 0.0 d_labels_real = tf.ones_like(d_real_probs) * (1 - smooth) d_labels_fake = tf.zeros_like(d_fake_probs) #----------------------------------------------------------------------------------- # LOSS DEFINITIONS # discriminator loss d_loss_real = tf.losses.mean_squared_error(predictions=d_real_probs, labels=d_labels_real) d_loss_fake = tf.losses.mean_squared_error(predictions=d_fake_probs, labels=d_labels_fake) d_loss = d_loss_real + d_loss_fake # generator loss g_loss_fake = tf.losses.mean_squared_error( predictions=d_fake_probs, labels=tf.ones_like(d_fake_probs)) # perturbation loss (minimize overall perturbation) l_perturb = perturb_loss(perturb, 1.0) # adversarial loss (encourage misclassification) l_adv = adv_loss(f_fake_probs, y_pl, is_targeted) # loss minimizing L1 distance between target class average and perturbed vector # this is used to encourage realism of sample target_normal = tf.placeholder(tf.float32, [None, x_train.shape[-1]]) l_tar_dist = tf.reduce_mean( tf.norm(target_normal - x_perturbed, axis=1, ord=1)) # weights for generator loss function alpha = 1.0 beta = 1.0 g_loss = l_adv + alpha * g_loss_fake + l_tar_dist + beta * l_perturb # ---------------------------------------------------------------------------------- # gather variables for training/restoring t_vars = tf.trainable_variables() f_vars = [var for var in t_vars if "Model_A" in var.name] d_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="discriminator") g_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="generator") # define optimizers for discriminator and generator update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): d_opt = tf.train.AdamOptimizer(learning_rate=0.0001).minimize( d_loss, var_list=d_vars) g_opt = tf.train.AdamOptimizer(learning_rate=0.0002).minimize( g_loss, var_list=g_vars) # create saver objects for the target model, generator, and discriminator saver = tf.train.Saver(f_vars) g_saver = tf.train.Saver(g_vars) d_saver = tf.train.Saver(d_vars) init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) # load the pretrained target model try: saver.restore( sess, tf.train.latest_checkpoint("./weights/target_model/Model_A/")) except: print("make sure to train the target model first...") sys.exit(1) n_batches = int(len(y_train) / batch_size) for epoch in range(epochs): # shuffle training data x_train, y_train = utils.shuffle(x_train, y_train) loss_D = 0.0 loss_G_fake = 0.0 loss_perturb = 0.0 loss_adv = 0.0 loss_target_norm = 0.0 target_normal_np = np.random.multivariate_normal( t_mu, t_cov, (batch_size)) target_normal_np = np.clip(target_normal_np, 0, 1) for i in range(n_batches): # extract batch batch_x, batch_y = utils.next_batch(x_train, y_train, batch_size, i) # if targeted, create one hot vectors of the target if is_targeted: targets = np.full((batch_y.shape[0], ), target) batch_y = np.eye(y_train.shape[-1])[targets] # train the discriminator first n times for _ in range(1): _, loss_D_batch = sess.run( [d_opt, d_loss], feed_dict={ x_pl: batch_x, target_normal: target_normal_np, is_training: True }) # train the generator n times for _ in range(1): _, loss_G_fake_batch, loss_adv_batch, loss_perturb_batch, loss_target_batch = \ sess.run([g_opt, g_loss_fake, l_adv, l_perturb, l_tar_dist], feed_dict={ x_pl: batch_x, y_pl: batch_y, target_normal: target_normal_np, is_training: True, target_is_training: False }) loss_D += loss_D_batch loss_G_fake += loss_G_fake_batch loss_perturb += loss_perturb_batch loss_adv += loss_adv_batch loss_target_norm += loss_target_batch loss_D /= n_batches loss_G_fake /= n_batches loss_perturb /= n_batches loss_adv /= n_batches loss_target_norm /= n_batches print("epoch %d:" % (epoch + 1)) print(" loss_D: %.3f, loss_G_fake: %.3f" % (loss_D, loss_G_fake)) print(" loss_perturb: %.3f, loss_adv: %.3f" % (loss_perturb, loss_adv)) print(" loss_target_norm: %.3f" % (loss_target_norm)) print() if epoch % 10 == 0: g_saver.save(sess, "weights/generator/gen.ckpt") d_saver.save(sess, "weights/discriminator/disc.ckpt") # quick sample to see some outputs rawpert, pert, fake_l, real_l = sess.run( [perturb, x_perturbed, f_fake_probs, f_real_probs], feed_dict={ x_pl: x_test[:32], is_training: False, target_is_training: False }) print("Original Labels:") print(np.argmax(y_test[:32], axis=1)) print("Original Predictions:") print(np.argmax(real_l, axis=1)) print("Perturbed Predictions:") print(np.argmax(fake_l, axis=1)) # evaluate the test set correct_prediction = tf.equal(tf.argmax(f_fake_probs, 1), tf.argmax(y_pl, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) scores = [] total_batches_test = int(len(y_test) / batch_size) for i in range(total_batches_test): batch_x, batch_y = utils.next_batch(x_test, y_test, batch_size, i) score, x_pert = sess.run( [accuracy, x_perturbed], feed_dict={ x_pl: batch_x, y_pl: batch_y, is_training: False, target_is_training: False }) scores.append(score) print("test accuracy: %0.3f" % (sum(scores) / len(scores))) print("finished training, saving weights") g_saver.save(sess, "weights/generator/gen.ckpt") d_saver.save(sess, "weights/discriminator/disc.ckpt")
def AdvGAN(X, y, X_test, y_test, epochs=50, batch_size=128, target=-1): # placeholder definitions x_pl = tf.placeholder( tf.float32, [None, X.shape[1], X.shape[2], X.shape[3]]) # image placeholder t = tf.placeholder(tf.float32, [None, y.shape[-1]]) # target placeholder is_training = tf.placeholder(tf.bool, []) #----------------------------------------------------------------------------------- # MODEL DEFINITIONS is_targeted = False if target in range(0, y.shape[-1]): is_targeted = True # gather target model f = target_model() thresh = 0.3 # generate perturbation, add to original input image(s) perturb = tf.clip_by_value(generator(x_pl, is_training), -thresh, thresh) x_perturbed = perturb + x_pl x_perturbed = tf.clip_by_value(x_perturbed, 0, 1) # pass real and perturbed image to discriminator and the target model d_real_logits, d_real_probs = discriminator(x_pl, is_training) d_fake_logits, d_fake_probs = discriminator(x_perturbed, is_training) # pass real and perturbed images to the model we are trying to fool f_real_logits, f_real_probs = f.ModelC(x_pl) f_fake_logits, f_fake_probs = f.ModelC(x_perturbed) # generate labels for discriminator (optionally smooth labels for stability) smooth = 0.0 d_labels_real = tf.ones_like(d_real_probs) * (1 - smooth) d_labels_fake = tf.zeros_like(d_fake_probs) #----------------------------------------------------------------------------------- # LOSS DEFINITIONS # discriminator loss d_loss_real = tf.losses.mean_squared_error(predictions=d_real_probs, labels=d_labels_real) d_loss_fake = tf.losses.mean_squared_error(predictions=d_fake_probs, labels=d_labels_fake) d_loss = d_loss_real + d_loss_fake # generator loss g_loss_fake = tf.losses.mean_squared_error( predictions=d_fake_probs, labels=tf.ones_like(d_fake_probs)) # perturbation loss (minimize overall perturbation) l_perturb = perturb_loss(perturb, thresh) # adversarial loss (encourage misclassification) l_adv = adv_loss(f_fake_probs, t, is_targeted) # weights for generator loss function alpha = 1.0 beta = 5.0 g_loss = l_adv + alpha * g_loss_fake + beta * l_perturb # ---------------------------------------------------------------------------------- # gather variables for training/restoring t_vars = tf.trainable_variables() f_vars = [var for var in t_vars if 'ModelC' in var.name] d_vars = [var for var in t_vars if 'd_' in var.name] g_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='g_weights') # define optimizers for discriminator and generator update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): d_opt = tf.train.AdamOptimizer().minimize(d_loss, var_list=d_vars) g_opt = tf.train.AdamOptimizer(learning_rate=0.001).minimize( g_loss, var_list=g_vars) # create saver objects for the target model, generator, and discriminator saver = tf.train.Saver(f_vars) g_saver = tf.train.Saver(g_vars) d_saver = tf.train.Saver(d_vars) init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) # load the pretrained target model try: saver.restore(sess, "./weights/target_model/model.ckpt") except: print("make sure to train the target model first...") sys.exit(1) total_batches = int(X.shape[0] / batch_size) for epoch in range(0, epochs): X, y = shuffle(X, y) loss_D_sum = 0.0 loss_G_fake_sum = 0.0 loss_perturb_sum = 0.0 loss_adv_sum = 0.0 for i in range(total_batches): batch_x, batch_y = next_batch(X, y, i, batch_size) # if targeted, create one hot vectors of the target if is_targeted: targets = np.full((batch_y.shape[0], ), target) batch_y = np.eye(y.shape[-1])[targets] # train the discriminator first n times for _ in range(1): _, loss_D_batch = sess.run([d_opt, d_loss], feed_dict={x_pl: batch_x, \ is_training: True}) # train the generator n times for _ in range(1): _, loss_G_fake_batch, loss_adv_batch, loss_perturb_batch = \ sess.run([g_opt, g_loss_fake, l_adv, l_perturb], \ feed_dict={x_pl: batch_x, \ t: batch_y, \ is_training: True}) loss_D_sum += loss_D_batch loss_G_fake_sum += loss_G_fake_batch loss_perturb_sum += loss_perturb_batch loss_adv_sum += loss_adv_batch print("epoch %d:\nloss_D: %.3f, loss_G_fake: %.3f, \ \nloss_perturb: %.3f, loss_adv: %.3f, \n" % (epoch + 1, loss_D_sum / total_batches, loss_G_fake_sum / total_batches, loss_perturb_sum / total_batches, loss_adv_sum / total_batches)) if epoch % 10 == 0: g_saver.save(sess, "weights/generator/gen.ckpt") d_saver.save(sess, "weights/discriminator/disc.ckpt") # evaluate the test set correct_prediction = tf.equal(tf.argmax(f_fake_probs, 1), tf.argmax(t, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) accs = [] total_batches_test = int(X_test.shape[0] / batch_size) for i in range(total_batches_test): batch_x, batch_y = next_batch(X_test, y_test, i, batch_size) acc, x_pert = sess.run([accuracy, x_perturbed], feed_dict={ x_pl: batch_x, t: batch_y, is_training: False }) accs.append(acc) print('accuracy of test set: {}'.format(sum(accs) / len(accs))) # plot some images and their perturbed counterparts f, axarr = plt.subplots(2, 2) axarr[0, 0].imshow(np.squeeze(batch_x[2]), cmap='Greys_r') axarr[0, 1].imshow(np.squeeze(x_pert[2]), cmap='Greys_r') axarr[1, 0].imshow(np.squeeze(batch_x[5]), cmap='Greys_r') axarr[1, 1].imshow(np.squeeze(x_pert[5]), cmap='Greys_r') plt.show() print('finished training, saving weights') g_saver.save(sess, "weights/generator/gen.ckpt") d_saver.save(sess, "weights/discriminator/disc.ckpt")
def AdvGAN(X, y, batch_size=128): x_real_pl = tf.placeholder(tf.float32, [None, 28, 28, 1]) # image placeholder x_fake_pl = tf.placeholder(tf.float32, [None, 28, 28, 1]) # image placeholder d_labels_pl = tf.placeholder(tf.float32, [None, 1]) y_hinge_pl = tf.placeholder(tf.float32, [None, 28, 28, 1]) t = tf.placeholder(tf.float32, [None, 10]) # target placeholder #----------------------------------------------------------------------------------- # MODEL DEFINITIONS # gather target model f = target_model() # generate perturbation, add to original input image(s) perturb = generator(x_fake_pl) x_perturbed = x_fake_pl + perturb disc_batch_x = tf.concat([x_real_pl, x_perturbed], axis=0) # pass perturbed image to discriminator and the target model d_out_logits, d_out_probs = discriminator(disc_batch_x) d_perturb_logits, d_perturb_probs = discriminator(x_perturbed) f_out_logits, f_out_probs = f.ModelC(x_perturbed) # generate labels for discriminator # smooth = 0.0 # d_labels_real = tf.ones_like(d_real_logits) * (1 - smooth) # d_labels_fake = tf.zeros_like(d_perturb_logits) #----------------------------------------------------------------------------------- # LOSS DEFINITIONS d_loss = mse_loss(d_out_probs, d_labels_pl) l_adv = adv_loss(f_out_probs, t) l_hinge = hinge_loss(perturb, y_hinge_pl, 0.3) alpha = 1 beta = 1 g_loss = mse_loss(d_perturb_probs, d_labels_pl) + alpha * l_adv + beta * l_hinge # ---------------------------------------------------------------------------------- # gather variables for training/restoring t_vars = tf.trainable_variables() f_vars = [var for var in t_vars if 'ModelC' in var.name] d_vars = [var for var in t_vars if 'd_' in var.name] g_vars = [var for var in t_vars if 'g_' in var.name] d_opt = tf.train.AdamOptimizer().minimize(d_loss, var_list=d_vars) g_opt = tf.train.AdamOptimizer().minimize(g_loss, var_list=g_vars) saver = tf.train.Saver(f_vars) g_saver = tf.train.Saver(g_vars) d_saver = tf.train.Saver(d_vars) init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) saver.restore(sess, "./weights/target_model/model.ckpt") for i in range(50): # ------------------------------------------------------------------------------ # train the discriminator first on real and generated images real_image_inp = X[ np.random.randint(0, X.shape[0], size=int(batch_size / 2)), :, :, :] fake_image_inp = X[ np.random.randint(0, X.shape[0], size=int(batch_size / 2)), :, :, :] disc_batch_y = np.zeros([batch_size, 1]) disc_batch_y[0:int(batch_size / 2)] = 1 _, dl = sess.run([d_opt, d_loss], feed_dict={x_real_pl: real_image_inp, \ x_fake_pl: fake_image_inp, \ d_labels_pl: disc_batch_y}) if i % 10 == 0: print('discriminator loss: ' + str(dl)) # train the generator 5x (test) for _ in range(5): # ------------------------------------------------------------------------------ # train the generator for perturbed images using loss for discriminator, adversarial, and hinge random_samples = np.random.randint(0, X.shape[0], size=int(batch_size)) fake_image_inp = X[random_samples, ...] y_discrim = np.ones([batch_size, 1]) target_class = y[random_samples] _, gl = sess.run([g_opt, g_loss], feed_dict={x_fake_pl: fake_image_inp, \ d_labels_pl: y_discrim, \ y_hinge_pl: np.zeros((batch_size, 28, 28, 1)), \ t: target_class}) if i % 10 == 0: print('generator loss: ' + str(gl)) g_saver.save(sess, "weights/generator/gen.ckpt") d_saver.save(sess, "weights/discriminator/disc.ckpt")
def perturb_advgan(x, y, target=-1, batch_size=32, output_dir='.'): x_pl = tf.placeholder(tf.float32, [None, x.shape[-1]]) y_pl = tf.placeholder(tf.float32, [None, y.shape[-1]]) is_training = tf.placeholder(tf.bool, []) is_training_target = tf.placeholder(tf.bool, []) if target != -1: is_targeted = True else: is_targeted = False # generate pertubation, add to original, clip to valid expression level p, logit_perturb = generator.generator(x_pl, is_training) x_perturbed = p + x_pl x_perturbed = tf.clip_by_value(x_perturbed, 0, 1) # instantiate target model, create graphs for original and perturbed data f = target_model(n_input=x.shape[-1], n_classes=y.shape[-1]) f_real_logits, f_real_probs = f.Model(x_pl, is_training_target) f_fake_logits, f_fake_probs = f.Model(x_perturbed, is_training_target) # get variables t_vars = tf.trainable_variables() f_vars = [var for var in t_vars if 'Model_A' in var.name] g_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='generator') sess = tf.Session() # load checkpoints f_saver = tf.train.Saver(f_vars) g_saver = tf.train.Saver(g_vars) f_saver.restore( sess, tf.train.latest_checkpoint('%s/target_model/' % (output_dir))) g_saver.restore(sess, tf.train.latest_checkpoint('%s/generator/' % (output_dir))) # calculate accuracy of target model on perturbed data correct_prediction = tf.equal(tf.argmax(f_fake_probs, 1), tf.argmax(y_pl, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float')) # generate perturbed samples from original samples n_batches = math.ceil(len(x) / batch_size) scores = [] perturbations = [] for i in range(n_batches): batch_x, batch_y = utils.next_batch(x, y, batch_size, i) if is_targeted: targets = np.full((batch_y.shape[0], ), target) batch_y_pert = np.eye(y_pl.shape[-1])[targets] score, _, batch_x_pert, batch_p = sess.run( [accuracy, f_fake_probs, x_perturbed, p], feed_dict={ x_pl: batch_x, y_pl: batch_y_pert, is_training: False, is_training_target: False }) scores.append(score) perturbations.append(batch_p) print('perturbation accuracy: %0.3f' % (sum(scores) / len(scores))) # return matrix of perturbed samples return np.vstack(perturbations).T
def attack(x_train, y_train, target=-1, batch_size=64): x_pl = tf.placeholder(tf.float32, [None, x_train.shape[-1]]) y_pl = tf.placeholder(tf.float32, [None, y_train.shape[-1]]) is_training = tf.placeholder(tf.bool, []) is_training_target = tf.placeholder(tf.bool, []) if target != -1: is_targeted = True else: is_targeted = False # generate pertubation, add to original, clip to valid expression level perturb, logit_perturb = generator.generator(x_pl, is_training) x_perturbed = perturb + x_pl x_perturbed = tf.clip_by_value(x_perturbed, 0, 1) # instantiate target model, create graphs for original and perturbed data f = target_model(n_input=x_train.shape[-1], n_classes=y_train.shape[-1]) f_real_logits, f_real_probs = f.Model(x_pl, is_training_target) f_fake_logits, f_fake_probs = f.Model(x_perturbed, is_training_target) # get variables t_vars = tf.trainable_variables() f_vars = [var for var in t_vars if "Model_A" in var.name] g_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="generator") sess = tf.Session() # load checkpoints f_saver = tf.train.Saver(f_vars) g_saver = tf.train.Saver(g_vars) f_saver.restore(sess, tf.train.latest_checkpoint("./weights/target_model/Model_A/")) g_saver.restore(sess, tf.train.latest_checkpoint("./weights/generator/")) # calculate accuracy of target model on perturbed data correct_prediction = tf.equal(tf.argmax(f_fake_probs, 1), tf.argmax(y_pl, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) scores = [] x_pert = [] n_batches = int(len(x_train) / batch_size) for i in range(n_batches): batch_x, batch_y_og = utils.next_batch(x_train, y_train, batch_size, i) if is_targeted: targets = np.full((batch_y_og.shape[0],), target) batch_y = np.eye(y_pl.shape[-1])[targets] score, fake_l, x_p, p = sess.run([accuracy, f_fake_probs, x_perturbed, perturb], feed_dict={ x_pl: batch_x, y_pl: batch_y, is_training: False, is_training_target: False }) scores.append(score) x_pert.append(x_p) # print a sample original, perturbation, and original + perturbation np.set_printoptions(precision=4, suppress=True) print("original class is: %s" % (classes[np.argmax(batch_y_og, axis=1)[0]])) print(batch_x[0]) print(p[0]) print(x_p[0]) np.save("perturbed_%s.npy" % (target), np.vstack(x_pert)) print("test accuracy: %0.3f" % (sum(scores) / len(scores)))
def attack_source_target(x, y, classes, source, target, target_mu): source_indices = np.where(np.argmax(y, axis=1) == source) x_source = x[source_indices] y_source = y[source_indices] x_pl = tf.placeholder(tf.float32, [None, x_source.shape[-1]]) y_pl = tf.placeholder(tf.float32, [None, y_source.shape[-1]]) is_training = tf.placeholder(tf.bool, []) is_training_target = tf.placeholder(tf.bool, []) if target != -1: is_targeted = True else: is_targeted = False # generate pertubation, add to original, clip to valid expression level perturb, logit_perturb = generator.generator(x_pl, is_training) x_perturbed = perturb + x_pl x_perturbed = tf.clip_by_value(x_perturbed, 0, 1) # instantiate target model, create graphs for original and perturbed data f = target_model(n_input=x.shape[-1], n_classes=y.shape[-1]) f_real_logits, f_real_probs = f.Model(x_pl, is_training_target) f_fake_logits, f_fake_probs = f.Model(x_perturbed, is_training_target) # get variables t_vars = tf.trainable_variables() f_vars = [var for var in t_vars if "Model_A" in var.name] g_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="generator") sess = tf.Session() # load checkpoints f_saver = tf.train.Saver(f_vars) g_saver = tf.train.Saver(g_vars) f_saver.restore(sess, tf.train.latest_checkpoint("./weights/target_model/Model_A/")) g_saver.restore(sess, tf.train.latest_checkpoint("./weights/generator/")) if is_targeted: targets = np.full((y_source.shape[0],), target) batch_y = np.eye(y_pl.shape[-1])[targets] x_pert, p = sess.run([x_perturbed, perturb], feed_dict={ x_pl: x_source, y_pl: batch_y, is_training: False, is_training_target: False }) print("source class is: %s" % (classes[source])) print("X:") print(x_source[0]) print("P:") print(p[0]) print("X_adv:") print(x_pert[0]) print("target_mu:") print(target_mu) # save the results in X, P, X_adv, target_mu order results = np.vstack([x_source[0], p[0], x_pert[0], target_mu]) source_class = cleanse_label(classes[source]) target_class = cleanse_label(classes[target]) np.save("%s_to_%s.npy" % (source_class, target_class), results)