def save_adv_examples(sess, model, data_iter, save_folder, fgm_settings={np.inf: [0.1]}): ''' fgm_settings should be a dictionary of the form { 'NORM' : 'ARRAY OF VALUES' } ''' examples_folder = os.path.join(save_folder, 'adv_examples') if not os.path.isdir(examples_folder): os.makedirs(examples_folder) for norm in fgm_settings: for eps in fgm_settings[norm]: data_iter.reset() adv_examples, labels, _ = gen_adv_examples( sess, model, data_iter, fgm(model.input, model.output, eps=eps, ord=norm)) _save_adv_examples(examples_folder, adv_examples, labels, eps, norm) for norm in fgm_settings: for eps in fgm_settings[norm]: data_iter.reset() adv_examples, labels, targets = gen_adv_examples( sess, model, data_iter, fgm_target(model.input, model.output, model.label, eps=eps, ord=norm)) _save_adv_examples(examples_folder, adv_examples, labels, eps, norm, targets)
def __init__(self, sess, resnet): ''' images: though it is not a placeholder, most of the time you should feed your image into this variable images_scaled: a placeholder that contains images scaled to -0.5 to 0.5 ''' self.sess = sess self.images = tf.placeholder( tf.float32, (None, FLAGS.image_size, FLAGS.image_size, FLAGS.channels), name="images") # combined attack for feature squeezing! if FLAGS.feature_squeeze: smoothed_images = median_filtering_2x2(self.images, dataset=FLAGS.dataset) self.labels = tf.placeholder(tf.int64, (None, ), name="labels") self.resnet = resnet self.logits = resnet( self.images) if not FLAGS.feature_squeeze else resnet( smoothed_images) self.softmax = tf.nn.softmax(self.logits) self.eps = tf.placeholder(tf.float32, (), name="fgsm_eps") self.cw_model = types.SimpleNamespace() self.cw_model.image_size = FLAGS.image_size self.cw_model.num_channels = FLAGS.channels self.cw_model.predict = self.resnet self.cw_model.num_labels = 10 labels_onehot = tf.one_hot(self.labels, depth=self.cw_model.num_labels) self.adv_image = fgm(self.images, self.softmax, y=labels_onehot, eps=self.eps, clip_min=0.0, clip_max=255.0) saver = tf.train.Saver() save_path, save_path_ckpt = get_weights_path() try: ckpt_state = tf.train.get_checkpoint_state(save_path) except tf.errors.OutOfRangeError as e: raise AssertionError('Cannot restore checkpoint: %s', e) if not (ckpt_state and ckpt_state.model_checkpoint_path): raise FileNotFoundError('No model to eval yet at %s', save_path) tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path) saver.restore(sess, ckpt_state.model_checkpoint_path) self.index = tf.placeholder(tf.int32, (), name="index") self.grad = tf.gradients(self.logits[:, self.index], [self.images], name="grad")[0] # Images in CW attack are rescaled to [-0.5, 0.5] self.cw_attacker = CarliniLi(self.sess, self.cw_model, targeted=False)
def fgs_eval(sess, model, data_iter, fgm_eps, norm=np.inf, logger=None): ''' Returns (untargeted_fgs_acc, targeted_fgs_acc, targeted_atk_success_rate) ''' untarget_num_correct = 0.0 target_num_correct = 0.0 target_atk_success = 0.0 total_count = 0 iter_ = tqdm(data_iter) fgm_attack = fgm(model.input, model.output, eps=fgm_eps, ord=norm) targeted_fgm_attack = fgm_target(model.input, model.output, model.label, eps=fgm_eps, ord=norm) for batch in iter_: target_labels = permute_labels(batch["label"]) # Try to perturb with cleverhans (untargeted) perturbed_imgs_fgm = sess.run(fgm_attack, {model.input: batch["img"]}) targeted_fgm_imgs = sess.run(targeted_fgm_attack, { model.input: batch["img"], model.label: target_labels }) y_untarget = sess.run(model.output, {model.input: perturbed_imgs_fgm}) y_targeted = sess.run(model.output, {model.input: targeted_fgm_imgs}) untarget_pred_label = np.argmax(y_untarget, axis=1) target_pred_label = np.argmax(y_targeted, axis=1) untarget_num_correct += np.sum( np.equal(untarget_pred_label, batch["label"]).astype(float)) target_num_correct += np.sum( np.equal(target_pred_label, batch["label"]).astype(float)) target_atk_success += np.sum( np.equal(target_pred_label, target_labels).astype(float)) total_count += untarget_pred_label.size untargeted_pred_acc = (untarget_num_correct / total_count) targeted_pred_acc = (target_num_correct / total_count) targeted_success_rate = (target_atk_success / total_count) # Here we generate targeted adversarial attacks # target_perturbed_imgs_fgs = model.perturb_inputs_fgs(fgs_eps, imgs, target_labels) # untargeted_pred_acc = model.predictive_accuracy(perturbed_imgs_fgs, labels) # targeted_pred_acc = model.predictive_accuracy(target_perturbed_imgs_fgs, target_labels) logger.log_adv_stats(norm, fgm_eps, untargeted_pred_acc, targeted_pred_acc, targeted_success_rate) return (untargeted_pred_acc, targeted_pred_acc, targeted_success_rate)
def __init__(self, sess, model_input, model_result, global_threshold, ord): self.sess = sess self.model_result_2classes = tf.concat( (global_threshold - model_result, model_result - global_threshold), axis=1) self.model_input = model_input self.eps = tf.placeholder(dtype=tf.float32) self.labels = tf.placeholder(dtype=tf.float32) self.adv = fgm(self.model_input, self.model_result_2classes, self.labels, eps=self.eps, clip_min=0, clip_max=255, ord=ord, targeted=True)
def test_fgm_gradient_max(self): input_dim = 2 nb_classes = 3 batch_size = 4 rng = np.random.RandomState([2017, 8, 23]) x = tf.placeholder(tf.float32, [batch_size, input_dim]) weights = tf.placeholder(tf.float32, [input_dim, nb_classes]) logits = tf.matmul(x, weights) probs = tf.nn.softmax(logits) adv_x = fgm(x, probs) random_example = rng.randint(batch_size) random_feature = rng.randint(input_dim) output = tf.slice(adv_x, [random_example, random_feature], [1, 1]) (dx, ) = tf.gradients(output, x) # The following line catches GitHub issue #243 self.assertIsNotNone(dx) dx = self.sess.run(dx, feed_dict=random_feed_dict(rng, [x, weights])) ground_truth = np.zeros((batch_size, input_dim)) ground_truth[random_example, random_feature] = 1.0 self.assertClose(dx, ground_truth)
def test_fgm_gradient_max(): input_dim = 2 num_classes = 3 batch_size = 4 loss_type = KEYWORDS.CE rng = np.random.RandomState([2017, 8, 23]) x = tf.placeholder(tf.float32, [batch_size, input_dim]) weights = tf.placeholder(tf.float32, [input_dim, num_classes]) logits = tf.matmul(x, weights) probs = tf.nn.softmax(logits) adv_x = fgm(x, probs, loss_type=loss_type) random_example = rng.randint(batch_size) random_feature = rng.randint(input_dim) output = tf.slice(adv_x, [random_example, random_feature], [1, 1]) dx, = tf.gradients(output, x) # The following line catches GitHub issue #243 assert dx is not None sess = tf.Session() dx = sess.run(dx, feed_dict=random_feed_dict(rng, [x, weights])) ground_truth = np.zeros((batch_size, input_dim)) ground_truth[random_example, random_feature] = 1. assert np.allclose(dx, ground_truth), (dx, ground_truth)
[mc_preds_tensor, entropy_mean_tensor, bald_tensor]) # plot entropy and MI as eps increases entropies = [] balds = [] accs = [] preds_tensor = K.mean(mc_preds_tensor, axis=0) for i, ep in enumerate(eps): print("iteration", i, "of", len(eps), "epsilon", ep) sys.stdout.flush() adv_tensor = fgm(x, preds_tensor, eps=ep, clip_min=0, clip_max=1, ord=norm) b_entropies = [] b_balds = [] b_accs = [] batches = U.batches_generator(tst, tsty, batch_size=500) for j, (bx, by) in enumerate(batches): print(' batch', j) sys.stdout.flush() # in case we are writing to a log file not stdout adv = adv_tensor.eval(session=K.get_session(), feed_dict={x: bx}) mc_samples, e_adv, b_adv = get_output([adv]) b_entropies.append(e_adv.mean()) # mean across the batch b_balds.append(b_adv.mean()) # ditto preds = mc_samples.mean(axis=0) # mean across the mc samples per point
def evaluate(hps, data_X, data_y, eval_once=True): """Eval loop.""" images = tf.placeholder(tf.float32, shape=(None, args.image_size, args.image_size, args.channels)) labels_onehot = tf.placeholder(tf.int32, shape=(None, args.num_classes)) labels = tf.argmax(labels_onehot, axis=1) if args.classifier == "madry": net = tf.make_template('net', madry_template) logits = net(images, training=False) elif args.classifier == 'aditi': net = tf.make_template('net', aditi_template) logits = net(images, training=False) elif args.classifier == 'zico': net = tf.make_template('net', zico_template) logits = net(images, training=False) else: net = tf.make_template('net', resnet_template, hps=hps) if args.classifier == 'resnet' else \ tf.make_template('net', vgg_template, hps=hps) logits = net(images, training=False) pred = tf.argmax(logits, axis=1) probs = tf.nn.softmax(logits) cost = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels_onehot) adv_image = fgm(images, tf.nn.softmax(logits), y=labels_onehot, eps=args.eps / 10, clip_min=0.0, clip_max=1.0) top_5 = tf.nn.in_top_k(predictions=logits, targets=labels, k=5) saver = tf.train.Saver( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='net')) if args.classifier == 'madry' and not args.trained: saver = tf.train.Saver({ x.name[4:-2]: x for x in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="net") }) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) best_precision = 0.0 save_path, save_path_ckpt = get_weights_path(args) while True: try: ckpt_state = tf.train.get_checkpoint_state(save_path) except tf.errors.OutOfRangeError as e: print('[!] Cannot restore checkpoint: %s', e) break if not (ckpt_state and ckpt_state.model_checkpoint_path): print('[!] No model to eval yet at %s', save_path) break print('[*] Loading checkpoint %s' % ckpt_state.model_checkpoint_path) saver.restore(sess, ckpt_state.model_checkpoint_path) total_prediction, correct_prediction = 0, 0 adv_prediction = 0 total_loss = 0 all_preds = [] batch_size = args.batch_size num_batch = len(data_X) // batch_size bad_images = [] bad_labels = [] confidences = [] adv_images = [] cls_preds = [] true_labels = [] for batch in range(num_batch): x = data_X[batch * batch_size:(batch + 1) * batch_size] x = x.astype(np.float32) y = data_y[batch * batch_size:(batch + 1) * batch_size] y = y.astype(np.int32) if not args.top5: (loss, predictions, conf) = sess.run([cost, pred, probs], feed_dict={ images: x, labels_onehot: y }) all_preds.extend(predictions) confidences.extend(conf[np.arange(conf.shape[0]), predictions]) img_np = np.copy(x) for i in range(100): img_np = sess.run(adv_image, feed_dict={ images: img_np, labels_onehot: y }) img_np = np.clip(img_np, x - args.eps, x + args.eps) img_np = np.clip(img_np, 0.0, 1.0) adv_images.extend(img_np) adv_pred_np = pred.eval(session=sess, feed_dict={ images: img_np, labels_onehot: y }) cls_preds.extend(adv_pred_np) true_labels.extend(np.argmax(y, axis=1)) else: (loss, in_top5) = sess.run([cost, top_5], feed_dict={ images: x, labels_onehot: y }) total_loss += np.sum(loss) y = np.argmax(y, axis=1) correct_prediction += np.sum( y == predictions) if not args.top5 else np.sum(in_top5) bad_images.extend(x[y != predictions]) bad_labels.extend(predictions[y != predictions]) adv_prediction += np.sum(y == adv_pred_np) total_prediction += loss.shape[0] precision = 1.0 * correct_prediction / total_prediction loss = 1.0 * total_loss / total_prediction best_precision = max(precision, best_precision) average_conf = np.mean(np.asarray(confidences)) adv_images = np.asarray(adv_images) cls_preds = np.asarray(cls_preds) true_labels = np.asarray(true_labels) if not args.top5: print( '[*] loss: %.6f, precision: %.6f, PGD precision: %.6f, Confidence: %.6f' % (loss, precision, adv_prediction / total_prediction, average_conf)) folder_format = '/atlas/u/yangsong/generative_adversary/{}_{}_pgd/' np.savez(os.path.join( check_folder( folder_format.format(args.dataset, args.classifier)), 'eps_{:.3f}.npz'.format(args.eps)), adv_images=adv_images, cls_preds=cls_preds, true_labels=true_labels) else: print( '[*] loss: %.6f, top 5 accuracy: %.6f, best top 5 accuracy: %.6f' % (loss, precision, best_precision)) bad_images = np.asarray(bad_images) bad_images = (255. * bad_images).astype(np.uint8) bad_labels = np.asarray(bad_labels).astype(np.uint8) if len(bad_images) > 10: bad_images = bad_images[:10] bad_labels = bad_labels[:10] bad_images = np.reshape(bad_images, (len(bad_images) * args.image_size, args.image_size, args.channels)) bad_images = np.squeeze(bad_images) imsave(os.path.join(check_folder('tmp'), 'bad_images.png'), bad_images) print("bad_labels:\n{}".format(bad_labels)) if eval_once: break time.sleep(60)
ds_test, _ = get_data('test', args) # Untargeted BIM from tensorpack.dataflow import dataset pp_mean = dataset.Cifar10('train').get_per_pixel_mean() stepsize_ph = tf.placeholder(tf.float32, []) orig_input_ph = tf.placeholder(tf.float32, image_ph.get_shape().as_list()) # adv_inp = fgm(image_ph, probs, y=tf.one_hot(label_ph, depth=10), eps=tf.to_float(1)) pp_mean_sym = tf.tile(tf.constant(pp_mean[None]), [tf.shape(image_ph)[0], 1, 1, 1]) # adv_inp = tf.clip_by_value(adv_inp, -pp_mean_sym, 255 - pp_mean_sym) # adv_inp = tf.clip_by_value(adv_inp, orig_input_ph - stepsize_ph, orig_input_ph + stepsize_ph) adv_inp = fgm( image_ph, probs, #y=tf.one_hot(label_ph, depth=10) eps=tf.to_float(stepsize_ph)) adv_inp = tf.clip_by_value(adv_inp, -pp_mean_sym, 255 - pp_mean_sym) adv_inp = tf.clip_by_value(adv_inp, orig_input_ph - stepsize_ph, orig_input_ph + stepsize_ph) for EPSILON in [0, 1, 2, 4, 8, 16]: ds_test.reset_state() crts, ents = [], [] with tqdm(total=10000) as pbar: for i, (img, lbl) in tqdm(enumerate(ds_test.get_data())): if img.shape[0] != 128: break fd = { image_ph: img, label_ph: lbl,
def train(hps, data): """Training loop.""" images = tf.placeholder(tf.float32, shape=(None, FLAGS.image_size, FLAGS.image_size, FLAGS.channels), name="images") labels = tf.placeholder(tf.int64, shape=(None), name="labels") labels_onehot = tf.one_hot(labels, depth=hps.num_classes, dtype=tf.float32, name="labels_onehot") if FLAGS.label_smooth: labels_onehot = label_smooth(labels_onehot) lrn_rate = tf.placeholder(tf.float32, shape=(), name="lrn_rate") tf.logging.info(json.dumps(vars(FLAGS))) tf.logging.info(json.dumps(hps._asdict())) flipped_images = random_flip_left_right(images) net = tf.make_template('net', resnet_template, hps=hps) if FLAGS.model == 'resnet' else \ tf.make_template('net', vgg_template, hps=hps) truth = labels if FLAGS.adversarial or FLAGS.adversarial_BIM: logits = net(flipped_images, training=False) else: logits = net(flipped_images, training=True) probs = tf.nn.softmax(logits) predictions = tf.argmax(logits, axis=1) precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth))) cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels_onehot)) weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='net') weight_norm = tf.add_n([tf.nn.l2_loss(v) for v in weights]) cost = cost + 0.0005 * weight_norm with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): train_op = tf.train.MomentumOptimizer(learning_rate=lrn_rate, momentum=0.9).minimize(cost) if FLAGS.adversarial or FLAGS.adversarial_BIM: eps = tf.abs(tf.truncated_normal(shape=(tf.shape(images)[0],), mean=0, stddev=FLAGS.adv_std)) eps = eps[:, None, None, None] adv_x = fgsm(flipped_images, probs, eps=eps, clip_min=0.0, clip_max=255.0) adv_x_leak = fgm(flipped_images, probs, y=labels_onehot, eps=np.asarray([1])[:, None, None, None], clip_min=0.0, clip_max=255.0) adv_logits = net(adv_x, training=False) adv_pred = tf.argmax(adv_logits, axis=1) adv_precision = tf.reduce_mean(tf.to_float(tf.equal(adv_pred, truth))) adv_logits_leak = net(adv_x_leak, training=False) adv_pred_leak = tf.argmax(adv_logits_leak, axis=1) adv_precision_leak = tf.reduce_mean(tf.to_float(tf.equal(adv_pred_leak, truth))) num_normal = hps.batch_size // 2 combined_images = tf.concat([flipped_images[:num_normal], images[num_normal:]], axis=0) com_logits = net(combined_images, training=True) normal_cost = 2.0 / 1.3 * tf.nn.softmax_cross_entropy_with_logits(logits=com_logits[:num_normal], labels=labels_onehot[:num_normal]) adv_cost = 0.6 / 1.3 * tf.nn.softmax_cross_entropy_with_logits(logits=com_logits[num_normal:], labels=labels_onehot[num_normal:]) combined_cost = tf.reduce_mean(tf.concat([normal_cost, adv_cost], axis=0)) + 0.0005 * weight_norm with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): train_op_adv = tf.train.MomentumOptimizer(learning_rate=lrn_rate, momentum=0.9).minimize(combined_cost) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: tf.global_variables_initializer().run() saver = tf.train.Saver(max_to_keep=3) save_path, save_path_ckpt = get_weights_path() state = tf.train.get_checkpoint_state(save_path) if state and state.model_checkpoint_path: ans = verify("Warning: model already trained. Delete files and re-train? (y/n)") if ans: shutil.rmtree(save_path) os.makedirs(save_path) else: saver_state = tf.train.get_checkpoint_state(save_path) saver.restore(sess, saver_state.model_checkpoint_path) # raise FileExistsError("Model weight already exists") else: os.makedirs(save_path, exist_ok=True) hps_path = os.path.join(save_path, 'hps.txt') with open(hps_path, 'w') as fout: fout.write(json.dumps(vars(FLAGS))) fout.write(json.dumps(hps._asdict())) for iter in range(FLAGS.maxiter): try: x, y = data.next(hps.batch_size) except StopIteration: tf.logging.info("New epoch!") if iter < 40000: lr = 0.1 elif iter < 60000: lr = 0.01 elif iter < 80000: lr = 0.001 else: lr = 0.0001 if not FLAGS.adversarial and not FLAGS.adversarial_BIM: _, acc = sess.run([train_op, precision], feed_dict={ images: x, labels: y, lrn_rate: lr }) tf.logging.info("Iter: {}, Precision: {:.6f}".format(iter + 1, acc)) elif FLAGS.adversarial: adv_images, acc, acc_adv = sess.run([adv_x, precision, adv_precision], feed_dict={ images: x, labels: y, }) combined_batch = np.concatenate([x[:num_normal], adv_images[num_normal:]], axis=0) _, com_loss = sess.run([train_op_adv, combined_cost], feed_dict={ images: combined_batch, labels: y, lrn_rate: lr }) tf.logging.info("Iter: {}, Precision: {:.6f}, Adv precision: {:.6f}, Combined loss: {:.6f}" .format(iter + 1, acc, acc_adv, com_loss)) elif FLAGS.adversarial_BIM: BIM_eps = np.abs(truncnorm.rvs(a=-2., b=2.) * FLAGS.adv_std) attack_iter = int(min(BIM_eps + 4, 1.25 * BIM_eps)) adv_images = np.copy(x) for i in range(attack_iter): adv_images, acc, acc_adv = sess.run([adv_x_leak, precision, adv_precision_leak], feed_dict={ images: adv_images, labels: y, }) combined_batch = np.concatenate([x[:num_normal], adv_images[num_normal:]], axis=0) _, com_loss = sess.run([train_op_adv, combined_cost], feed_dict={ images: combined_batch, labels: y, lrn_rate: lr }) tf.logging.info("Iter: {}, Precision: {:.6f}, Adv precision: {:.6f}, Combined loss: {:.6f}" .format(iter + 1, acc, acc_adv, com_loss)) if (iter + 1) % 5000 == 0: saver.save(sess, save_path_ckpt, global_step=iter + 1) tf.logging.info("Model saved! Path: " + save_path)