def _get_pert(self, X, Y, eps: float, model): x = tf.placeholder(tf.float32, shape=([None] + list(self.n_features))) y = tf.placeholder(tf.float32, shape=(None, self.n_classes)) wrap = KerasModelWrapper(model) pgd = ProjectedGradientDescent(wrap, ord=self.ord, sess=self.sess) if eps >= 0.05: adv_x = pgd.generate(x, y=y, eps=eps) else: adv_x = pgd.generate(x, y=y, eps=eps, eps_iter=eps) adv_x = tf.stop_gradient(adv_x) ret = adv_x - x return ret.eval(feed_dict={x: X, y: Y}, session=self.sess)
class PGDAttack(AdversarialAttack): def __init__(self, model, targeted=False, step_size_iter=0.05, max_perturbation=0.3, n_iterations=10, norm_order=np.inf, rand_init=None, rand_minmax=0.3, clip_min=None, clip_max=None, sanity_checks=True): super().__init__(model=model, clip_min=clip_min, clip_max=clip_max) self._targeted = targeted self._step_size_iter = step_size_iter self._max_perturbation = max_perturbation self._n_iterations = n_iterations self._norm_order = norm_order self._rand_init = rand_init self._rand_minmax = rand_minmax self._sanity_checks = sanity_checks with self.graph.as_default(): self._method = ProjectedGradientDescent( self._model, sess=self.session, eps=self._max_perturbation, eps_iter=self._step_size_iter, nb_iter=self._n_iterations, ord=self._norm_order, rand_init=self._rand_init, clip_min=self._clip_min, clip_max=self._clip_max, sanity_checks=self._sanity_checks) def attack_method(self, labels): if labels is not None: if self._targeted: return self._method.generate(x=self._x_clean, y_target=labels, rand_minmax=self._rand_minmax) else: return self._method.generate(x=self._x_clean, y=labels, rand_minmax=self._rand_minmax) return self._method.generate(x=self._x_clean, rand_minmax=self._rand_minmax)
def get_at_loss(sess, x, y, model, eps, eps_iter, iterations): # Set up PGD attack graph using Cleverhans library pgd_params = { 'ord': np.inf, 'y': y, 'eps': eps / 255, 'eps_iter': eps_iter / 255, 'nb_iter': iterations, 'rand_init': True, 'rand_minmax': eps / 255, 'clip_min': 0., 'clip_max': 1., 'sanity_checks': True } pgd = ProjectedGradientDescent(model, sess=sess) adv_x = pgd.generate(x, **pgd_params) adv_logits = model.get_logits(adv_x) # Add summary for adversarial training images with tf.device('/gpu:0'): with tf.name_scope('Adversarial-Image-Summaries'): tf.summary.image('adv-input', adv_x, max_outputs=2, family='Adversarial-Training', collections=['training']) adv_loss = tf.nn.softmax_cross_entropy_with_logits(logits=adv_logits, labels=y) adv_loss = tf.reduce_mean(adv_loss) return adv_loss, adv_logits
def attack_images(model, tfrecords_dirpath, attack_type='PGD', attack_kwargs=default_attack_kwargs): ''' Attack images (batch = 1 for now) ''' # Get the true label true_label = attack_kwargs['y'] attack_label = attack_kwargs['y_target'] del attack_kwargs['y'] # Define tfrecords input iterator tfrecord_filepaths = glob(os.path.join(tfrecords_dirpath, '*')) tf_dataset = tfutils.make_dataset( tfrecord_filepaths, batch_size=1, filter_label=true_label, preprocessing_fn=preprocess_input ) iterator = tf_dataset.make_one_shot_iterator() x, y = iterator.get_next() # Run the Session attacked_imgs = [] with tf.Session() as sess: # Set attack settings # PGD if attack_type == "PGD": attack = ProjectedGradientDescent(model, sess=sess) # FGM elif attack_type == "FGM": attack = FastGradientMethod(model, sess=sess) target_one_hot_encoded = get_one_hot_encoded_targets(attack_label) attack_kwargs['y_target'] = target_one_hot_encoded # Run the session to generate attacked images x_adv = attack.generate(x, **attack_kwargs) pbar = tqdm(unit='imgs') try: while True: attacked_img = sess.run(x_adv) predicted_class = get_predictions(model, attacked_img) print(predicted_class, attack_label) if predicted_class == attack_label: attacked_imgs.append(attacked_img) pbar.update() except tf.errors.OutOfRangeError: pass if len(attacked_imgs) > 0: attacked_imgs = np.vstack(attacked_imgs) return attacked_imgs
def pgd_attack(): # Use tf for evaluation on adversarial data tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) x_op = tf.placeholder(tf.float32, shape=( None, 3, 32, 32, )) y_op = tf.placeholder(tf.float32, shape=(None, 10)) # Convert pytorch model to a tf_model and wrap it in cleverhans tf_model_fn = convert_pytorch_model_to_tf(model) cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits') # Create an PGD attack pgd = ProjectedGradientDescent(cleverhans_model, sess=sess) pgd_params = { 'eps': args.eps, 'eps_iter': args.ss, 'nb_iter': args.ns, 'clip_min': 0., 'clip_max': 1., 'y': y_op } adv_x_op = pgd.generate(x_op, **pgd_params) adv_preds_op = tf_model_fn(adv_x_op) # Evaluation against PGD attacks correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(test_loader): adv_preds = sess.run(adv_preds_op, feed_dict={ x_op: inputs, y_op: torch.nn.functional.one_hot(targets, 10) }) correct += (np.argmax(adv_preds, axis=1) == targets.numpy()).sum() total += len(inputs) sys.stdout.write("\rWhite-box PGD attack... Acc: %.3f%% (%d/%d)" % (100. * correct / total, correct, total)) sys.stdout.flush() print('Accuracy under PGD attack: %.3f%%' % (100. * correct / total))
def _get_pert(self, X, Y, eps): if eps == 0: return np.zeros_like(X) with self.sess.as_default(): self.x = self.wrap.input pgd = ProjectedGradientDescent(self.x, sess=self.sess) adv_x = pgd.generate(self.x, y=self.y, eps=eps, ord=self.ord, eps_iter=0.01) adv_x = tf.stop_gradient(adv_x) pert_x = adv_x - self.x feed_dict = {self.x: X, self.y: Y} ret = pert_x.eval(feed_dict=feed_dict) return ret
def evaluate_checkpoint(filename): if attack_method == 'BIM': bim = BasicIterativeMethod(model) bim_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1., 'nb_iter': 50, 'eps_iter': .01 } adv_x = bim.generate(x_image, **bim_params) elif attack_method == 'FGM': FGM_attack = FastGradientMethod(model) FGM_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = FGM_attack.generate(x_image, **FGM_params) elif attack_method == 'PGD': pgd = ProjectedGradientDescent(model) pgd_params = { 'eps': 0.09, 'clip_min': 0., 'clip_max': 1., 'nb_iter': 40, 'eps_iter': .01 } adv_x = pgd.generate(x_image, **pgd_params) preds_adv = model.get_probs(adv_x) with tf.Session() as sess: # Restore the checkpoint saver = tf.train.Saver(var_list=model.all_variables) saver.restore(sess, filename) eval_par = {'batch_size': batch_size} t1 = time.time() acc = model_eval(sess, x_image, y, preds_adv, X_test, Y_test, args=eval_par) t2 = time.time() print("Took", t2 - t1, "seconds") print('Test accuracy on adversarial examples: %0.4f\n' % acc)
def get_alp_loss(sess, x, y, logits, adv_logits, model, eps, eps_iter, iterations): if adv_logits is None: pgd_params = { 'ord': np.inf, 'y': y, 'eps': eps / 255, 'eps_iter': eps_iter / 255, 'nb_iter': iterations, 'rand_init': True, 'rand_minmax': eps / 255, 'clip_min': 0., 'clip_max': 1., 'sanity_checks': True } pgd = ProjectedGradientDescent(model, sess=sess) adv_x = pgd.generate(x, **pgd_params) adv_logits = model.get_logits(adv_x) adv_pairing_loss = tf.losses.mean_squared_error(logits, adv_logits) return adv_pairing_loss
def save_pgd_attacked_images(original_class, target_class, attack_strength, nb_iter=50, seed=1000): random.seed(seed) np.random.seed(seed) tf.set_random_seed(seed) eps = attack_strength labels_by_name = load_labels_by_name() target_label = labels_by_name[target_class].lucid_label benign_dataset_path = DataPaths.get_benign_images_datapath(original_class) assert benign_dataset_path.exists() attacked_dataset_path = DataPaths.get_attacked_images_datapath( original_class, target_class, attack_name='pgd', attack_strength=eps) assert not attacked_dataset_path.exists() print('Saving attacked images to %s' % attacked_dataset_path) img_dataset = hdf5utils.load_image_dataset_from_file(benign_dataset_path) output_file = h5py.File(attacked_dataset_path, 'w') out_dataset = hdf5utils.create_image_dataset(output_file, dataset_name='images') indices_dataset = hdf5utils.create_dataset(output_file, data_shape=(1, ), dataset_name='indices') graph = tf.Graph() with graph.as_default(): model = InceptionV1Model() x = model.default_input_placeholder y_pred = model.get_predicted_class(x) with tf.Session(graph=graph) as sess: attack = ProjectedGradientDescent(model, sess=sess) target_one_hot_encoded = get_one_hot_encoded_targets(target_label) x_adv = attack.generate(x, eps=eps, nb_iter=nb_iter, clip_min=-1, clip_max=1, eps_iter=(eps / 5), ord=2, y_target=target_one_hot_encoded) num_attack_success = 0 pbar = tqdm(unit='imgs', total=len(img_dataset)) try: for i, img in enumerate(img_dataset): ben_img = np.array(img) adv_img = sess.run(x_adv, feed_dict={x: [ben_img]}) attack_pred = sess.run(y_pred, feed_dict={x: adv_img}) adv_img = adv_img[0] attack_pred = attack_pred[0] assert not np.any(np.isnan(adv_img)) assert not np.isnan(attack_pred) if attack_pred == target_label: index = np.array([i]) num_attack_success += 1 hdf5utils.add_image_to_dataset(adv_img, out_dataset) hdf5utils.add_item_to_dataset(index, indices_dataset) pbar.set_postfix(num_attack_success=num_attack_success) pbar.update() except tf.errors.OutOfRangeError: pass
tf_model_fn = convert_pytorch_model_to_tf(clf) cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits') # pgd_op = MadryEtAl(cleverhans_model, sess=sess) pgd_op = ProjectedGradientDescent(cleverhans_model, sess=sess, default_rand_init=True) pgd_params = { 'eps': 16 / 255.0, 'eps_iter': 2 / 255.0, 'nb_iter': 10, 'clip_min': 0.0, 'clip_max': 1.0 } adv_x_op = pgd_op.generate(x_op, y=onehot_op, **pgd_params) clean_logits_op = tf_model_fn(x_op) adv_logits_op = tf_model_fn(adv_x_op) cudnn.benchmark = True total = -1 for step, (images, labels) in enumerate(train_loader): print('To be attacked: {}th, {}'.format( total + 1, os.path.basename(train_set.imgs[total + 1][0]))) start = time.time() images = images.to(device) labels = labels.to(device)
def train(ARGS): # Define helper function for evaluating on test data during training def eval(epoch): from train_utils import clean_eval test_accuracy, test_loss, _ = clean_eval(sess, x, y, is_training, testloader, n_classes, logits, preds) # Write tensorboard summary acc_summary = tf.Summary() acc_summary.value.add(tag='Evaluation/accuracy/test', simple_value=test_accuracy) writer_test.add_summary(acc_summary, epoch) # Write tensorboard summary err_summary = tf.Summary() err_summary.value.add(tag='Evaluation/error/test', simple_value=1.0 - test_accuracy) writer_test.add_summary(err_summary, epoch) # Write tensorboard summary loss_summary = tf.Summary() loss_summary.value.add(tag='Evaluation/loss/test', simple_value=test_loss) writer_test.add_summary(loss_summary, epoch) # Define helper function for evaluating on adversarial test data during training def adv_eval(epoch): from train_utils import adversarial_eval adv_accuracy, adv_loss = adversarial_eval(sess, x, y, is_training, adv_testloader, n_classes, preds, adv_preds, eval_all=True) # Write tensorboard summary acc_summary = tf.Summary() acc_summary.value.add(tag='Evaluation/adversarial-accuracy/test', simple_value=adv_accuracy) writer_test.add_summary(acc_summary, epoch) # Write tensorboard summary err_summary = tf.Summary() err_summary.value.add(tag='Evaluation/adversarial-error/test', simple_value=1.0 - adv_accuracy) writer_test.add_summary(err_summary, epoch) # Write tensorboard summary loss_summary = tf.Summary() loss_summary.value.add(tag='Evaluation/adversarial-loss/test', simple_value=adv_loss) writer_test.add_summary(loss_summary, epoch) # Define computational graph with tf.Graph().as_default() as g: # Define placeholders with tf.device('/gpu:0'): with tf.name_scope('Placeholders'): x = tf.placeholder(dtype=tf.float32, shape=input_shape, name='inputs') x_pair1 = tf.placeholder(dtype=tf.float32, shape=input_shape, name='x-pair1') x_pair2 = tf.placeholder(dtype=tf.float32, shape=input_shape, name='x-pair2') y = tf.placeholder(dtype=tf.float32, shape=(None, n_classes), name='labels') is_training = tf.placeholder_with_default(True, shape=(), name='is-training') # Define TF session config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(graph=g, config=config) # Define model with tf.name_scope('Model'): with tf.device('/gpu:0'): model = Model(nb_classes=n_classes, input_shape=input_shape, is_training=is_training) # Define forward-pass with tf.name_scope('Logits'): logits = model.get_logits(x) with tf.name_scope('Probs'): preds = tf.nn.softmax(logits) with tf.name_scope('Accuracy'): ground_truth = tf.argmax(y, axis=1) predicted_label = tf.argmax(preds, axis=1) correct_prediction = tf.equal(predicted_label, ground_truth) acc = tf.reduce_mean(tf.to_float(correct_prediction), name='accuracy') tf.add_to_collection('accuracies', acc) err = tf.identity(1.0 - acc, name='error') tf.add_to_collection('accuracies', err) # Define losses with tf.name_scope('Losses'): ce_loss, wd_loss, clp_loss, lsq_loss, at_loss, alp_loss = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 adv_logits = None if ARGS.ct: with tf.name_scope('Cross-Entropy-Loss'): ce_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=y), name='cross-entropy-loss') tf.add_to_collection('losses', ce_loss) if ARGS.at: with tf.name_scope('Adversarial-Cross-Entropy-Loss'): at_loss, adv_logits = get_at_loss( sess, x, y, model, ARGS.eps, ARGS.eps_iter, ARGS.nb_iter) at_loss = tf.identity(at_loss, name='at-loss') tf.add_to_collection('losses', at_loss) with tf.name_scope('Regularizers'): if ARGS.wd: with tf.name_scope('Weight-Decay'): for var in tf.trainable_variables(): if 'beta' in var.op.name: # Do not regularize bias of batch normalization continue # print('regularizing: ', var.op.name) wd_loss += tf.nn.l2_loss(var) reg_loss = tf.identity(wd_loss, name='wd-loss') tf.add_to_collection('losses', reg_loss) if ARGS.alp: with tf.name_scope('Adversarial-Logit-Pairing'): alp_loss = get_alp_loss( sess, x, y, logits, adv_logits, model, ARGS.eps, ARGS.eps_iter, ARGS.nb_iter) alp_loss = tf.identity(alp_loss, name='alp-loss') tf.add_to_collection('losses', alp_loss) if ARGS.clp: with tf.name_scope('Clean-Logit-Pairing'): clp_loss = get_clp_loss( x_pair1, x_pair2, model) clp_loss = tf.identity(clp_loss, name='clp-loss') tf.add_to_collection('losses', clp_loss) if ARGS.lsq: with tf.name_scope('Logit-Squeezing'): lsq_loss = get_lsq_loss(x, model) lsq_loss = tf.identity(lsq_loss, name='lsq-loss') tf.add_to_collection('losses', lsq_loss) with tf.name_scope('Total-Loss'): # Define objective function total_loss = (ARGS.ct_lambda * ce_loss) + ( ARGS.at_lambda * at_loss) + (ARGS.wd_lambda * wd_loss) + ( ARGS.clp_lambda * clp_loss) + (ARGS.lsq_lambda * lsq_loss) + ( ARGS.alp_lambda * alp_loss) total_loss = tf.identity(total_loss, name='total-loss') tf.add_to_collection('losses', total_loss) # Define PGD adversary with tf.name_scope('PGD-Attacker'): pgd_params = { 'ord': np.inf, 'y': y, 'eps': ARGS.eps / 255, 'eps_iter': ARGS.eps_iter / 255, 'nb_iter': ARGS.nb_iter, 'rand_init': True, 'rand_minmax': ARGS.eps / 255, 'clip_min': 0., 'clip_max': 1., 'sanity_checks': True } pgd = ProjectedGradientDescent(model, sess=sess) adv_x = pgd.generate(x, **pgd_params) with tf.name_scope('Logits'): adv_logits = model.get_logits(adv_x) with tf.name_scope('Probs'): adv_preds = tf.nn.softmax(adv_logits) # Define optimizer with tf.device('/gpu:0'): with tf.name_scope('Optimizer'): # Define global step variable global_step = tf.get_variable( name='global_step', shape=[], # scalar dtype=tf.float32, initializer=tf.zeros_initializer(), trainable=False) optimizer = tf.train.AdamOptimizer(learning_rate=ARGS.lr, beta1=0.9, beta2=0.999, epsilon=1e-6, use_locking=False, name='Adam') trainable_vars = tf.trainable_variables() update_bn_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS ) # this collection stores the moving_mean and moving_variance ops # for batch normalization with tf.control_dependencies(update_bn_ops): grads_and_vars = optimizer.compute_gradients( total_loss, trainable_vars) train_step = optimizer.apply_gradients( grads_and_vars, global_step=global_step) # Add Tensorboard summaries with tf.device('/gpu:0'): # Create file writers writer_train = tf.summary.FileWriter(ARGS.log_dir + '/train', graph=g) writer_test = tf.summary.FileWriter(ARGS.log_dir + '/test') # Add summary for input images with tf.name_scope('Image-Summaries'): # Create image summary ops tf.summary.image('input', x, max_outputs=2, collections=['training']) # Add summaries for the training losses losses = tf.get_collection('losses') for entry in losses: tf.summary.scalar(entry.name, entry, collections=['training']) # Add summaries for the training accuracies accs = tf.get_collection('accuracies') for entry in accs: tf.summary.scalar(entry.name, entry, collections=['training']) # Add summaries for all trainable vars for var in trainable_vars: tf.summary.histogram(var.op.name, var, collections=['training']) var_norm = tf.norm(var, ord='euclidean') tf.summary.scalar(var.op.name + '/l2norm', var_norm, collections=['training']) # Add summaries for variable gradients for grad, var in grads_and_vars: if grad is not None: tf.summary.histogram(var.op.name + '/gradients', grad, collections=['training']) grad_norm = tf.norm(grad, ord='euclidean') tf.summary.scalar(var.op.name + '/gradients/l2norm', grad_norm, collections=['training']) # Add summaries for the logits and model predictions with tf.name_scope('Logits-Summaries'): variable_summaries(tf.identity(logits, name='logits'), name='logits', collections=['training', 'test'], histo=True) with tf.name_scope('Predictions-Summaries'): variable_summaries(tf.identity(preds, name='predictions'), name='predictions', collections=['training', 'test'], histo=True) # Initialize all variables with sess.as_default(): tf.global_variables_initializer().run() # Collect training params train_params = { 'epochs': ARGS.epochs, 'eval_step': ARGS.eval_step, 'adv_eval_step': ARGS.adv_eval_step, 'n_classes': n_classes, 'clp': ARGS.clp } # Start training loop model_train(sess, x, y, x_pair1, x_pair2, is_training, trainloader, train_step, args=train_params, evaluate=eval, adv_evaluate=adv_eval, writer_train=writer_train) # Save the trained model if ARGS.save: save_path = os.path.join(ARGS.save_dir, ARGS.filename) saver = tf.train.Saver(var_list=tf.global_variables()) saver.save(sess, save_path) print("Saved model at {:s}".format(str(ARGS.save_dir)))
def attack(self, path, session): print_and_log(self.logfile, "") # add a blank line print_and_log(self.logfile, 'Attacking model {0:}: '.format(path)) self.model = self.init_model() self.model.load_state_dict(torch.load(path)) pgd_parameters = self.pgd_params() class_index = 0 context_images, target_images, context_labels, target_labels, context_images_np = None, None, None, None, None def model_wrapper(context_point_x): # Insert context_point at correct spot context_images_attack = torch.cat([ context_images[0:class_index], context_point_x, context_images[class_index + 1:] ], dim=0) target_logits = self.model(context_images_attack, context_labels, target_images) return target_logits[0] tf_model_conv = convert_pytorch_model_to_tf(model_wrapper, out_dims=self.args.way) tf_model = cleverhans.model.CallableModelWrapper( tf_model_conv, 'logits') pgd = ProjectedGradientDescent(tf_model, sess=session, dtypestr='float32') for item in self.test_set: for t in range(self.args.attack_tasks): task_dict = self.dataset.get_test_task(item, session) context_images, target_images, context_labels, target_labels, context_images_np = self.prepare_task( task_dict, shuffle=False) # Detach shares storage with the original tensor, which isn't what we want. context_images_attack_all = context_images.clone() # Is require_grad true here, for context_images? for c in torch.unique(context_labels): # Adversarial input context image class_index = extract_class_indices(context_labels, c)[0].item() context_x = np.expand_dims(context_images_np[class_index], 0) # Input to the model wrapper is automatically converted to Torch tensor for us x = tf.placeholder(tf.float32, shape=context_x.shape) adv_x_op = pgd.generate(x, **pgd_parameters) preds_adv_op = tf_model.get_logits(adv_x_op) feed_dict = {x: context_x} adv_x, preds_adv = session.run((adv_x_op, preds_adv_op), feed_dict=feed_dict) context_images_attack_all[class_index] = torch.from_numpy( adv_x) save_image(adv_x, os.path.join(self.checkpoint_dir, 'adv.png')) save_image(context_x, os.path.join(self.checkpoint_dir, 'in.png')) acc_after = torch.mean( torch.eq( target_labels, torch.argmax(torch.from_numpy(preds_adv).to( self.device), dim=-1)).float()).item() with torch.no_grad(): logits = self.model(context_images, context_labels, target_images) acc_before = torch.mean( torch.eq(target_labels, torch.argmax(logits, dim=-1)).float()).item() del logits diff = acc_before - acc_after print_and_log( self.logfile, "Task = {}, Class = {} \t Diff = {}".format( t, c, diff)) print_and_log(self.logfile, "Accuracy before {}".format(acc_after)) logits = self.model(context_images_attack_all, context_labels, target_images) acc_all_attack = torch.mean( torch.eq(target_labels, torch.argmax(logits, dim=-1)).float()).item() print_and_log(self.logfile, "Accuracy after {}".format(acc_all_attack))
def main(argv): del argv # unused args_keys = [ 'namespace', 'seed', 'num_images', 'batch_size', 'attack_models', 'eval_models', 'epsilon', 'eps_iter', 'nb_iter', 'attack_differentiable_slq', 'eval_only' ] args_dict = { k: v for k, v in FLAGS.flag_values_dict().items() if k in args_keys } logging.info('') for k in args_keys: logging.info('%-20s = %s' % (k, args_dict[k])) logging.info('') with JobbyJob(args_dict, namespace=FLAGS.namespace) as job: tf.set_random_seed(FLAGS.seed) sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=1., allow_growth=True))) keras.backend.set_session(sess) keras.backend.set_learning_phase(0) with tf.name_scope('TFRecordsLoader'): dataset = load_tfrecords_dataset(TFRECORDS_FILENAMES) dataset = dataset.take(FLAGS.num_images) dataset = dataset.batch(FLAGS.batch_size) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() X, y_true = next_element with sess.as_default(): attack_model_paths = [ MODEL_NAME_TO_CKPT_PATH_MAP[m] for m in FLAGS.attack_models ] eval_model_paths = [ MODEL_NAME_TO_CKPT_PATH_MAP[m] for m in FLAGS.eval_models ] eval_model = EvalSHIELDModel( load_jpeg_trained_ensemble(FLAGS.eval_models, eval_model_paths)) perform_attack = not FLAGS.eval_only if perform_attack: attack_model = AttackSHIELDModel( load_jpeg_trained_ensemble(FLAGS.attack_models, attack_model_paths), attack_differentiable_slq=FLAGS.attack_differentiable_slq) y_target = attack_model.get_least_likely_prediction(X) y_target_one_hot = tf.one_hot(y_target, 1000, axis=-1) attack = ProjectedGradientDescent(attack_model, sess=sess) attack_kwargs = { 'y_target': y_target_one_hot, 'eps': FLAGS.epsilon, 'eps_iter': FLAGS.eps_iter, 'nb_iter': FLAGS.nb_iter } X_adv = attack.generate(X, **attack_kwargs) y_pred_shield = eval_model.get_predicted_class(X_adv) else: y_target = y_true * -1 y_pred_shield = eval_model.get_predicted_class(X) writer = tf.summary.FileWriter(LOGS_DIR, sess.graph) writer.close() model_accuracy = AccuracyMeter() attack_success = AccuracyMeter() with tqdm(total=FLAGS.num_images, unit='imgs') as pbar: while True: try: y_true_np, y_target_np, y_pred_shield_np = \ sess.run([y_true, y_target, y_pred_shield]) model_accuracy.offer(y_pred_shield_np, y_true_np) attack_success.offer(y_pred_shield_np, y_target_np) pbar.set_postfix( model_accuracy=model_accuracy.evaluate(), attack_success=attack_success.evaluate()) pbar.update(y_true_np.shape[0]) except tf.errors.OutOfRangeError: break job.update_output(model_accuracy=model_accuracy.evaluate(), attack_success=attack_success.evaluate()) logging.info('') logging.info('model_accuracy = %.04f' % model_accuracy.evaluate()) logging.info('attack_success = %.04f' % attack_success.evaluate())
def PGD(torch_model, dataset, eps_list, opt, c, h, w, clip_min, clip_max): if opt == 'evaluate': acclist = [] for eps in eps_list: sess = tf.Session() x_op = tf.placeholder(tf.float32, shape=( None, c, h, w, )) # Convert pytorch model to a tf_model and wrap it in cleverhans tf_model_fn = convert_pytorch_model_to_tf(torch_model) cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits') # Create an FGSM attack atk_op = ProjectedGradientDescent(cleverhans_model, sess=sess) atk_params = { 'eps': eps, 'clip_min': clip_min, 'clip_max': clip_max } adv_x_op = atk_op.generate(x_op, **atk_params) adv_preds_op = tf_model_fn(adv_x_op) # Run an evaluation of our model against fgsm total = 0 correct = 0 for xs, ys in dataset: xs, ys = xs.to(device), ys.to(device) adv_preds = sess.run(adv_preds_op, feed_dict={x_op: xs}) correct += (np.argmax( adv_preds, axis=1) == ys.cpu().detach().numpy()).sum() total += dataset.batch_size acc = float(correct) / total print('Adv accuracy: {:.3f}'.format(acc * 100)) acclist.append(acc) return acclist elif opt == 'generate': advpacklist = [] for eps in eps_list: advlist = [] sess = tf.Session() x_op = tf.placeholder(tf.float32, shape=( None, c, h, w, )) # Convert pytorch model to a tf_model and wrap it in cleverhans tf_model_fn = convert_pytorch_model_to_tf(torch_model) cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits') # Create an FGSM attack atk_op = ProjectedGradientDescent(cleverhans_model, sess=sess) atk_params = { 'eps': eps, 'clip_min': clip_min, 'clip_max': clip_max } adv_x_op = atk_op.generate(x_op, **atk_params) total = 0 # Run an evaluation of our model against fgsm for xs, ys in dataset: xs, ys = xs.to(device), ys.to(device) adv = torch.from_numpy(sess.run(adv_x_op, feed_dict={x_op: xs})) if ys == np.argmax(torch_model(xs).data.cpu().numpy()): pred = np.argmax(torch_model(adv).data.cpu().numpy()) if ys != pred: print('OK') total += 1 print(total) adv = adv.numpy() advlist.append(adv) if total == 500: break print(len(advlist)) advpacklist.append(advlist) return advpacklist
def eval(sess, model_name, X_train, Y_train, X_test, Y_test, cnn=False, rbf=False, fgsm=False, jsma=False, df=False, bim=False): """ Load model saved in model_name.json and model_name_weights.h5 and evaluate its accuracy on legitimate test samples and adversarial samples. Use cnn=True if the model is CNN based. """ # open text file and output accuracy results to it text_file = open("mnist_results.txt", "w") # load saved model print("Load model ... ") ''' json = open('models/{}.json'.format(model_name), 'r') model = json.read() json.close() loaded_model = model_from_json(model) loaded_model.load_weights("models/{}_weights.h5".format(model_name)) ''' if rbf: loaded_model = load_model("rbfmodels/{}.h5".format(model_name), custom_objects={'RBFLayer': RBFLayer}) text_file.write('Evaluating on rbfmodels/{}.h5\n\n'.format(model_name)) else: loaded_model = load_model("models/{}.h5".format(model_name)) text_file.write('Evaluating on models/{}.h5\n\n'.format(model_name)) # Set placeholders if cnn: x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) else: x = tf.placeholder(tf.float32, shape=(None, 784)) y = tf.placeholder(tf.float32, shape=(None, 10)) predictions = loaded_model(x) accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args={ "batch_size" : 128 }) text_file.write('Test accuracy on legitimate test examples: {0}\n'.format(str(accuracy))) #print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Craft adversarial examples depending on the input parameters wrap = KerasModelWrapper(loaded_model) # FGSM if fgsm: fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3} adv_x = fgsm.generate(x, **fgsm_params) adv_x = tf.stop_gradient(adv_x) preds_adv = loaded_model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={ "batch_size" : 128}) text_file.write('Test accuracy on fgsm adversarial test examples: {0}\n'.format(str(accuracy))) #print('Test accuracy on fgsm adversarial test examples: ' + str(accuracy)) # JSMA if jsma: jsma = SaliencyMapMethod(wrap, sess=sess) jsma_params = {'theta': 2., 'gamma': 0.145, 'clip_min': 0., 'clip_max': 1., 'y_target': None} adv_x = jsma.generate(x, **jsma_params) adv_x = tf.stop_gradient(adv_x) preds_adv = loaded_model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={ "batch_size" : 128}) text_file.write('Test accuracy on jsma adversarial test examples: {0}\n'.format(str(accuracy))) #print('Test accuracy on jsma adversarial test examples: ' + str(accuracy)) # DeepFool if df: df = DeepFool(wrap, sess=sess) df_params = {'nb_candidate': 10, 'max_iter': 50} adv_x = df.generate(x, **df_params) adv_x = tf.stop_gradient(adv_x) preds_adv = loaded_model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={ "batch_size" : 128}) text_file.write('Test accuracy on df adversarial test examples: {0}\n'.format(str(accuracy))) #print('Test accuracy on df adversarial test examples: ' + str(accuracy)) # Basic Iterative Method if bim: bim = ProjectedGradientDescent(wrap, sess=sess) bim_params = {'eps': 0.3} adv_x = bim.generate(x, **bim_params) adv_x = tf.stop_gradient(adv_x) preds_adv = loaded_model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={ "batch_size" : 128}) text_file.write('Test accuracy on bim adversarial test examples: {0}\n'.format(str(accuracy))) #print('Test accuracy on bim adversarial test examples: ' + str(accuracy)) print('Accuracy results outputted to mnist_results.txt') text_file.close() # Close TF session sess.close()
clip_max = (1.0 + 1E-6 - min(mean)) / std[0] params = {'eps': eps, 'clip_min': clip_min, 'clip_max': clip_max, 'eps_iter': 0.005, 'nb_iter': 100, 'rand_init': False} elif args.attack == 'FGSM': attk = FastGradientMethod(ch_model, sess=sess) clip_min = (0.0 - 1E-6 - max(mean)) / std[0] clip_max = (1.0 + 1E-6 - min(mean)) / std[0] params = {'eps': eps, 'clip_min': clip_min, 'clip_max': clip_max} adv_x = attk.generate(x_op, **params) adv_preds_op = tf_model(adv_x) stime = time.time() tot_clean_err, tot_adv_err, tot = 0.0, 0.0, 0 # tot_adv_input_err = 0.0 clean_detail = list() detail = list() for i, (xs, ys) in enumerate(test_loader): ys = ys.numpy() clean_preds = model(xs.cuda()).detach().cpu().numpy()
def main(train_method, dataset, model_name, params): # prepare dataset and normalize settings normalize = None if params.get('normalized', False): if dataset == 'mnist': normalize = (_MNIST_MEAN, _MNIST_STDDEV) elif dataset == 'cifar10': normalize = (_CIFAR10_MEAN, _CIFAR10_STDDEV) elif dataset == 'imagenet': normalize = (_IMAGENET_MEAN, _IMAGENET_STDDEV) train_set = get_dataset(dataset, 'train', normalize) test_set = get_dataset(dataset, 'test', normalize) # read input shape (c, h, w) input_shape = get_input_shape(dataset) # read params batch_size = params['batch_size'] optimizer_name = params.get('optimizer', 'sgd') if optimizer_name == 'sgd': lr = params.get('learning_rate', 0.1) momentum = params.get('momentum', 0.1) weight_decay = params.get('weight_decay', 5e-4) elif optimizer_name == 'adam': lr = params.get('learning_rate', 0.1) else: raise NotImplementedError cur_lr = lr print('default learning rate =', cur_lr, file=stderr) start_epoch = 0 epochs = params.get('epochs', 0) eps = normed_eps = params['eps'] if train_method == 'adv': # Note: for adversarial training, in training phase, we use the manual implementation version for precision, # and use the clearhans implementation in test phase for precision eps_iter_coef = params['eps_iter_coef'] clip_min = params['clip_min'] clip_max = params['clip_max'] if normalize is not None: mean, std = normalize clip_min = (clip_min - max(mean)) / min(std) - 1e-6 clip_max = (clip_max - min(mean)) / min(std) + 1e-6 normed_eps = eps / min(std) nb_iter = params['nb_iter'] rand_init = params['rand_init'] adv_params = { 'eps': normed_eps, 'clip_min': clip_min, 'clip_max': clip_max, 'eps_iter': eps_iter_coef * eps, 'nb_iter': nb_iter, 'rand_init': rand_init } elif train_method == 'certadv': # Note: for certified adversarially trained models, we test its accuracy still using PGD attack eps_iter_coef = params['eps_iter_coef'] clip_min = params['clip_min'] clip_max = params['clip_max'] if normalize is not None: mean, std = normalize clip_min = (clip_min - max(mean)) / min(std) - 1e-6 clip_max = (clip_max - min(mean)) / min(std) + 1e-6 normed_eps = eps / min(std) nb_iter = params['nb_iter'] rand_init = params['rand_init'] adv_params = { 'eps': normed_eps, 'clip_min': clip_min, 'clip_max': clip_max, 'eps_iter': eps_iter_coef * eps, 'nb_iter': nb_iter, 'rand_init': rand_init } print(adv_params, file=stderr) # prepare loader train_loader = torch.utils.data.DataLoader(train_set, batch_size, shuffle=True, pin_memory=True) test_loader = torch.utils.data.DataLoader(test_set, batch_size, shuffle=True, pin_memory=True) # stats train_tot = len(train_set) test_tot = len(test_set) best_acc = 0.0 best_robacc = 0.0 # load model m = model.load_model('exp', dataset, model_name).cuda() print(m) if train_method == 'adv' and params['retrain']: # retrain from the best clean model clean_model_name = f'{dataset}_{model_name}_clean_0_best' new_m, stats = try_load_weight(m, clean_model_name) assert stats == True, "Could not load pretrained clean model." if isinstance(new_m[0], NormalizeLayer): # squeeze the normalize layer out new_m = new_m[1] m = new_m elif train_method == 'certadv': configdir = params['configpath'] ds_mapping = {'cifar10': 'cifar', 'mnist': 'mnist'} ds_multiplier = {'cifar10': 255., 'mnist': 10.} configfilename = f'exp_{ds_mapping[dataset]}{int(round(eps * ds_multiplier[dataset]))}.json' with open(os.path.join(configdir, configfilename), 'r') as f: real_config = json.load(f) epochs = real_config['training_params']['epochs'] start_epoch = epochs - 1 model_path = os.path.join( os.path.join(real_config['path_prefix'], real_config['models_path']), f'{model_name}_best.pth') d = torch.load(model_path) print(f'certadv load from {model_path}', file=stderr) m.load_state_dict(d['state_dict']) # open file handler save_name = f'{ds}_{model_name}_{now_method}_{eps}' mode = 'a' if os.path.exists(f'{SAVE_PATH}/{save_name}_train.log') or os.path.exists( f'{SAVE_PATH}/{save_name}_test.log'): choice = getpass.getpass( f'Log exists. Do you want to rewrite it? (Y/others) ') if choice == 'Y': mode = 'w' print('Rewrite log', file=stderr) else: mode = 'a' train_log = open(f'{SAVE_PATH}/{save_name}_train.log', mode) test_log = open(f'{SAVE_PATH}/{save_name}_test.log', mode) # special treatment for model G - layerwise training if model_name == 'G' and train_method == 'adv': new_last_layer = nn.Linear(1024, 10) # start for epoch in range(start_epoch, epochs): if epoch % LR_REDUCE == 0 and epoch > 0: # learning rate reduced to LR_REDUCE_RATE every LR_REDUCE epochs cur_lr *= LR_REDUCE_RATE print(f' reduce learning rate to {cur_lr}', file=stderr) # special treatment for model G - layerwise training if model_name == 'G' and train_method == 'adv': new_m = list() tmp_cnt = 0 for l in m: new_m.append(l) if isinstance(l, nn.Linear) and l.out_features == 1024: tmp_cnt += 1 if tmp_cnt > epoch / 5: if l.out_features == 1024: new_m.append(nn.ReLU()) new_m.append(new_last_layer) break new_m = nn.Sequential(*new_m).cuda() m, new_m = new_m, m print(m, file=stderr) cur_lr = lr print(f' learning rate restored to {cur_lr}', file=stderr) # init optimizer if optimizer_name == 'adam': opt = optim.Adam(m.parameters(), lr=cur_lr) elif optimizer_name == 'sgd': opt = optim.SGD(m.parameters(), lr=cur_lr, momentum=momentum, weight_decay=weight_decay) else: raise Exception("Fail to create the optimizer") cur_idx = 0 cur_acc = 0.0 cur_robacc = 0.0 batch_tot = 0 batch_acc_tot = 0 batch_robacc_tot = 0 clean_ce = 0.0 adv_ce = 0.0 # now eps now_eps = normed_eps * min((epoch + 1) / EPS_WARMUP_EPOCHS, 1.0) # =========== Training =========== print(f'Epoch {epoch}: training', file=stderr) if train_method != 'clean': print(f' Training eps={now_eps:.3f}', file=stderr) m.train() for i, (X, y) in enumerate(train_loader): if DEBUG and i > 10: break start_t = time.time() X_clean, y_clean = X.cuda(), y.cuda().long() clean_out = m(Variable(X_clean)) clean_ce = nn.CrossEntropyLoss()(clean_out, Variable(y_clean)) batch_tot = X.size(0) batch_acc_tot = ( clean_out.data.max(1)[1] == y_clean).float().sum().item() if train_method == 'clean': opt.zero_grad() clean_ce.backward() opt.step() elif train_method == 'adv': X_pgd = Variable(X, requires_grad=True) for _ in range(nb_iter): opt_pgd = optim.Adam([X_pgd], lr=1e-3) opt.zero_grad() loss = nn.CrossEntropyLoss()(m(X_pgd.cuda()), Variable(y_clean)) loss.backward() eta = now_eps * eps_iter_coef * X_pgd.grad.data.sign() X_pgd = Variable(X_pgd.data + eta, requires_grad=True) eta = torch.clamp(X_pgd.data - X, -now_eps, now_eps) X_pgd.data = X + eta X_pgd.data = torch.clamp(X_pgd.data, clip_min, clip_max) # print(X_pgd.data, la.norm((X_pgd.data - X).numpy().reshape(-1), np.inf), file=stderr) adv_out = m(Variable(X_pgd.data).cuda()) adv_ce = nn.CrossEntropyLoss()(adv_out, Variable(y_clean)) batch_robacc_tot = ( adv_out.data.max(1)[1] == y_clean).float().sum() opt.zero_grad() adv_ce.backward() opt.step() elif train_method == 'certadv': # no action to do for training adv_ce = torch.Tensor([0.0]).cuda() pass end_t = time.time() clean_ce = clean_ce.detach().cpu().item() if train_method != 'clean': adv_ce = adv_ce.detach().cpu().item() runtime = end_t - start_t cur_acc = (cur_acc * cur_idx + batch_acc_tot) / (cur_idx + batch_tot) if train_method != 'clean': cur_robacc = (cur_robacc * cur_idx + batch_robacc_tot) / (cur_idx + batch_tot) cur_idx += batch_tot print( f'{epoch} {cur_idx} {cur_acc} {cur_robacc} {batch_acc_tot/batch_tot:.3f} {batch_robacc_tot/batch_tot:.3f}' f' {clean_ce:.3f} {adv_ce:.3f} {runtime:.3f}', file=train_log) if i % STEP == 0 or cur_idx == train_tot: print( f' [train] {epoch}/{cur_idx} acc={cur_acc:.3f}({batch_acc_tot/batch_tot:.3f}) ' f'robacc={cur_robacc:.3f}({batch_robacc_tot/batch_tot:.3f}) ce={clean_ce:.3f} adv_ce={adv_ce:.3f} time={runtime:.3f}', file=stderr) train_log.flush() # =========== Testing =========== print(f'Epoch {epoch}: testing', file=stderr) m.eval() torch.set_grad_enabled(False) cur_idx = 0 cur_acc = 0.0 cur_robacc = 0.0 batch_tot = 0 batch_acc_tot = 0 batch_robacc_tot = 0 clean_ce = 0.0 adv_ce = 0.0 if train_method in ['adv', 'certadv']: tf_model = convert_pytorch_model_to_tf(m) ch_model = CallableModelWrapper(tf_model, output_layer='logits') x_op = tf.placeholder(tf.float32, shape=(None, ) + tuple(input_shape)) sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.5))) attk = ProjectedGradientDescent(ch_model, sess=sess) adv_x = attk.generate(x_op, **adv_params) adv_preds_op = tf_model(adv_x) for i, (X, y) in enumerate(test_loader): if DEBUG and i >= 10: break start_t = time.time() X_clean, y_clean = X.cuda(), y.cuda().long() clean_out = m(Variable(X_clean)) clean_ce = nn.CrossEntropyLoss()(clean_out, Variable(y_clean)) batch_tot = X.size(0) batch_acc_tot = ( clean_out.data.max(1)[1] == y_clean).float().sum().item() if train_method in ['adv', 'certadv']: (adv_preds, ) = sess.run((adv_preds_op, ), feed_dict={x_op: X}) adv_preds = torch.Tensor(adv_preds) adv_ce = nn.CrossEntropyLoss()(adv_preds, Variable(y)) batch_robacc_tot = ( adv_preds.data.max(1)[1] == y).float().sum().item() # elif train_method == 'certadv': # # adv_ce, robust_err = robust_loss(m, eps, # Variable(X_clean), Variable(y_clean), # proj=50, norm_type='l1_median', bounded_input=True) # # batch_robacc_tot = (1.0 - robust_err) * batch_tot end_t = time.time() clean_ce = clean_ce.detach().cpu().item() if train_method != 'clean': adv_ce = adv_ce.detach().cpu().item() runtime = end_t - start_t cur_acc = (cur_acc * cur_idx + batch_acc_tot) / (cur_idx + batch_tot) if train_method != 'clean': cur_robacc = (cur_robacc * cur_idx + batch_robacc_tot) / (cur_idx + batch_tot) cur_idx += batch_tot print( f'{epoch} {cur_idx} {cur_acc} {cur_robacc} {batch_acc_tot / batch_tot:.3f} {batch_robacc_tot / batch_tot:.3f}' f' {clean_ce} {adv_ce} {runtime:.3f}', file=test_log) if i % STEP == 0 or cur_idx == train_tot: print( f' [test] {epoch}/{cur_idx} acc={cur_acc:.3f}({batch_acc_tot / batch_tot:.3f}) ' f'robacc={cur_robacc:.3f}({batch_robacc_tot / batch_tot:.3f}) time={runtime:.3f}', file=stderr) torch.set_grad_enabled(True) if model_name == 'G' and train_method == 'adv': # switch back m, new_m = new_m, m def save_with_configs(m, path): torch.save( { 'state_dict': m.state_dict(), 'acc': cur_acc, 'robacc': cur_robacc, 'epoch': epoch, 'normalized': normalize is not None, 'dataset': dataset }, path) if not os.path.exists(f'{SAVE_PATH}/{save_name}_chkpt'): os.makedirs(f'{SAVE_PATH}/{save_name}_chkpt') save_with_configs( m, f'{SAVE_PATH}/{save_name}_chkpt/{save_name}_ep_{epoch:03d}.pth') if (train_method == 'clean' and cur_acc > best_acc) or (train_method != 'clean' and cur_robacc > best_robacc): save_with_configs(m, f'{SAVE_PATH}/{save_name}_best.pth') print( f" Updated, acc {best_acc:.3f} => {cur_acc:.3f} robacc {best_robacc:.3f} => {cur_robacc:.3f}", file=stderr) best_acc = cur_acc best_robacc = cur_robacc test_log.flush() # memory clean after each batch torch.cuda.empty_cache() if train_method == 'adv': sess.close() train_log.close() test_log.close()
def evaluate_trans(loader, dataset, model, epoch, epsilon, ref_model, clip_min=0., clip_max=1., eps_iter=0.005, nb_iter=100, rand_init=False, verbose=20): batch_time = AverageMeter() losses = AverageMeter() errors = AverageMeter() params = { 'eps': epsilon, 'clip_min': clip_min, 'clip_max': clip_max, 'eps_iter': eps_iter, 'nb_iter': nb_iter, 'rand_init': rand_init } sess = tf.Session(config=config) x_op = tf.placeholder(tf.float32, shape=( None, 1, 28, 28, )) model.eval() ref_model.eval() tf_model = convert_pytorch_model_to_tf(ref_model) cleverhans_model = CallableModelWrapper(tf_model, output_layer='logits') attk = ProjectedGradientDescent(cleverhans_model, sess=sess) adv_x_op = attk.generate(x_op, **params) end = time.time() for i, (X, y) in enumerate(loader): X_adv = sess.run((adv_x_op), feed_dict={x_op: X}) X, y = Variable(torch.tensor(X_adv)).cuda(), y.cuda() out = model(Variable(X)) ce = nn.CrossEntropyLoss()(out, Variable(y)) err = (out.data.max(1)[1] != y).float().sum() / X.size(0) # measure accuracy and record loss losses.update(ce.item(), X.size(0)) errors.update(err.item(), X.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() endline = '\n' if i % verbose == 0 else '\r' print('Adv test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Error {error.val:.3f} ({error.avg:.3f})'.format( i, len(loader), batch_time=batch_time, loss=losses, error=errors), end=endline) if DEBUG and i == 10: break print('\n * Error {error.avg:.3f}'.format(error=errors)) return losses.avg, errors.avg
def trans_train(loader, model, opt, epoch, epsilon, ref_model, clip_min=0., clip_max=1., eps_iter=0.005, nb_iter=100, rand_init=False, verbose=20): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() errors = AverageMeter() model.train() params = { 'eps': epsilon, 'clip_min': clip_min, 'clip_max': clip_max, 'eps_iter': eps_iter, 'nb_iter': nb_iter, 'rand_init': rand_init } sess = tf.Session(config=config) x_op = tf.placeholder(tf.float32, shape=( None, 1, 28, 28, )) tf_model = convert_pytorch_model_to_tf(ref_model) cleverhans_model = CallableModelWrapper(tf_model, output_layer='logits') attk = ProjectedGradientDescent(cleverhans_model, sess=sess) adv_x_op = attk.generate(x_op, **params) end = time.time() for i, (X, y) in enumerate(loader): X_adv = sess.run((adv_x_op), feed_dict={x_op: X}) X, y = Variable(torch.tensor(X_adv)).cuda(), y.cuda() data_time.update(time.time() - end) out = model(Variable(X)) ce = nn.CrossEntropyLoss()(out, Variable(y)) err = (out.data.max(1)[1] != y).float().sum() / X.size(0) opt.zero_grad() ce.backward() opt.step() batch_time.update(time.time() - end) end = time.time() losses.update(ce.item(), X.size(0)) errors.update(err.item(), X.size(0)) if verbose and i % verbose == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Error {errors.val:.3f} ({errors.avg:.3f})'.format( epoch, i, len(loader), batch_time=batch_time, data_time=data_time, loss=losses, errors=errors)) if DEBUG and i == 10: break return losses.avg, errors.avg
class PGDAdaptor(BasicAdaptor): """ ** Not a real attack ** For PGD attack, which only provides the lower bound for the robust radius """ def __init__(self, dataset, model): super(PGDAdaptor, self).__init__(dataset, model) self.config = tf.ConfigProto(gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.5)) self.config.gpu_options.allow_growth = True self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph, config=self.config) input_shape = get_input_shape(dataset) with self.sess.graph.as_default(): with self.sess.as_default(): self.tf_model = convert_pytorch_model_to_tf(self.model) self.ch_model = CallableModelWrapper(self.tf_model, output_layer='logits') self.x_op = tf.placeholder(tf.float32, shape=( None, input_shape[0], input_shape[1], input_shape[2], )) self.attk = ProjectedGradientDescent(self.ch_model, sess=self.sess) self.adv_preds_ops = dict() def verify(self, input, label, norm_type, radius): # only support Linfty norm assert norm_type == 'inf' xs = input.unsqueeze(0) clean_preds = self.model(xs.cuda()).detach().cpu().numpy() clean_pred = np.argmax(clean_preds[0]) if clean_pred != label: return False if radius == 0: return True with self.sess.graph.as_default(): with self.sess.as_default(): if radius not in self.adv_preds_ops: params = { 'eps': radius, 'clip_min': 0.0, 'clip_max': 1.0, 'eps_iter': radius / 50.0, 'nb_iter': 100, 'rand_init': False } adv_x = self.attk.generate(self.x_op, **params) self.adv_preds_ops[radius] = self.tf_model(adv_x) (adv_preds, ) = self.sess.run((self.adv_preds_ops[radius], ), feed_dict={self.x_op: xs}) adv_pred = np.argmax(adv_preds[0]) return adv_pred == label
def eval_robustness(ARGS, verbose=True): ############################################# # Load pre-trained model ############################################# if verbose: print('\n- Loading pre-trained model...') # Build evaluation graph eval_graph = tf.Graph() config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(graph=eval_graph, config=config) # Define input TF placeholder with eval_graph.as_default(): with tf.device('/gpu:0'): # Define placeholders with tf.name_scope('Placeholders'): x = tf.placeholder(dtype=tf.float32, shape=input_shape, name='inputs') y = tf.placeholder(dtype=tf.float32, shape=(None, n_classes), name='labels') is_training = tf.placeholder_with_default(False, shape=(), name='is-training') # Define model with tf.name_scope('Model'): model = Model(nb_classes=n_classes, input_shape=input_shape, is_training=is_training) # Define forward-pass with tf.name_scope('Logits'): logits = model.get_logits(x) with tf.name_scope('Probs'): preds = tf.nn.softmax(logits) # Restore the pre-trained model with sess.as_default(): saver = tf.train.Saver() saver.restore(sess, ARGS.restore_path + '/model.ckpt') # Define accuracy ops with tf.name_scope('Accuracy'): ground_truth = tf.argmax(y, axis=1) predicted_label = tf.argmax(preds, axis=1) correct_prediction = tf.equal(predicted_label, ground_truth) clean_acc = tf.reduce_mean(tf.to_float(correct_prediction), name='accuracy') # Define PGD adversary if ARGS.attack == 'PGD': if verbose: print('\n- Building {:s} attack graph...'.format( ARGS.attack)) with tf.name_scope('PGD-Attacker'): pgd_params = { 'ord': np.inf, 'y': y, 'eps': ARGS.eps / 255, 'eps_iter': ARGS.eps_iter / 255, 'nb_iter': ARGS.nb_iter, 'rand_init': ARGS.rand_init, 'rand_minmax': ARGS.eps / 255, 'clip_min': 0., 'clip_max': 1., 'sanity_checks': True } pgd = ProjectedGradientDescent(model, sess=None) adv_x = pgd.generate(x, **pgd_params) # Define SPSA adversary elif ARGS.attack == 'SPSA': if verbose: print('\n- Building {:s} attack graph...'.format( ARGS.attack)) with tf.name_scope('PGD-Attacker'): spsa_params = { 'y': y, 'eps': ARGS.eps / 255, 'nb_iter': ARGS.nb_iter, 'spsa_samples': ARGS.spsa_samples, 'spsa_iters': ARGS.spsa_iters, 'clip_min': 0., 'clip_max': 1., 'learning_rate': ARGS.spsa_lr, 'delta': ARGS.spsa_delta } spsa = SPSA(model, sess=sess) adv_x = spsa.generate(x, **spsa_params) else: raise NotImplementedError with tf.name_scope('Logits'): adv_logits = model.get_logits(adv_x) with tf.name_scope('Probs'): adv_preds = tf.nn.softmax(adv_logits) adv_loss = tf.nn.softmax_cross_entropy_with_logits( logits=adv_logits, labels=y) adv_predicted_label = tf.argmax(adv_preds, axis=1) correct_prediction = tf.equal(adv_predicted_label, ground_truth) adv_accuracy = tf.reduce_mean(tf.to_float(correct_prediction), name='adv-accuracy') is_adv_example = tf.not_equal(ground_truth, adv_predicted_label) ############################################# # Run evaluation ############################################# if verbose: print('\n- Running robustness evaluation against {:s} attacker...\n'. format(ARGS.attack)) if ARGS.attack == 'PGD': clean, adv_mean, adv_worstcase = run_pgd_eval(x, y, is_training, sess, adv_testloader, clean_acc, adv_accuracy, adv_loss, is_adv_example, ARGS, save_loss_dist=False, verbose=verbose) elif ARGS.attack == 'SPSA': clean, adv_mean = run_spsa_eval(x, y, is_training, sess, adv_testloader, clean_acc, adv_accuracy, adv_loss, is_adv_example, ARGS, save_loss_dist=False, verbose=verbose) adv_worstcase = adv_mean else: raise NotImplementedError return clean, adv_mean, adv_worstcase
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) #x########################################################## #Tensor("Placeholder:0", shape=(?, 28, 28, 1), dtype=float32) #<class 'tensorflow.python.framework.ops.Tensor'> ########################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) return acc if clean_train: model = ModelBasicCNN( 'model1', nb_classes, nb_filters ) # <cleverhans_tutorials.tutorial_models.ModelBasicCNN object at 0x7f81feaae240> preds = model.get_logits( x ) # Tensor("model1_1/dense/BiasAdd:0", shape=(?, 10), dtype=float32) loss = CrossEntropy( model, smoothing=label_smoothing ) # <cleverhans.loss.CrossEntropy object at 0x7f819466b470> def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = ProjectedGradientDescent( model, sess=sess ) # TODO # <cleverhans.attacks.FastGradientMethod object at 0x7feabc77ce80> start = time.time() adv_x = fgsm.generate( x, **fgsm_params ) # Tensor("Identity_1:0", shape=(?, 28, 28, 1), dtype=float32) #imagetest = np.squeeze(adv_x) #plt.imshow(imagetest) preds_adv = model.get_logits( adv_x ) # Tensor("model1_5/dense/BiasAdd:0", shape=(?, 10), dtype=float32) end = time.time() a = end - start print("Attack time = ") print(a) print("") #Tensor("Identity_1:0", shape=(?, 28, 28, 1), dtype=float32) #Tensor("model1_5/dense/BiasAdd:0", shape=(?, 10), dtype=float32) # Evaluate the accuracy of the MNIST model on adversarial examples start = time.time() acc_result = do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True) end = time.time() b = end - start print("") print("Inference function time = ") print(b) print("") values = [b, acc_result * 100, 0, 0, 0] x_labels = [ 'Time(s)', 'Accuracy(%)', '', 'Method2 Time(s)', 'Method2 Accuracy(%)' ] plt.bar(x_labels, values) plt.show() # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval') print("END!")
def main(argv): del argv if FLAGS.debug: logging.info('Running in debug mode!!!') random.seed(FLAGS.seed) np.random.seed(FLAGS.seed) tf.set_random_seed(FLAGS.seed) TFRECORDS_DIR = FLAGS.tfrecords_dir HDF5_DATA_PATH = FLAGS.hdf5_data_path tfrecord_filepaths = glob(os.path.join(TFRECORDS_DIR, '*')) tf_dataset = tfutils.make_dataset(tfrecord_filepaths, batch_size=1, filter_label=FLAGS.label, preprocessing_fn=preprocess_input) hdf5_dataset = None if not FLAGS.debug: hdf5_file = h5py.File(HDF5_DATA_PATH, 'a') hdf5_group = get_attack_group_name(O_ATTACK_NAME, FLAGS.label) hdf5_dataset = hdf5utils.create_image_dataset(hdf5_file, group=hdf5_group, attrs={ 'seed': FLAGS.seed, 'eps': FLAGS.eps, 'ord': FLAGS.ord, 'eps_iter': FLAGS.eps_iter, 'nb_iter': FLAGS.nb_iter, 'target': FLAGS.target }) model = InceptionV1Model() iterator = tf_dataset.make_one_shot_iterator() x, y = iterator.get_next() with tf.Session() as sess: attack = ProjectedGradientDescent(model, sess=sess) target_one_hot_encoded = get_one_hot_encoded_targets(FLAGS.target) x_adv = attack.generate( x, eps=FLAGS.eps, nb_iter=FLAGS.nb_iter, eps_iter=FLAGS.eps_iter, ord=(int(FLAGS.ord) if FLAGS.ord != 'inf' else np.inf), y_target=target_one_hot_encoded) pbar = tqdm(unit='imgs') try: while True: attacked_imgs = sess.run(x_adv) if not FLAGS.debug: hdf5utils.add_images_to_dataset(attacked_imgs, hdf5_dataset) pbar.update() except tf.errors.OutOfRangeError: pass
def test_transferability_subset(loader, attack_method, epsilon, torch_model1, torch_model2, verbose, batch_size): batch_time = AverageMeter() err12s = AverageMeter() err21s = AverageMeter() end = time.time() sess = tf.Session(config=config) x_op = tf.placeholder(tf.float32, shape=( None, 1, 28, 28, )) # Convert pytorch model to a tf_model and wrap it in cleverhans tf_model_fn1 = convert_pytorch_model_to_tf(torch_model1) tf_model_fn2 = convert_pytorch_model_to_tf(torch_model2) # Attack Parameters if attack_method == 'CW': params = { 'binary_search_steps': 1, # 'y': None, 'max_iterations': CW_ATTACK_ITERATIONS, 'learning_rate': CW_LEARNING_RATE, 'batch_size': batch_size, 'initial_const': 10 } elif attack_method == 'PGD': params = { 'eps': epsilon, 'clip_min': 0., 'clip_max': 1., 'eps_iter': 0.005, 'nb_iter': 100, 'rand_init': False } elif attack_method == 'FGSM': params = {'eps': epsilon, 'clip_min': 0., 'clip_max': 1.} else: raise Exception('Unknown attack method %s'.format(attack_method)) # Model1 --> Model2 cleverhans_model1 = CallableModelWrapper(tf_model_fn1, output_layer='logits') cleverhans_model2 = CallableModelWrapper(tf_model_fn2, output_layer='logits') # Create an attack if attack_method == 'CW': attk1 = CarliniWagnerL2(cleverhans_model1, sess=sess) if attack_method == 'PGD': attk1 = ProjectedGradientDescent(cleverhans_model1, sess=sess) if attack_method == 'FGSM': attk1 = FastGradientMethod(cleverhans_model1, sess=sess) if attack_method == 'CW': attk2 = CarliniWagnerL2(cleverhans_model2, sess=sess) if attack_method == 'PGD': attk2 = ProjectedGradientDescent(cleverhans_model2, sess=sess) if attack_method == 'FGSM': attk2 = FastGradientMethod(cleverhans_model2, sess=sess) adv_x_op1 = attk1.generate(x_op, **params) adv_x_op2 = attk2.generate(x_op, **params) # Test on model1 and model2 adv_preds_op11 = tf_model_fn1(adv_x_op1) adv_preds_op12 = tf_model_fn2(adv_x_op1) adv_preds_op21 = tf_model_fn1(adv_x_op2) adv_preds_op22 = tf_model_fn2(adv_x_op2) for i, (xs, ys) in enumerate(loader): (adv_preds11, adv_preds12) = sess.run((adv_preds_op11, adv_preds_op12), feed_dict={x_op: xs}) (adv_preds21, adv_preds22) = sess.run((adv_preds_op21, adv_preds_op22), feed_dict={x_op: xs}) cnt11 = int((np.argmax(adv_preds11, axis=1) != ys).sum()) cnt22 = int((np.argmax(adv_preds22, axis=1) != ys).sum()) if cnt11 > 0: err12 = float( ((np.argmax(adv_preds12, axis=1) != ys) * (np.argmax(adv_preds11, axis=1) != ys)).sum()) / float(cnt11) err12s.update(err12, cnt11) if cnt22 > 0: err21 = float( ((np.argmax(adv_preds22, axis=1) != ys) * (np.argmax(adv_preds21, axis=1) != ys)).sum()) / float(cnt22) err21s.update(err21, cnt22) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if verbose: endline = '\n' if i % verbose == 0 else '\r' print('Test: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'error 1->2 {err12.val:.3f} ({err12.avg:.3f})\t' 'error 2->1 {err21.val:.3f} ({err21.avg:.3f})\t'.format( i, len(loader), batch_time=batch_time, err12=err12s, err21=err21s), end=endline) sess.close() return err12s.avg, err21s.avg