Esempio n. 1
0
    def __init__(self,
                 model,
                 image_shape_hwc,
                 epsilon=(16. / 255),
                 num_steps=200,
                 batch_size=32,
                 is_debug=False):
        self.graph = tf.Graph()

        with self.graph.as_default():
            self.sess = tf.Session(graph=self.graph)

            self.x_input = tf.placeholder(tf.float32,
                                          shape=(1, ) + image_shape_hwc)
            self.y_label = tf.placeholder(tf.int32, shape=(1, ))

            self.model = model
            attack = SPSA(CleverhansPyfuncModelWrapper(self.model),
                          sess=self.sess)
            self.x_adv = attack.generate(self.x_input,
                                         y=self.y_label,
                                         epsilon=epsilon,
                                         num_steps=num_steps,
                                         early_stop_loss_threshold=-1.,
                                         batch_size=batch_size,
                                         is_debug=is_debug)

        self.graph.finalize()
    def test_attack_success(self):
        """Check SPSA creates misclassified images."""
        epsilon = 4. / 255
        input_dir = FLAGS.input_image_dir
        metadata_file_path = FLAGS.metadata_file_path
        num_images = 8
        batch_shape = (num_images, 299, 299, 3)
        images, labels = load_images(input_dir, metadata_file_path,
                                     batch_shape)
        num_classes = 1001

        tf.logging.set_verbosity(tf.logging.INFO)
        with tf.Graph().as_default():
            # Prepare graph
            x_input = tf.placeholder(tf.float32, shape=(1, ) + batch_shape[1:])
            y_label = tf.placeholder(tf.int32, shape=(1, ))
            model = InceptionModel(num_classes)

            attack = SPSA(model)
            x_adv = attack.generate(x_input,
                                    y=y_label,
                                    epsilon=epsilon,
                                    num_steps=30,
                                    early_stop_loss_threshold=-1.,
                                    spsa_samples=32,
                                    spsa_iters=16,
                                    is_debug=True)

            logits = model.get_logits(x_adv)
            acc = _top_1_accuracy(logits, y_label)

            # Run computation
            saver = tf.train.Saver(slim.get_model_variables())
            session_creator = tf.train.ChiefSessionCreator(
                scaffold=tf.train.Scaffold(saver=saver),
                checkpoint_filename_with_path=FLAGS.checkpoint_path,
                master=FLAGS.master)

            num_correct = 0.
            with tf.train.MonitoredSession(
                    session_creator=session_creator) as sess:
                for i in xrange(num_images):
                    acc_val = sess.run(acc,
                                       feed_dict={
                                           x_input:
                                           np.expand_dims(images[i], axis=0),
                                           y_label:
                                           np.expand_dims(labels[i], axis=0),
                                       })
                    tf.logging.info('Accuracy: %s', acc_val)
                    num_correct += acc_val
                assert (num_correct / num_images) < 0.1
Esempio n. 3
0
    def test_attack_bounds(self):
        """Check SPSA respects perturbation limits."""
        epsilon = 4. / 255
        input_dir = FLAGS.input_image_dir
        metadata_file_path = FLAGS.metadata_file_path
        num_images = 8
        batch_shape = (num_images, 299, 299, 3)
        images, labels = load_images(input_dir, metadata_file_path,
                                     batch_shape)
        nb_classes = 1001

        tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
        with tf.Graph().as_default():
            # Prepare graph
            x_input = tf.compat.v1.placeholder(tf.float32,
                                               shape=(1, ) + batch_shape[1:])
            y_label = tf.compat.v1.placeholder(tf.int32, shape=(1, ))
            model = InceptionModel(nb_classes)

            attack = SPSA(model)
            x_adv = attack.generate(x_input,
                                    y=y_label,
                                    epsilon=epsilon,
                                    num_steps=10,
                                    early_stop_loss_threshold=-1.,
                                    spsa_samples=32,
                                    spsa_iters=1,
                                    is_debug=True)

            # Run computation
            saver = tf.compat.v1.train.Saver(slim.get_model_variables())
            session_creator = tf.compat.v1.train.ChiefSessionCreator(
                scaffold=tf.compat.v1.train.Scaffold(saver=saver),
                checkpoint_filename_with_path=FLAGS.checkpoint_path,
                master=FLAGS.master)

            with tf.compat.v1.train.MonitoredSession(
                    session_creator=session_creator) as sess:
                for i in xrange(num_images):
                    x_expanded = np.expand_dims(images[i], axis=0)
                    y_expanded = np.expand_dims(labels[i], axis=0)

                    adv_image = sess.run(x_adv,
                                         feed_dict={
                                             x_input: x_expanded,
                                             y_label: y_expanded
                                         })
                    diff = adv_image - images[i]
                    assert np.max(np.abs(diff)) < epsilon + 1e-4
                    assert np.max(adv_image < 1. + 1e-4)
                    assert np.min(adv_image > -1e-4)
    def test_attack_success(self):
        """Check SPSA creates misclassified images."""
        epsilon = 4. / 255
        input_dir = FLAGS.input_image_dir
        metadata_file_path = FLAGS.metadata_file_path
        num_images = 8
        batch_shape = (num_images, 299, 299, 3)
        images, labels = load_images(
            input_dir, metadata_file_path, batch_shape)
        num_classes = 1001

        tf.logging.set_verbosity(tf.logging.INFO)
        with tf.Graph().as_default():
            # Prepare graph
            x_input = tf.placeholder(tf.float32, shape=(1,) + batch_shape[1:])
            y_label = tf.placeholder(tf.int32, shape=(1,))
            model = InceptionModel(num_classes)

            attack = SPSA(model)
            x_adv = attack.generate(
                x_input, y=y_label, epsilon=epsilon, num_steps=30,
                early_stop_loss_threshold=-1., batch_size=32, spsa_iters=16,
                is_debug=True)

            logits = model.get_logits(x_adv)
            acc = _top_1_accuracy(logits, y_label)

            # Run computation
            saver = tf.train.Saver(slim.get_model_variables())
            session_creator = tf.train.ChiefSessionCreator(
                scaffold=tf.train.Scaffold(saver=saver),
                checkpoint_filename_with_path=FLAGS.checkpoint_path,
                master=FLAGS.master)

            num_correct = 0.
            with tf.train.MonitoredSession(
                    session_creator=session_creator) as sess:
                for i in xrange(num_images):
                    acc_val = sess.run(acc, feed_dict={
                        x_input: np.expand_dims(images[i], axis=0),
                        y_label: np.expand_dims(labels[i], axis=0),
                    })
                    tf.logging.info('Accuracy: %s', acc_val)
                    num_correct += acc_val
                assert (num_correct / num_images) < 0.1
Esempio n. 5
0
def cleverhans_spsa(model, data_iter, attack_size):
    while True:
        count = 0
        try:
            image, label = next(data_iter)
            label = np.array([label])
        except StopIteration:
            break
        print('Start attacking image {}'.format(count))
        for x in range():
            for y in range():
                print("location {}".format((x, y)))
                subimg = get_subimgs(image, (x, y), attack_size)
                #Build model
                tic = time.time()
                subimg_op = Input(shape=(3, attack_size[0], attack_size[1]))
                adv_img_op = ApplyStickerLayer(image, attack_size,
                                               (x, y))(subimg_op)
                wrapped_logits_op = model(adv_img_op)
                wrapped_model = Model(inputs=subimg_op,
                                      outputs=wrapped_logits_op)
                tac = time.time()
                print('{}s to build graph for attack'.format(tac - tic))
                wrapper = CallableModelWrapper(wrapped_model, "logits")
                wrapper.nb_classes = 1000
                attack = SPSA(wrapper, sess=keras.backend.get_session())
                spsa_params = {
                    'eps': 2.5,
                    'clip_min': -2.3,
                    'clip_max': 2.7,
                    'nb_iter': 40,
                    'y': label.astype(np.int32)
                }
                print('Start attacking...')
                tic = time.time()
                adv = attack.generate_np(subimg, **spsa_params)
                tac = time.time()
                print("Attack Time: {}s".format(tac - tic))

                # Evaluate adversarial sticker
                adv_logits = wrapped_model.predict(adv)
                print("Adversarial image: top-5 prediction: {}, label: {}".
                      format(np.argsort(adv_logits, axis=1)[:, -5:], label))
Esempio n. 6
0
def init_attack(model, attack_params_dict, sess):
    """
    Initialize the adversarial attack using the cleverhans toolbox

    Parameters
    ----------
    model : Keras Model
        The model to attack

    attack_params_dict : dict
        Self-defined dictionary specifying the attack and its parameters

    sess : Session
        The current tf session

    Returns
    -------
    attack : cleverhans Attack
        The Attack object

    attack_params
        Dictionary with the value of the attack parameters, valid to generate
        adversarial examples with cleverhans.
    """

    # Wrapper for the Keras model
    model_wrap = KerasModelWrapper(model)

    # Initialize attack
    batch_size = None
    if attack_params_dict['attack'] == 'fgsm':
        attack = FastGradientMethod(model_wrap, sess=sess)
        attack_params = {'eps': attack_params_dict['eps'], 'clip_min': 0., 
                         'clip_max': 1.}
    elif attack_params_dict['attack'] == 'spsa':
        attack = SPSA(model_wrap, sess=sess)
        attack_params = {'epsilon': attack_params_dict['eps'], 
                         'num_steps': attack_params_dict['n_steps']}
        batch_size = 1
    elif attack_params_dict['attack'] == 'deepfool':
        attack = DeepFool(model_wrap, sess=sess)
        attack_params = {'clip_min': 0., 'clip_max': 1.}
    elif attack_params_dict['attack'] == 'pgd':
        attack = ProjectedGradientDescent(model_wrap, sess=sess)
        attack_params = {'eps': attack_params_dict['eps'], 
                         'eps_iter': attack_params_dict['eps_iter'],
                         'nb_iter': attack_params_dict['n_steps'],
                         'clip_min': 0., 'clip_max': 1.}
    elif attack_params_dict['attack'] == 'carlini':
        attack = CarliniWagnerL2(model_wrap, sess=sess)
        attack_params = {'clip_min': 0., 'clip_max': 1.}
    else:
        raise NotImplementedError()

    return attack, attack_params, batch_size
    def test_attack_bounds(self):
        """Check SPSA respects perturbation limits."""
        epsilon = 4. / 255
        input_dir = FLAGS.input_image_dir
        metadata_file_path = FLAGS.metadata_file_path
        num_images = 8
        batch_shape = (num_images, 299, 299, 3)
        images, labels = load_images(
            input_dir, metadata_file_path, batch_shape)
        num_classes = 1001

        tf.logging.set_verbosity(tf.logging.INFO)
        with tf.Graph().as_default():
            # Prepare graph
            x_input = tf.placeholder(tf.float32, shape=(1,) + batch_shape[1:])
            y_label = tf.placeholder(tf.int32, shape=(1,))
            model = InceptionModel(num_classes)

            attack = SPSA(model)
            x_adv = attack.generate(
                x_input, y=y_label, epsilon=epsilon, num_steps=10,
                early_stop_loss_threshold=-1., batch_size=32, spsa_iters=1,
                is_debug=True)

            # Run computation
            saver = tf.train.Saver(slim.get_model_variables())
            session_creator = tf.train.ChiefSessionCreator(
                scaffold=tf.train.Scaffold(saver=saver),
                checkpoint_filename_with_path=FLAGS.checkpoint_path,
                master=FLAGS.master)

            with tf.train.MonitoredSession(
                    session_creator=session_creator) as sess:
                for i in xrange(num_images):
                    adv_image = sess.run(x_adv, feed_dict={
                        x_input: np.expand_dims(images[i], axis=0),
                        y_label: np.expand_dims(labels[i], axis=0),
                    })
                    diff = adv_image - images[i]
                    assert np.max(np.abs(diff)) < epsilon + 1e-4
                    assert np.max(adv_image < 1. + 1e-4)
                    assert np.min(adv_image > -1e-4)
Esempio n. 8
0
def spsa_attack():
    # Use tf for evaluation on adversarial data
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    sess = tf.Session(config=tf_config)

    # Convert pytorch model to a tf_model and wrap it in cleverhans
    tf_model_fn = convert_pytorch_model_to_tf(model, out_dims=10)
    cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits')
    cleverhans_model.nb_classes = 10

    # Create an SPSA attack
    spsa = SPSA(cleverhans_model, sess=sess)
    spsa_params = {
        'eps': args.eps,
        'nb_iter': args.ns,
        'clip_min': 0.,
        'clip_max': 1.,
        'spsa_samples': args.
        spsa_samples,  # in this case, the batch_size is equal to spsa_samples
        'spsa_iters': 1,
        'early_stop_loss_threshold': 0
    }

    # Evaluation against SPSA attacks
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(test_loader):
        advs = spsa.generate_np(inputs.numpy(),
                                y=targets.numpy().astype(np.int32),
                                **spsa_params)
        with torch.no_grad():
            correct += (model(
                torch.tensor(advs).cuda()).topk(1)[1].cpu().eq(targets)
                        ).sum().item()
        total += len(inputs)

        sys.stdout.write("\rBlack-box SPSA attack... Acc: %.3f%% (%d/%d)" %
                         (100. * correct / total, correct, total))
        sys.stdout.flush()

    print('Accuracy under SPSA attack: %.3f%%' % (100. * correct / total))
Esempio n. 9
0
class TestSPSA(CleverHansTest):
    def setUp(self):
        super(TestSPSA, self).setUp()

        self.sess = tf.Session()
        self.model = SimpleModel()
        self.attack = SPSA(self.model, sess=self.sess)

    def test_attack_strength(self):
        # This uses the existing input structure for SPSA. Tom tried for ~40
        # minutes to get generate_np to work correctly but could not.

        n_samples = 10
        x_val = np.random.rand(n_samples, 2)
        x_val = np.array(x_val, dtype=np.float32)

        # The SPSA attack currently uses non-one-hot labels
        # TODO: change this to use standard cleverhans label conventions
        feed_labs = np.random.randint(0, 2, n_samples)

        x_input = tf.placeholder(tf.float32, shape=(1, 2))
        y_label = tf.placeholder(tf.int32, shape=(1, ))

        x_adv_op = self.attack.generate(
            x_input,
            y=y_label,
            epsilon=.5,
            num_steps=100,
            batch_size=64,
            spsa_iters=1,
        )

        all_x_adv = []
        for i in range(n_samples):
            x_adv_np = self.sess.run(x_adv_op,
                                     feed_dict={
                                         x_input:
                                         np.expand_dims(x_val[i], axis=0),
                                         y_label:
                                         np.expand_dims(feed_labs[i], axis=0),
                                     })
            all_x_adv.append(x_adv_np[0])

        x_adv = np.vstack(all_x_adv)
        new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1)
        self.assertTrue(np.mean(feed_labs == new_labs) < 0.1)
Esempio n. 10
0
def spsa_max_confidence_recipe(sess, model, x, y, nb_classes, eps,
                               clip_min, clip_max, nb_iter,
                               report_path,
                               spsa_samples=SPSA.DEFAULT_SPSA_SAMPLES,
                               spsa_iters=SPSA.DEFAULT_SPSA_ITERS,
                               eval_batch_size=BATCH_SIZE):
  """Runs the MaxConfidence attack using SPSA as the underlying optimizer.

  Even though this runs only one attack, it must be implemented as a bundler
  because SPSA supports only batch_size=1. The cleverhans.attacks.MaxConfidence
  attack internally multiplies the batch size by nb_classes, so it can't take
  SPSA as a base attacker. Insteader, we must bundle batch_size=1 calls using
  cleverhans.attack_bundling.MaxConfidence.

  References:
  https://openreview.net/forum?id=H1g0piA9tQ

  :param sess: tf.Session
  :param model: cleverhans.model.Model
  :param x: numpy array containing clean example inputs to attack
  :param y: numpy array containing true labels
  :param nb_classes: int, number of classes
  :param eps: float, maximum size of perturbation (measured by max norm)
  :param nb_iter: int, number of iterations for one version of PGD attacks
    (will also run another version with 25X more iterations)
  :param report_path: str, the path that the report will be saved to.
  :param eval_batch_size: int, batch size for evaluation (as opposed to making attacks)
  """
  spsa = SPSA(model, sess)
  spsa_params = {"eps": eps, "clip_min" : clip_min, "clip_max" : clip_max,
                 "nb_iter": nb_iter, "spsa_samples": spsa_samples,
                 "spsa_iters": spsa_iters}
  attack_configs = []
  dev_batch_size = 1 # The only batch size supported by SPSA
  batch_size = num_devices
  ones = tf.ones(dev_batch_size, tf.int32)
  for cls in range(nb_classes):
    cls_params = copy.copy(spsa_params)
    cls_params['y_target'] = tf.to_float(tf.one_hot(ones * cls, nb_classes))
    cls_attack_config = AttackConfig(spsa, cls_params, "spsa_" + str(cls))
    attack_configs.append(cls_attack_config)
  new_work_goal = {config: 1 for config in attack_configs}
  goals = [MaxConfidence(t=1., new_work_goal=new_work_goal)]
  bundle_attacks(sess, model, x, y, attack_configs, goals, report_path,
                 attack_batch_size=batch_size, eval_batch_size=eval_batch_size)
Esempio n. 11
0
    def setUp(self):
        super(TestSPSA, self).setUp()

        self.sess = tf.Session()
        self.model = SimpleModel()
        self.attack = SPSA(self.model, sess=self.sess)
Esempio n. 12
0
val_subset_indices = image_partition(42, 100)[0]
val_subset_loader = torch.utils.data.DataLoader(imagenet_val,
                                                batch_size=1,
                                                num_workers=4,
                                                sampler=torch.utils.data.sampler.SubsetRandomSampler(val_subset_indices))

# We use tf for evaluation on adversarial data
sess = tf.Session()
x_op = tf.placeholder(tf.float32, shape=(None, 3, 224, 224,))
# Convert pytorch model to a tf_model and wrap it in cleverhans
tf_model_fn = convert_pytorch_model_to_tf(model)
cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits')
cleverhans_model.nb_classes = 1000

spsa_op = SPSA(cleverhans_model, sess=sess)
spsa_params = {'eps': 2.5,
             'clip_min': -2.3,
             'clip_max': 2.8, 
             'nb_iter': 40,
             'y': None}

correct = 0
count = 0
for xs, ys in val_subset_loader:
    count += 1
    ys = ys.numpy().astype(np.int32)
    # Create an SPSA attack
    spsa_params['y'] = ys
    adv_x = spsa_op.generate_np(xs, **spsa_params)
    break
Esempio n. 13
0
def eval(sess,
         model_name,
         X_train,
         Y_train,
         X_test,
         Y_test,
         cnn=False,
         rbf=False):
    """ Load model saved in model_name.json and model_name_weights.h5 and 
    evaluate its accuracy on legitimate test samples and adversarial samples.
    Use cnn=True if the model is CNN based.
    """

    # load saved model
    print("Load model ... ")
    '''
    json = open('models/{}.json'.format(model_name), 'r')
    model = json.read()
    json.close()
    loaded_model = model_from_json(model)
    loaded_model.load_weights("models/{}_weights.h5".format(model_name))
    '''
    if rbf:
        loaded_model = load_model("rbfmodels/{}.h5".format(model_name),
                                  custom_objects={'RBFLayer': RBFLayer})
    else:
        loaded_model = load_model("models/{}.h5".format(model_name))

    # Set placeholders
    if cnn:
        x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    else:
        x = tf.placeholder(tf.float32, shape=(None, 784))

    y = tf.placeholder(tf.float32, shape=(None, 10))

    predictions = loaded_model(x)

    accuracy = model_eval(sess,
                          x,
                          y,
                          predictions,
                          X_test,
                          Y_test,
                          args={"batch_size": 128})
    print('Test accuracy on legitimate test examples: ' + str(accuracy))

    # Craft adversarial examples using Fast Gradient Sign Method (FGSM)
    # Using functions from /cleverhans/attacks_tf.py
    # Will be deprecated next year
    # adv_x = fgsm(x, predictions, eps=0.3)
    # X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], batch_size=128)

    # Using functions from /cleverhans/attacks.py (as specified by creators)

    wrap = KerasModelWrapper(loaded_model)
    spsa = SPSA(wrap, sess=sess)

    images = 100

    correctImages = 0
    adv_pred = np.zeros((images, 10))

    for i in range(images):
        tensorpls = X_test[i].reshape(1, 784)
        tensorpls2 = Y_test[i].reshape(1, 10)

        x_in = tf.convert_to_tensor(tensorpls, tf.float32)
        y_in = tf.convert_to_tensor(tensorpls2, tf.float32)

        adv_x = spsa.generate(x_in,
                              y_in,
                              eps=0.3,
                              nb_iter=100,
                              clip_min=0,
                              clip_max=1,
                              early_stop_loss_threshold=-1.,
                              spsa_samples=32,
                              spsa_iters=1)
        adv_x = tf.stop_gradient(adv_x)

        test2 = adv_x.eval(session=sess)
        test3 = test2.reshape(28, 28)
        plt.imshow(test3)
        plt.colorbar()
        plt.show()

        print(type(test2))
        print(test2.shape)

        preds_adv = loaded_model(adv_x)

        test = preds_adv.eval(session=sess)

        for j in range(10):
            adv_pred[i][j] = test[0][j]

        if np.argmax(adv_pred[i]) == np.argmax(Y_test[i]):
            correctImages = correctImages + 1

        accuracy = correctImages / (i + 1)
        print('Test accuracy (' + str(i + 1) + '): ' + str(accuracy))

    # Evaluate the accuracy of the MNIST model on adversarial examples
    #accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={ "batch_size" : 128 })

    accuracy = correctImages / images
    print('Test accuracy on adversarial test examples: ' + str(accuracy))
Esempio n. 14
0
def run(args, restrict=True):
    if restrict:
        # Restrict the visible GPUs to the one for this subprocess
        id = np.int(multiprocessing.current_process().name.split("-")[1])
        os.environ["CUDA_VISIBLE_DEVICES"] = str(id - 1)

    # Load Parameters
    dataset = args[0]
    epsilon = float(args[1])
    mode = args[2]
    K = int(args[3])
    bias = float(args[4])

    fname = dataset + "/" + str(epsilon) + "_" + mode + "_" + str(
        K) + "_" + str(bias)

    # Configure Keras/Tensorflow
    Keras.clear_session()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

    sess = Keras.get_session()
    Keras.set_learning_phase(False)

    # Fix Random Seeds
    np.random.seed(1)
    tf.set_random_seed(
        1
    )  #Having this before keras.clear_session() causes it it hang for some reason

    # Load Model/Data and setup SPSA placeholders
    N = 1000
    if dataset == "MNIST":
        # Base Model
        base_model = MNISTModel("../1-Models/MNIST")
        data = MNIST()
        x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
        # SPSA
        shape_spsa = (1, 28, 28, 1)
        x_spsa = tf.placeholder(tf.float32, shape=shape_spsa)
    elif dataset == "CIFAR":
        # Base Model
        base_model = CIFARModel("../1-Models/CIFAR")
        data = CIFAR()
        x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
        # SPSA
        shape_spsa = (1, 32, 32, 3)
        x_spsa = tf.placeholder(tf.float32, shape=shape_spsa)
    y_spsa = tf.placeholder(tf.int32)

    # Load the hidden representations of the real and adversarial examples from the training set
    x_train_real = np.squeeze(
        np.load("../3-Representation/" + dataset + "/train_" + mode + ".npy"))
    x_train_adv = np.squeeze(
        np.load("../3-Representation/" + dataset + "/train_adv_" + mode +
                ".npy"))

    n_train = x_train_real.shape[0]
    n_train_adv = x_train_adv.shape[0]
    x_train = np.float32(np.vstack((x_train_real, x_train_adv)))
    #print("Bounds ", np.max(np.abs(x_train)))
    y_train = np.float32(
        np.hstack((-1.0 * np.ones(n_train), np.ones(n_train_adv))))

    # Create the defended model
    model_defended = DefendedModel(base_model, x_train, y_train, K, bias=bias)
    defended_logits = model_defended.get_logits(x)

    # Get the predictions on the original images
    labels = np.argmax(data.test_labels[:N], axis=1)
    logits_real = sess.run(defended_logits, {x: data.test_data[:N]})
    fp = (np.argmax(logits_real,
                    axis=1) == 10)  #False positives of the defense
    pred_undefended = np.argmax(np.delete(logits_real, -1, axis=1),
                                axis=1)  #Original model prediction

    # Configure the attack
    attack = SPSA(model_defended, back="tf", sess=sess)
    with tf.name_scope("Attack") as scope:
        gen = attack.generate(x_spsa,
                              y_target=y_spsa,
                              epsilon=epsilon,
                              is_targeted=True,
                              num_steps=100,
                              batch_size=2048,
                              early_stop_loss_threshold=-5.0)

    # Run the attack
    pred_adv = -1.0 * np.ones((N, 10))
    for i in range(N):
        if i % 10 == 0:
            print(fname, " ", i)
            out = {}
            out["FP"] = fp
            out["Labels"] = labels
            out["UndefendedPrediction"] = pred_undefended
            out["AdversarialPredictions"] = pred_adv
            file = open(fname, "wb")
            pickle.dump(out, file)
            file.close()

        x_real = data.test_data[i].reshape(shape_spsa)

        # Try a targeted attack for each class other than the original network prediction and the adversarial class
        for y in range(10):
            if y != pred_undefended[i]:
                x_adv = sess.run(gen, {x_spsa: x_real, y_spsa: y})
                pred_adv[i,
                         y] = np.argmax(sess.run(defended_logits, {x: x_adv}))

    out = {}
    out["FP"] = fp
    out["Labels"] = labels
    out["UndefendedPrediction"] = pred_undefended
    out["AdversarialPredictions"] = pred_adv
    file = open(fname, "wb")
    pickle.dump(out, file)
    file.close()

    analysis(fname)
#SPSA
print("\n\n")
print("SPSA")

spsa_params = {
    'eps': float(sys.argv[1]),
    'learning_rate': 0.01,
    'delta': 0.01,
    'spsa_samples': 128,
    'spsa_iters': 1,
    'nb_iter': 100,
    'clip_min': 0.,
    'clip_max': 1.
}

spsa_attack = SPSA(wrap_source, sess=sess)
x = tf.placeholder(dtype=tf.float32, shape=(None, 32, 32, 3))
y = tf.placeholder(dtype=tf.float32, shape=(None, 10))
x_adv = spsa_attack.generate(x, y, **spsa_params)
X_adv_source = np.zeros((len(indices_test), 32, 32, 3))
for i in range(0, len(indices_test)):
    X_adv_source[i] = sess.run(x_adv,
                               feed_dict={
                                   x: X_test[indices_test[i:(i + 1)]],
                                   y: Y_test[indices_test[i:(i + 1)]]
                               })

print("metrics source model")
print(metrics(model_source, X_adv_source, X_test, pred_source, indices_test))
print("metrics base model")
print(metrics(model, X_adv_source, X_test, pred_base, indices_test))
Esempio n. 16
0
def craft_one_type(sess,
                   model,
                   X,
                   Y,
                   dataset,
                   attack,
                   batch_size,
                   log_path=None,
                   fp_path=None,
                   model_logits=None):
    """
    TODO
    :param sess:
    :param model:
    :param X:
    :param Y:
    :param dataset:
    :param attack:
    :param batch_size:
    :return:
    """
    print("entered")
    if not log_path is None:
        PATH_DATA = log_path

    if attack == 'fgsm':
        # FGSM attack
        print('Crafting fgsm adversarial samples...')
        X_adv = fast_gradient_sign_method(sess,
                                          model,
                                          X,
                                          Y,
                                          eps=ATTACK_PARAMS[dataset]['eps'],
                                          clip_min=CLIP_MIN,
                                          clip_max=CLIP_MAX,
                                          batch_size=batch_size)
    elif attack == 'adapt-fgsm':
        # Adaptive FGSM attack
        print('Crafting fgsm adversarial samples...')

        X_adv = adaptive_fast_gradient_sign_method(
            sess,
            model,
            X,
            Y,
            eps=ATTACK_PARAMS[dataset]['eps'],
            clip_min=CLIP_MIN,
            clip_max=CLIP_MAX,
            batch_size=batch_size,
            log_dir=fp_path,
            model_logits=model_logits,
            dataset=dataset)
    elif attack == 'adapt-bim-b':
        # BIM attack
        print('Crafting %s adversarial samples...' % attack)
        X_adv = adaptive_basic_iterative_method(
            sess,
            model,
            X,
            Y,
            eps=ATTACK_PARAMS[dataset]['eps'],
            eps_iter=ATTACK_PARAMS[dataset]['eps_iter'],
            clip_min=CLIP_MIN,
            clip_max=CLIP_MAX,
            batch_size=batch_size,
            log_dir=fp_path,
            model_logits=model_logits,
            dataset=dataset)
    elif attack in ['bim-a', 'bim-b']:
        # BIM attack
        print('Crafting %s adversarial samples...' % attack)
        its, results = basic_iterative_method(
            sess,
            model,
            X,
            Y,
            eps=ATTACK_PARAMS[dataset]['eps'],
            eps_iter=ATTACK_PARAMS[dataset]['eps_iter'],
            clip_min=CLIP_MIN,
            clip_max=CLIP_MAX,
            batch_size=batch_size)
        if attack == 'bim-a':
            # BIM-A
            # For each sample, select the time step where that sample first
            # became misclassified
            X_adv = np.asarray([results[its[i], i] for i in range(len(Y))])
        else:
            # BIM-B
            # For each sample, select the very last time step
            X_adv = results[-1]
    elif attack == 'jsma':
        # JSMA attack
        print('Crafting jsma adversarial samples. This may take > 5 hours')
        X_adv = saliency_map_method(sess,
                                    model,
                                    X,
                                    Y,
                                    theta=1,
                                    gamma=0.1,
                                    clip_min=CLIP_MIN,
                                    clip_max=CLIP_MAX)
    elif attack == 'cw-l2':
        # C&W attack
        print(
            'Crafting %s examples. This takes > 5 hours due to internal grid search'
            % attack)
        image_size = ATTACK_PARAMS[dataset]['image_size']
        num_channels = ATTACK_PARAMS[dataset]['num_channels']
        num_labels = ATTACK_PARAMS[dataset]['num_labels']
        cw_attack = CarliniL2(sess,
                              model,
                              image_size,
                              num_channels,
                              num_labels,
                              batch_size=batch_size)
        X_adv = cw_attack.attack(X, Y)
    elif attack == 'cw-fp':
        # C&W attack to break LID detector
        print(
            'Crafting %s examples. This takes > 5 hours due to internal grid search'
            % attack)
        image_size = ATTACK_PARAMS[dataset]['image_size']
        num_channels = ATTACK_PARAMS[dataset]['num_channels']
        num_labels = ATTACK_PARAMS[dataset]['num_labels']
        cw_attack = CarliniFP_2vars(sess,
                                    model,
                                    image_size,
                                    num_channels,
                                    num_labels,
                                    batch_size=batch_size,
                                    fp_dir=fp_path)
        X_adv = cw_attack.attack(X, Y)

    elif attack == 'spsa':
        binary_steps = 1
        batch_shape = X.shape
        X_input = tf.placeholder(tf.float32, shape=(1, ) + batch_shape[1:])
        Y_label = tf.placeholder(tf.int32, shape=(1, ))
        alpha = tf.placeholder(tf.float32, shape=(1, ))

        num_samples = np.shape(X)[0]
        # X = (X - np.argmin(X))/(np.argmax(X)-np.argmin(X))
        _min = np.min(X)
        _max = np.max(X)
        print(_max, _min)
        print(tf.trainable_variables())
        filters = sess.run('conv1/kernel:0')
        biases = 0.0 * sess.run('conv1/bias:0')
        shift_model = Sequential()
        if (dataset == 'mnist'):
            shift_model.add(
                Conv2D(32,
                       kernel_size=(3, 3),
                       activation=None,
                       input_shape=(1, 28, 28)))
        else:
            shift_model.add(
                Conv2D(32,
                       kernel_size=(3, 3),
                       activation=None,
                       input_shape=(3, 32, 32)))

        X_input_2 = tf.placeholder(tf.float32,
                                   shape=(None, ) + batch_shape[1:])

        correction_term = shift_model(X_input_2)
        if (dataset == 'mnist'):
            X_correction = -0.5 * np.ones(
                (1, 1, 28, 28)
            )  # We will shift the image up by 0.5, so this is the correction
        else:
            X_correction = -0.5 * np.ones(
                (1, 3, 32, 32)
            )  # We will shift the image up by 0.5, so this is the correction

        # for PGD

        shift_model.layers[0].set_weights([filters, biases])
        bias_correction_terms = (sess.run(correction_term,
                                          feed_dict={X_input_2: X_correction}))
        for i in range(32):
            biases[i] = bias_correction_terms[0, i, 0, 0]
        _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0)
        print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc))
        original_biases = model.layers[0].get_weights()[1]
        original_weights = model.layers[0].get_weights()[0]
        model.layers[0].set_weights(
            [original_weights, original_biases + biases])
        #Correct model for input shift

        X = X + 0.5  #shift input to make it >=0
        _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0)
        print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc))
        # check accuracy post correction of input and model
        print('Crafting %s examples. Using Cleverhans' % attack)
        image_size = ATTACK_PARAMS[dataset]['image_size']
        num_channels = ATTACK_PARAMS[dataset]['num_channels']
        num_labels = ATTACK_PARAMS[dataset]['num_labels']

        from cleverhans.utils_keras import KerasModelWrapper
        wrapped_model = KerasModelWrapper(model)

        if dataset == "mnist":
            wrapped_model.nb_classes = 10
        elif dataset == "cifar":
            wrapped_model.nb_classes = 10
        else:
            wrapped_model.nb_classes = 10

        real_batch_size = X.shape[0]
        X_adv = None

        spsa = SPSA(wrapped_model, back='tf', sess=sess)
        spsa_params = {
            "epsilon": ATTACK_PARAMS[dataset]['eps'],
            'num_steps': 100,
            'spsa_iters': 1,
            'early_stop_loss_threshold': None,
            'is_targeted': False,
            'is_debug': False
        }
        X_adv_spsa = spsa.generate(X_input,
                                   alpha=alpha,
                                   y=Y_label,
                                   fp_path=fp_path,
                                   **spsa_params)

        for i in range(num_samples):

            # rescale to format TF wants

            #X_i_norm = (X[i] - _min)/(_max-_min)

            X_i_norm = X[i]
            # Run attack
            best_res = None
            ALPHA = np.ones(1) * 0.1
            lb = 1.0e-2
            ub = 1.0e2
            for j in range(binary_steps):
                res = sess.run(X_adv_spsa,
                               feed_dict={
                                   X_input: np.expand_dims(X_i_norm, axis=0),
                                   Y_label: np.array([np.argmax(Y[i])]),
                                   alpha: ALPHA
                               })
                if (dataset == 'mnist'):
                    X_place = tf.placeholder(tf.float32, shape=[1, 1, 28, 28])
                else:
                    X_place = tf.placeholder(tf.float32, shape=[1, 3, 32, 32])
                pred = model(X_place)
                model_op = sess.run(pred, feed_dict={X_place: res})

                if (not np.argmax(model_op) == np.argmax(Y[i, :])):
                    lb = ALPHA[0]
                else:
                    ub = ALPHA[0]
                ALPHA[0] = 0.5 * (lb + ub)
                print(ALPHA)
                if (best_res is None):
                    best_res = res
                else:
                    if (not np.argmax(model_op) == np.argmax(Y[i, :])):
                        best_res = res
                        pass

            # Rescale result back to our scale

            if (i == 0):
                X_adv = best_res
            else:
                X_adv = np.concatenate((X_adv, best_res), axis=0)

        _, acc = model.evaluate(X_adv, Y, batch_size=batch_size, verbose=0)
        print("Model accuracy on the adversarial test set: %0.2f%%" %
              (100.0 * acc))
        _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0)
        print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc))

        #Revert model to original
        model.layers[0].set_weights([original_weights, original_biases])
        #Revert adv shift
        X_adv = X_adv - 0.5
        X = X - 0.5  #Not used but just for logging purposes
    elif attack == 'adapt-pgd':
        binary_steps = 1
        rand_starts = 2
        batch_shape = X.shape
        X_input = tf.placeholder(tf.float32, shape=(1, ) + batch_shape[1:])
        Y_label = tf.placeholder(tf.int32, shape=(1, ))
        alpha = tf.placeholder(tf.float32, shape=(1, ))

        num_samples = np.shape(X)[0]
        # X = (X - np.argmin(X))/(np.argmax(X)-np.argmin(X))
        _min = np.min(X)
        _max = np.max(X)
        print(_max, _min)
        print(tf.trainable_variables())
        filters = sess.run('conv1/kernel:0')
        biases = 0.0 * sess.run('conv1/bias:0')
        shift_model = Sequential()
        if (dataset == 'mnist'):
            shift_model.add(
                Conv2D(32,
                       kernel_size=(3, 3),
                       activation=None,
                       input_shape=(1, 28, 28)))
        else:
            shift_model.add(
                Conv2D(32,
                       kernel_size=(3, 3),
                       activation=None,
                       input_shape=(3, 32, 32)))

        X_input_2 = tf.placeholder(tf.float32,
                                   shape=(None, ) + batch_shape[1:])

        correction_term = shift_model(X_input_2)
        if (dataset == 'mnist'):
            X_correction = -0.5 * np.ones(
                (1, 1, 28, 28)
            )  # We will shift the image up by 0.5, so this is the correction
        else:
            X_correction = -0.5 * np.ones(
                (1, 3, 32, 32)
            )  # We will shift the image up by 0.5, so this is the correction

        # for PGD

        shift_model.layers[0].set_weights([filters, biases])
        bias_correction_terms = (sess.run(correction_term,
                                          feed_dict={X_input_2: X_correction}))
        for i in range(32):
            biases[i] = bias_correction_terms[0, i, 0, 0]
        _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0)
        print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc))
        original_biases = model.layers[0].get_weights()[1]
        original_weights = model.layers[0].get_weights()[0]
        model.layers[0].set_weights(
            [original_weights, original_biases + biases])
        #Correct model for input shift

        X = X + 0.5  #shift input to make it >=0

        _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0)
        print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc))
        # check accuracy post correction of input and model
        print('Crafting %s examples. Using Cleverhans' % attack)
        image_size = ATTACK_PARAMS[dataset]['image_size']
        num_channels = ATTACK_PARAMS[dataset]['num_channels']
        num_labels = ATTACK_PARAMS[dataset]['num_labels']

        from cleverhans.utils_keras import KerasModelWrapper
        wrapped_model = KerasModelWrapper(model)

        if dataset == "mnist":
            wrapped_model.nb_classes = 10
        elif dataset == "cifar":
            wrapped_model.nb_classes = 10
        else:
            wrapped_model.nb_classes = 10

        real_batch_size = X.shape[0]
        X_adv = None

        pgd = MadryEtAl(wrapped_model, back='tf', sess=sess)
        X_adv_pgd, adv_loss_fp = pgd.generate(X_input,
                                              eps=0.3,
                                              eps_iter=0.02,
                                              clip_min=0.0,
                                              clip_max=1.0,
                                              nb_iter=20,
                                              rand_init=True,
                                              fp_path=fp_path,
                                              alpha=alpha)

        for i in range(num_samples):
            # rescale to format TF wants

            #X_i_norm = (X[i] - _min)/(_max-_min)

            X_i_norm = X[i]
            # Run attack
            best_res = None
            best_res_loss = 1000000.0
            ALPHA = np.ones(1) * 0.1
            lb = 1.0e-2
            ub = 1.0e2
            for j in range(binary_steps):
                bin_flag = 0
                for jj in range(rand_starts):

                    [res, res_loss] = sess.run(
                        [X_adv_pgd, adv_loss_fp],
                        feed_dict={
                            X_input: np.expand_dims(X[i], axis=0),
                            Y_label: np.array([np.argmax(Y[i])]),
                            alpha: ALPHA
                        })

                    if (dataset == 'mnist'):
                        X_place = tf.placeholder(tf.float32,
                                                 shape=[1, 1, 28, 28])
                    else:
                        X_place = tf.placeholder(tf.float32,
                                                 shape=[1, 3, 32, 32])

                    pred = model(X_place)
                    model_op = sess.run(pred, feed_dict={X_place: res})

                    if (best_res is None):
                        best_res = res
                    else:
                        if ((not np.argmax(model_op) == np.argmax(Y[i, :]))
                                and res_loss < best_res_loss):
                            best_res = res
                            best_res_loss = res_loss
                            bin_flag = 1
                            pass
                if (bin_flag == 1):
                    lb = ALPHA[0]
                else:
                    ub = ALPHA[0]
                ALPHA[0] = 0.5 * (lb + ub)
                print(ALPHA)
            # Rescale result back to our scale

            if (i == 0):
                X_adv = best_res
            else:
                X_adv = np.concatenate((X_adv, best_res), axis=0)

        _, acc = model.evaluate(X_adv, Y, batch_size=batch_size, verbose=0)
        print("Model accuracy on the adversarial test set: %0.2f%%" %
              (100.0 * acc))
        _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0)
        print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc))

        #Revert model to original
        model.layers[0].set_weights([original_weights, original_biases])
        #Revert adv shift
        X_adv = X_adv - 0.5
        X = X - 0.5  #Not used but just for logging purposes

        pass

    if ("adapt" in attack or "fp" in attack or "spsa" in attack):
        [m, _, _, _] = (np.shape(X_adv))
        cropped_X_adv = []
        cropped_Y = []
        cropped_X = []
        if (dataset == 'mnist'):
            X_place = tf.placeholder(tf.float32, shape=[1, 1, 28, 28])
            pred = model(X_place)
        else:
            X_place = tf.placeholder(tf.float32, shape=[1, 3, 32, 32])
            pred = model(X_place)
        for i in range(m):
            logits_op = sess.run(pred,
                                 feed_dict={X_place: X_adv[i:i + 1, :, :, :]})
            if (not np.argmax(logits_op) == np.argmax(Y[i, :])):
                cropped_Y.append(Y[i, :])
                cropped_X_adv.append(X_adv[i, :, :, :])
                cropped_X.append(X[i, :, :, :])
        X_adv = np.array(cropped_X_adv)
        X = np.array(cropped_X)
        Y = np.array(cropped_Y)

        f = open(
            os.path.join(log_path, 'Random_Test_%s_%s.p' % (dataset, attack)),
            'w')

        pickle.dump({"adv_input": X, "adv_labels": Y}, f)
        f.close()

    #np.save(os.path.join(PATH_DATA, 'Adv_%s_%s.npy' % (dataset, attack)), X_adv)
    f = open(os.path.join(log_path, 'Adv_%s_%s.p' % (dataset, attack)), 'w')

    pickle.dump({"adv_input": X_adv, "adv_labels": Y}, f)
    f.close()
    _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0)
    print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc))
    l2_diff = np.linalg.norm(X_adv.reshape((len(X), -1)) - X.reshape(
        (len(X), -1)),
                             axis=1).mean()
    print("Average L-2 perturbation size of the %s attack: %0.2f" %
          (attack, l2_diff))
    if (("adapt" in attack) or ("cw-fp" in attack)):
        return (X, X_adv, Y)
    else:
        print(Y.shape)
        return (X_adv, Y)
X_adv = np.zeros((len(indices),32,32,3))
for i in range(len(indices)):
    ind = indices[i]
    X_adv[i] = zoo.attack(X_test[ind:(ind+1)], Y_test[ind:(ind+1)])    
print("results on source model: ")
results = metrics(model, X_adv, X_test, y_test, indices)
print(results)    
print("results on target model: ")
results = metrics(model_target, X_adv, X_test, y_test, indices)
print(results)  

#####SPSA
spsa_params = {'eps': 0.03,
               'learning_rate': 0.01,
               'delta': 0.01,
               'spsa_samples': 128,
               'spsa_iters': 1,
               'nb_iter': 100,
               'clip_min': 0.,
               'clip_max': 1.
               }
spsa = SPSA(wrap, sess=sess)
X_adv = np.zeros((len(indices_spsa),32,32,3))
for i in range(0,len(indices_spsa),batch_attack):
    X_adv[i:i+batch_attack] = spsa.generate_np(X_test[indices[i:(i+batch_attack)]], **spsa_params)
print("results on source model: ")
results = metrics(model, X_adv, X_test, y_test, indices)
print(results)    
print("results on target model: ")
results = metrics(model_target, X_adv, X_test, y_test, indices)
print(results)   
Esempio n. 18
0
def train(alpha, eps2_ratio, gen_ratio, fgsm_eps, LR, logfile):
    logfile.write("fgsm_eps \t %g, LR \t %g, alpha \t %d , eps2_ratio \t %d , gen_ratio \t %d \n"%(fgsm_eps, LR, alpha, eps2_ratio, gen_ratio))
    #############################
    ##Hyper-parameter Setting####
    #############################
    hk = 256; #number of hidden units at the last layer
    Delta2 = (14*14+2)*25; #global sensitivity for the first hidden layer
    Delta3_adv = 2*hk #10*(hk + 1/4 * hk**2) #10*(hk) #global sensitivity for the output layer
    Delta3_benign = 2*hk #10*(hk); #global sensitivity for the output layer
    D = 50000; #size of the dataset
    L = 2499; #batch size
    image_size = 28;
    padding = 4;
    #numHidUnits = 14*14*32 + 7*7*64 + M + 10; #number of hidden units
    #gen_ratio = 1
    epsilon1 = 0.0; #0.175; #epsilon for dpLRP
    epsilon2 = 0.1*(1 + gen_ratio); #epsilon for the first hidden layer
    epsilon3 = 0.1*(1); #epsilon for the last hidden layer
    total_eps = epsilon1 + epsilon2 + epsilon3
    print(total_eps)
    uncert = 0.1; #uncertainty modeling at the output layer
    infl = 1; #inflation rate in the privacy budget redistribution
    R_lowerbound = 1e-5; #lower bound of the LRP
    c = [0, 40, 50, 200] #norm bounds
    epochs = 200; #number of epochs
    preT_epochs = 50; #number of epochs
    T = int(D/L*epochs + 1); #number of steps T
    pre_T = int(D/L*preT_epochs + 1);
    step_for_epoch = int(D/L); #number of steps for one epoch
    
    broken_ratio = 1
    #alpha = 9.0 # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    #eps2_ratio = 10; # [1/10, 1/8, 1/6, 1/4, 1/2, 1, 2, 4, 6, 8, 10]
    #eps_benign = 1/(1+eps2_ratio)*(2*epsilon2)
    #eps_adv = eps2_ratio/(1+eps2_ratio)*(2*epsilon2)
    
    #fgsm_eps = 0.1
    rand_alpha = 0.05
    
    ##Robustness##
    robustness_T = (fgsm_eps*18*18*L*epsilon2)/Delta2;
    ####
    
    LRPfile = os.getcwd() + '/Relevance_R_0_075.txt';
    #############################
    mnist = input_data.read_data_sets("MNIST_data/", one_hot = True);

    #############################
    ##Construct the Model########
    #############################
    #Step 4: Randomly initiate the noise, Compute 1/|L| * Delta3 for the output layer#

    #Compute the 1/|L| * Delta3 for the last hidden layer#
    """eps3_ratio = Delta3_adv/Delta3_benign;
    eps3_benign = 1/(1+eps3_ratio)*(epsilon3)
    eps3_adv = eps3_ratio/(1+eps3_ratio)*(epsilon3)"""
    loc, scale3_benign, scale3_adv = 0., Delta3_benign/(epsilon3*L), Delta3_adv/(epsilon3*L);
    ###
    #End Step 4#
    # Parameters Declarification
    W_conv1 = weight_variable('W_conv1', [5, 5, 1, 32], collect=[AECODER_VARIABLES]);
    b_conv1 = bias_variable('b_conv1', [32], collect=[AECODER_VARIABLES]);

    shape     = W_conv1.get_shape().as_list()
    w_t       = tf.reshape(W_conv1, [-1, shape[-1]])
    w         = tf.transpose(w_t)
    sing_vals = tf.svd(w, compute_uv=False)
    sensitivity = tf.reduce_max(sing_vals)
    gamma = 2*(14*14 + 2)*25/(L*sensitivity)
    
    dp_epsilon=1.0 #0.1
    delta_r = fgsm_eps*(image_size**2);
    #delta_h = 1.0 * delta_r; #sensitivity*(14**2) = sensitivity*(\beta**2) can also be used
    #dp_mult = (Delta2/(L*epsilon2))/(delta_r / dp_epsilon) + (2*Delta2/(L*epsilon2))/(delta_h / dp_epsilon)
    
    W_conv2 = weight_variable('W_conv2', [5, 5, 32, 64], collect=[CONV_VARIABLES]);
    b_conv2 = bias_variable('b_conv2', [64], collect=[CONV_VARIABLES]);

    W_fc1 = weight_variable('W_fc1', [4 * 4 * 64, hk], collect=[CONV_VARIABLES]);
    b_fc1 = bias_variable('b_fc1', [hk], collect=[CONV_VARIABLES]);

    W_fc2 = weight_variable('W_fc2', [hk, 10], collect=[CONV_VARIABLES]);
    b_fc2 = bias_variable('b_fc2', [10], collect=[CONV_VARIABLES]);

    """scale2 = tf.Variable(tf.ones([hk]))
    beta2 = tf.Variable(tf.zeros([hk]))
    tf.add_to_collections([CONV_VARIABLES], scale2)
    tf.add_to_collections([CONV_VARIABLES], beta2)"""

    params = [W_conv1, b_conv1, W_conv2, b_conv2, W_fc1, b_fc1, W_fc2, b_fc2]
    ###


    #Step 5: Create the model#
    noise = tf.placeholder(tf.float32, [None, image_size, image_size, 1]);
    adv_noise = tf.placeholder(tf.float32, [None, image_size, image_size, 1]);

    keep_prob = tf.placeholder(tf.float32);
    x = tf.placeholder(tf.float32, [None, image_size*image_size]);
    x_image = tf.reshape(x, [-1,image_size,image_size,1]);

    #perturbFMx = np.random.laplace(0.0, Delta2/(2*epsilon2*L), 28*28)
    #perturbFMx = np.reshape(perturbFMx, [-1, 28, 28, 1]);

    # pretrain ###
    #Enc_Layer1 = EncLayer(inpt=x_image, n_filter_in = 1, n_filter_out = 32, filter_size = 5, W=W_conv1, b=b_conv1, activation=tf.nn.relu)
    #pretrain = Enc_Layer1.get_train_ops2(xShape = tf.shape(x_image)[0], Delta = Delta2, epsilon = 2*epsilon2, batch_size = L, learning_rate= LR, W = W_conv1, b = b_conv1, perturbFMx = noise)
    ###########

    adv_x = tf.placeholder(tf.float32, [None, image_size*image_size]);
    adv_image = tf.reshape(adv_x, [-1,image_size,image_size,1]);

    #perturbFMx_adv = np.random.laplace(0.0, Delta2/(2*epsilon2*L), 28*28)
    #perturbFMx_adv = np.reshape(perturbFMx_adv, [-1, 28, 28, 1]);

    # pretrain adv ###
    #perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2*L), 14*14*32)
    #perturbFM_h = np.reshape(perturbFM_h, [-1, 14, 14, 32]);
    FM_h = tf.placeholder(tf.float32, [None, 14, 14, 32]);
    Enc_Layer2 = EncLayer(inpt=adv_image, n_filter_in = 1, n_filter_out = 32, filter_size = 5, W=W_conv1, b=b_conv1, activation=tf.nn.relu)
    pretrain_adv = Enc_Layer2.get_train_ops2(xShape = tf.shape(adv_image)[0], Delta = Delta2, batch_size = L, learning_rate= LR, W = W_conv1, b = b_conv1, perturbFMx = adv_noise, perturbFM_h = FM_h)
    Enc_Layer3 = EncLayer(inpt=x_image, n_filter_in = 1, n_filter_out = 32, filter_size = 5, W=W_conv1, b=b_conv1, activation=tf.nn.relu)
    pretrain_benign = Enc_Layer3.get_train_ops2(xShape = tf.shape(x_image)[0], Delta = Delta2, batch_size = L, learning_rate= LR, W = W_conv1, b = b_conv1, perturbFMx = noise, perturbFM_h = FM_h)
    ###########
    
    x_image += noise;
    x_image = tf.clip_by_value(x_image, -10, 10) #Clip the values of each input feature.
    
    adv_image += adv_noise;
    adv_image = tf.clip_by_value(adv_image, -10, 10) #Clip the values of each input feature.

    #perturbFM = np.random.laplace(0.0, scale3_benign, hk)
    #perturbFM = np.reshape(perturbFM, [hk]);
    perturbFM = np.random.laplace(0.0, scale3_benign, hk * 10)
    perturbFM = np.reshape(perturbFM, [hk, 10]);
    
    y_conv = inference(x_image, perturbFM, hk, FM_h, params);
    softmax_y_conv = tf.nn.softmax(y_conv)
    #robust_mask = inference_robust_mask(y_conv, Delta2, L, epsilon2, robustness_T)

    #perturbFM = np.random.laplace(0.0, scale3_adv, hk)
    #perturbFM = np.reshape(perturbFM, [hk]);
    y_adv_conv = inference(adv_image, perturbFM, hk, FM_h, params);
    #adv_robust_mask = inference_robust_mask(y_adv_conv, Delta2, L, epsilon2, robustness_T)

    # test model
    perturbFM_test = np.random.laplace(0.0, 0, hk)
    perturbFM_test = np.reshape(perturbFM_test, [hk]);
    x_test = tf.reshape(x, [-1,image_size,image_size,1]);
    y_test = inference(x_test, perturbFM_test, hk, FM_h, params);
    #test_robust_mask = inference_robust_mask(y_test, Delta2, L, epsilon2, robustness_T)

    #Define a place holder for the output label#
    y_ = tf.placeholder(tf.float32, [None, 10]);
    adv_y_ = tf.placeholder(tf.float32, [None, 10]);
    #End Step 5#
    #############################

    #############################
    ##Define loss and Optimizer##
    #############################
    '''
        Computes differentially private sigmoid cross entropy given `logits`.
        
        Measures the probability error in discrete classification tasks in which each
        class is independent and not mutually exclusive.
        
        For brevity, let `x = logits`, `z = labels`.  The logistic loss is
        z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
        = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
        = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
        = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
        = (1 - z) * x + log(1 + exp(-x))
        = x - x * z + log(1 + exp(-x))
        
        For x < 0, to avoid overflow in exp(-x), we reformulate the above
        
        x - x * z + log(1 + exp(-x))
        = log(exp(x)) - x * z + log(1 + exp(-x))
        = - x * z + log(1 + exp(x))
        
        Hence, to ensure stability and avoid overflow, the implementation uses this
        equivalent formulation
        
        max(x, 0) - x * z + log(1 + exp(-abs(x)))
        
        `logits` and `labels` must have the same type and shape. Let denote neg_abs_logits = -abs(y_conv) = -abs(h_fc1 * W_fc2). By Applying Taylor Expansion, we have:
        
        Taylor = max(y_conv, 0) - y_conv * y_ + log(1 + exp(-abs(y_conv)));
        = max(h_fc1 * W_fc2, 0) - (y_ * h_fc1) * W_fc2 + (math.log(2.0) + 0.5*neg_abs_logits + 1.0/8.0*neg_abs_logits**2)
        = max(h_fc1 * W_fc2, 0) - (y_ * h_fc1) * W_fc2 + (math.log(2.0) + 0.5*(-abs(h_fc1 * W_fc2)) + 1.0/8.0*(-abs(h_fc1 * W_fc2))**2)
        = F1 + F2
        where: F1 = max(h_fc1 * W_fc2, 0) + (math.log(2.0) + 0.5*(-abs(h_fc1 * W_fc2)) + 1.0/8.0*(-abs(h_fc1 * W_fc2))**2) and F2 = - (y_ * h_fc1) * W_fc2
        
        To ensure that Taylor is differentially private, we need to perturb all the coefficients, including the term y_ * h_fc1 * W_fc2.
        Note that h_fc1 is differentially private, since its computation on top of the DP Affine transformation does not access the original data.
        Therefore, F1 should be differentially private. We need to preserve DP in F2, which reads the groundtruth label y_, as follows:
        
        By applying Funtional Mechanism, we perturb (y_ * h_fc1) * W_fc2 as ((y_ * h_fc1) + perturbFM) * W_fc2 = (y_ * h_fc1)*W_fc2 + (perturbFM * W_fc2):
        
        perturbFM = np.random.laplace(0.0, scale3, hk * 10)
        perturbFM = np.reshape(perturbFM/L, [hk, 10]);
        
        where scale3 = Delta3/(epsilon3) = 2*hk/(epsilon3);
        
        To allow computing gradients at zero, we define custom versions of max and abs functions [Tensorflow].
        
        Source: https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/python/ops/nn_impl.py @ TensorFlow
    '''
    ### Taylor for benign x
    zeros = array_ops.zeros_like(y_conv, dtype=y_conv.dtype)
    cond = (y_conv >= zeros)
    relu_logits = array_ops.where(cond, y_conv, zeros)
    neg_abs_logits = array_ops.where(cond, -y_conv, y_conv)
    #Taylor = math_ops.add(relu_logits - y_conv * y_, math_ops.log1p(math_ops.exp(neg_abs_logits)))
    Taylor_benign = math_ops.add(relu_logits - y_conv * y_, math.log(2.0) + 0.5*neg_abs_logits + 1.0/8.0*neg_abs_logits**2) - tf.reduce_sum(perturbFM*W_fc2)
    #Taylor_benign = tf.abs(y_conv - y_)

    ### Taylor for adv_x
    zeros_adv = array_ops.zeros_like(y_adv_conv, dtype=y_conv.dtype)
    cond_adv = (y_adv_conv >= zeros_adv)
    relu_logits_adv = array_ops.where(cond_adv, y_adv_conv, zeros_adv)
    neg_abs_logits_adv = array_ops.where(cond_adv, -y_adv_conv, y_adv_conv)
    #Taylor = math_ops.add(relu_logits - y_conv * y_, math_ops.log1p(math_ops.exp(neg_abs_logits)))
    Taylor_adv = math_ops.add(relu_logits_adv - y_adv_conv * adv_y_, math.log(2.0) + 0.5*neg_abs_logits_adv + 1.0/8.0*neg_abs_logits_adv**2) - tf.reduce_sum(perturbFM*W_fc2)
    #Taylor_adv = tf.abs(y_adv_conv - adv_y_)

    ### Adversarial training loss
    adv_loss = (1/(L + L*alpha))*(Taylor_benign + alpha * Taylor_adv)

    '''Some time, using learning rate decay can help to stablize training process. However, use this carefully, since it may affect the convergent speed.'''
    global_step = tf.Variable(0, trainable=False)
    pretrain_var_list = tf.get_collection(AECODER_VARIABLES)
    train_var_list = tf.get_collection(CONV_VARIABLES)
    #print(pretrain_var_list)
    #print(train_var_list)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        pretrain_step = tf.train.AdamOptimizer(LR).minimize(pretrain_adv+pretrain_benign, global_step=global_step, var_list=pretrain_var_list);
        train_step = tf.train.AdamOptimizer(LR).minimize(adv_loss, global_step=global_step, var_list=train_var_list);
    sess = tf.InteractiveSession();

    # Define the correct prediction and accuracy
    # This needs to be changed to "Robust Prediction"
    correct_prediction_x = tf.equal(tf.argmax(y_test,1), tf.argmax(y_,1));
    accuracy_x = tf.reduce_mean(tf.cast(correct_prediction_x, tf.float32));

    #############
    # use these to get predictions wrt to robust conditions
    """robust_correct_prediction_x = tf.multiply(test_robust_mask, tf.cast(correct_prediction_x, tf.float32))
    accuracy_x_robust = tf.reduce_sum(robust_correct_prediction_x) / tf.reduce_sum(test_robust_mask)
    #certified_utility = 2/(1/accuracy_x_robust + 1/(tf.reduce_sum(test_robust_mask)/(1.0*tf.cast(tf.size(test_robust_mask), tf.float32))))
    certified_utility = (1.0*tf.reduce_sum(test_robust_mask))/(1.0*tf.cast(tf.size(test_robust_mask), tf.float32))"""
    #############

    # craft adversarial samples from x for training
    dynamic_eps = tf.placeholder(tf.float32);
    emsemble_L = int(L/3)
    softmax_y = tf.nn.softmax(y_test)
    #c_x_adv = fgsm(x, softmax_y, eps=fgsm_eps, clip_min=0.0, clip_max=1.0)
    c_x_adv = fgsm(x, softmax_y, eps=(dynamic_eps)/10, clip_min=-1.0, clip_max=1.0) # for I-FGSM
    x_adv = tf.reshape(c_x_adv, [emsemble_L,image_size*image_size]);

    #====================== attack =========================
    #attack_switch = {'randfgsm':True, 'fgsm':True, 'ifgsm':True, 'deepfool':True, 'mim':True, 'spsa':False, 'cwl2':False, 'madry':True, 'stm':True}
    #attack_switch = {'fgsm':True, 'ifgsm':True, 'deepfool':True, 'mim':True, 'spsa':False, 'cwl2':False, 'madry':True, 'stm':True}
    attack_switch = {'fgsm':True, 'ifgsm':True, 'deepfool':False, 'mim':True, 'spsa':False, 'cwl2':False, 'madry':True, 'stm':False}
    #other possible attacks:
        # ElasticNetMethod
        # FastFeatureAdversaries
        # LBFGS
        # SaliencyMapMethod
        # VirtualAdversarialMethod

    # y_test = logits (before softmax)
    # softmax_y_test = preds (probs, after softmax)
    softmax_y_test = tf.nn.softmax(y_test)

    # create saver
    saver = tf.train.Saver(tf.all_variables())
    
    sess.run(W_conv1.initializer)
    _gamma = sess.run(gamma)
    _gamma_x = Delta2/L
    epsilon2_update = epsilon2/(1.0 + 1.0/_gamma + 1/_gamma_x)
    print(epsilon2_update/_gamma + epsilon2_update/_gamma_x)
    print(epsilon2_update)
    _sensitivityW = sess.run(sensitivity)
    delta_h = _sensitivityW*(14**2)
    dp_mult = (Delta2/(L*epsilon2_update))/(delta_r / dp_epsilon) + (2*Delta2/(L*epsilon2_update))/(delta_h / dp_epsilon)
    #############################
    
    iterativeStep = 100
    
    # load the most recent models
    _global_step = 0
    ckpt = tf.train.get_checkpoint_state(os.getcwd() + './tmp/train')
    if ckpt and ckpt.model_checkpoint_path:
        print(ckpt.model_checkpoint_path);
        saver.restore(sess, ckpt.model_checkpoint_path)
        _global_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
    else:
        print('No checkpoint file found')

    start_time = time.time();

    # adv pretrain model (Auto encoder layer)
    cost = tf.reduce_sum(Enc_Layer2.cost);
    logfile.write("pretrain: \n")
    
    # define cleverhans abstract models for using cleverhans attacks
    ch_model_logits = CustomCallableModelWrapper(callable_fn=inference_test_input, output_layer='logits', hk=hk, params=params, image_size=image_size, adv_noise = adv_noise)
    ch_model_probs = CustomCallableModelWrapper(callable_fn=inference_test_input_probs, output_layer='probs', hk=hk, params=params, image_size=image_size, adv_noise = adv_noise)

    # rand+fgsm
    # if attack_switch['randfgsm']:
    #     randfgsm_obj = FastGradientMethod(model=ch_model_probs, sess=sess)
    #     x_randfgsm_t = (fgsm_eps - rand_alpha) * randfgsm_obj.generate(x=x, eps=fgsm_eps, clip_min=-1.0, clip_max=1.0)
    #     x_rand_t = rand_alpha * tf.sign(tf.random_normal(shape=tf.shape(x), mean=0.0, stddev=1.0))

    # define each attack method's tensor
    mu_alpha = tf.placeholder(tf.float32, [1]);
    attack_tensor_dict = {}
    # FastGradientMethod
    if attack_switch['fgsm']:
        print('creating attack tensor of FastGradientMethod')
        fgsm_obj = FastGradientMethod(model=ch_model_probs, sess=sess)
        #x_adv_test_fgsm = fgsm_obj.generate(x=x, eps=fgsm_eps, clip_min=-1.0, clip_max=1.0, ord=2) # testing now
        x_adv_test_fgsm = fgsm_obj.generate(x=x, eps=mu_alpha, clip_min=-1.0, clip_max=1.0) # testing now
        attack_tensor_dict['fgsm'] = x_adv_test_fgsm

    # Iterative FGSM (BasicIterativeMethod/ProjectedGradientMethod with no random init)
    # default: eps_iter=0.05, nb_iter=10
    if attack_switch['ifgsm']:
        print('creating attack tensor of BasicIterativeMethod')
        ifgsm_obj = BasicIterativeMethod(model=ch_model_probs, sess=sess)
        #x_adv_test_ifgsm = ifgsm_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, clip_min=-1.0, clip_max=1.0, ord=2)
        x_adv_test_ifgsm = ifgsm_obj.generate(x=x, eps=mu_alpha, eps_iter=mu_alpha/iterativeStep, nb_iter=iterativeStep, clip_min=-1.0, clip_max=1.0)
        attack_tensor_dict['ifgsm'] = x_adv_test_ifgsm

    # Deepfool
    if attack_switch['deepfool']:
        print('creating attack tensor of DeepFool')
        deepfool_obj = DeepFool(model=ch_model_logits, sess=sess)
        #x_adv_test_deepfool = deepfool_obj.generate(x=x, nb_candidate=10, overshoot=0.02, max_iter=50, nb_classes=10, clip_min=-1.0, clip_max=1.0, ord=2)
        x_adv_test_deepfool = deepfool_obj.generate(x=x, nb_candidate=10, overshoot=0.02, max_iter=50, nb_classes=10, clip_min=-1.0, clip_max=1.0)
        attack_tensor_dict['deepfool'] = x_adv_test_deepfool

    # MomentumIterativeMethod
    # default: eps_iter=0.06, nb_iter=10
    if attack_switch['mim']:
        print('creating attack tensor of MomentumIterativeMethod')
        mim_obj = MomentumIterativeMethod(model=ch_model_probs, sess=sess)
        #x_adv_test_mim = mim_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, decay_factor=1.0, clip_min=-1.0, clip_max=1.0, ord=2)
        x_adv_test_mim = mim_obj.generate(x=x, eps=mu_alpha, eps_iter=mu_alpha/iterativeStep, nb_iter=iterativeStep, decay_factor=1.0, clip_min=-1.0, clip_max=1.0)
        attack_tensor_dict['mim'] = x_adv_test_mim

    # SPSA
    # note here the epsilon is the infinity norm instead of precent of perturb
    # Maybe exclude this method first, since it seems to have some constrain about the data value range
    if attack_switch['spsa']:
        print('creating attack tensor of SPSA')
        spsa_obj = SPSA(model=ch_model_logits, sess=sess)
        #x_adv_test_spsa = spsa_obj.generate(x=x, epsilon=fgsm_eps, num_steps=10, is_targeted=False, early_stop_loss_threshold=None, learning_rate=0.01, delta=0.01,spsa_samples=1000, spsa_iters=1, ord=2)
        x_adv_test_spsa = spsa_obj.generate(x=x, epsilon=fgsm_eps, num_steps=10, is_targeted=False, early_stop_loss_threshold=None, learning_rate=0.01, delta=0.01,spsa_samples=1000, spsa_iters=1)
        attack_tensor_dict['spsa'] = x_adv_test_spsa

    # CarliniWagnerL2
    # confidence=0 is fron their paper
    # it is said to be slow, maybe exclude first
    if attack_switch['cwl2']:
        print('creating attack tensor of CarliniWagnerL2')
        cwl2_obj = CarliniWagnerL2(model=ch_model_logits, sess=sess)
        #x_adv_test_cwl2 = cwl2_obj.generate(x=x, confidence=0, batch_size=1000, learning_rate=0.005, binary_search_steps=5, max_iterations=500, abort_early=True, initial_const=0.01, clip_min=-1.0, clip_max=1.0, ord=2)
        x_adv_test_cwl2 = cwl2_obj.generate(x=x, confidence=0, batch_size=1000, learning_rate=0.005, binary_search_steps=5, max_iterations=500, abort_early=True, initial_const=0.01, clip_min=-1.0, clip_max=1.0)
        attack_tensor_dict['cwl2'] = x_adv_test_cwl2

    # MadryEtAl (Projected Grdient with random init, same as rand+fgsm)
    # default: eps_iter=0.01, nb_iter=40
    if attack_switch['madry']:
        print('creating attack tensor of MadryEtAl')
        madry_obj = MadryEtAl(model=ch_model_probs, sess=sess)
        #x_adv_test_madry = madry_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, clip_min=-1.0, clip_max=1.0, ord=2)
        x_adv_test_madry = madry_obj.generate(x=x, eps=mu_alpha, eps_iter=fgsm_eps/iterativeStep, nb_iter=iterativeStep, clip_min=-1.0, clip_max=1.0)
        attack_tensor_dict['madry'] = x_adv_test_madry

    # SpatialTransformationMethod
    # the params are pretty different from on the paper
    # so I use default
    # exclude since there's bug
    if attack_switch['stm']:
        print('creating attack tensor of SpatialTransformationMethod')
        stm_obj = SpatialTransformationMethod(model=ch_model_probs, sess=sess)
        #x_adv_test_stm = stm_obj.generate(x=x, batch_size=1000, n_samples=None, dx_min=-0.1, dx_max=0.1, n_dxs=2, dy_min=-0.1, dy_max=0.1, n_dys=2, angle_min=-30, angle_max=30, n_angles=6, ord=2)
        x_adv_test_stm = stm_obj.generate(x=x, batch_size=1000, n_samples=None, dx_min=-0.1, dx_max=0.1, n_dxs=2, dy_min=-0.1, dy_max=0.1, n_dys=2, angle_min=-30, angle_max=30, n_angles=6)
        attack_tensor_dict['stm'] = x_adv_test_stm
    #====================== attack =========================
    
    sess.run(tf.initialize_all_variables());

    ##perturb h for training
    perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*32)
    perturbFM_h = np.reshape(perturbFM_h, [-1, 14, 14, 32]);

    ##perturb h for testing
    perturbFM_h_test = np.random.laplace(0.0, 0, 14*14*32)
    perturbFM_h_test = np.reshape(perturbFM_h_test, [-1, 14, 14, 32]);

    '''for i in range(_global_step, _global_step + pre_T):
        d_eps = random.random();
        
        batch = mnist.train.next_batch(L); #Get a random batch.
        adv_images = sess.run(x_adv, feed_dict = {x:batch[0], y_:batch[1], FM_h: perturbFM_h_test, dynamic_eps: d_eps})
        for iter in range(0, 9):
            adv_images = sess.run(x_adv, feed_dict = {x:adv_images, y_:batch[1], FM_h: perturbFM_h_test, dynamic_eps: d_eps})
        """batch = mnist.train.next_batch(emsemble_L)
        adv_images_mim = sess.run(attack_tensor_dict['mim'], feed_dict = {x:batch[0], y_: batch[1]})
        batch = mnist.train.next_batch(emsemble_L)
        adv_images_madry = sess.run(attack_tensor_dict['mim'], feed_dict = {x:batch[0], y_: batch[1]})
        train_images = np.append(np.append(adv_images, adv_images_mim, axis = 0),adv_images_madry, axis = 0)"""

        batch_2 = mnist.train.next_batch(L);
        pretrain_step.run(feed_dict={adv_x: np.append(adv_images, batch_2[0], axis = 0), adv_noise: AdvLnoise, FM_h: perturbFM_h});
        if i % int(5*step_for_epoch) == 0:
            cost_value = sess.run(cost, feed_dict={adv_x:mnist.test.images, adv_noise: AdvLnoise_test, FM_h: perturbFM_h_test})/(test_size*32)
            logfile.write("step \t %d \t %g \n"%(i, cost_value))
            print(cost_value)

    pre_train_finish_time = time.time()
    print('pre_train finished in: ' + parse_time(pre_train_finish_time - start_time))'''

    # train and test model with adv samples
    max_benign_acc = -1;
    max_robust_benign_acc = -1
    #max_adv_acc = -1;

    test_size = len(mnist.test.images)
    AdvLnoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L);
    AdvLnoise_test = generateIdLMNoise(image_size, 0, epsilon2_update, test_size);

    Lnoise_empty = generateIdLMNoise(image_size, 0, epsilon2_update, L);
    BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L);
    last_eval_time = -1
    accum_time = 0
    accum_epoch = 0
    max_adv_acc_dict = {}
    max_robust_adv_acc_dict = {}
    #max_robust_adv_utility_dict = {}
    for atk in attack_switch.keys():
        if atk not in max_adv_acc_dict:
            max_adv_acc_dict[atk] = -1
            max_robust_adv_acc_dict[atk] = -1

    for i in range(_global_step, _global_step + T):
        # this batch is for generating adv samples
        batch = mnist.train.next_batch(emsemble_L); #Get a random batch.
        y_adv_batch = batch[1]
        #The number of epochs we print out the result. Print out the result every 5 epochs.
        if i % int(10*step_for_epoch) == 0 and i > int(10*step_for_epoch):
            cost_value = sess.run(cost, feed_dict={adv_x:mnist.test.images, adv_noise: AdvLnoise_test, FM_h: perturbFM_h_test})/(test_size*32)
            print(cost_value)
            
            if last_eval_time < 0:
                last_eval_time = time.time()
            #===================benign samples=====================
            predictions_form_argmax = np.zeros([test_size, 10])
            #test_bach = mnist.test.next_batch(test_size)
            softmax_predictions = softmax_y_conv.eval(feed_dict={x: mnist.test.images, noise: BenignLNoise, FM_h: perturbFM_h})
            argmax_predictions = np.argmax(softmax_predictions, axis=1)
            for n_draws in range(0, 1):
                _BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L);
                _perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*32)
                _perturbFM_h = np.reshape(_perturbFM_h, [-1, 14, 14, 32]);
                for j in range(test_size):
                    pred = argmax_predictions[j]
                    predictions_form_argmax[j, pred] += 1;
                softmax_predictions = softmax_y_conv.eval(feed_dict={x: mnist.test.images, noise: (BenignLNoise + _BenignLNoise/2), FM_h: (perturbFM_h + _perturbFM_h/2)})
                argmax_predictions = np.argmax(softmax_predictions, axis=1)
            final_predictions = predictions_form_argmax;
            is_correct = []
            is_robust = []
            for j in range(test_size):
                is_correct.append(np.argmax(mnist.test.labels[j]) == np.argmax(final_predictions[j]))
                robustness_from_argmax = robustness.robustness_size_argmax(counts=predictions_form_argmax[j],eta=0.05,dp_attack_size=fgsm_eps, dp_epsilon=1.0, dp_delta=0.05, dp_mechanism='laplace') / (dp_mult)
                is_robust.append(robustness_from_argmax >= fgsm_eps)
            acc = np.sum(is_correct)*1.0/test_size
            robust_acc = np.sum([a and b for a,b in zip(is_robust, is_correct)])*1.0/np.sum(is_robust)
            robust_utility = np.sum(is_robust)*1.0/test_size
            max_benign_acc = max(max_benign_acc, acc)
            max_robust_benign_acc = max(max_robust_benign_acc, robust_acc*robust_utility)
            log_str = "step: {:.1f}\t epsilon: {:.1f}\t benign: {:.4f} \t {:.4f} \t {:.4f} \t {:.4f} \t".format(i, total_eps, acc, robust_acc, robust_utility, robust_acc*robust_utility)
            #===================adv samples=====================
            #log_str = "step: {:.1f}\t epsilon: {:.1f}\t".format(i, total_eps)
            """adv_images_dict = {}
            for atk in attack_switch.keys():
                if attack_switch[atk]:
                    adv_images_dict[atk] = sess.run(attack_tensor_dict[atk], feed_dict = {x:mnist.test.images, y_:mnist.test.labels})
            print("Done with the generating of Adversarial samples")"""
            #===================adv samples=====================
            adv_acc_dict = {}
            robust_adv_acc_dict = {}
            robust_adv_utility_dict = {}
            for atk in attack_switch.keys():
                if atk not in adv_acc_dict:
                    adv_acc_dict[atk] = -1
                    robust_adv_acc_dict[atk] = -1
                    robust_adv_utility_dict[atk] = -1
                if attack_switch[atk]:
                    adv_images_dict = sess.run(attack_tensor_dict[atk], feed_dict = {x:mnist.test.images, y_: mnist.test.labels, adv_noise: AdvLnoise_test, mu_alpha:[fgsm_eps]})
                    ### PixelDP Robustness ###
                    predictions_form_argmax = np.zeros([test_size, 10])
                    softmax_predictions = softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: BenignLNoise, FM_h: perturbFM_h})
                    argmax_predictions = np.argmax(softmax_predictions, axis=1)
                    for n_draws in range(0, 2000):
                        if n_draws % 1000 == 0:
                            print(n_draws)
                        _BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L);
                        _perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*32)
                        _perturbFM_h = np.reshape(_perturbFM_h, [-1, 14, 14, 32]);
                        for j in range(test_size):
                            pred = argmax_predictions[j]
                            predictions_form_argmax[j, pred] += 1;
                        softmax_predictions = softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: BenignLNoise, FM_h: (perturbFM_h + _perturbFM_h/2)}) * softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: (BenignLNoise + _BenignLNoise/2), FM_h: perturbFM_h})
                        #softmax_predictions = softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: BenignLNoise, FM_h: (_perturbFM_h)}) * softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: (_BenignLNoise), FM_h: perturbFM_h})
                        argmax_predictions = np.argmax(softmax_predictions, axis=1)
                    final_predictions = predictions_form_argmax;
                    is_correct = []
                    is_robust = []
                    for j in range(test_size):
                        is_correct.append(np.argmax(mnist.test.labels[j]) == np.argmax(final_predictions[j]))
                        robustness_from_argmax = robustness.robustness_size_argmax(counts=predictions_form_argmax[j],eta=0.05,dp_attack_size=fgsm_eps, dp_epsilon=1.0, dp_delta=0.05, dp_mechanism='laplace') / (dp_mult)
                        is_robust.append(robustness_from_argmax >= fgsm_eps)
                    adv_acc_dict[atk] = np.sum(is_correct)*1.0/test_size
                    robust_adv_acc_dict[atk] = np.sum([a and b for a,b in zip(is_robust, is_correct)])*1.0/np.sum(is_robust)
                    robust_adv_utility_dict[atk] = np.sum(is_robust)*1.0/test_size
                    ##############################
            for atk in attack_switch.keys():
                if attack_switch[atk]:
                    # added robust prediction
                    log_str += " {}: {:.4f} {:.4f} {:.4f} {:.4f}".format(atk, adv_acc_dict[atk], robust_adv_acc_dict[atk], robust_adv_utility_dict[atk], robust_adv_acc_dict[atk]*robust_adv_utility_dict[atk])
                    max_adv_acc_dict[atk] = max(max_adv_acc_dict[atk], adv_acc_dict[atk])
                    max_robust_adv_acc_dict[atk] = max(max_robust_adv_acc_dict[atk], robust_adv_acc_dict[atk]*robust_adv_utility_dict[atk])
            print(log_str)
            logfile.write(log_str + '\n')

            # logfile.write("step \t %d \t %g \t %g \n"%(i, benign_acc, adv_acc))
            # print("step \t %d \t %g \t %g"%(i, benign_acc, adv_acc));

            # estimate end time
            """if i > 0 and i % int(10*step_for_epoch) == 0:
                current_time_interval = time.time() - last_eval_time
                last_eval_time = time.time()
                print('during last eval interval, {} epoch takes {}'.format(10, parse_time(current_time_interval)))
                accum_time += current_time_interval
                accum_epoch += 10
                estimate_time = ((_global_step + T - i) / step_for_epoch) * (accum_time / accum_epoch)
                print('estimate finish in: {}'.format(parse_time(estimate_time)))"""

            #print("step \t %d \t adversarial test accuracy \t %g"%(i, accuracy_x.eval(feed_dict={x: adv_images, y_: mnist.test.labels, noise: Lnoise_empty})));
            """checkpoint_path = os.path.join(os.getcwd() + '/tmp/train', 'model.ckpt')
            saver.save(sess, checkpoint_path, global_step=i);"""

        d_eps = random.random();
        y_adv = batch[1]
        adv_images = sess.run(attack_tensor_dict['ifgsm'], feed_dict = {x:batch[0], y_: batch[1], adv_noise: AdvLnoise, mu_alpha:[d_eps]})
        """for iter in range(0, 9):
            adv_images = sess.run(x_adv, feed_dict = {x:adv_images, y_:batch[1], FM_h: perturbFM_h_test, dynamic_eps: d_eps})"""
        batch = mnist.train.next_batch(emsemble_L)
        adv_images_mim = sess.run(attack_tensor_dict['mim'], feed_dict = {x:batch[0], y_: batch[1], adv_noise: AdvLnoise, mu_alpha:[d_eps]})
        y_adv = np.append(y_adv, batch[1], axis = 0)
        batch = mnist.train.next_batch(emsemble_L)
        adv_images_madry = sess.run(attack_tensor_dict['madry'], feed_dict = {x:batch[0], y_: batch[1], adv_noise: AdvLnoise, mu_alpha:[d_eps]})
        y_adv = np.append(y_adv, batch[1], axis = 0)
        train_images = np.append(np.append(adv_images, adv_images_mim, axis = 0),adv_images_madry, axis = 0)
        
        batch = mnist.train.next_batch(L); #Get a random batch.
        # train with benign and adv samples
        pretrain_step.run(feed_dict={adv_x: train_images, x: batch[0], adv_noise: AdvLnoise_test, noise: BenignLNoise, FM_h: perturbFM_h});
        train_step.run(feed_dict={x: batch[0], adv_x: train_images, y_: batch[1], adv_y_: y_adv, noise: BenignLNoise, adv_noise: AdvLnoise_test, FM_h: perturbFM_h});
    duration = time.time() - start_time;
    # print(parse_time(duration)); #print running time duration#

    max_acc_string = "max acc: benign: \t{:.4f} {:.4f}".format(max_benign_acc, max_robust_benign_acc)
    for atk in attack_switch.keys():
        if attack_switch[atk]:
            max_acc_string += " {}: \t{:.4f} {:.4f}".format(atk, max_adv_acc_dict[atk], max_robust_adv_acc_dict[atk])
    logfile.write(max_acc_string + '\n')
    logfile.write(str(duration) + '\n')
Esempio n. 19
0
def iterate_through_cwl2_attacks():
    tf.logging.set_verbosity(tf.logging.INFO)
    input_dir = FLAGS.input_image_dir
    metadata_file_path = FLAGS.metadata_file_path
    num_images = len(os.listdir(input_dir))
    batch_shape = (num_images, 299, 299, 3)
    num_classes = 1001
    batch_size = attack_name_to_params[ATTACKS.CARLINI_WAGNER]['batch_size']
    images, labels, target_classes = load_images(input_dir, metadata_file_path, batch_shape,
                                                 num_classes)

    list_param_dict = expand_param_dict(
        attack_name_to_params[ATTACKS.CARLINI_WAGNER],
        attack_name_to_configurable_params[ATTACKS.CARLINI_WAGNER]
    )

    save_dir = 'saves'
    os.makedirs(save_dir, exist_ok=True)

    for idx, params in enumerate(list_param_dict):
        tf.reset_default_graph()

        logger.info('Running attack with parameters: {}'.format(params))
        logger.info('Current index of parameters: {}/{}'.format(idx, len(list_param_dict)))

        # Get save path
        adv_imgs_save_path = get_attack_images_filename_prefix(
            attack_name=ATTACKS.CARLINI_WAGNER,
            params=params,
            model='inception',
            targeted_prefix='targeted'
        )
        adv_imgs_save_path = os.path.join(save_dir, adv_imgs_save_path)

        # Run inference
        graph = tf.Graph()
        with graph.as_default():
            sess = tf.Session(graph=graph)
            # Prepare graph
            x_input = tf.placeholder(tf.float32, shape=(batch_size,) + batch_shape[1:])
            y_label = tf.placeholder(tf.int32, shape=(batch_size, num_classes))
            y_target = tf.placeholder(tf.int32, shape=(batch_size, num_classes))
            model = InceptionModel(num_classes)

            cwl2 = True
            if cwl2:
                attack = CarliniWagnerL2(model=model, sess=sess)
                x_adv = attack.generate(x_input, y_target=y_target, **params)
            else:
                attack = SPSA(model=model)
                x_adv = attack.generate(x_input, y_target=y_label, epsilon=4. / 255, num_steps=30,
                                        early_stop_loss_threshold=-1., batch_size=32, spsa_iters=16,
                                        is_debug=True)

            logits = model.get_logits(x_input)
            acc = _top_k_accuracy(logits, tf.argmax(y_label, axis=1), k=1)
            success_rate = _top_k_accuracy(logits, tf.argmax(y_target, axis=1), k=1)

            # Run computation
            saver = tf.train.Saver(slim.get_model_variables())
            saver.restore(sess, save_path=FLAGS.checkpoint_path)

            list_adv_images = []

            if num_images % batch_size == 0:
                num_batches = int(num_images / batch_size)
            else:
                num_batches = int(num_images / batch_size + 1)

            for i in tqdm.tqdm(range(num_batches)):
                feed_dict_i = {x_input: images[i * batch_size:(i + 1) * batch_size],
                               y_target: target_classes[i * batch_size:(i + 1) * batch_size]}
                adv_img = sess.run(x_adv, feed_dict=feed_dict_i)
                list_adv_images.append(adv_img)

            adv_images = np.concatenate((list_adv_images))
            np.save(adv_imgs_save_path, adv_images)

            acc_store = []
            succ_store = []
            for i in tqdm.tqdm(range(num_batches)):
                feed_dict_i = {x_input: adv_images[i * batch_size:(i + 1) * batch_size],
                               y_target: target_classes[i * batch_size:(i + 1) * batch_size],
                               y_label: labels[i * batch_size:(i + 1) * batch_size]}
                succ_batch, acc_batch = sess.run([success_rate, acc],
                                                 feed_dict=feed_dict_i)
                acc_store.extend(acc_batch)
                succ_store.extend(succ_batch)

            logger.info('Accuracy is: {:.4f}'.format(np.mean(acc_store)))
            logger.info('Success Rate is: {:.4f}'.format(np.mean(succ_store)))
Esempio n. 20
0
def run(args, restrict=True):
    if restrict:
        # Restrict the visible GPUs to the one for this subprocess
        id = np.int(multiprocessing.current_process().name.split("-")[1])
        os.environ["CUDA_VISIBLE_DEVICES"] = str(id - 1)

    # Load Parameters
    dataset = args[0]
    epsilon = float(args[1])
    mode = args[2]
    K = int(args[3])

    fname = dataset + "/" + str(epsilon) + "_" + mode + "_" + str(K)

    # Configure Keras/Tensorflow
    Keras.clear_session()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

    sess = Keras.get_session()
    Keras.set_learning_phase(False)

    # Fix Random Seeds
    np.random.seed(1)
    tf.set_random_seed(
        1
    )  #Having this before keras.clear_session() causes it it hang for some reason

    # Load Model/Data and setup SPSA placeholders
    N = 50
    if dataset == "MNIST":
        # Base Model
        base_model = MNISTModel("../1-Models/MNIST")
        data = MNIST()
        x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
        # SPSA
        shape_spsa = (1, 28, 28, 1)
        x_spsa = tf.placeholder(tf.float32, shape=shape_spsa)
    elif dataset == "CIFAR":
        # Base Model
        base_model = CIFARModel("../1-Models/CIFAR")
        data = CIFAR()
        x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
        # SPSA
        shape_spsa = (1, 32, 32, 3)
        x_spsa = tf.placeholder(tf.float32, shape=shape_spsa)
    y_spsa = tf.placeholder(tf.int32)

    # Load the hidden representations of the real and adversarial examples from the training set
    x_train_real = np.squeeze(
        np.load("../3-Representation/" + dataset + "/train_" + mode + ".npy"))
    x_train_adv = np.squeeze(
        np.load("../3-Representation/" + dataset + "/train_adv_" + mode +
                ".npy"))

    n_train = x_train_real.shape[0]
    n_train_adv = x_train_adv.shape[0]
    x_train = np.float32(np.vstack((x_train_real, x_train_adv)))
    #print("Bounds ", np.max(np.abs(x_train)))
    y_train = np.float32(
        np.hstack((-1.0 * np.ones(n_train), np.ones(n_train_adv))))

    # Create the defended model
    model_defended = DefendedModel(base_model, x_train, y_train, K)
    defended_logits = model_defended.get_logits(x)

    # Configure the attack
    attack = SPSA(model_defended, back="tf", sess=sess)
    with tf.name_scope("Attack") as scope:
        gen = attack.generate(x_spsa,
                              y=y_spsa,
                              epsilon=epsilon,
                              is_targeted=False,
                              num_steps=100,
                              batch_size=2048,
                              early_stop_loss_threshold=-5.0)

    # Run the attack
    f = open(fname + ".txt", "w")

    sample = np.random.choice(data.test_data.shape[0], N, replace=False)
    x_sample = data.test_data[sample]
    y_sample = np.argmax(data.test_labels[sample], axis=1)

    logits_nat = sess.run(defended_logits, {x: x_sample})
    f.write("Accuracy on Natural Images: " +
            str(np.mean(np.argmax(logits_nat, axis=1) == y_sample)) + "\n")

    pred_adv = -1.0 * np.ones((N))
    for i in range(N):
        x_real = x_sample[i].reshape(shape_spsa)
        x_adv = sess.run(gen, {x_spsa: x_real, y_spsa: y_sample[i]})
        pred_adv[i] = np.argmax(sess.run(defended_logits, {x: x_adv}))

    f.write("Accuracy on Adversarial Images: " +
            str(np.mean(pred_adv == y_sample)))
    f.close()
Esempio n. 21
0
def run(args, restrict=True):
    if restrict:
        # Restrict the visible GPUs to the one for this subprocess
        id = np.int(multiprocessing.current_process().name.split("-")[1])
        os.environ["CUDA_VISIBLE_DEVICES"] = str(id - 1)

    # Load Parameters
    dataset = args[0]
    epsilon = float(args[1])
    mode = args[2]
    K = int(args[3])

    fname = dataset + "/" + str(epsilon) + "_" + mode + "_" + str(K)

    # Configure Keras/Tensorflow
    Keras.clear_session()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

    sess = Keras.get_session()
    Keras.set_learning_phase(False)

    # Fix Random Seeds
    np.random.seed(1)
    tf.set_random_seed(
        1
    )  #Having this before keras.clear_session() causes it it hang for some reason

    # Load Model/Data and setup SPSA placeholders
    N = 500
    if dataset == "MNIST":
        # Base Model
        base_model = MNISTModel("../1-Models/MNIST")
        data = MNIST()
        x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
        # SPSA
        shape_spsa = (1, 28, 28, 1)
        x_spsa = tf.placeholder(tf.float32, shape=shape_spsa)
    elif dataset == "CIFAR":
        # Base Model
        base_model = CIFARModel("../1-Models/CIFAR")
        data = CIFAR()
        x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
        # SPSA
        shape_spsa = (1, 32, 32, 3)
        x_spsa = tf.placeholder(tf.float32, shape=shape_spsa)
    y_spsa = tf.placeholder(tf.int32)

    # Load the hidden representations of the real and adversarial examples from the training set
    x_train_real = np.squeeze(
        np.load("../3-Representation/" + dataset + "/train_" + mode + ".npy"))
    x_train_adv = np.squeeze(
        np.load("../3-Representation/" + dataset + "/train_adv_" + mode +
                ".npy"))

    n_train = x_train_real.shape[0]
    n_train_adv = x_train_adv.shape[0]
    x_train = np.float32(np.vstack((x_train_real, x_train_adv)))
    y_train = np.float32(
        np.hstack((-1.0 * np.ones(n_train), np.ones(n_train_adv))))

    # Create the defended model
    defense = DefendedModel(base_model, x_train, y_train, K)
    get_votes = defense.get_votes(
        x)  # Should this be get_votes, introducing separate method
    get_logits = defense.get_logits(x)

    # Configure the attack
    attack = SPSA(defense, back="tf", sess=sess)
    with tf.name_scope("Attack") as scope:
        gen = attack.generate(x_spsa,
                              y=y_spsa,
                              epsilon=0.01,
                              is_targeted=False,
                              num_steps=100,
                              batch_size=2048,
                              early_stop_loss_threshold=-0.05)

    # Run the test
    sample = np.random.choice(data.test_data.shape[0], N, replace=False)
    x_sample = data.test_data[sample]
    y_sample = np.argmax(data.test_labels[sample], axis=1)

    votes = sess.run(get_votes, {x: x_sample})

    count = 0
    bound = 0
    correct = 0
    for i in range(N):
        if votes[i, 0] > 0:
            count += 1
            # Project via an adversarially attack on the votest
            #x_real = x_sample[i].reshape(shape_spsa)
            #x_adv = sess.run(gen, {x_spsa: x_real, y_spsa: 0}) #TODO: not adv, is projected
            x_proj = sess.run(get_logits, {x: x_sample[i]})
            projection_labels = np.argmax(x_proj, axis=1)
            successful_projections = projection_labels[np.nonzero(
                projection_labels * (projection_labels != 10))]

            # Check if the projection was a success
            if successful_projections.shape[0] != 0:
                bound += 1

            # Check if the projection is predicted correctly
            if y_sample[i] == np.argmax(sess.run(get_logits, {x: x_proj}),
                                        axis=1)[0]:
                correct += 1

    print("FP Count: ", count)
    print("FP Recovery in Bounds: ", bound / count)
    print("FP Recovery Accuracy: ", correct / count)
def eval_robustness(ARGS, verbose=True):
    #############################################
    # Load pre-trained model
    #############################################

    if verbose:
        print('\n- Loading pre-trained model...')

    # Build evaluation graph
    eval_graph = tf.Graph()
    config = tf.ConfigProto(log_device_placement=False,
                            allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    sess = tf.Session(graph=eval_graph, config=config)

    # Define input TF placeholder
    with eval_graph.as_default():
        with tf.device('/gpu:0'):
            # Define placeholders
            with tf.name_scope('Placeholders'):
                x = tf.placeholder(dtype=tf.float32,
                                   shape=input_shape,
                                   name='inputs')
                y = tf.placeholder(dtype=tf.float32,
                                   shape=(None, n_classes),
                                   name='labels')
                is_training = tf.placeholder_with_default(False,
                                                          shape=(),
                                                          name='is-training')

            # Define model
            with tf.name_scope('Model'):
                model = Model(nb_classes=n_classes,
                              input_shape=input_shape,
                              is_training=is_training)

            # Define forward-pass
            with tf.name_scope('Logits'):
                logits = model.get_logits(x)
            with tf.name_scope('Probs'):
                preds = tf.nn.softmax(logits)

            # Restore the pre-trained model
            with sess.as_default():
                saver = tf.train.Saver()
                saver.restore(sess, ARGS.restore_path + '/model.ckpt')

            # Define accuracy ops
            with tf.name_scope('Accuracy'):
                ground_truth = tf.argmax(y, axis=1)
                predicted_label = tf.argmax(preds, axis=1)
                correct_prediction = tf.equal(predicted_label, ground_truth)
                clean_acc = tf.reduce_mean(tf.to_float(correct_prediction),
                                           name='accuracy')

            # Define PGD adversary
            if ARGS.attack == 'PGD':
                if verbose:
                    print('\n- Building {:s} attack graph...'.format(
                        ARGS.attack))

                with tf.name_scope('PGD-Attacker'):
                    pgd_params = {
                        'ord': np.inf,
                        'y': y,
                        'eps': ARGS.eps / 255,
                        'eps_iter': ARGS.eps_iter / 255,
                        'nb_iter': ARGS.nb_iter,
                        'rand_init': ARGS.rand_init,
                        'rand_minmax': ARGS.eps / 255,
                        'clip_min': 0.,
                        'clip_max': 1.,
                        'sanity_checks': True
                    }

                    pgd = ProjectedGradientDescent(model, sess=None)
                    adv_x = pgd.generate(x, **pgd_params)

            # Define SPSA adversary
            elif ARGS.attack == 'SPSA':
                if verbose:
                    print('\n- Building {:s} attack graph...'.format(
                        ARGS.attack))

                with tf.name_scope('PGD-Attacker'):
                    spsa_params = {
                        'y': y,
                        'eps': ARGS.eps / 255,
                        'nb_iter': ARGS.nb_iter,
                        'spsa_samples': ARGS.spsa_samples,
                        'spsa_iters': ARGS.spsa_iters,
                        'clip_min': 0.,
                        'clip_max': 1.,
                        'learning_rate': ARGS.spsa_lr,
                        'delta': ARGS.spsa_delta
                    }

                    spsa = SPSA(model, sess=sess)
                    adv_x = spsa.generate(x, **spsa_params)
            else:
                raise NotImplementedError

            with tf.name_scope('Logits'):
                adv_logits = model.get_logits(adv_x)
            with tf.name_scope('Probs'):
                adv_preds = tf.nn.softmax(adv_logits)

            adv_loss = tf.nn.softmax_cross_entropy_with_logits(
                logits=adv_logits, labels=y)
            adv_predicted_label = tf.argmax(adv_preds, axis=1)
            correct_prediction = tf.equal(adv_predicted_label, ground_truth)
            adv_accuracy = tf.reduce_mean(tf.to_float(correct_prediction),
                                          name='adv-accuracy')
            is_adv_example = tf.not_equal(ground_truth, adv_predicted_label)

    #############################################
    # Run evaluation
    #############################################

    if verbose:
        print('\n- Running robustness evaluation against {:s} attacker...\n'.
              format(ARGS.attack))

    if ARGS.attack == 'PGD':
        clean, adv_mean, adv_worstcase = run_pgd_eval(x,
                                                      y,
                                                      is_training,
                                                      sess,
                                                      adv_testloader,
                                                      clean_acc,
                                                      adv_accuracy,
                                                      adv_loss,
                                                      is_adv_example,
                                                      ARGS,
                                                      save_loss_dist=False,
                                                      verbose=verbose)

    elif ARGS.attack == 'SPSA':
        clean, adv_mean = run_spsa_eval(x,
                                        y,
                                        is_training,
                                        sess,
                                        adv_testloader,
                                        clean_acc,
                                        adv_accuracy,
                                        adv_loss,
                                        is_adv_example,
                                        ARGS,
                                        save_loss_dist=False,
                                        verbose=verbose)
        adv_worstcase = adv_mean
    else:
        raise NotImplementedError

    return clean, adv_mean, adv_worstcase
Esempio n. 23
0
def attack_classifier(sess, x, model, x_test, attack_method="fgsm", target=None, batch_size=128):

    if attack_method == "fgsm":
        from cleverhans.attacks import FastGradientMethod
        params = {'eps': 8/255,
                  'clip_min': 0.,
                  'clip_max': 1.
                 }
        if target is not None:
            params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0))
        method = FastGradientMethod(model, sess=sess)

    elif attack_method == "basic_iterative":
        from cleverhans.attacks import BasicIterativeMethod
        params = {'eps': 8./255,
                  'eps_iter': 1./255,
                  'nb_iter': 10,
                  'clip_min': 0.,
                  'clip_max': 1.,
                  'ord': np.inf
                 }
        if target is not None:
            params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0))
        method = BasicIterativeMethod(model,sess = sess)

    elif attack_method == "momentum_iterative":
        from cleverhans.attacks import MomentumIterativeMethod
        params = {'eps':8/255,
                  'eps_iter':1/255,
                  'nb_iter': 10,
                  'clip_min': 0.,
                  'clip_max': 1.
                 }
        if target is not None:
            params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0))
        method = MomentumIterativeMethod(model,sess = sess)

    elif attack_method == "saliency":
        from cleverhans.attacks import SaliencyMapMethod
        params = {'theta':8/255,
                  'gamma':0.1,
                  'clip_min': 0.,
                  'clip_max': 1.
                 }
        assert target is None
        method = SaliencyMapMethod(model,sess = sess)

    elif attack_method == "virtual":
        from cleverhans.attacks import VirtualAdversarialMethod
        params = {'eps':8/255,
                  'num_iterations':10,
                  'xi' :1e-6,
                  'clip_min': 0.,
                  'clip_max': 1.
                 }
        assert target is None
        method = VirtualAdversarialMethod(model,sess = sess)

    elif attack_method == "cw":
        from cleverhans.attacks import CarliniWagnerL2
        params = {
            "confidence":0,
            "batch_size":128,
            "learning_rate":1e-4,
            "binary_search_steps":10,
            "max_iterations":1000,
            "abort_early": True,
            "initial_const":1e-2,
            "clip_min":0,
            "clip_max":1
        }
        if target is not None:
            params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0))
        method = CarliniWagnerL2(model,sess = sess)

    elif attack_method == "elastic_net":
        from cleverhans.attacks import ElasticNetMethod
        params = {
            "fista": "FISTA",
            "beta": 0.1,
            "decision_rule":"EN",
            "confidence":0,
            "batch_size":128,
            "learning_rate":1e-4,
            "binary_search_steps":10,
            "max_iterations":1000,
            "abort_early": True,
            "initial_const":1e-2,
            "clip_min":0,
            "clip_max":1
        }
        if target is not None:
            params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0))
        method = ElasticNetMethod(model,sess = sess)

    elif attack_method == "deepfool":
        from cleverhans.attacks import DeepFool
        params = {
            "nb_candidate":10,
            "overshoot":1e-3,
            "max_iter":100,
            "nb_classes":10,
            "clip_min":0,
            "clip_max":1
        }
        assert target is None
        method = DeepFool(model,sess = sess)

    elif attack_method == "lbfgs":
        from cleverhans.attacks import LBFGS
        params = {
            'batch_size':128,
            "binary_search_steps":10,
            "max_iterations":1000,
            "initial_const":1e-2,
            'clip_min': 0.,
            'clip_max': 1.
        }
        assert target is not None
        params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0))
        method = LBFGS(model,sess = sess)

    elif attack_method == "madry":
        from cleverhans.attacks import MadryEtAl
        params = {'eps':8/255,
                  'eps_iter':1/255,
                  'nb_iter':10,
                  'ord':np.inf,
                  'clip_min': 0.,
                  'clip_max': 1.
                 }
        if target is not None:
            params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0))
        method = MadryEtAl(model, sess = sess)

    elif attack_method == "SPSA":
        from cleverhans.attacks import SPSA
        params = {
            'epsilon':1/255,
            'num_steps':10,
            'is_targeted':False,
            'early_stop_loss_threshold':None,
            'learning_rate':0.01,
            'delta':0.01,
            'batch_size':128,
            'spsa_iters':1,
            'is_debug':False
        }
        if target is not None:
            params["y_target"] = tf.constant(np.repeat(np.eye(10)[target:target+1], batch_size, axis = 0))
            params["is_targeted"] = True
        method = SPSA(model, sess = sess)

    else:
        raise ValueError("Can not recognize this attack method: %s" % attack_method)

    adv_x = method.generate(x, **params)
    num_batch = x_test.shape[0] // batch_size
    adv_imgs = []
    for i in range(num_batch):
        x_feed = x_test[i*batch_size:(i+1)*batch_size]
        #y_feed = y_test[i*batch_size:(i+1)*batch_size]

        adv_img = sess.run(adv_x, feed_dict={x: x_feed})
        adv_imgs.append(adv_img)

    adv_imgs = np.concatenate(adv_imgs, axis=0)
    return adv_imgs
Esempio n. 24
0
def get_adv_examples(sess, wrap, attack_type, X, Y):
    """
        detect adversarial examples
        :param sess: target model session
        :param wrap: wrap model
        :param attack_type:  attack for generating adversarial examples
        :param X: examples to be attacked
        :param Y: correct label of the examples
        :return: x_adv: adversarial examples
    """
    x = tf.placeholder(tf.float32, shape=(None, X.shape[1], X.shape[2],
                                          X.shape[3]))
    y = tf.placeholder(tf.float32, shape=(None, Y.shape[1]))
    adv_label = np.copy(Y)
    batch_size = 128

    # Define attack method parameters
    if (attack_type == 'fgsm'):
        attack_params = {
            'eps': 0.1,
            'clip_min': 0.,
            'clip_max': 1.
        }
        attack_object = FastGradientMethod(wrap, sess=sess)
    elif (attack_type == 'jsma'):
        attack_params = {
            'theta': 1., 'gamma': 0.1,
            'clip_min': 0., 'clip_max': 1.,
            'y_target': None
        }
        attack_object = SaliencyMapMethod(wrap, sess=sess)
        batch_size = 32
    elif (attack_type == 'cw'):
        attack_params = {
            'binary_search_steps': 1,
            'y': y,
            'max_iterations': 100,
            'learning_rate': .2,
            'batch_size': 128,
            'initial_const': 10
        }
        attack_object = CarliniWagnerL2(wrap, sess=sess)
    elif (attack_type == 'mim'):
        attack_object = MomentumIterativeMethod(wrap, back='tf', sess=sess)
        attack_params = {'clip_min': 0., 'clip_max': 1., 'eps': 0.1}
    elif (attack_type == 'df'):
        attack_params = {
            'max_iterations': 50,
            'clip_min': 0., 'clip_max': 1.,
            'overshoot': 0.02
        }
        attack_object = DeepFool(wrap, sess=sess)
        batch_size = 64
    elif (attack_type == 'bim'):
        attack_object = BasicIterativeMethod(wrap, back='tf', sess=sess)
        attack_params = {'eps': 0.1, 'eps_iter': 0.05,
                         'nb_iter': 10, 'clip_min': 0.,
                         'clip_max': 1.
                         }
    elif (attack_type == 'vam'):
        attack_object = VirtualAdversarialMethod(wrap, back='tf', sess=sess)
        attack_params = {'clip_min': 0., 'clip_max': 1., 'nb_iter': 100, 'eps': 2, 'xi': 1e-6}
    elif (attack_type == 'enm'):
        attack_object = ElasticNetMethod(wrap, back='tf', sess=sess)
        attack_params = {'y': y, 'max_iterations': 10, 'batch_size': 128}
    elif (attack_type == 'spsa'):
        attack_object = SPSA(wrap, sess=sess)
        adv_x = attack_object.generate(x=x, y=y, eps=0.1, clip_min=0., clip_max=1., nb_iter=100,
                                       early_stop_loss_threshold=-5.)
        batch_size = 1
    elif (attack_type == 'lbfgs'):
        attack_object = LBFGS(wrap, sess=sess)
        attack_params = {'clip_min': 0, 'clip_max': 1., 'batch_size': 128,
                         'max_iterations': 10, "y_target": y}
        true_label = np.argmax(Y, axis=-1)
        for i in range(len(Y)):
            ind = (true_label[i] + 1) % FLAGS.nb_classes
            adv_label[i] = np.zeros([FLAGS.nb_classes])
            adv_label[i, ind] = 1
    if (attack_type != 'spsa'):
        adv_x = attack_object.generate(x, **attack_params)

    # Get adversarial examples
    if (attack_type == 'lbfgs'):
        x_adv = get_adv(sess, x, y, adv_x, X, adv_label, batch_size=batch_size)
    else:
        x_adv = get_adv(sess, x, y, adv_x, X, Y, batch_size=batch_size)
    return x_adv
Esempio n. 25
0
def main(type="Resnet", dataset="CIFAR10", attack_type="FGM"):

    size = 256
    eval_params = {'batch_size': 128}

    ############################################# Prepare the Data #####################################################

    if dataset == 'CIFAR10':
        (_, _), (x_test, y_test) = prepare_CIFAR10()
        num_classes = 10
        input_dim = 32
    elif dataset == 'CIFAR100':
        (_, _), (x_test, y_test) = prepare_CIFAR100()
        num_classes = 100
        input_dim = 32
    else:
        (_, _), (x_test, y_test) = prepare_SVHN("./Data/")
        num_classes = 10
        input_dim = 32

    x_test = x_test / 255.
    y_test = keras.utils.to_categorical(y_test, num_classes)

    ############################################# Prepare the Data #####################################################


    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:

        # prepare the placeholders
        x = tf.placeholder(tf.float32, [None, input_dim, input_dim, 3])
        y = tf.placeholder(tf.float32, [None, num_classes])

        input_output = []
        def modelBuilder(x, num_classes, dataset, type, sess, input_output):

            if len(input_output) == 0:

                reuse = False

                # Model/Graph
                if type == 'End2End':
                    _, tf_model = \
                        prepare_GBP_End2End(num_classes,
                                            inputT=x, sess=sess,
                                            checkpoint_dir='./{}_{}/'.format(dataset, type), reuse=reuse)
                else:

                    _, tf_model = \
                        prepare_Resnet(num_classes,
                                       inputT=x, sess=sess,
                                       checkpoint_dir='./{}_{}/'.format(dataset, type), reuse=reuse)

                input_output.append(x)
                input_output.append(tf_model.logits)

            else:

                reuse = True

                # Model/Graph
                if type == 'End2End':
                    _, tf_model = \
                        prepare_GBP_End2End(num_classes, inputT=x, reuse=reuse)
                else:
                    _, tf_model = \
                        prepare_Resnet(num_classes, inputT=x, reuse=reuse)

                input_output.append(x)
                input_output.append(tf_model.logits)


            return tf_model.logits

        # create an attackable model for the cleverhans
        model = CallableModelWrapper(lambda placeholder: modelBuilder(placeholder, num_classes, dataset, type, sess, input_output), 'logits')

        # TODO: check the configurations
        if attack_type == "FGM": # pass
            attack = FastGradientMethod(model, back='tf', sess=sess)
            params = {
                'eps' : 0.06,
                'clip_min': 0.,
                'clip_max': 1.
            }
        elif attack_type == "CWL2": # pass
            attack = CarliniWagnerL2(model, back='tf', sess=sess)
            params = {
                'confidence': 0.9,
                'batch_size': 128,
                'learning_rate': 0.005,
            }
        elif attack_type == "DF": # pass
            attack = DeepFool(model, back='tf', sess=sess)
            params = {
            }
        elif attack_type == "ENM": # configurations checked, quickly tested
            attack = ElasticNetMethod(model, back='tf', sess=sess)
            params = {
                'confidence': 0.9,
                'batch_size': 128,
                'learning_rate': 0.005,
            }
        elif attack_type == "FFA": # configuration checked
            attack = FastFeatureAdversaries(model, back='tf', sess=sess)
            params = {
                'eps': 0.06,
                'eps_iter': 0.005,
                'clip_min': 0.,
                'clip_max': 1.
            }
        elif attack_type == "LBFGS":
            attack = LBFGS(model, back='tf', sess=sess)
            params = {
                'eps': 0.06,
                'clip_min': 0.,
                'clip_max': 1.
            }
        elif attack_type == "MEA":
            attack = MadryEtAl(model, back='tf', sess=sess)
            params = {
                'eps': 0.06,
                'clip_min': 0.,
                'clip_max': 1.
            }
        elif attack_type == "MIM":
            attack = MomentumIterativeMethod(model, back='tf', sess=sess)
            params = {
                'eps': 0.06,
                'clip_min': 0.,
                'clip_max': 1.
            }
        elif attack_type == "SMM":
            attack = SaliencyMapMethod(model, back='tf', sess=sess)
            params = {
                'eps': 0.06,
                'clip_min': 0.,
                'clip_max': 1.
            }
        elif attack_type == "SPSA":
            attack = SPSA(model, back='tf', sess=sess)
            params = {
                'eps': 0.06,
                'clip_min': 0.,
                'clip_max': 1.
            }
        elif attack_type == "VATM":
            attack = vatm(model, back='tf', sess=sess)
            params = {
                'eps': 0.06,
                'clip_min': 0.,
                'clip_max': 1.
            }
        elif attack_type == "VAM":
            attack = VirtualAdversarialMethod(model, back='tf', sess=sess)
            params = {
                'eps': 0.06,
                'clip_min': 0.,
                'clip_max': 1.
            }
        else:
            raise Exception("I don't recognize {} this attack type. I will use FGM instead.".format(attack_type))

        # tf operation
        adv_x = attack.generate(x, **params)

        # generate the adversarial examples
        adv_vals = sess.run(adv_x, feed_dict={x: x_test[:size]})

        # notice that "adv_vals" may contain NANs because of the failure of the attack
        # also the input may not be perturbed at all because of the failure of the attack
        to_delete = []
        for idx, adv in enumerate(adv_vals):
            # for nan
            if np.isnan(adv).any():
                to_delete.append(idx)
            # for no perturbation
            if np.array_equiv(adv, x_test[idx]):
                to_delete.append(idx)

        # cleanings
        adv_vals_cleaned = np.delete(adv_vals, to_delete, axis=0)
        ori_cleaned = np.delete(x_test[:size], to_delete, axis=0)
        y_cleaned = np.delete(y_test[:size], to_delete, axis=0)

        if len(adv_vals_cleaned) == 0:
            print("No adversarial example is generated!")
            return

        print("{} out of {} adversarial examples are generated.".format(len(adv_vals_cleaned), size))

        print("The average L_inf distortion is {}".format(
            np.mean([np.max(np.abs(adv - ori_cleaned[idx])) for idx, adv in enumerate(adv_vals_cleaned)])))

        # TODO: visualize the adv_vals

        accuracy = model_eval(sess, input_output[0], y, tf.nn.softmax(input_output[1]), x_test[:size], y_test[:size],
                              args=eval_params)
        print('Test accuracy on normal examples: %0.4f' % accuracy)

        accuracy = model_eval(sess, input_output[0], y, tf.nn.softmax(input_output[1]), adv_vals_cleaned, y_cleaned,
                              args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
Esempio n. 26
0
			if Cfg.useCleverHans:
				# Initialize CleverHans
				print ("Initializing CleverHans")
				if attackMethod == "FGSM":
					print ("Using FGSM attack method!")
					attack = FastGradientMethod(model=model, sess=sess)
				if attackMethod == "LBFGS":
					print ("Using LBFGS attack method!")
					attack = LBFGS(model=model, sess=sess)
				if attackMethod == "CarliniWagnerL2":
					print ("Using Carlini and Wagner attack method!")
					attack = CarliniWagnerL2(model=model, sess=sess)
				if attackMethod == "SPSA":
					print ("Using SPSA attack method!")
					attack = SPSA(model=model, sess=sess)
				if attackMethod == "MadryEtAl":
					print ("Using Madry et al. attack method!")
					attack = MadryEtAl(model=model, sess=sess)
				if attackMethod == "ElasticNet":
					print ("Using Elastic Net attack method!")
					attack = ElasticNetMethod(model=model, sess=sess)
				if attackMethod == "DeepFool":
					print ("Using Deep Fool attack method!")
					attack = DeepFool(model=model, sess=sess)
				if attackMethod == "MomentumIterative":
					print ("Using Momentum Iterative attack method!")
					attack = MomentumIterativeMethod(model=model, sess=sess)
				if attackMethod == "BasicIterative":
					print ("Using Basic Iterative attack method!")
					attack = BasicIterativeMethod(model=model, sess=sess)