Exemple #1
0
    def test_generate_np_high_confidence_targeted_examples(self):

        trivial_model = TrivialModel()

        for CONFIDENCE in [0, 2.3]:
            x_val = np.random.rand(10, 1) - .5
            x_val = np.array(x_val, dtype=np.float32)

            feed_labs = np.zeros((10, 2))
            feed_labs[np.arange(10), np.random.randint(0, 2, 10)] = 1
            attack = CarliniWagnerL2(trivial_model, sess=self.sess)
            x_adv = attack.generate_np(x_val,
                                       max_iterations=100,
                                       binary_search_steps=2,
                                       learning_rate=1e-2,
                                       initial_const=1,
                                       clip_min=-10,
                                       clip_max=10,
                                       confidence=CONFIDENCE,
                                       y_target=feed_labs,
                                       batch_size=10)

            new_labs = self.sess.run(trivial_model.get_logits(x_adv))

            good_labs = new_labs[np.arange(10), np.argmax(feed_labs, axis=1)]
            bad_labs = new_labs[np.arange(10),
                                1 - np.argmax(feed_labs, axis=1)]

            self.assertClose(CONFIDENCE,
                             np.min(good_labs - bad_labs),
                             atol=1e-1)
            self.assertTrue(
                np.mean(
                    np.argmax(new_labs, axis=1) == np.argmax(feed_labs,
                                                             axis=1)) > .9)
Exemple #2
0
    def test_generate_np_high_confidence_untargeted_examples(self):

        trivial_model = TrivialModel()

        for CONFIDENCE in [0, 2.3]:
            x_val = np.random.rand(10, 1) - .5
            x_val = np.array(x_val, dtype=np.float32)

            orig_labs = np.argmax(self.sess.run(
                trivial_model.get_logits(x_val)),
                                  axis=1)
            attack = CarliniWagnerL2(trivial_model, sess=self.sess)
            x_adv = attack.generate_np(x_val,
                                       max_iterations=100,
                                       binary_search_steps=2,
                                       learning_rate=1e-2,
                                       initial_const=1,
                                       clip_min=-10,
                                       clip_max=10,
                                       confidence=CONFIDENCE,
                                       batch_size=10)

            new_labs = self.sess.run(trivial_model.get_logits(x_adv))

            good_labs = new_labs[np.arange(10), 1 - orig_labs]
            bad_labs = new_labs[np.arange(10), orig_labs]

            self.assertTrue(
                np.mean(np.argmax(new_labs, axis=1) == orig_labs) == 0)
            self.assertTrue(
                np.isclose(0,
                           np.min(good_labs - (bad_labs + CONFIDENCE)),
                           atol=1e-1))
def main(argv):

  model_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)

  if model_file is None:
    print('No model found')
    sys.exit()

  cifar = cifar10_input.CIFAR10Data(FLAGS.dataset_dir)

  nb_classes = 10
  X_test = cifar.eval_data.xs
  Y_test = to_categorical(cifar.eval_data.ys, nb_classes)
  assert Y_test.shape[1] == 10.

  set_log_level(logging.DEBUG)

  with tf.Session() as sess:

    x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    from madry_cifar10_model import make_madry_wresnet
    model = make_madry_wresnet()

    saver = tf.train.Saver()

    # Restore the checkpoint
    saver.restore(sess, model_file)

    nb_samples = FLAGS.nb_samples

    attack_params = {'batch_size': FLAGS.batch_size,
                     'clip_min': 0., 'clip_max': 255.}

    if FLAGS.attack_type == 'cwl2':
      from cleverhans_copy.attacks import CarliniWagnerL2
      attacker = CarliniWagnerL2(model, sess=sess)
      attack_params.update({'binary_search_steps': 1,
                            'max_iterations': 100,
                            'learning_rate': 0.1,
                            'initial_const': 10,
                            'batch_size': 10
                            })

    else:  # eps and eps_iter in range 0-255
      attack_params.update({'eps': 8, 'ord': np.inf})
      if FLAGS.attack_type == 'fgsm':
        from cleverhans_copy.attacks import FastGradientMethod
        attacker = FastGradientMethod(model, sess=sess)

      elif FLAGS.attack_type == 'pgd':
        attack_params.update({'eps_iter': 2, 'nb_iter': 20})
        from cleverhans_copy.attacks import MadryEtAl
        attacker = MadryEtAl(model, sess=sess)

    eval_par = {'batch_size': FLAGS.batch_size}

    if FLAGS.sweep:
      max_eps = 16
      epsilons = np.linspace(1, max_eps, max_eps)
      for e in epsilons:
        t1 = time.time()
        attack_params.update({'eps': e})
        x_adv = attacker.generate(x, **attack_params)
        preds_adv = model.get_probs(x_adv)
        acc = model_eval(sess, x, y, preds_adv, X_test[
            :nb_samples], Y_test[:nb_samples], args=eval_par)
        print('Epsilon %.2f, accuracy on adversarial' % e,
              'examples %0.4f\n' % acc)
      t2 = time.time()
    else:
      t1 = time.time()
      x_adv = attacker.generate(x, **attack_params)
      preds_adv = model.get_probs(x_adv)
      acc = model_eval(sess, x, y, preds_adv, X_test[
          :nb_samples], Y_test[:nb_samples], args=eval_par)
      t2 = time.time()
      print('Test accuracy on adversarial examples %0.4f\n' % acc)
    print("Took", t2 - t1, "seconds")
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0,
                      test_end=10000, viz_enabled=VIZ_ENABLED,
                      nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE,
                      source_samples=SOURCE_SAMPLES,
                      learning_rate=LEARNING_RATE,
                      attack_iterations=ATTACK_ITERATIONS,
                      model_path=MODEL_PATH,
                      targeted=TARGETED):
  """
  MNIST tutorial for Carlini and Wagner's attack
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param viz_enabled: (boolean) activate plots of adversarial examples
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param nb_classes: number of output classes
  :param source_samples: number of test inputs to attack
  :param learning_rate: learning rate for training
  :param model_path: path to the model file
  :param targeted: should we run a targeted attack? or untargeted?
  :return: an AccuracyReport object
  """
  # Object used to keep track of (and return) key accuracies
  report = AccuracyReport()

  # Set TF random seed to improve reproducibility
  tf.set_random_seed(1234)

  # Create TF session
  sess = tf.Session()
  print("Created TensorFlow session.")

  set_log_level(logging.DEBUG)

  # Get MNIST test data
  x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                train_end=train_end,
                                                test_start=test_start,
                                                test_end=test_end)

  # Obtain Image Parameters
  img_rows, img_cols, nchannels = x_train.shape[1:4]
  nb_classes = y_train.shape[1]

  # Define input TF placeholder
  x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                        nchannels))
  y = tf.placeholder(tf.float32, shape=(None, nb_classes))
  nb_filters = 64

  # Define TF model graph
  model = ModelBasicCNN('model1', nb_classes, nb_filters)
  preds = model.get_logits(x)
  loss = CrossEntropy(model, smoothing=0.1)
  print("Defined TensorFlow model graph.")

  ###########################################################################
  # Training the model using TensorFlow
  ###########################################################################

  # Train an MNIST model
  train_params = {
      'nb_epochs': nb_epochs,
      'batch_size': batch_size,
      'learning_rate': learning_rate,
      'filename': os.path.split(model_path)[-1]
  }

  rng = np.random.RandomState([2017, 8, 30])
  # check if we've trained before, and if we have, use that pre-trained model
  if os.path.exists(model_path + ".meta"):
    tf_model_load(sess, model_path)
  else:
    train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng)
    saver = tf.train.Saver()
    saver.save(sess, model_path)

  # Evaluate the accuracy of the MNIST model on legitimate test examples
  eval_params = {'batch_size': batch_size}
  accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
  assert x_test.shape[0] == test_end - test_start, x_test.shape
  print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
  report.clean_train_clean_eval = accuracy

  ###########################################################################
  # Craft adversarial examples using Carlini and Wagner's approach
  ###########################################################################
  nb_adv_per_sample = str(nb_classes - 1) if targeted else '1'
  print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample +
        ' adversarial examples')
  print("This could take some time ...")

  # Instantiate a CW attack object
  cw = CarliniWagnerL2(model, back='tf', sess=sess)

  if viz_enabled:
    assert source_samples == nb_classes
    idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0]
            for i in range(nb_classes)]
  if targeted:
    if viz_enabled:
      # Initialize our array for grid visualization
      grid_shape = (nb_classes, nb_classes, img_rows, img_cols,
                    nchannels)
      grid_viz_data = np.zeros(grid_shape, dtype='f')

      adv_inputs = np.array(
          [[instance] * nb_classes for instance in x_test[idxs]],
          dtype=np.float32)
    else:
      adv_inputs = np.array(
          [[instance] * nb_classes for
           instance in x_test[:source_samples]], dtype=np.float32)

    one_hot = np.zeros((nb_classes, nb_classes))
    one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1

    adv_inputs = adv_inputs.reshape(
        (source_samples * nb_classes, img_rows, img_cols, nchannels))
    adv_ys = np.array([one_hot] * source_samples,
                      dtype=np.float32).reshape((source_samples *
                                                 nb_classes, nb_classes))
    yname = "y_target"
  else:
    if viz_enabled:
      # Initialize our array for grid visualization
      grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels)
      grid_viz_data = np.zeros(grid_shape, dtype='f')

      adv_inputs = x_test[idxs]
    else:
      adv_inputs = x_test[:source_samples]

    adv_ys = None
    yname = "y"

  if targeted:
    cw_params_batch_size = source_samples * nb_classes
  else:
    cw_params_batch_size = source_samples
  cw_params = {'binary_search_steps': 1,
               yname: adv_ys,
               'max_iterations': attack_iterations,
               'learning_rate': CW_LEARNING_RATE,
               'batch_size': cw_params_batch_size,
               'initial_const': 10}

  adv = cw.generate_np(adv_inputs,
                       **cw_params)

  eval_params = {'batch_size': np.minimum(nb_classes, source_samples)}
  if targeted:
    adv_accuracy = model_eval(
        sess, x, y, preds, adv, adv_ys, args=eval_params)
  else:
    if viz_enabled:
      err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params)
      adv_accuracy = 1 - err
    else:
      err = model_eval(sess, x, y, preds, adv, y_test[:source_samples],
                       args=eval_params)
      adv_accuracy = 1 - err

  if viz_enabled:
    for j in range(nb_classes):
      if targeted:
        for i in range(nb_classes):
          grid_viz_data[i, j] = adv[i * nb_classes + j]
      else:
        grid_viz_data[j, 0] = adv_inputs[j]
        grid_viz_data[j, 1] = adv[j]

    print(grid_viz_data.shape)

  print('--------------------------------------')

  # Compute the number of adversarial examples that were successfully found
  print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy))
  report.clean_train_adv_eval = 1. - adv_accuracy

  # Compute the average distortion introduced by the algorithm
  percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2,
                                     axis=(1, 2, 3))**.5)
  print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

  # Close TF session
  sess.close()

  # Finally, block & display a grid of all the adversarial examples
  if viz_enabled:
    import matplotlib.pyplot as plt
    _ = grid_visual(grid_viz_data)

  return report
Exemple #5
0
    def setUp(self):
        super(TestCarliniWagnerL2, self).setUp()

        self.sess = tf.Session()
        self.model = SimpleModel()
        self.attack = CarliniWagnerL2(self.model, sess=self.sess)