Example #1
0
def main(_):
    """Run the sample attack"""
    # Images for inception classifier are normalized to be in [-1, 1] interval,
    # eps is a difference between pixels so it should be in [0, 2] interval.
    # Renormalizing epsilon from [0, 255] to [0, 2].
    eps = 2.0 * FLAGS.max_epsilon / 255.0
    batch_shape = [FLAGS.batch_size, FLAGS.image_height, FLAGS.image_width, 3]
    nb_classes = 1001

    tf.logging.set_verbosity(tf.logging.INFO)

    with tf.Graph().as_default():
        # Prepare graph
        x_input = tf.placeholder(tf.float32, shape=batch_shape)

        model = InceptionModel(nb_classes)

        fgsm = FastGradientMethod(model)
        x_adv = fgsm.generate(x_input, eps=eps, clip_min=-1., clip_max=1.)

        # Run computation
        saver = tf.train.Saver(slim.get_model_variables())
        session_creator = tf.train.ChiefSessionCreator(
            scaffold=tf.train.Scaffold(saver=saver),
            checkpoint_filename_with_path=FLAGS.checkpoint_path,
            master=FLAGS.master)

        with tf.train.MonitoredSession(
                session_creator=session_creator) as sess:
            for filenames, images in load_images(FLAGS.input_dir, batch_shape):
                adv_images = sess.run(x_adv, feed_dict={x_input: images})
                save_images(adv_images, filenames, FLAGS.output_dir)
Example #2
0
def main(argv):
  checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)

  if checkpoint is None:
    raise ValueError("Couldn't find latest checkpoint in " +
                     FLAGS.checkpoint_dir)

  train_start = 0
  train_end = 60000
  test_start = 0
  test_end = 10000
  X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                train_end=train_end,
                                                test_start=test_start,
                                                test_end=test_end)

  assert Y_train.shape[1] == 10

  # NOTE: for compatibility with Madry Lab downloadable checkpoints,
  # we cannot enclose this in a scope or do anything else that would
  # change the automatic naming of the variables.
  model = MadryMNIST()

  x_input = tf.placeholder(tf.float32, shape=[None, 784])
  x_image = tf.placeholder(tf.float32, shape=[None, 28, 28, 1])
  y = tf.placeholder(tf.float32, shape=[None, 10])

  if FLAGS.attack_type == 'fgsm':
    fgsm = FastGradientMethod(model)
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    adv_x = fgsm.generate(x_image, **fgsm_params)
  elif FLAGS.attack_type == 'bim':
    bim = BasicIterativeMethod(model)
    bim_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.,
                  'nb_iter': 50,
                  'eps_iter': .01}
    adv_x = bim.generate(x_image, **bim_params)
  else:
    raise ValueError(FLAGS.attack_type)
  preds_adv = model.get_probs(adv_x)

  saver = tf.train.Saver()

  with tf.Session() as sess:
    # Restore the checkpoint
    saver.restore(sess, checkpoint)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': FLAGS.batch_size}
    t1 = time.time()
    acc = model_eval(
        sess, x_image, y, preds_adv, X_test, Y_test, args=eval_par)
    t2 = time.time()
    print("Took", t2 - t1, "seconds")
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
Example #3
0
def get_logits_over_interval(sess,
                             model,
                             x_data,
                             fgsm_params,
                             min_epsilon=-10.,
                             max_epsilon=10.,
                             num_points=21):
    """Get logits when the input is perturbed in an interval in adv direction.

  Args:
      sess: Tf session
      model: Model for which we wish to get logits.
      x_data: Numpy array corresponding to single data.
              point of shape [height, width, channels].
      fgsm_params: Parameters for generating adversarial examples.
      min_epsilon: Minimum value of epsilon over the interval.
      max_epsilon: Maximum value of epsilon over the interval.
      num_points: Number of points used to interpolate.

  Returns:
      Numpy array containing logits.

  Raises:
      ValueError if min_epsilon is larger than max_epsilon.
  """
    # Get the height, width and number of channels
    height = x_data.shape[0]
    width = x_data.shape[1]
    channels = x_data.shape[2]

    x_data = np.expand_dims(x_data, axis=0)
    import tensorflow as tf
    from src.FGSM.cleverhans.cleverhans.attacks import FastGradientMethod

    # Define the data placeholder
    x = tf.placeholder(dtype=tf.float32,
                       shape=[1, height, width, channels],
                       name='x')
    # Define adv_x
    fgsm = FastGradientMethod(model, sess=sess)
    adv_x = fgsm.generate(x, **fgsm_params)

    if min_epsilon > max_epsilon:
        raise ValueError('Minimum epsilon is less than maximum epsilon')

    eta = tf.nn.l2_normalize(adv_x - x, dim=0)
    epsilon = tf.reshape(
        tf.lin_space(float(min_epsilon), float(max_epsilon), num_points),
        (num_points, 1, 1, 1))
    lin_batch = x + epsilon * eta
    logits = model.get_logits(lin_batch)
    with sess.as_default():
        log_prob_adv_array = sess.run(logits, feed_dict={x: x_data})
    return log_prob_adv_array
def cifar10_tutorial(train_start=0,
                     train_end=60000,
                     test_start=0,
                     test_end=10000,
                     nb_epochs=NB_EPOCHS,
                     batch_size=BATCH_SIZE,
                     learning_rate=LEARNING_RATE,
                     clean_train=CLEAN_TRAIN,
                     testing=False,
                     backprop_through_attack=BACKPROP_THROUGH_ATTACK,
                     nb_filters=NB_FILTERS,
                     num_threads=None,
                     label_smoothing=0.1):
    """
  CIFAR10 cleverhans tutorial
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param learning_rate: learning rate for training
  :param clean_train: perform normal training on clean examples only
                      before performing adversarial training.
  :param testing: if true, complete an AccuracyReport for unit tests
                  to verify that performance is adequate
  :param backprop_through_attack: If True, backprop through adversarial
                                  example construction process during
                                  adversarial training.
  :param label_smoothing: float, amount of label smoothing for cross entropy
  :return: an AccuracyReport object
  """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    if num_threads:
        config_args = dict(intra_op_parallelism_threads=1)
    else:
        config_args = {}
    sess = tf.Session(config=tf.ConfigProto(**config_args))

    # Get CIFAR10 data
    data = CIFAR10(train_start=train_start,
                   train_end=train_end,
                   test_start=test_start,
                   test_end=test_end)
    dataset_size = data.x_train.shape[0]
    dataset_train = data.to_tensorflow()[0]
    dataset_train = dataset_train.map(
        lambda x, y: (random_shift(random_horizontal_flip(x)), y), 4)
    dataset_train = dataset_train.batch(batch_size)
    dataset_train = dataset_train.prefetch(16)
    x_train, y_train = data.get_set('train')
    x_test, y_test = data.get_set('test')

    # Use Image Parameters
    img_rows, img_cols, nchannels = x_test.shape[1:4]
    nb_classes = y_test.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    eval_params = {'batch_size': batch_size}
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    rng = np.random.RandomState([2017, 8, 30])

    def do_eval(preds, x_set, y_set, report_key, is_adv=None):
        acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params)
        setattr(report, report_key, acc)
        if is_adv is None:
            report_text = None
        elif is_adv:
            report_text = 'adversarial'
        else:
            report_text = 'legitimate'
        if report_text:
            print('Test accuracy on %s examples: %0.4f' % (report_text, acc))

    if clean_train:
        model = ModelAllConvolutional('model1',
                                      nb_classes,
                                      nb_filters,
                                      input_shape=[32, 32, 3])
        preds = model.get_logits(x)
        loss = CrossEntropy(model, smoothing=label_smoothing)

        def evaluate():
            do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False)

        train(sess,
              loss,
              None,
              None,
              dataset_train=dataset_train,
              dataset_size=dataset_size,
              evaluate=evaluate,
              args=train_params,
              rng=rng,
              var_list=model.get_params())

        # Calculate training error
        if testing:
            do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval')

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        fgsm = FastGradientMethod(model, sess=sess)
        adv_x = fgsm.generate(x, **fgsm_params)
        preds_adv = model.get_logits(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True)

        # Calculate training error
        if testing:
            do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval')

        print('Repeating the process, using adversarial training')

    # Create a new model and train it to be robust to FastGradientMethod
    model2 = ModelAllConvolutional('model2',
                                   nb_classes,
                                   nb_filters,
                                   input_shape=[32, 32, 3])
    fgsm2 = FastGradientMethod(model2, sess=sess)

    def attack(x):
        return fgsm2.generate(x, **fgsm_params)

    loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack)
    preds2 = model2.get_logits(x)
    adv_x2 = attack(x)

    if not backprop_through_attack:
        # For the fgsm attack used in this tutorial, the attack has zero
        # gradient so enabling this flag does not change the gradient.
        # For some other attacks, enabling this flag increases the cost of
        # training, but gives the defender the ability to anticipate how
        # the atacker will change their strategy in response to updates to
        # the defender's parameters.
        adv_x2 = tf.stop_gradient(adv_x2)
    preds2_adv = model2.get_logits(adv_x2)

    def evaluate2():
        # Accuracy of adversarially trained model on legitimate test inputs
        do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False)
        # Accuracy of the adversarially trained model on adversarial examples
        do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True)

    # Perform and evaluate adversarial training
    train(sess,
          loss2,
          None,
          None,
          dataset_train=dataset_train,
          dataset_size=dataset_size,
          evaluate=evaluate2,
          args=train_params,
          rng=rng,
          var_list=model2.get_params())

    # Calculate training errors
    if testing:
        do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval')
        do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval')

    return report
Example #5
0
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=NB_EPOCHS,
                   batch_size=BATCH_SIZE,
                   learning_rate=LEARNING_RATE,
                   train_dir=TRAIN_DIR,
                   filename=FILENAME,
                   load_model=LOAD_MODEL,
                   testing=False,
                   label_smoothing=0.1):
    """
  MNIST CleverHans tutorial
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param learning_rate: learning rate for training
  :param train_dir: Directory storing the saved model
  :param filename: Filename to save model under
  :param load_model: True for load, False for not load
  :param testing: if true, test error is calculated
  :param label_smoothing: float, amount of label smoothing for cross entropy
  :return: an AccuracyReport object
  """
    tf.keras.backend.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if keras.backend.image_data_format() != 'channels_last':
        raise NotImplementedError(
            "this tutorial requires keras to be configured to channels_last format"
        )

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    mnist = MNIST(train_start=train_start,
                  train_end=train_end,
                  test_start=test_start,
                  test_end=test_end)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Define TF model graph
    model = cnn_model(img_rows=img_rows,
                      img_cols=img_cols,
                      channels=nchannels,
                      nb_filters=64,
                      nb_classes=nb_classes)
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
        report.clean_train_clean_eval = acc
        #        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }

    rng = np.random.RandomState([2017, 8, 30])
    if not os.path.exists(train_dir):
        os.mkdir(train_dir)

    ckpt = tf.train.get_checkpoint_state(train_dir)
    print(train_dir, ckpt)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path
    wrap = KerasModelWrapper(model)

    if load_model and ckpt_path:
        saver = tf.train.Saver()
        print(ckpt_path)
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        loss = CrossEntropy(wrap, smoothing=label_smoothing)
        train(sess,
              loss,
              x_train,
              y_train,
              evaluate=evaluate,
              args=train_params,
              rng=rng)

    # Calculate training error
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params)
        report.train_clean_train_clean_eval = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    adv_x = fgsm.generate(x, **fgsm_params)
    # Consider the attack to be constant
    adv_x = tf.stop_gradient(adv_x)
    preds_adv = model(adv_x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    # Calculating train error
    if testing:
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess,
                         x,
                         y,
                         preds_adv,
                         x_train,
                         y_train,
                         args=eval_par)
        report.train_clean_train_adv_eval = acc

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model(img_rows=img_rows,
                        img_cols=img_cols,
                        channels=nchannels,
                        nb_filters=64,
                        nb_classes=nb_classes)
    wrap_2 = KerasModelWrapper(model_2)
    preds_2 = model_2(x)
    fgsm2 = FastGradientMethod(wrap_2, sess=sess)

    def attack(x):
        return fgsm2.generate(x, **fgsm_params)

    preds_2_adv = model_2(attack(x))
    loss_2 = CrossEntropy(wrap_2, smoothing=label_smoothing, attack=attack)

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              x_test,
                              y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              x_test,
                              y_test,
                              args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    train(sess,
          loss_2,
          x_train,
          y_train,
          evaluate=evaluate_2,
          args=train_params,
          rng=rng)

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              x_train,
                              y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              x_train,
                              y_train,
                              args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
def mnist_tutorial(nb_epochs=NB_EPOCHS,
                   batch_size=BATCH_SIZE,
                   train_end=-1,
                   test_end=-1,
                   learning_rate=LEARNING_RATE):
    """
  MNIST cleverhans tutorial
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param learning_rate: learning rate for training
  :return: an AccuracyReport object
  """
    # Train a pytorch MNIST model
    torch_model = PytorchMnistModel()
    if torch.cuda.is_available():
        torch_model = torch_model.cuda()
    report = AccuracyReport()

    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        'data', train=True, download=True, transform=transforms.ToTensor()),
                                               batch_size=batch_size,
                                               shuffle=True)
    test_loader = torch.utils.data.DataLoader(datasets.MNIST(
        'data', train=False, transform=transforms.ToTensor()),
                                              batch_size=batch_size)

    # Truncate the datasets so that our test run more quickly
    train_loader.dataset.train_data = train_loader.dataset.train_data[:
                                                                      train_end]
    test_loader.dataset.test_data = test_loader.dataset.test_data[:test_end]

    # Train our model
    optimizer = optim.Adam(torch_model.parameters(), lr=learning_rate)
    train_loss = []

    total = 0
    correct = 0
    step = 0
    for _epoch in range(nb_epochs):
        for xs, ys in train_loader:
            xs, ys = Variable(xs), Variable(ys)
            if torch.cuda.is_available():
                xs, ys = xs.cuda(), ys.cuda()
            optimizer.zero_grad()
            preds = torch_model(xs)
            loss = F.nll_loss(preds, ys)
            loss.backward()  # calc gradients
            train_loss.append(loss.data.item())
            optimizer.step()  # update gradients

            preds_np = preds.cpu().detach().numpy()
            correct += (np.argmax(preds_np,
                                  axis=1) == ys.cpu().detach().numpy()).sum()
            total += train_loader.batch_size
            step += 1
            if total % 1000 == 0:
                acc = float(correct) / total
                print('[%s] Training accuracy: %.2f%%' % (step, acc * 100))
                total = 0
                correct = 0

    # Evaluate on clean data
    total = 0
    correct = 0
    for xs, ys in test_loader:
        xs, ys = Variable(xs), Variable(ys)
        if torch.cuda.is_available():
            xs, ys = xs.cuda(), ys.cuda()

        preds = torch_model(xs)
        preds_np = preds.cpu().detach().numpy()

        correct += (np.argmax(preds_np,
                              axis=1) == ys.cpu().detach().numpy()).sum()
        total += len(xs)

    acc = float(correct) / total
    report.clean_train_clean_eval = acc
    print('[%s] Clean accuracy: %.2f%%' % (step, acc * 100))

    # We use tf for evaluation on adversarial data
    sess = tf.Session()
    x_op = tf.placeholder(tf.float32, shape=(
        None,
        1,
        28,
        28,
    ))

    # Convert pytorch model to a tf_model and wrap it in cleverhans
    tf_model_fn = convert_pytorch_model_to_tf(torch_model)
    cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits')

    # Create an FGSM attack
    fgsm_op = FastGradientMethod(cleverhans_model, sess=sess)
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    adv_x_op = fgsm_op.generate(x_op, **fgsm_params)
    adv_preds_op = tf_model_fn(adv_x_op)

    # Run an evaluation of our model against fgsm
    total = 0
    correct = 0
    for xs, ys in test_loader:
        adv_preds = sess.run(adv_preds_op, feed_dict={x_op: xs})
        correct += (np.argmax(adv_preds,
                              axis=1) == ys.cpu().detach().numpy()).sum()
        total += test_loader.batch_size

    acc = float(correct) / total
    print('Adv accuracy: {:.3f}'.format(acc * 100))
    report.clean_train_adv_eval = acc
    return report
def evaluate_model(filepath,
                   train_start=0, train_end=60000, test_start=0,
                   test_end=10000, batch_size=128,
                   testing=False, num_threads=None):
  """
  Run evaluation on a saved model
  :param filepath: path to model to evaluate
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param batch_size: size of evaluation batches
  """

  # Set TF random seed to improve reproducibility
  tf.set_random_seed(1234)

  # Set logging level to see debug information
  set_log_level(logging.INFO)

  # Create TF session
  if num_threads:
    config_args = dict(intra_op_parallelism_threads=1)
  else:
    config_args = {}
  sess = tf.Session(config=tf.ConfigProto(**config_args))

  # Get MNIST test data
  mnist = MNIST(train_start=train_start, train_end=train_end,
                test_start=test_start, test_end=test_end)
  x_train, y_train = mnist.get_set('train')
  x_test, y_test = mnist.get_set('test')

  # Use Image Parameters
  img_rows, img_cols, nchannels = x_train.shape[1:4]
  nb_classes = y_train.shape[1]

  # Define input TF placeholder
  x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                        nchannels))
  y = tf.placeholder(tf.float32, shape=(None, nb_classes))

  eval_params = {'batch_size': batch_size}
  fgsm_params = {
      'eps': 0.3,
      'clip_min': 0.,
      'clip_max': 1.
  }

  def do_eval(preds, x_set, y_set, report_key, is_adv=None):
    acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params)
    if is_adv is None:
      report_text = None
    elif is_adv:
      report_text = 'adversarial'
    else:
      report_text = 'legitimate'
    if report_text:
      print('Test accuracy on %s examples: %0.4f' % (report_text, acc))

  with sess.as_default():
    model = load(filepath)
  assert len(model.get_params()) > 0

  # Initialize the Fast Gradient Sign Method (FGSM) attack object and
  # graph
  fgsm = FastGradientMethod(model, sess=sess)
  adv_x = fgsm.generate(x, **fgsm_params)
  preds_adv = model.get_logits(adv_x)
  preds = model.get_logits(x)

  # Evaluate the accuracy of the MNIST model on adversarial examples
  do_eval(preds, x_test, y_test, 'train_clean_train_clean_eval', False)
  do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True)
Example #8
0
def dknn_tutorial():
    # Get MNIST data.
    mnist = MNIST()
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Use Image Parameters.
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    with tf.Session() as sess:
        with tf.variable_scope('dknn'):
            # Define input TF placeholder.
            x = tf.placeholder(tf.float32,
                               shape=(None, img_rows, img_cols, nchannels))
            y = tf.placeholder(tf.float32, shape=(None, nb_classes))

            # Define a model.
            model = make_basic_picklable_cnn()
            preds = model.get_logits(x)
            loss = CrossEntropy(model, smoothing=0.)

            # Define the test set accuracy evaluation.
            def evaluate():
                acc = model_eval(sess,
                                 x,
                                 y,
                                 preds,
                                 x_test,
                                 y_test,
                                 args={'batch_size': FLAGS.batch_size})
                print('Test accuracy on test examples: %0.4f' % acc)

            # Train the model
            train_params = {
                'nb_epochs': FLAGS.nb_epochs,
                'batch_size': FLAGS.batch_size,
                'learning_rate': FLAGS.lr
            }
            train(sess,
                  loss,
                  x_train,
                  y_train,
                  evaluate=evaluate,
                  args=train_params,
                  var_list=model.get_params())

            # Define callable that returns a dictionary of all activations for a dataset
            def get_activations(data):
                data_activations = {}
                for layer in layers:
                    layer_sym = tf.layers.flatten(model.get_layer(x, layer))
                    data_activations[layer] = batch_eval(
                        sess, [x], [layer_sym], [data],
                        args={'batch_size': FLAGS.batch_size})[0]
                return data_activations

            # Use a holdout of the test set to simulate calibration data for the DkNN.
            train_data = x_train
            train_labels = np.argmax(y_train, axis=1)
            cali_data = x_test[:FLAGS.nb_cali]
            y_cali = y_test[:FLAGS.nb_cali]
            cali_labels = np.argmax(y_cali, axis=1)
            test_data = x_test[FLAGS.nb_cali:]
            y_test = y_test[FLAGS.nb_cali:]

            # Extract representations for the training and calibration data at each layer of interest to the DkNN.
            layers = ['ReLU1', 'ReLU3', 'ReLU5', 'logits']

            # Wrap the model into a DkNNModel
            dknn = DkNNModel(FLAGS.neighbors,
                             layers,
                             get_activations,
                             train_data,
                             train_labels,
                             nb_classes,
                             scope='dknn')
            dknn.calibrate(cali_data, cali_labels)

            # Generate adversarial examples
            fgsm = FastGradientMethod(model, sess=sess)
            attack_params = {'eps': .25, 'clip_min': 0., 'clip_max': 1.}
            adv = sess.run(fgsm.generate(x, **attack_params),
                           feed_dict={x: test_data})

            # Test the DkNN on clean test data and FGSM test data
            for data_in, fname in zip([test_data, adv], ['test', 'adv']):
                dknn_preds = dknn.fprop_np(data_in)
                print(dknn_preds.shape)
                print(
                    np.mean(
                        np.argmax(dknn_preds, axis=1) == np.argmax(y_test,
                                                                   axis=1)))
                plot_reliability_diagram(dknn_preds, np.argmax(y_test, axis=1),
                                         '/tmp/dknn_' + fname + '.pdf')

    return True
Example #9
0
class CommonAttackProperties(CleverHansTest):
  """
  Abstract base class shared by the tessts for many attacks that want
  to check the same properties.
  """

  def setUp(self):
    # Inheritance doesn't really work with tests.
    # nosetests always wants to run this class because it is a
    # CleverHansTest subclass, but this class is meant to just
    # be abstract.
    # Before this class was the tests for FastGradientMethod but
    # people kept inheriting from it for other attacks so it was
    # impossible to write tests specifically for FastGradientMethod.
    # pylint: disable=unidiomatic-typecheck
    if type(self) is CommonAttackProperties:
      raise SkipTest()

    super(CommonAttackProperties, self).setUp()
    self.sess = tf.Session()
    self.model = SimpleModel()

  def generate_adversarial_examples_np(self, ord, eps, **kwargs):
    x_val = np.random.rand(100, 2)
    x_val = np.array(x_val, dtype=np.float32)

    x_adv = self.attack.generate_np(x_val, eps=eps, ord=ord,
                                    clip_min=-5, clip_max=5, **kwargs)
    if ord == np.inf:
      delta = np.max(np.abs(x_adv - x_val), axis=1)
    elif ord == 1:
      delta = np.sum(np.abs(x_adv - x_val), axis=1)
    elif ord == 2:
      delta = np.sum(np.square(x_adv - x_val), axis=1) ** .5

    return x_val, x_adv, delta

  def help_generate_np_gives_adversarial_example(self, ord, eps=.5,
                                                 **kwargs):
    x_val, x_adv, delta = self.generate_adversarial_examples_np(ord, eps,
                                                                **kwargs)
    self.assertLess(np.max(np.abs(delta-eps)), 1e-3)
    orig_labs = np.argmax(self.sess.run(self.model.get_logits(x_val)), axis=1)
    new_labs = np.argmax(self.sess.run(self.model.get_logits(x_adv)), axis=1)
    self.assertLess(np.max(np.mean(orig_labs == new_labs)), .5)

  def test_invalid_input(self):
    x_val = -np.ones((2, 2), dtype='float32')
    with self.assertRaises(tf.errors.InvalidArgumentError) as context:
      self.attack.generate_np(x_val, eps=1., clip_min=0., clip_max=1.)
    self.assertTrue(context.exception)

  def test_generate_np_gives_adversarial_example_linfinity(self):
    self.help_generate_np_gives_adversarial_example(np.infty)

  def test_generate_np_gives_adversarial_example_l1(self):
    self.help_generate_np_gives_adversarial_example(1)

  def test_generate_np_gives_adversarial_example_l2(self):
    self.help_generate_np_gives_adversarial_example(2)

  def test_generate_respects_dtype(self):
    self.attack = FastGradientMethod(self.model, sess=self.sess,
                                     dtypestr='float64')
    x = tf.placeholder(dtype=tf.float64, shape=(100, 2))
    x_adv = self.attack.generate(x)
    self.assertEqual(x_adv.dtype, tf.float64)

  def test_targeted_generate_np_gives_adversarial_example(self):
    random_labs = np.random.random_integers(0, 1, 100)
    random_labs_one_hot = np.zeros((100, 2))
    random_labs_one_hot[np.arange(100), random_labs] = 1

    try:
      _, x_adv, delta = self.generate_adversarial_examples_np(
          eps=.5, ord=np.inf, y_target=random_labs_one_hot)
    except NotImplementedError:
      raise SkipTest()

    self.assertLessEqual(np.max(delta), 0.5001)

    new_labs = np.argmax(self.sess.run(self.model.get_logits(x_adv)), axis=1)
    self.assertTrue(np.mean(random_labs == new_labs) > 0.7)

  def test_generate_np_can_be_called_with_different_eps(self):
    x_val = np.random.rand(100, 2)
    x_val = np.array(x_val, dtype=np.float32)

    for eps in [0.1, 0.2, 0.3, 0.4]:
      x_adv = self.attack.generate_np(x_val, eps=eps, ord=np.inf,
                                      clip_min=-5.0, clip_max=5.0)

      delta = np.max(np.abs(x_adv - x_val), axis=1)
      self.assertLessEqual(np.max(delta), eps+1e-4)

  def test_generate_can_be_called_with_different_eps(self):
    # It is crtical that this test uses generate and not generate_np.
    # All the other tests use generate_np. Even though generate_np calls
    # generate, it does so in a very standardized way, e.g. with eps
    # always converted to a tensorflow placeholder, so the other tests
    # based on generate_np do not exercise the generate API very well.
    x_val = np.random.rand(100, 2)
    x_val = np.array(x_val, dtype=np.float32)
    x = tf.placeholder(tf.float32, x_val.shape)

    for eps in [0.1, 0.2, 0.3, 0.4]:
      x_adv = self.attack.generate(x, eps=eps, ord=np.inf,
                                   clip_min=-5.0, clip_max=5.0)
      x_adv = self.sess.run(x_adv, feed_dict={x: x_val})

      delta = np.max(np.abs(x_adv - x_val), axis=1)
      self.assertLessEqual(np.max(delta), eps + 1e-4)

  def test_generate_np_clip_works_as_expected(self):
    x_val = np.random.rand(100, 2)
    x_val = np.array(x_val, dtype=np.float32)

    x_adv = self.attack.generate_np(x_val, eps=0.5, ord=np.inf,
                                    clip_min=-0.2, clip_max=0.1,
                                    sanity_checks=False)

    self.assertClose(np.min(x_adv), -0.2)
    self.assertClose(np.max(x_adv), 0.1)
Example #10
0
    graph = tf.get_default_graph()
    phase_train_placeholder = graph.get_tensor_by_name("phase_train:0")
    feed_dict = {model.face_input: faces2,
                 phase_train_placeholder: False}
    victims_embeddings = sess.run(
        model.embedding_output, feed_dict=feed_dict)

    # Define FGSM for the model
    steps = 1
    eps = 0.01
    alpha = eps / steps
    fgsm = FastGradientMethod(model)
    fgsm_params = {'eps': alpha,
                   'clip_min': 0.,
                   'clip_max': 1.}
    adv_x = fgsm.generate(model.face_input, **fgsm_params)

    # Run FGSM
    adv = faces1
    for i in range(steps):
      print("FGSM step " + str(i + 1))
      feed_dict = {model.face_input: adv,
                   model.victim_embedding_input: victims_embeddings,
                   phase_train_placeholder: False}
      adv = sess.run(adv_x, feed_dict=feed_dict)

    # Test accuracy of the model
    batch_size = graph.get_tensor_by_name("batch_size:0")

    feed_dict = {model.face_input: faces1,
                 model.victim_embedding_input: victims_embeddings,
Example #11
0
def mnist_blackbox(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_classes=NB_CLASSES,
                   batch_size=BATCH_SIZE,
                   learning_rate=LEARNING_RATE,
                   nb_epochs=NB_EPOCHS,
                   holdout=HOLDOUT,
                   data_aug=DATA_AUG,
                   nb_epochs_s=NB_EPOCHS_S,
                   lmbda=LMBDA,
                   aug_batch_size=AUG_BATCH_SIZE):
    """
  MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :return: a dictionary with:
           * black-box model accuracy on test set
           * substitute model accuracy on test set
           * black-box model accuracy on adversarial examples transferred
             from the substitute model
  """

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session
    sess = tf.Session()

    # Get MNIST data
    mnist = MNIST(train_start=train_start,
                  train_end=train_end,
                  test_start=test_start,
                  test_end=test_end)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Initialize substitute training set reserved for adversary
    x_sub = x_test[:holdout]
    y_sub = np.argmax(y_test[:holdout], axis=1)

    # Redefine test set as remaining samples unavailable to adversaries
    x_test = x_test[holdout:]
    y_test = y_test[holdout:]

    # Obtain Image parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Seed random number generator so tutorial is reproducible
    rng = np.random.RandomState([2017, 8, 30])

    # Simulate the black-box model locally
    # You could replace this by a remote labeling API for instance
    print("Preparing the black-box model.")
    prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test,
                              nb_epochs, batch_size, learning_rate, rng,
                              nb_classes, img_rows, img_cols, nchannels)
    model, bbox_preds, accuracies['bbox'] = prep_bbox_out

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    print("Training the substitute model.")
    train_sub_out = train_sub(sess, x, y, bbox_preds, x_sub, y_sub, nb_classes,
                              nb_epochs_s, batch_size, learning_rate, data_aug,
                              lmbda, aug_batch_size, rng, img_rows, img_cols,
                              nchannels)
    model_sub, preds_sub = train_sub_out

    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_sub, x_test, y_test, args=eval_params)
    accuracies['sub'] = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.}
    fgsm = FastGradientMethod(model_sub, sess=sess)

    # Craft adversarial examples using the substitute
    eval_params = {'batch_size': batch_size}
    x_adv_sub = fgsm.generate(x, **fgsm_par)

    # Evaluate the accuracy of the "black-box" model on adversarial examples
    accuracy = model_eval(sess,
                          x,
                          y,
                          model.get_logits(x_adv_sub),
                          x_test,
                          y_test,
                          args=eval_params)
    print('Test accuracy of oracle on adversarial examples generated '
          'using the substitute: ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex'] = accuracy

    return accuracies
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=NB_EPOCHS,
                   batch_size=BATCH_SIZE,
                   learning_rate=LEARNING_RATE,
                   clean_train=CLEAN_TRAIN,
                   testing=False,
                   backprop_through_attack=BACKPROP_THROUGH_ATTACK,
                   nb_filters=NB_FILTERS,
                   num_threads=None,
                   label_smoothing=0.1):
    """
  MNIST cleverhans tutorial
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param learning_rate: learning rate for training
  :param clean_train: perform normal training on clean examples only
                      before performing adversarial training.
  :param testing: if true, complete an AccuracyReport for unit tests
                  to verify that performance is adequate
  :param backprop_through_attack: If True, backprop through adversarial
                                  example construction process during
                                  adversarial training.
  :param label_smoothing: float, amount of label smoothing for cross entropy
  :return: an AccuracyReport object
  """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    if num_threads:
        config_args = dict(intra_op_parallelism_threads=1)
    else:
        config_args = {}
    sess = tf.Session(config=tf.ConfigProto(**config_args))

    # Get MNIST test data
    mnist = MNIST(train_start=train_start,
                  train_end=train_end,
                  test_start=test_start,
                  test_end=test_end)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Use Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    eval_params = {'batch_size': batch_size}
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    rng = np.random.RandomState([2017, 8, 30])

    def do_eval(preds, x_set, y_set, report_key, is_adv=None):
        """
    Run the evaluation and print the results.
    """
        acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params)
        setattr(report, report_key, acc)
        if is_adv is None:
            report_text = None
        elif is_adv:
            report_text = 'adversarial'
        else:
            report_text = 'legitimate'
        if report_text:
            print('Test accuracy on %s examples: %0.4f' % (report_text, acc))

    if clean_train:
        model = make_basic_picklable_cnn()
        # Tag the model so that when it is saved to disk, future scripts will
        # be able to tell what data it was trained on
        model.dataset_factory = mnist.get_factory()
        preds = model.get_logits(x)
        assert len(model.get_params()) > 0
        loss = CrossEntropy(model, smoothing=label_smoothing)

        def evaluate():
            """
      Run evaluation for the naively trained model on clean examples.
      """
            do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False)

        train(sess,
              loss,
              x_train,
              y_train,
              evaluate=evaluate,
              args=train_params,
              rng=rng,
              var_list=model.get_params())

        with sess.as_default():
            save("clean_model.joblib", model)

            print("Now that the model has been saved, you can evaluate it in a"
                  " separate process using `evaluate_pickled_model.py`. "
                  "You should get exactly the same result for both clean and "
                  "adversarial accuracy as you get within this program.")

        # Calculate training error
        if testing:
            do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval')

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        fgsm = FastGradientMethod(model, sess=sess)
        adv_x = fgsm.generate(x, **fgsm_params)
        preds_adv = model.get_logits(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True)

        # Calculate training error
        if testing:
            do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval')

        print('Repeating the process, using adversarial training')

    # Create a new model and train it to be robust to FastGradientMethod
    model2 = make_basic_picklable_cnn()
    # Tag the model so that when it is saved to disk, future scripts will
    # be able to tell what data it was trained on
    model2.dataset_factory = mnist.get_factory()
    fgsm2 = FastGradientMethod(model2, sess=sess)

    def attack(x):
        """Return an adversarial example near clean example `x`"""
        return fgsm2.generate(x, **fgsm_params)

    loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack)
    preds2 = model2.get_logits(x)
    adv_x2 = attack(x)

    if not backprop_through_attack:
        # For the fgsm attack used in this tutorial, the attack has zero
        # gradient so enabling this flag does not change the gradient.
        # For some other attacks, enabling this flag increases the cost of
        # training, but gives the defender the ability to anticipate how
        # the atacker will change their strategy in response to updates to
        # the defender's parameters.
        adv_x2 = tf.stop_gradient(adv_x2)
    preds2_adv = model2.get_logits(adv_x2)

    def evaluate_adv():
        """
    Evaluate the adversarially trained model.
    """
        # Accuracy of adversarially trained model on legitimate test inputs
        do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False)
        # Accuracy of the adversarially trained model on adversarial examples
        do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True)

    # Perform and evaluate adversarial training
    train(sess,
          loss2,
          x_train,
          y_train,
          evaluate=evaluate_adv,
          args=train_params,
          rng=rng,
          var_list=model2.get_params())

    with sess.as_default():
        save("adv_model.joblib", model2)
        print(
            "Now that the model has been saved, you can evaluate it in a "
            "separate process using "
            "`python evaluate_pickled_model.py adv_model.joblib`. "
            "You should get exactly the same result for both clean and "
            "adversarial accuracy as you get within this program."
            " You can also move beyond the tutorials directory and run the "
            " real `compute_accuracy.py` script (make sure cleverhans/scripts "
            "is in your PATH) to see that this FGSM-trained "
            "model is actually not very robust---it's just a model that trains "
            " quickly so the tutorial does not take a long time")

    # Calculate training errors
    if testing:
        do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval')
        do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval')

    return report
Example #13
0
def main(argv):

    model_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)

    if model_file is None:
        print('No model found')
        sys.exit()

    cifar = cifar10_input.CIFAR10Data(FLAGS.dataset_dir)

    nb_classes = 10
    X_test = cifar.eval_data.xs
    Y_test = to_categorical(cifar.eval_data.ys, nb_classes)
    assert Y_test.shape[1] == 10.

    set_log_level(logging.DEBUG)

    with tf.Session() as sess:

        x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
        y = tf.placeholder(tf.float32, shape=(None, 10))

        from src.FGSM.cleverhans.cleverhans.model_zoo.madry_lab_challenges.cifar10_model import make_wresnet
        model = make_wresnet()

        saver = tf.train.Saver()

        # Restore the checkpoint
        saver.restore(sess, model_file)

        nb_samples = FLAGS.nb_samples

        attack_params = {
            'batch_size': FLAGS.batch_size,
            'clip_min': 0.,
            'clip_max': 255.
        }

        if FLAGS.attack_type == 'cwl2':
            from src.FGSM.cleverhans.cleverhans.attacks import CarliniWagnerL2
            attacker = CarliniWagnerL2(model, sess=sess)
            attack_params.update({
                'binary_search_steps': 1,
                'max_iterations': 100,
                'learning_rate': 0.1,
                'initial_const': 10,
                'batch_size': 10
            })

        else:  # eps and eps_iter in range 0-255
            attack_params.update({'eps': 8, 'ord': np.inf})
            if FLAGS.attack_type == 'fgsm':
                from src.FGSM.cleverhans.cleverhans.attacks import FastGradientMethod
                attacker = FastGradientMethod(model, sess=sess)

            elif FLAGS.attack_type == 'pgd':
                attack_params.update({'eps_iter': 2, 'nb_iter': 20})
                from src.FGSM.cleverhans.cleverhans.attacks import MadryEtAl
                attacker = MadryEtAl(model, sess=sess)

        eval_par = {'batch_size': FLAGS.batch_size}

        if FLAGS.sweep:
            max_eps = 16
            epsilons = np.linspace(1, max_eps, max_eps)
            for e in epsilons:
                t1 = time.time()
                attack_params.update({'eps': e})
                x_adv = attacker.generate(x, **attack_params)
                preds_adv = model.get_probs(x_adv)
                acc = model_eval(sess,
                                 x,
                                 y,
                                 preds_adv,
                                 X_test[:nb_samples],
                                 Y_test[:nb_samples],
                                 args=eval_par)
                print('Epsilon %.2f, accuracy on adversarial' % e,
                      'examples %0.4f\n' % acc)
            t2 = time.time()
        else:
            t1 = time.time()
            x_adv = attacker.generate(x, **attack_params)
            preds_adv = model.get_probs(x_adv)
            acc = model_eval(sess,
                             x,
                             y,
                             preds_adv,
                             X_test[:nb_samples],
                             Y_test[:nb_samples],
                             args=eval_par)
            t2 = time.time()
            print('Test accuracy on adversarial examples %0.4f\n' % acc)
        print("Took", t2 - t1, "seconds")