Beispiel #1
0
    def _init_data(self):
        hparams = self.hparams
        batch_size = hparams.batch_size
        if hparams.dataset == 'mnist':
            # Get MNIST test data
            X_train, Y_train, X_test, Y_test = data_mnist(
                train_start=hparams.train_start,
                train_end=hparams.train_end,
                test_start=hparams.test_start,
                test_end=hparams.test_end)
            input_shape = (batch_size, 28, 28, 1)
            preproc_func = None
        elif hparams.dataset == 'cifar10':
            X_train, Y_train, X_test, Y_test = cifar_input.read_CIFAR10(
                os.path.join(hparams.data_path, hparams.dataset))
            input_shape = (batch_size, 32, 32, 3)
            preproc_func = cifar_input.cifar_tf_preprocess
        elif hparams.dataset == 'svhn':
            X_train, Y_train, X_test, Y_test = svhn_input.read_SVHN(
                os.path.join(hparams.data_path, hparams.dataset))
            input_shape = (batch_size, 32, 32, 3)
            preproc_func = svhn_input.svhn_tf_preprocess

        # Use label smoothing
        assert Y_train.shape[1] == 10.
        label_smooth = .1
        Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

        self.X_train = X_train
        self.Y_train = Y_train
        self.X_test = X_test
        self.Y_test = Y_test
        self.data = (X_train, Y_train, X_test, Y_test)
        self.input_shape = input_shape
        self.preproc_func = preproc_func
Beispiel #2
0
def main(argv):
    checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)

    if checkpoint is None:
        raise ValueError("Couldn't find latest checkpoint in " +
                         FLAGS.checkpoint_dir)

    train_start = 0
    train_end = 60000
    test_start = 0
    test_end = 10000
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    assert Y_train.shape[1] == 10

    # NOTE: for compatibility with Madry Lab downloadable checkpoints,
    # we cannot enclose this in a scope or do anything else that would
    # change the automatic naming of the variables.
    model = MadryMNIST()

    x_input = tf.placeholder(tf.float32, shape=[None, 784])
    x_image = tf.placeholder(tf.float32, shape=[None, 28, 28, 1])
    y = tf.placeholder(tf.float32, shape=[None, 10])

    if FLAGS.attack_type == 'fgsm':
        fgsm = FastGradientMethod(model)
        fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
        adv_x = fgsm.generate(x_image, **fgsm_params)
    elif FLAGS.attack_type == 'bim':
        bim = BasicIterativeMethod(model)
        bim_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.,
                      'nb_iter': 50,
                      'eps_iter': .01}
        adv_x = bim.generate(x_image, **bim_params)
    else:
        raise ValueError(FLAGS.attack_type)
    preds_adv = model.get_probs(adv_x)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        # Restore the checkpoint
        saver.restore(sess, checkpoint)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        eval_par = {'batch_size': FLAGS.batch_size}
        t1 = time.time()
        acc = model_eval(
            sess, x_image, y, preds_adv, X_test, Y_test, args=eval_par)
        t2 = time.time()
        print("Took", t2 - t1, "seconds")
        print('Test accuracy on adversarial examples: %0.4f\n' % acc)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE,
                   learning_rate=LEARNING_RATE,
                   clean_train=True,
                   testing=False,
                   backprop_through_attack=False,
                   nb_filters=NB_FILTERS, num_threads=None,
                   attack_string=None):
  """
  MNIST cleverhans tutorial
  :param train_start: index of first training set example.
  :param train_end: index of last training set example.
  :param test_start: index of first test set example.
  :param test_end: index of last test set example.
  :param nb_epochs: number of epochs to train model.
  :param batch_size: size of training batches.
  :param learning_rate: learning rate for training.
  :param clean_train: perform normal training on clean examples only
                      before performing adversarial training.
  :param testing: if true, complete an AccuracyReport for unit tests
                  to verify that performance is adequate.
  :param backprop_through_attack: If True, backprop through adversarial
                                  example construction process during
                                  adversarial training.
  :param nb_filters: number of filters in the CNN used for training.
  :param num_threads: number of threads used for running the process.
  :param attack_string: attack name for crafting adversarial attacks and
                          adversarial training, in string format.
  :return: an AccuracyReport object
  """

  # Object used to keep track of (and return) key accuracies
  report = AccuracyReport()

  # Set TF random seed to improve reproducibility
  tf.set_random_seed(1234)

  # Set logging level to see debug information
  set_log_level(logging.DEBUG)

  # Get MNIST test data
  X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                train_end=train_end,
                                                test_start=test_start,
                                                test_end=test_end)

  # Use label smoothing
  assert Y_train.shape[1] == 10
  label_smooth = .1
  Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

  # Train an MNIST model
  model_path = "models/mnist"
  train_params = {
      'nb_epochs': nb_epochs,
      'batch_size': batch_size,
      'learning_rate': learning_rate
  }

  # Initialize the attack object
  attack_class = attack_selection(attack_string)
  attack_params = {'eps': 0.3, 'clip_min': 0.,
                   'clip_max': 1.}

  rng = np.random.RandomState([2018, 6, 18])
  if clean_train:
    model = ModelBasicCNNTFE(nb_filters=nb_filters)

    def evaluate_clean():
      """Evaluate the accuracy of the MNIST model on legitimate test
      examples
      """
      eval_params = {'batch_size': batch_size}
      acc = model_eval(model, X_test, Y_test, args=eval_params)
      report.clean_train_clean_eval = acc
      assert X_test.shape[0] == test_end - test_start, X_test.shape
      print('Test accuracy on legitimate examples: %0.4f' % acc)

    train(model, X_train, Y_train, evaluate=evaluate_clean,
          args=train_params, rng=rng, var_list=model.get_params())

    if testing:
      # Calculate training error
      eval_params = {'batch_size': batch_size}
      acc = model_eval(model, X_train, Y_train, args=eval_params)
      report.train_clean_train_clean_eval = acc

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    attack = attack_class(model)
    acc = model_eval(
        model, X_test, Y_test, args=eval_par,
        attack=attack, attack_args=attack_params)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    # Calculate training error
    if testing:
      eval_par = {'batch_size': batch_size}
      acc = model_eval(
          model, X_train, Y_train, args=eval_par,
          attack=attack, attack_args=attack_params)
      print('Train accuracy on adversarial examples: %0.4f\n' % acc)
      report.train_clean_train_adv_eval = acc

    # Clear the previous Variables
    for var in model.get_params():
      var = None
    attack = None
    print("Repeating the process, using adversarial training")

  model_adv_train = ModelBasicCNNTFE(nb_filters=nb_filters)
  attack = attack_class(model_adv_train)

  def evaluate_adv():
    # Accuracy of adversarially trained model on legitimate test inputs
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(
        model_adv_train, X_test, Y_test,
        args=eval_params)
    print('Test accuracy on legitimate examples: %0.4f' % accuracy)
    report.adv_train_clean_eval = accuracy
    # Accuracy of the adversarially trained model on adversarial examples
    accuracy = model_eval(
        model_adv_train, X_test, Y_test,
        args=eval_params, attack=attack,
        attack_args=attack_params)
    print('Test accuracy on adversarial examples: %0.4f' % accuracy)
    report.adv_train_adv_eval = accuracy

  # Perform and evaluate adversarial training
  train(model_adv_train, X_train, Y_train, evaluate=evaluate_adv,
        args=train_params, rng=rng,
        var_list=model_adv_train.get_params(),
        attack=attack, attack_args=attack_params)

  # Calculate training errors
  if testing:
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(
        model_adv_train, X_train, Y_train, args=eval_params,
        attack=None, attack_args=None)
    report.train_adv_train_clean_eval = accuracy
    accuracy = model_eval(
        model_adv_train, X_train, Y_train, args=eval_params,
        attack=attack, attack_args=attack_params)
    report.train_adv_train_adv_eval = accuracy
  return report
def main(argv=None):
    """
    MNIST cleverhans tutorial
    :return:
    """

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist()

    assert Y_train.shape[1] == 10.
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    predictions = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': FLAGS.batch_size}
        accuracy = model_eval(sess, x, y, predictions, X_test, Y_test,
                              args=eval_params)
        assert X_test.shape[0] == 10000, X_test.shape
        print('Test accuracy on legitimate test examples: ' + str(accuracy))

    # Train an MNIST model
    train_params = {
        'nb_epochs': FLAGS.nb_epochs,
        'batch_size': FLAGS.batch_size,
        'learning_rate': FLAGS.learning_rate
    }
    model_train(sess, x, y, predictions, X_train, Y_train,
                evaluate=evaluate, args=train_params)

    # Craft adversarial examples using Fast Gradient Sign Method (FGSM)
    adv_x = fgsm(x, predictions, eps=0.3)
    eval_params = {'batch_size': FLAGS.batch_size}
    X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], args=eval_params)
    assert X_test_adv.shape[0] == 10000, X_test_adv.shape

    # Evaluate the accuracy of the MNIST model on adversarial examples
    accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test,
                          args=eval_params)
    print('Test accuracy on adversarial examples: ' + str(accuracy))

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    # model_2 = cnn_model()
    # model_2 = regression_model()
    model_2 = model
    predictions_2 = model_2(x)

    # adv_x_2 = fgsm(x, predictions_2, eps=0.3)
    adv_x_2 = adv_x
    predictions_2_adv = model_2(adv_x_2)

    # fgsm2 = FastGradientMethod(model_2, sess=sess)
    # predictions_2_adv = model_2(fgsm2.generate(x, **fgsm_params))

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': FLAGS.batch_size}
        accuracy = model_eval(sess, x, y, predictions_2, X_test, Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate test examples: %0.4f' % accuracy)

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy_adv = model_eval(sess, x, y, predictions_2_adv, X_test,
                                  Y_test, args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy_adv)

    # Perform and evaluate adversarial training
    train_params['nb_epochs'] = 30

    model_train(sess, x, y, predictions_2, X_train, Y_train,
                predictions_adv=predictions_2_adv, evaluate=evaluate_2,
                args=train_params)

    # after adversarial training:
    # Test accuracy on legitimate test examples: 0.9483
    # Test accuracy on adversarial examples: 0.8989

    # we test on previously generated adversarial example
    # Test accuracy on adversarial examples: 0.1844
    accuracy = model_eval(sess, x, y, predictions_2, X_test_adv, Y_test,
                          args=eval_params)

    print('Test accuracy on adversarial examples: ' + str(accuracy))
Beispiel #5
0
def main():
    """
    MNIST cleverhans tutorial
    :return:
    """
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', '-b', default=1000, type=int, help='Size of training batches')
    parser.add_argument('--train_dir', '-d', default='/tmp', help='Directory storing the saved model.')
    parser.add_argument('--filename', '-f',  default='mnist.ckpt', help='Filename to save model under.')
    parser.add_argument('--nb_epochs', '-e', default=6, type=int, help='Number of epochs to train model')
    parser.add_argument('--nb_iters', '-i', default=10000, type=int, help='Number of iterations for crafting adversarial examples')
    parser.add_argument('--learning_rate', '-lr', default=0.1, type=float, help='Learning rate for training')
    parser.add_argument('--eps', default=0.01, type=float, help='Epsilon for Carlini L2 Attack')
    parser.add_argument('--kappa', default=0.01, type=float, help='Kappa for Carlini L2 Attack')
    parser.add_argument('--c', default=20, type=float)
    parser.add_argument('--load', default=None, type=str, help='Model path to load')
    parser.add_argument('--dump', default=None, type=str, help='Model path to dump')
    args = parser.parse_args()

    np.random.seed(126)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist()
    print("Loaded MNIST test data.")

    assert Y_train.shape[1] == 10.
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input Theano placeholder
    x_shape = (None, 1, 28, 28)
    y_shape = (None, 10)
    x = T.tensor4('x')
    y = T.matrix('y')
    
    if args.load:
        model = pickle.load(open(args.load, "rb"))
        predictions = model(x)
    else:
        # Define Theano model graph
        model = model_mnist()
        model.build(x_shape)
        predictions = model(x)
        print("Defined Theano model graph.")

        def evaluate():
            # Evaluate the accuracy of the MNIST model on legitimate test examples
            accuracy = th_model_eval(x, y, predictions, X_test, Y_test, args=args)
            assert X_test.shape[0] == 10000, X_test.shape
            print('Test accuracy on legitimate test examples: ' + str(accuracy))
            pass

        # Train an MNIST model
        
        th_model_train(x, y, predictions, model.trainable_weights, X_train, Y_train, evaluate=evaluate, args=args)

        if args.dump:
            pickle.dump(model, open(args.dump, "wb"))

    # Craft adversarial examples using Fast Gradient Sign Method (FGSM)
    #for i in range(10):
    carlini_L2(x, predictions, X_test, Y_test, eps=args.eps, kappa=args.kappa, c=args.c, nb_iters=args.nb_iters, batch_size=args.batch_size)
Beispiel #6
0
def mnist_blackbox(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_classes=10,
                   batch_size=128,
                   learning_rate=0.001,
                   nb_epochs=10,
                   holdout=150,
                   data_aug=6,
                   nb_epochs_s=10,
                   lmbda=0.1):
    """
    MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :return: a dictionary with:
             * black-box model accuracy on test set
             * substitute model accuracy on test set
             * black-box model accuracy on adversarial examples transferred
               from the substitute model
    """
    keras.layers.core.K.set_learning_phase(0)

    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Initialize substitute training set reserved for adversary
    X_sub = X_test[:holdout]
    Y_sub = np.argmax(Y_test[:holdout], axis=1)

    # Redefine test set as remaining samples unavailable to adversaries
    X_test = X_test[holdout:]
    Y_test = Y_test[holdout:]

    # Define input and output TF placeholders
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Simulate the black-box model locally
    # You could replace this by a remote labeling API for instance
    print("Preparing the black-box model.")
    prep_bbox_out = prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test,
                              nb_epochs, batch_size, learning_rate)
    model, bbox_preds, accuracies['bbox'] = prep_bbox_out

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    print("Training the substitute model.")
    train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes,
                              nb_epochs_s, batch_size, learning_rate, data_aug,
                              lmbda)
    model_sub, preds_sub = train_sub_out

    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params)
    accuracies['sub'] = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.}
    wrap = KerasModelWrapper(model_sub)
    fgsm = FastGradientMethod(wrap, sess=sess)

    # Craft adversarial examples using the substitute
    eval_params = {'batch_size': batch_size}
    x_adv_sub = fgsm.generate(x, **fgsm_par)

    # Evaluate the accuracy of the "black-box" model on adversarial examples
    accuracy = model_eval(sess,
                          x,
                          y,
                          model(x_adv_sub),
                          X_test,
                          Y_test,
                          args=eval_params)
    print('Test accuracy of oracle on adversarial examples generated '
          'using the substitute: ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex'] = accuracy

    return accuracies
def mnist_tutorial_cw(train_start=0,
                      train_end=60000,
                      test_start=0,
                      test_end=10000,
                      viz_enabled=VIZ_ENABLED,
                      nb_epochs=NB_EPOCHS,
                      batch_size=BATCH_SIZE,
                      source_samples=SOURCE_SAMPLES,
                      learning_rate=LEARNING_RATE,
                      attack_iterations=ATTACK_ITERATIONS,
                      model_path=MODEL_PATH,
                      targeted=TARGETED):
    """
    MNIST tutorial for Carlini and Wagner's attack
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))
    nb_filters = 64

    # Define TF model graph
    model = ModelBasicCNN('model1', nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = CrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'filename': os.path.split(model_path)[-1]
    }

    rng = np.random.RandomState([2017, 8, 30])
    # check if we've trained before, and if we have, use that pre-trained model
    if os.path.exists(model_path + ".meta"):
        tf_model_load(sess, model_path)
    else:
        train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng)
        saver = tf.train.Saver()
        saver.save(sess, model_path)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using Carlini and Wagner's approach
    ###########################################################################
    nb_adv_per_sample = str(nb_classes - 1) if targeted else '1'
    print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample +
          ' adversarial examples')
    print("This could take some time ...")

    # Instantiate a CW attack object
    cw = CarliniWagnerL2(model, back='tf', sess=sess)

    if viz_enabled:
        assert source_samples == nb_classes
        idxs = [
            np.where(np.argmax(y_test, axis=1) == i)[0][0]
            for i in range(nb_classes)
        ]
    if targeted:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, nb_classes, img_rows, img_cols,
                          nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = np.array([[instance] * nb_classes
                                   for instance in x_test[idxs]],
                                  dtype=np.float32)
        else:
            adv_inputs = np.array([[instance] * nb_classes
                                   for instance in x_test[:source_samples]],
                                  dtype=np.float32)

        one_hot = np.zeros((nb_classes, nb_classes))
        one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1

        adv_inputs = adv_inputs.reshape(
            (source_samples * nb_classes, img_rows, img_cols, nchannels))
        adv_ys = np.array([one_hot] * source_samples,
                          dtype=np.float32).reshape(
                              (source_samples * nb_classes, nb_classes))
        yname = "y_target"
    else:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = x_test[idxs]
        else:
            adv_inputs = x_test[:source_samples]

        adv_ys = None
        yname = "y"

    cw_params = {
        'binary_search_steps': 1,
        yname: adv_ys,
        'max_iterations': attack_iterations,
        'learning_rate': CW_LEARNING_RATE,
        'batch_size':
        source_samples * nb_classes if targeted else source_samples,
        'initial_const': 10
    }

    adv = cw.generate_np(adv_inputs, **cw_params)

    eval_params = {'batch_size': np.minimum(nb_classes, source_samples)}
    if targeted:
        adv_accuracy = model_eval(sess,
                                  x,
                                  y,
                                  preds,
                                  adv,
                                  adv_ys,
                                  args=eval_params)
    else:
        if viz_enabled:
            adv_accuracy = 1 - \
                model_eval(sess, x, y, preds, adv, y_test[
                           idxs], args=eval_params)
        else:
            adv_accuracy = 1 - \
                model_eval(sess, x, y, preds, adv, y_test[
                           :source_samples], args=eval_params)

    if viz_enabled:
        for j in range(nb_classes):
            if targeted:
                for i in range(nb_classes):
                    grid_viz_data[i, j] = adv[i * nb_classes + j]
            else:
                grid_viz_data[j, 0] = adv_inputs[j]
                grid_viz_data[j, 1] = adv[j]

        print(grid_viz_data.shape)

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy))
    report.clean_train_adv_eval = 1. - adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(
        np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        _ = grid_visual(grid_viz_data)

    return report
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.001,
                   train_dir="/tmp",
                   filename="mnist.ckpt",
                   load_model=False,
                   testing=False):
    """
    MNIST CleverHans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param train_dir: Directory storing the saved model
    :param filename: Filename to save model under
    :param load_model: True for load, False for not load
    :param testing: if true, test error is calculated
    :return: an AccuracyReport object
    """
    keras.layers.core.K.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10.
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
        report.clean_train_clean_eval = acc
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }
    ckpt = tf.train.get_checkpoint_state(train_dir)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path

    if load_model and ckpt_path:
        saver = tf.train.Saver()
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        model_train(sess,
                    x,
                    y,
                    preds,
                    X_train,
                    Y_train,
                    evaluate=evaluate,
                    args=train_params,
                    save=True)

    # Calculate training error
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params)
        report.train_clean_train_clean_eval = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    wrap = KerasModelWrapper(model)
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.3}
    adv_x = fgsm.generate(x, **fgsm_params)
    preds_adv = model(adv_x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    # Calculating train error
    if testing:
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess,
                         x,
                         y,
                         preds_adv,
                         X_train,
                         Y_train,
                         args=eval_par)
        report.train_clean_train_adv_eval = acc

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model()
    preds_2 = model_2(x)
    wrap_2 = KerasModelWrapper(model_2)
    fgsm2 = FastGradientMethod(wrap_2, sess=sess)
    preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params))

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    model_train(sess,
                x,
                y,
                preds_2,
                X_train,
                Y_train,
                predictions_adv=preds_2_adv,
                evaluate=evaluate_2,
                args=train_params,
                save=False)

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              X_train,
                              Y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              X_train,
                              Y_train,
                              args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0,
                        test_end=10000, viz_enabled=True, nb_epochs=6,
                        batch_size=128, source_samples=10,
                        learning_rate=0.001):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    nb_filters = 64
    # Define TF model graph
    model = ModelBasicCNN('model1', nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = LossCrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    train(sess, loss, x, y, x_train, y_train, args=train_params,
          rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, back='tf', sess=sess)
    jsma_params = {'theta': 1., 'gamma': 0.1,
                   'clip_min': 0., 'clip_max': 1.,
                   'y_target': None}

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = x_test[sample_ind:(sample_ind+1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, nchannels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, nchannels)),
                    np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, nchannels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        plt.close(figure)
        _ = grid_visual(grid_viz_data)

    return report
Beispiel #10
0
def main(argv=None):
    """
    MNIST cleverhans tutorial
    :return:
    """
    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'th':
        keras.backend.set_image_dim_ordering('th')
        print "INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to 'tf', temporarily setting to 'th'"

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)
    print "Created TensorFlow session and set Keras backend."

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist()
    print "Loaded MNIST test data."

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 1, 28, 28))
    y = tf.placeholder(tf.float32, shape=(None, FLAGS.nb_classes))

    # Define TF model graph
    model = model_mnist()
    predictions = model(x)
    print "Defined TensorFlow model graph."

    # Train an MNIST model
    tf_model_train(sess, x, y, predictions, X_train, Y_train)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    accuracy = tf_model_eval(sess, x, y, predictions, X_test, Y_test)
    assert X_test.shape[0] == 10000, X_test.shape
    print 'Test accuracy on legitimate test examples: ' + str(accuracy)

    # Craft adversarial examples using Fast Gradient Sign Method (FGSM)
    adv_x = fgsm(x, predictions, eps=0.3)
    X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test])
    assert X_test_adv.shape[0] == 10000, X_test_adv.shape

    # Evaluate the accuracy of the MNIST model on adversarial examples
    accuracy = tf_model_eval(sess, x, y, predictions, X_test_adv, Y_test)
    print 'Test accuracy on adversarial examples: ' + str(accuracy)

    print "Repeating the process, using adversarial training"
    # Redefine TF model graph
    model_2 = model_mnist()
    predictions_2 = model_2(x)
    adv_x_2 = fgsm(x, predictions_2, eps=0.3)
    predictions_2_adv = model_2(adv_x_2)

    # Perform adversarial training
    tf_model_train(sess,
                   x,
                   y,
                   predictions_2,
                   X_train,
                   Y_train,
                   predictions_adv=predictions_2_adv)

    # Evaluate the accuracy of the adversarialy trained MNIST model on
    # legitimate test examples
    accuracy = tf_model_eval(sess, x, y, predictions_2, X_test, Y_test)
    print 'Test accuracy on legitimate test examples: ' + str(accuracy)

    # Craft adversarial examples using Fast Gradient Sign Method (FGSM) on
    # the new model, which was trained using adversarial training
    X_test_adv_2, = batch_eval(sess, [x], [adv_x_2], [X_test])
    assert X_test_adv_2.shape[0] == 10000, X_test_adv_2.shape

    # Evaluate the accuracy of the adversarially trained MNIST model on
    # adversarial examples
    accuracy_adv = tf_model_eval(sess, x, y, predictions_2, X_test_adv_2,
                                 Y_test)
    print 'Test accuracy on adversarial examples: ' + str(accuracy_adv)
def mnist_blackbox(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_classes=10,
                   batch_size=128,
                   learning_rate=0.001,
                   nb_epochs=10,
                   holdout=150,
                   data_aug=6,
                   nb_epochs_s=10,
                   lmbda=0.1,
                   aug_batch_size=512):
    """
  MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :return: a dictionary with:
           * black-box model accuracy on test set
           * substitute model accuracy on test set
           * black-box model accuracy on adversarial examples transferred
             from the substitute model
  """

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session
    sess = tf.Session()

    # Get MNIST data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)
    # Initialize substitute training set reserved for adversary
    X_sub = x_test[:holdout]
    Y_sub = np.argmax(y_test[:holdout], axis=1)

    # Redefine test set as remaining samples unavailable to adversaries
    x_test = x_test[holdout:]
    y_test = y_test[holdout:]

    # Obtain Image parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Seed random number generator so tutorial is reproducible
    rng = np.random.RandomState([2017, 8, 30])

    # Simulate the black-box model locally
    # You could replace this by a remote labeling API for instance
    print("Preparing the black-box model.")
    prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test,
                              nb_epochs, batch_size, learning_rate, rng,
                              nb_classes, img_rows, img_cols, nchannels)
    model, bbox_preds, accuracies['bbox'] = prep_bbox_out

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    print("Training the substitute model.")
    train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes,
                              nb_epochs_s, batch_size, learning_rate, data_aug,
                              lmbda, aug_batch_size, rng, img_rows, img_cols,
                              nchannels)
    model_sub, preds_sub = train_sub_out

    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_sub, x_test, y_test, args=eval_params)
    accuracies['sub'] = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.}
    fgsm = FastGradientMethod(model_sub, sess=sess)

    # Craft adversarial examples using the substitute
    eval_params = {'batch_size': batch_size}
    x_adv_sub = fgsm.generate(x, **fgsm_par)

    # Evaluate the accuracy of the "black-box" model on adversarial examples
    accuracy = model_eval(sess,
                          x,
                          y,
                          model.get_logits(x_adv_sub),
                          x_test,
                          y_test,
                          args=eval_params)
    print('Test accuracy of oracle on adversarial examples generated '
          'using the substitute: ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex'] = accuracy

    return accuracies
from cleverhans.utils_mnist import data_mnist
from array_to_image import array_to_image
import numpy as np

X_train, Y_train, X_test, Y_test = data_mnist(train_start=0,
                                              train_end=60000,
                                              test_start=0,
                                              test_end=10000)
# visualization for MNIST
array_to_image(X_train[0:5], "mnist")

# visualization for MNIST's adversarial examples
adv_image = np.load("adv_image_FGM.npy")
array_to_image(adv_image[0:5], "adv_image_FGM")

# visualization for Cifar-10's adversarial examples
adv_image = np.load("adv_image_FGM_cifar10.npy")
array_to_image(adv_image[0:5], "adv_image_cifar10", channels=3, size=32)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=6, batch_size=128,
                   learning_rate=0.001, train_dir="train_dir",
                   filename="mnist.ckpt", load_model=False,
                   testing=False, label_smoothing=0.1, method='FGSM'):
    """
    MNIST  tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param train_dir: Directory storing the saved model
    :param filename: Filename to save model under
    :param load_model: True for load, False for not load
    :param testing: if true, test error is calculated
    :param label_smoothing: float, amount of label smoothing for cross entropy
    :return: an AccuracyReport object
    """
    keras.layers.core.K.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)
    print('y_train: ', y_train.shape)

    x_train = np.pad(x_train, ((0, 0), (2, 2), (2, 2), (0, 0)), mode='constant')
    x_test = np.pad(x_test, ((0, 0), (2, 2), (2, 2), (0, 0)), mode='constant')

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    print('img_rows: {}, img_cols: {}, nchannels: {}'.format(img_rows, img_cols, nchannels))
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Define TF model graph
    if train_dir=='mnist_ff_model':
        model=mnist_ff_model()
    elif train_dir=='mnist_BP_model':
        model = mnist_model(img_rows=img_rows, img_cols=img_cols,
                      channels=nchannels, nb_filters=64,
                      nb_classes=nb_classes)
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
        report.clean_train_clean_eval = acc
#        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }

    rng = np.random.RandomState([2017, 8, 30])
    if not os.path.exists(train_dir):
        os.mkdir(train_dir)

    ckpt = tf.train.get_checkpoint_state(train_dir)
    print(train_dir, ckpt)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path
    wrap = KerasModelWrapper(model)

    if load_model and ckpt_path:
        saver = tf.train.Saver()
        print(ckpt_path)
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        loss = CrossEntropy(wrap, smoothing=label_smoothing)
        train(sess, loss, x, y, x_train, y_train, evaluate=evaluate,
              args=train_params, save=True, rng=rng)
        print('Training done!')

    # Calculate training error
    print('testing param:', testing)
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params)
        report.train_clean_train_clean_eval = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    # fgsm = FastGradientMethod(wrap, sess=sess)
    if method=='FGSM':
        clw=FastGradientMethod(wrap, sess=sess)
    elif method=='BIM':
        clw=BasicIterativeMethod(wrap, sess=sess)
    elif method=='DeepFool':
        clw=DeepFool(wrap, sess=sess)
    else:
        raise NotImplementedError
    print('method chosen: ', method)
    clw_params = {}
    adv_x = clw.generate(x, **clw_params)
    with sess.as_default():
        feed_dict={x:x_test, y:y_test}
        store_data=adv_x.eval(feed_dict=feed_dict)
        print('store_data: {}'.format(store_data.shape))
        save_name='{}/mnist_{}_data.pkl'.format(train_dir,method)
        with open(save_name,'wb') as fw:
            pickle.dump(store_data, fw, protocol=2)
            print('data stored as {}'.format(save_name))


    # Consider the attack to be constant
    adv_x = tf.stop_gradient(adv_x)

    preds_adv = model(adv_x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    # Calculating train error
    if testing:
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, x_train,
                         y_train, args=eval_par)
        report.train_clean_train_adv_eval = acc


    return report
Beispiel #14
0
def minist_fgsm_saliency(
    train_start=0,
    train_end=10,
    test_start=0,
    test_end=5,
    nb_epochs=2,
    batch_size=128,
    learning_rate=0.001,
    clean_train=True,
    testing=False,
    backprop_through_attack=False,
    nb_filters=64,
    nb_classes=10,
    source_samples=10,
):
    """
    MNIST cleverhans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param clean_train: perform normal training on clean examples only
                        before performing adversarial training.
    :param testing: if true, complete an AccuracyReport for unit tests
                    to verify that performance is adequate
    :param backprop_through_attack: If True, backprop through adversarial
                                    example construction process during
                                    adversarial training.
    :param clean_train: if true, train on clean examples
    :return: an AccuracyReport object
    """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    sess = tf.Session()

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    # this way, all the 9 zeroes -> 0.1/9 because
    # the one-bit becomes 0.9
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # placeholder for y_target --> for saliency tensor
    y_target = tf.placeholder(tf.float32, shape=(None, 10))

    model_path = "models/mnist"
    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    rng = np.random.RandomState([2017, 8, 30])

    ###########################################################################
    # Training the CNN model using TensorFlow: model --> base model
    ###########################################################################
    model = make_basic_cnn(nb_filters=nb_filters)
    preds = model.get_probs(x)

    if clean_train:
        # omg -> creates a cnn model
        # model = make_basic_cnn(nb_filters=nb_filters)
        # preds = model.get_probs(x)
        def evaluate():
            # Evaluate the accuracy of the MNIST model on legitimate test
            # examples
            eval_params = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds,
                             X_test,
                             Y_test,
                             args=eval_params)
            report.clean_train_clean_eval = acc
            assert X_test.shape[0] == test_end - test_start, X_test.shape
            print('Test accuracy on legitimate examples: %0.4f' % acc)

        ###########################################################################
        # MODEL Train!!!!!!!!!!!!
        ###########################################################################
        # training the basic model, using train_params
        model_train(sess,
                    x,
                    y,
                    preds,
                    X_train,
                    Y_train,
                    evaluate=evaluate,
                    args=train_params,
                    rng=rng)

        # Calculate training error
        if testing:
            eval_params = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds,
                             X_train,
                             Y_train,
                             args=eval_params)
            report.train_clean_train_clean_eval = acc

        ###########################################################################
        # Generate FGSM Adversarial based on model, and
        # Compute Base Model Accuracy
        ###########################################################################

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        fgsm = FastGradientMethod(model, sess=sess)

        # todo: follow the paper and run Cleverhans Output?
        fgsm_params_y = {'eps': 0.3, 'y': y, 'clip_min': 0., 'clip_max': 1.}

        #adv_x = fgsm.generate(x, **fgsm_params)
        adv_x = fgsm.generate(x, **fgsm_params_y)
        preds_adv = model.get_probs(adv_x)
        # Evaluate the accuracy of the MNIST model on adversarial examples
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par)
        print('Test accuracy on FGSM adversarial examples: %0.4f\n' % acc)
        report.clean_train_adv_eval = acc

        # Calculate training error
        if testing:
            eval_par = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds_adv,
                             X_train,
                             Y_train,
                             args=eval_par)
            report.train_clean_train_adv_eval = acc

        ###########################################################################
        # Generate Saliency Map Adversarial Example and
        # Compute base model accuracy (only 10)
        ###########################################################################
        print("Saliency Map Attack On The Base Model")
        print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) +
              ' adversarial examples')

        # Instantiate a SaliencyMapMethod attack object --> modify y_target for each test_data again
        jsma = SaliencyMapMethod(model, back='tf', sess=sess)
        jsma_params = {
            'theta': 1.,
            'gamma': 0.1,
            'clip_min': 0.,
            'clip_max': 1.,
            'y_target': None
        }

        # Keep track of success (adversarial example classified in target)
        # Need this info to compute the success rate
        results = np.zeros((nb_classes, source_samples), dtype='i')

        # each sample will get 9 adversarial samples

        # adv_x_set: place_holder for all the x variations
        # correct_y_set: correct_y_output used for training

        adv_x_set = None
        adv_y_target = None

        # we need multi x_train_saliency / y_train_saliency
        #
        x_train_saliency = None
        y_train_saliency = None

        for sample_ind in xrange(0, source_samples):
            print('--------------------------------------')
            print('Saliency Attacking input %i/%i' %
                  (sample_ind + 1, source_samples))
            sample = X_train[sample_ind:(sample_ind + 1)]
            y_sample = Y_train[sample_ind:(sample_ind + 1)]

            current_class = int(np.argmax(Y_train[sample_ind]))
            target_classes = other_classes(nb_classes, current_class)

            # Loop over all target classes
            for target in target_classes:
                print('Generating adv. example for target class %i' % target)

                # Create x_train_saliency, corresponding to y_train_saliency
                if x_train_saliency is not None:
                    x_train_saliency = np.concatenate(
                        (x_train_saliency, sample), axis=0)
                    y_train_saliency = np.concatenate(
                        (y_train_saliency, y_sample), axis=0)
                else:
                    x_train_saliency = sample
                    y_train_saliency = y_sample
                    print("sample shape: ", x_train_saliency.shape)
                    print("y_sample shape: ", y_train_saliency.shape)

                # This call runs the Jacobian-based saliency map approach
                one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
                one_hot_target[0, target] = 1
                jsma_params['y_target'] = one_hot_target

                adv_x_np = jsma.generate_np(sample, **jsma_params)

                # Add to adv_x_set, correct_y_set
                if adv_x_set is not None:
                    adv_y_target = np.concatenate(
                        (adv_y_target, one_hot_target), axis=0)
                    adv_x_set = np.concatenate((adv_x_np, adv_x_set), axis=0)
                else:
                    adv_y_target = one_hot_target
                    adv_x_set = adv_x_np
                    print("adv_y_target shape(one-hot-encoding): ",
                          adv_y_target.shape)
                    print("adv_x_set(np) shape: ", adv_x_np.shape)

                # Check if success was achieved
                res = int(model_argmax(sess, x, preds, adv_x_np) == target)

                # Update the arrays for later analysis
                results[target, sample_ind] = res

        print('--------------------------------------')
        # Compute the number of adversarial examples that were successfully found
        nb_targets_tried = ((nb_classes - 1) * source_samples)
        succ_rate = float(np.sum(results)) / nb_targets_tried
        print('Avg. rate of successful Saliency adv. examples {0:.4f}'.format(
            succ_rate))
        report.clean_train_adv_eval = 1. - succ_rate

        # here we have successfully stacked up x_adversarial_set, y_correct_set
        # these can be used to provide training to our model now
        print("\n\n\n*****************************")
        print("Checking x_adv_set shape: ", adv_x_set.shape)
        print("Checking correct_y_set shape: ", adv_y_target.shape)

        print("x_training_saliency shape:", x_train_saliency.shape)
        print("y_training_saliency shape:", y_train_saliency.shape)

        # now construct model 3, define output -> input relationship tensor
        model_3 = make_basic_cnn(nb_filters=nb_filters)
        # define the x, the placeholder input - > preds_3 output
        preds_3 = model_3(x)

        # jsma3 = SaliencyMapMethod(model_3, sess=sess)
        #
        # jsma_params = {'theta': 1., 'gamma': 0.1,
        #                'clip_min': 0., 'clip_max': 1.,
        #                'y_target': y_target}
        #
        # # create adv_saliency set tensor, using x_train data and jsma_params containing adv_y_target
        # adv_jsma = jsma3.generate(x, jsma_params)
        # # create adv preds tensor
        # preds_jsma_adv = model_3(adv_jsma)

        # define saliency training model accuracy
        def evaluate_saliency():
            # Accuracy of adversarially trained model on legitimate test inputs
            eval_params = {'batch_size': batch_size}
            accuracy = model_eval(sess,
                                  x,
                                  y,
                                  preds_3,
                                  x_train_saliency,
                                  y_train_saliency,
                                  args=eval_params)
            print('Test accuracy on legitimate examples: %0.4f' % accuracy)
            report.adv_train_clean_eval = accuracy

        ###########################################################################
        # MODEL Train for Saliency Map
        ###########################################################################
        # Perform and evaluate adversarial training with FSGM MODEL!!!
        # Train the model with samples of normal and adversarial examples!
        model_train(sess,
                    x,
                    y,
                    model_3,
                    x_train_saliency,
                    y_train_saliency,
                    evaluate=evaluate_saliency(),
                    args=train_params,
                    rng=rng)

        #todo: use jsma to create adversarial testing??? or training???

    # Redefine TF model FGSM!!!
    model_2 = make_basic_cnn(nb_filters=nb_filters)
    preds_2 = model_2(x)
    fgsm2 = FastGradientMethod(model_2, sess=sess)

    # parameter for FGSM
    fgsm_params_y = {'eps': 0.3, 'y': y, 'clip_min': 0., 'clip_max': 1.}
    adv_x_2 = fgsm2.generate(x, **fgsm_params_y)
    if not backprop_through_attack:
        # For the fgsm attack used in this tutorial, the attack has zero
        # gradient so enabling this flag does not change the gradient.
        # For some other attacks, enabling this flag increases the cost of
        # training, but gives the defender the ability to anticipate how
        # the atacker will change their strategy in response to updates to
        # the defender's parameters.
        adv_x_2 = tf.stop_gradient(adv_x_2)
    preds_2_adv = model_2(adv_x_2)

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    ###########################################################################
    # MODEL Train for FGSM
    ###########################################################################
    # Perform and evaluate adversarial training with FSGM MODEL!!!
    model_train(sess,
                x,
                y,
                preds_2,
                X_train,
                Y_train,
                predictions_adv=preds_2_adv,
                evaluate=evaluate_2,
                args=train_params,
                rng=rng)

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              X_train,
                              Y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              X_train,
                              Y_train,
                              args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
Beispiel #15
0
def mnist_blackbox(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_classes=10, batch_size=128,
                   learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6,
                   nb_epochs_s=10, lmbda=0.1, attack="fgsm", targeted=False):
    """
    MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :return: a dictionary with:
             * black-box model accuracy on test set
             * substitute model accuracy on test set
             * black-box model accuracy on adversarial examples transferred
               from the substitute model
    """
    keras.layers.core.K.set_learning_phase(0)

    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session and set as Keras backend session
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    keras.backend.set_session(sess)

    # Get MNIST data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Initialize substitute training set reserved for adversary
    X_sub = X_test[:holdout]
    Y_sub = np.argmax(Y_test[:holdout], axis=1)

    # Redefine test set as remaining samples unavailable to adversaries
    X_test = X_test[holdout:]
    Y_test = Y_test[holdout:]

    X_test = X_test[:FLAGS.n_attack]
    Y_test = Y_test[:FLAGS.n_attack]

    # Define input and output TF placeholders
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Simulate the black-box model locally
    # You could replace this by a remote labeling API for instance
    print("Preparing the black-box model.")
    prep_bbox_out = prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test,
                              nb_epochs, batch_size, learning_rate)
    model, bbox_preds, accuracies['bbox'] = prep_bbox_out

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    time_start = time.time()
    print("Training the substitute model.")
    train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub,
                              nb_classes, nb_epochs_s, batch_size,
                              learning_rate, data_aug, lmbda)
    model_sub, preds_sub = train_sub_out
    time_end = time.time()
    print("Substitue model training time:", time_end - time_start)

    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params)
    accuracies['sub'] = acc
    print('substitution model accuracy:', acc)

    # Find the correctly predicted labels
    original_predict = batch_eval(sess, [x], [bbox_preds], [X_test],
                          args=eval_params)[0]
    original_class = np.argmax(original_predict, axis = 1)
    true_class = np.argmax(Y_test, axis = 1)
    mask = true_class == original_class
    print(np.sum(mask), "out of", mask.size, "are correct labeled,", len(X_test[mask]))  

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    wrap = KerasModelWrapper(model_sub)


    # Craft adversarial examples using the substitute
    eval_params = {'batch_size': batch_size}

    if attack == "fgsm":
        attacker_params = {'eps': 0.4, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.}
        fgsm = FastGradientMethod(wrap, sess=sess)
        x_adv_sub = fgsm.generate(x, **attacker_params)
        attacker = fgsm
        adv_inputs = X_test
        ori_labels = Y_test
        print("Running FGSM attack...")
    else:
        print("Running Carlini and Wagner\'s L2 attack...")
        yname = "y"
        adv_ys = None
        # wrap = KerasModelWrapper(model)
        cwl2 = CarliniWagnerL2(wrap, back='tf', sess=sess)
        attacker_params = {'binary_search_steps': 9,
                     'max_iterations': 2000,
                     'abort_early': True,
                     'learning_rate': 0.01,
                     'batch_size': 1,
                     'initial_const': 0.01,
                     'confidence': 20}
        # generate targeted labels, 9 for each test example
        if targeted:
            adv_ys = []
            targeted_class = []
            for i in range(0, X_test.shape[0]):
                for j in range(0,10):
                    # skip the original image label
                    if j == np.argmax(Y_test[i]):
                        continue
                    adv_ys.append(np.eye(10)[j])
                    targeted_class.append(j)
            attacker_params['y_target'] = np.array(adv_ys, dtype=np.float32)
            # duplicate the inputs by 9 times
            adv_inputs = np.array([[instance] * 9 for instance in X_test],
                                  dtype=np.float32)
            adv_inputs = adv_inputs.reshape((X_test.shape[0] * 9, 28, 28, 1))
            # also update the mask
            mask = np.repeat(mask, 9)
            ori_labels = np.repeat(Y_test, 9, axis=0)
        else:
            adv_inputs = X_test
            ori_labels = Y_test
        attacker = cwl2

    if attack == "fgsm":
        # Evaluate the accuracy of the "black-box" model on adversarial examples
        accuracy = model_eval(sess, x, y, model(x_adv_sub), adv_inputs, ori_labels,
                              args=eval_params)
        print('Test accuracy of oracle on adversarial examples generated '
              'using the substitute: ' + str(accuracy))
        accuracies['bbox_on_sub_adv_ex'] = accuracy

    time_start = time.time()
    # Evaluate the accuracy of the "black-box" model on adversarial examples
    x_adv_sub_np = attacker.generate_np(adv_inputs, **attacker_params)
    accuracy = model_eval(sess, x, y, bbox_preds, x_adv_sub_np, ori_labels,
                          args=eval_params)
    print('Test accuracy of oracle on adversarial examples generated '
          'using the substitute (NP): ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex'] = accuracy
    time_end = time.time()
    print('Attack time:', time_end - time_start)

    # Evaluate the targeted attack
    bbox_adv_predict = batch_eval(sess, [x], [bbox_preds], [x_adv_sub_np],
                          args=eval_params)[0]
    bbox_adv_class = np.argmax(bbox_adv_predict, axis = 1)
    true_class = np.argmax(ori_labels, axis = 1)
    untargeted_success = np.mean(bbox_adv_class != true_class)
    print('Untargeted attack success rate:', untargeted_success)
    accuracies['untargeted_success'] = untargeted_success
    if targeted:
        targeted_success = np.mean(bbox_adv_class == targeted_class)
        print('Targeted attack success rate:', targeted_success)
        accuracies['targeted_success'] = targeted_success

    if attack == "cwl2":
        # Compute the L2 pertubations of generated adversarial examples
        percent_perturbed = np.sum((x_adv_sub_np - adv_inputs)**2, axis=(1, 2, 3))**.5
        # print(percent_perturbed)
        # print('Avg. L_2 norm of perturbations {0:.4f}'.format(np.mean(percent_perturbed)))
        # when computing the mean, removing the failure attacks first
        print('Avg. L_2 norm of perturbations {0:.4f}'.format(np.mean(percent_perturbed[percent_perturbed > 1e-8])))

    # Evaluate the accuracy of the "black-box" model on adversarial examples
    accuracy = model_eval(sess, x, y, bbox_preds, adv_inputs[mask], ori_labels[mask],
                          args=eval_params)
    print('Test accuracy of excluding originally incorrect labels (should be 1.0): ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex_exc_ori'] = accuracy

    if attack == "fgsm":
        # Evaluate the accuracy of the "black-box" model on adversarial examples (excluding correct)
        accuracy = model_eval(sess, x, y, model(x_adv_sub), adv_inputs[mask], ori_labels[mask],
                              args=eval_params)
        print('Test accuracy of oracle on adversarial examples generated '
              'using the substitute (excluding originally incorrect labels): ' + str(accuracy))
        accuracies['bbox_on_sub_adv_ex_exc'] = accuracy

    # Evaluate the accuracy of the "black-box" model on adversarial examples (excluding correct)
    x_adv_sub_mask_np = x_adv_sub_np[mask]
    accuracy = model_eval(sess, x, y, bbox_preds, x_adv_sub_mask_np, ori_labels[mask],
                          args=eval_params)
    print('Test accuracy of oracle on adversarial examples generated '
          'using the substitute (excluding originally incorrect labels, NP): ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex_exc'] = accuracy

    return accuracies
Beispiel #16
0
def main():
    """
    MNIST CleverHans tutorial
    :return:
    """

    if not hasattr(backend, "theano"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the Theano backend.")

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'th':
        keras.backend.set_image_dim_ordering('th')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'tf', temporarily setting to 'th'")

    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size',
                        '-b',
                        default=128,
                        help='Size of training batches')
    parser.add_argument('--train_dir',
                        '-d',
                        default='/tmp',
                        help='Directory storing the saved model.')
    parser.add_argument('--filename',
                        '-f',
                        default='mnist.ckpt',
                        help='Filename to save model under.')
    parser.add_argument('--nb_epochs',
                        '-e',
                        default=6,
                        type=int,
                        help='Number of epochs to train model')
    parser.add_argument('--learning_rate',
                        '-lr',
                        default=0.5,
                        type=float,
                        help='Learning rate for training')
    args = parser.parse_args()

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist()
    print("Loaded MNIST test data.")

    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input Theano placeholder
    x_shape = (None, 1, 28, 28)
    x = T.tensor4('x')
    y = T.matrix('y')

    # Define Theano model graph
    model = cnn_model()
    model.build(x_shape)
    predictions = model(x)
    print("Defined Theano model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        accuracy = th_model_eval(x, y, predictions, X_test, Y_test, args=args)
        assert X_test.shape[0] == 10000, X_test.shape
        print('Test accuracy on legitimate test examples: ' + str(accuracy))
        pass

    # Train an MNIST model
    th_model_train(x,
                   y,
                   predictions,
                   model.trainable_weights,
                   X_train,
                   Y_train,
                   evaluate=evaluate,
                   args=args)

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    fgsm = FastGradientMethod(model, back='th')
    adv_x = fgsm.generate(x, params={'eps': 0.3})

    # Evaluate the accuracy of the MNIST model on adversarial examples
    accuracy = th_model_eval(x, y, model(adv_x), X_test, Y_test, args=args)
    print('Test accuracy on adversarial examples: ' + str(accuracy))

    print("Repeating the process, using adversarial training")
    # Redefine Theano model graph
    model_2 = cnn_model()
    model_2.build(x_shape)
    preds_2 = model_2(x)
    fgsm = FastGradientMethod(model_2, back='th')
    preds_2_adv = model_2(fgsm.generate(x, params={'eps': 0.3}))

    def evaluate_2():
        # Evaluate the accuracy of the adversarialy trained MNIST model on
        # legitimate test examples
        accuracy = th_model_eval(x, y, preds_2, X_test, Y_test, args=args)
        print('Test accuracy on legitimate test examples: ' + str(accuracy))

        # Evaluate the accuracy of the adversarially trained MNIST model on
        # adversarial examples
        acc_adv = th_model_eval(x, y, preds_2_adv, X_test, Y_test, args=args)
        print('Test accuracy on adversarial examples: ' + str(acc_adv))

    # Perform adversarial training
    th_model_train(x,
                   y,
                   preds_2,
                   model_2.trainable_weights,
                   X_train,
                   Y_train,
                   predictions_adv=preds_2_adv,
                   evaluate=evaluate_2,
                   args=args)
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0,
                      test_end=10000, viz_enabled=True, nb_epochs=6,
                      batch_size=128, source_samples=10,
                      learning_rate=0.001, attack_iterations=100,
                      model_path=os.path.join("models", "mnist"),
                      targeted=True):
    """
    MNIST tutorial for Carlini and Wagner's attack
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))
    nb_filters = 64

    # Define TF model graph
    model = ModelBasicCNN('model1', nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = LossCrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': os.path.join(*os.path.split(model_path)[:-1]),
        'filename': os.path.split(model_path)[-1]
    }

    rng = np.random.RandomState([2017, 8, 30])
    # check if we've trained before, and if we have, use that pre-trained model
    if os.path.exists(model_path + ".meta"):
        tf_model_load(sess, model_path)
    else:
        train(sess, loss, x, y, x_train, y_train, args=train_params,
              save=os.path.exists("models"), rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using Carlini and Wagner's approach
    ###########################################################################
    nb_adv_per_sample = str(nb_classes - 1) if targeted else '1'
    print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample +
          ' adversarial examples')
    print("This could take some time ...")

    # Instantiate a CW attack object
    cw = CarliniWagnerL2(model, back='tf', sess=sess)

    if viz_enabled:
        assert source_samples == nb_classes
        idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0]
                for i in range(nb_classes)]
    if targeted:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, nb_classes, img_rows, img_cols,
                          nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = np.array(
                [[instance] * nb_classes for instance in x_test[idxs]],
                dtype=np.float32)
        else:
            adv_inputs = np.array(
                [[instance] * nb_classes for
                 instance in x_test[:source_samples]], dtype=np.float32)

        one_hot = np.zeros((nb_classes, nb_classes))
        one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1

        adv_inputs = adv_inputs.reshape(
            (source_samples * nb_classes, img_rows, img_cols, nchannels))
        adv_ys = np.array([one_hot] * source_samples,
                          dtype=np.float32).reshape((source_samples *
                                                     nb_classes, nb_classes))
        yname = "y_target"
    else:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = x_test[idxs]
        else:
            adv_inputs = x_test[:source_samples]

        adv_ys = None
        yname = "y"

    cw_params = {'binary_search_steps': 1,
                 yname: adv_ys,
                 'max_iterations': attack_iterations,
                 'learning_rate': 0.1,
                 'batch_size': source_samples * nb_classes if
                 targeted else source_samples,
                 'initial_const': 10}

    adv = cw.generate_np(adv_inputs,
                         **cw_params)

    eval_params = {'batch_size': np.minimum(nb_classes, source_samples)}
    if targeted:
        adv_accuracy = model_eval(
            sess, x, y, preds, adv, adv_ys, args=eval_params)
    else:
        if viz_enabled:
            adv_accuracy = 1 - \
                model_eval(sess, x, y, preds, adv, y_test[
                           idxs], args=eval_params)
        else:
            adv_accuracy = 1 - \
                model_eval(sess, x, y, preds, adv, y_test[
                           :source_samples], args=eval_params)

    if viz_enabled:
        for j in range(nb_classes):
            if targeted:
                for i in range(nb_classes):
                    grid_viz_data[i, j] = adv[i * nb_classes + j]
            else:
                grid_viz_data[j, 0] = adv_inputs[j]
                grid_viz_data[j, 1] = adv[j]

        print(grid_viz_data.shape)

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy))
    report.clean_train_adv_eval = 1. - adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2,
                                       axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        _ = grid_visual(grid_viz_data)

    return report
def main(argv=None):
    """
    MNIST cleverhans tutorial
    :return:
    """
    keras.layers.core.K.set_learning_phase(0)

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session and set as Keras backend session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    # sess = tf.Session()
    keras.backend.set_session(sess)

    if args.ae:
        autoencoder.restore(sess, args.ae) # Restore model weights from previously saved model

    # Get MNIST data
    X_train, Y_train, X_test, Y_test = data_mnist()

    # Initialize substitute training set reserved for adversary
    X_sub = X_test[:FLAGS.holdout]
    Y_sub = np.argmax(Y_test[:FLAGS.holdout], axis=1)

    # Shrink training data.
    # X_train = X_train[:10000]
    # Y_train = Y_train[:10000]
    # Redefine test set as remaining samples unavailable to adversaries
    X_test = X_test[FLAGS.holdout:]
    Y_test = Y_test[FLAGS.holdout:]

    # Define input and output TF placeholders
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    print("Preparing the black-box model.")
    if args.alg == "cnn":
        model, bbox = prep_cnn_bbox(sess, x, y, X_train, Y_train, X_test, Y_test)
    elif is_not_nn():
        bbox = prep_boost_bbox(X_train, Y_train, X_test, Y_test)

    print("Training the substitute model.")
    model_sub, preds_sub = train_substitute(sess, x, y, bbox, X_sub, Y_sub)

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.}
    fgsm = FastGradientMethod(model_sub, sess=sess)

    # Craft adversarial examples using the substitute
    print("Crafting the adversarial examples.")
    eval_params = {'batch_size': FLAGS.batch_size}
    x_adv_sub = fgsm.generate(x, **fgsm_par)
    X_test_adv, = batch_eval(sess, [x], [x_adv_sub], [X_test], args=eval_params)

    # Dump adversarial examples.
    example_file = "example/{}.data".format(model_name)
    with open(example_file, "wb") as f:
        pickle.dump(X_test_adv, f)

    if args.ae:
        print("Denoising...")
        num_data = X_test_adv.shape[0]
        autoencoder.visualize(sess, X_test_adv.reshape(num_data, -1), "adv")
        filtered_data = autoencoder.run(sess, X_test_adv.reshape(num_data, -1))
        X_test_adv = filtered_data.reshape(num_data, 28, 28, 1)
    # Evaluate the accuracy of the "black-box" model on adversarial examples
    if args.alg == "cnn":
        accuracy = model_eval(sess, x, y, bbox, X_test_adv, Y_test,
                              args=eval_params)
    elif is_not_nn():
        x_test_adv = X_test_adv.reshape(X_test_adv.shape[0], -1)
        y_test = np.argmax(Y_test, axis=1)
        accuracy = bbox.score(x_test_adv, y_test)

    print("Test adversarial accuracy = {}".format(accuracy))

    log_file = "log/{}.log".format(model_name)
    with open(log_file, "a") as f:
        if args.ae:
            f.write("{}. Test adversarial accuracy = {}\n".format(args.ae, accuracy))
        else:
            f.write("Test adversarial accuracy = {}\n".format(accuracy))
def mnist_tutorial(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=6, batch_size=128,
                   learning_rate=0.001, train_dir="train_dir",
                   filename="mnist.ckpt", load_model=False,
                   testing=False, label_smoothing=True):
    """
    MNIST CleverHans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param train_dir: Directory storing the saved model
    :param filename: Filename to save model under
    :param load_model: True for load, False for not load
    :param testing: if true, test error is calculated
    :return: an AccuracyReport object
    """
    keras.layers.core.K.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    if label_smoothing:
        label_smooth = .1
        y_train = y_train.clip(label_smooth / (nb_classes-1),
                               1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Define TF model graph
    model = cnn_model(img_rows=img_rows, img_cols=img_cols,
                      channels=nchannels, nb_filters=64,
                      nb_classes=nb_classes)
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
        report.clean_train_clean_eval = acc
#        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }

    rng = np.random.RandomState([2017, 8, 30])
    if not os.path.exists(train_dir):
        os.mkdir(train_dir)

    ckpt = tf.train.get_checkpoint_state(train_dir)
    print(train_dir, ckpt)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path
    wrap = KerasModelWrapper(model)

    if load_model and ckpt_path:
        saver = tf.train.Saver()
        print(ckpt_path)
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        loss = LossCrossEntropy(wrap, smoothing=0.1)
        train(sess, loss, x, y, x_train, y_train, evaluate=evaluate,
              args=train_params, save=True, rng=rng)

    # Calculate training error
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params)
        report.train_clean_train_clean_eval = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.3,
                   'clip_min': 0.,
                   'clip_max': 1.}
    adv_x = fgsm.generate(x, **fgsm_params)
    # Consider the attack to be constant
    adv_x = tf.stop_gradient(adv_x)
    preds_adv = model(adv_x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    # Calculating train error
    if testing:
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, x_train,
                         y_train, args=eval_par)
        report.train_clean_train_adv_eval = acc

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols,
                        channels=nchannels, nb_filters=64,
                        nb_classes=nb_classes)
    wrap_2 = KerasModelWrapper(model_2)
    preds_2 = model_2(x)
    fgsm2 = FastGradientMethod(wrap_2, sess=sess)

    def attack(x):
        return fgsm2.generate(x, **fgsm_params)

    preds_2_adv = model_2(attack(x))
    loss_2 = LossCrossEntropy(wrap_2, smoothing=0.1, attack=attack)

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, x_test, y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess, x, y, preds_2_adv, x_test,
                              y_test, args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    train(sess, loss_2, x, y, x_train, y_train, evaluate=evaluate_2,
          args=train_params, save=False, rng=rng)

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, x_train, y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess, x, y, preds_2_adv, x_train,
                              y_train, args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
def main():
    """
    MNIST cleverhans tutorial
    :return:
    """
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--batch_size', '-b', default=128, help='Size of training batches')
    parser.add_argument('--train_dir', '-d', default='/tmp', help='Directory storing the saved model.')
    parser.add_argument('--filename', '-f',  default='mnist.ckpt', help='Filename to save model under.')
    parser.add_argument('--nb_epochs', '-e', default=6, type=int, help='Number of epochs to train model')
    parser.add_argument('--learning_rate', '-lr', default=0.5, type=float, help='Learning rate for training')
    args = parser.parse_args()

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist()
    print("Loaded MNIST test data.")

    assert Y_train.shape[1] == 10.
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input Theano placeholder
    x_shape = (None, 1, 28, 28)
    y_shape = (None, 10)
    x = T.tensor4('x')
    y = T.matrix('y')

    # Define Theano model graph
    model = model_mnist()
    model.build(x_shape)
    predictions = model(x)
    print("Defined Theano model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        accuracy = th_model_eval(x, y, predictions, X_test, Y_test, args=args)
        assert X_test.shape[0] == 10000, X_test.shape
        print('Test accuracy on legitimate test examples: ' + str(accuracy))
        pass

    # Train an MNIST model
    th_model_train(x, y, predictions, model.trainable_weights, X_train, Y_train, evaluate=evaluate, args=args)


    # Craft adversarial examples using Fast Gradient Sign Method (FGSM)
    adv_x = fgsm(x, predictions, eps=0.3)
    X_test_adv, = batch_eval([x], [adv_x], [X_test], args=args)
    assert X_test_adv.shape[0] == 10000, X_test_adv.shape

    # Evaluate the accuracy of the MNIST model on adversarial examples
    accuracy = th_model_eval(x, y, predictions, X_test_adv, Y_test, args=args)
    print('Test accuracy on adversarial examples: ' + str(accuracy))

    print("Repeating the process, using adversarial training")
    # Redefine Theano model graph
    model_2 = model_mnist()
    model_2.build(x_shape)
    predictions_2 = model_2(x)
    adv_x_2 = fgsm(x, predictions_2, eps=0.3)
    predictions_2_adv = model_2(adv_x_2)


    def evaluate_2():
        # Evaluate the accuracy of the adversarialy trained MNIST model on
        # legitimate test examples
        accuracy = th_model_eval(x, y, predictions_2, X_test, Y_test, args=args)
        print('Test accuracy on legitimate test examples: ' + str(accuracy))

        # Evaluate the accuracy of the adversarially trained MNIST model on
        # adversarial examples
        accuracy_adv = th_model_eval(x, y, predictions_2_adv, X_test, Y_test, args=args)
        print('Test accuracy on adversarial examples: ' + str(accuracy_adv))

    # Perform adversarial training
    th_model_train(x, y, predictions_2, model_2.trainable_weights, X_train, Y_train, predictions_adv=predictions_2_adv,
            evaluate=evaluate_2, args=args)
Beispiel #21
0
def mnist_tutorial_jsma(train_start=0,
                        train_end=60000,
                        test_start=0,
                        test_end=10000,
                        viz_enabled=True,
                        nb_epochs=6,
                        batch_size=128,
                        source_samples=10,
                        learning_rate=0.001):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    nb_filters = 64
    # Define TF model graph
    model = ModelBasicCNN('model1', nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = CrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, back='tf', sess=sess)
    jsma_params = {
        'theta': 1.,
        'gamma': 0.1,
        'clip_min': 0.,
        'clip_max': 1.,
        'y_target': None
    }

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = x_test[sample_ind:(sample_ind + 1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, nchannels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, nchannels)),
                    np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, nchannels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        plt.close(figure)
        _ = grid_visual(grid_viz_data)

    return report
def mnist_blackbox(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_classes=10,
                   batch_size=128,
                   learning_rate=0.001,
                   nb_epochs=10,
                   holdout=150,
                   nb_filters=64,
                   data_aug=6,
                   nb_epochs_s=10,
                   lmbda=0.1,
                   use_rec_err=True,
                   model_arch=None,
                   model_arch_sub=None,
                   attack_name=None,
                   use_cross_err=None,
                   dataset_name=None,
                   blocking_option=None,
                   opt_type='adam'):
    """
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :return: a dictionary with:
             * black-box model accuracy on test set
             * substitute model accuracy on test set
             * black-box model accuracy on adversarial examples transferred
               from the substitute model
    """
    get_rec_err = rec_err_fct(use_rec_err, blocking_option)

    merged = None  #XXX switched this off for the moment

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session
    sess = tf.Session()

    # Get data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Initialize substitute training set reserved for adversary
    X_sub = X_test[:holdout]
    Y_sub = np.argmax(Y_test[:holdout], axis=1)

    # Redefine test set as remaining samples unavailable to adversaries
    X_test = X_test[holdout:]
    Y_test = Y_test[holdout:]

    # Define input and output TF placeholders
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Seed random number generator so tutorial is reproducible
    rng = np.random.RandomState([2017, 8, 30])

    # Simulate the black-box model locally
    # You could replace this by a remote labeling API for instance
    print("Preparing the black-box model.")
    prep_bbox_out = prep_bbox(sess,
                              x,
                              y,
                              X_train,
                              Y_train,
                              X_test,
                              Y_test,
                              nb_filters,
                              nb_epochs,
                              batch_size,
                              learning_rate,
                              rng=rng,
                              use_rec_err=use_rec_err,
                              model_arch=model_arch,
                              attack_name=attack_name,
                              use_cross_err=use_cross_err,
                              dataset_name=dataset_name,
                              blocking_option=blocking_option,
                              opt_type=opt_type,
                              merged=merged)
    #model, bbox_preds, accuracies['bbox'] = prep_bbox_out
    model, _, _ = prep_bbox_out
    bbox_preds = model(x)

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    print("Training the substitute model.")
    model_sub, preds_sub = train_sub(
        sess,
        x,
        y,
        bbox_preds,
        X_sub,
        Y_sub,
        nb_classes,
        nb_epochs_s,
        batch_size,
        learning_rate,
        data_aug,
        lmbda,
        rng=rng,
        model_arch_sub=model_arch_sub,
        merged=merged,
        opt_type=opt_type,
        blocking_option=blocking_option,
    )

    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc_sub = model_eval(sess,
                         x,
                         y,
                         preds_sub,
                         X_test,
                         Y_test,
                         args=eval_params)
    accuracies['sub'] = acc_sub

    #XXX evaluating on clean samples after training sub
    preds = model(x)
    pre_ae_states, post_ae_states = model.get_ae_states()
    rec_err = get_rec_err(pre_ae_states, post_ae_states)
    accuracy, rec_loss_eval = model_eval(sess,
                                         x,
                                         y,
                                         preds,
                                         X_test,
                                         Y_test,
                                         args=eval_params,
                                         aux_loss_lst=[rec_err])
    print('Test accuracy of oracle on clean examples: ' + str(accuracy))
    print('reconstr. err. of oracle on clean examples: ' + str(rec_loss_eval))
    #XXX --> the result should be as before

    # Initialize the attack
    attack = attacks[attack_name](model_sub, sess=sess)

    x_adv_sub = attack.generate(x, **attack_par[dataset_name][attack_name])
    preds_adv = model(x_adv_sub)
    pre_ae_states_adv, post_ae_states_adv = model.get_ae_states()
    rec_err_adv = get_rec_err(pre_ae_states_adv, post_ae_states_adv)

    # Evaluate the accuracy of the "black-box" model on adversarial examples
    accuracy, rec_loss_eval = model_eval(sess,
                                         x,
                                         y,
                                         preds_adv,
                                         X_test,
                                         Y_test,
                                         args=eval_params,
                                         aux_loss_lst=[rec_err_adv])
    print('Test accuracy of oracle on adversarial examples: ' + str(accuracy))
    print('reconstr. err. of oracle on adversarial examples: ' +
          str(rec_loss_eval))
    accuracies['bbox_on_sub_adv_ex'] = accuracy

    print('Accuracies', accuracies)

    return accuracies
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.001,
                   clean_train=True,
                   testing=False,
                   backprop_through_attack=False,
                   nb_filters=64):
    """
    MNIST cleverhans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param clean_train: perform normal training on clean examples only
                        before performing adversarial training.
    :param testing: if true, complete an AccuracyReport for unit tests
                    to verify that performance is adequate
    :param backprop_through_attack: If True, backprop through adversarial
                                    example construction process during
                                    adversarial training.
    :param clean_train: if true, train on clean examples
    :return: an AccuracyReport object
    """
    nb_classes = 10
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(4264)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    sess = tf.Session()

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Get MNISTnotMNIST data
    # with np.load("MNISTnotMNIST.npz") as data:
    #     X_train, Y_train, X_test, Y_test = data['X_train'], data['Y_train'], data['X_test'], data['Y_test']

    # Use label smoothing
    # assert Y_train.shape[1] == 10
    # label_smooth = .1
    # Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    model_path = "./"
    model_name = "clean_trained_mnist_model"
    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': model_path,
        'filename': model_name
    }
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    rng = np.random.RandomState([443, 224, 39])

    if clean_train:
        model = make_basic_cnn(nb_filters=nb_filters, nb_classes=nb_classes)
        preds = model.get_probs(x)

        def evaluate():
            # Evaluate the accuracy of the MNIST model on legitimate test
            # examples
            eval_params = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds,
                             X_test,
                             Y_test,
                             args=eval_params)
            report.clean_train_clean_eval = acc
            assert X_test.shape[0] == test_end - test_start, X_test.shape
            print('Test accuracy on legitimate examples: %0.4f' % acc)

        model_train(sess,
                    x,
                    y,
                    preds,
                    X_train,
                    Y_train,
                    evaluate=evaluate,
                    save=True,
                    args=train_params,
                    rng=rng)

        # Calculate training error
        if testing:
            eval_params = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds,
                             X_train,
                             Y_train,
                             args=eval_params)
            report.train_clean_train_clean_eval = acc

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        fgsm = FastGradientMethod(model, sess=sess)
        adv_x = fgsm.generate(x, **fgsm_params)
        preds_adv = model.get_probs(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par)
        print('Test accuracy on adversarial examples: %0.4f\n' % acc)
        report.clean_train_adv_eval = acc

        # Calculate training error
        if testing:
            eval_par = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds_adv,
                             X_train,
                             Y_train,
                             args=eval_par)
            report.train_clean_train_adv_eval = acc

        print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    # model_2 = make_basic_cnn(nb_filters=nb_filters)
    # preds_2 = model_2(x)
    # fgsm2 = FastGradientMethod(model_2, sess=sess)
    # adv_x_2 = fgsm2.generate(x, **fgsm_params)
    # if not backprop_through_attack:
    #     # For the fgsm attack used in this tutorial, the attack has zero
    #     # gradient so enabling this flag does not change the gradient.
    #     # For some other attacks, enabling this flag increases the cost of
    #     # training, but gives the defender the ability to anticipate how
    #     # the atacker will change their strategy in response to updates to
    #     # the defender's parameters.
    #     adv_x_2 = tf.stop_gradient(adv_x_2)
    # preds_2_adv = model_2(adv_x_2)
    #
    # def evaluate_2():
    #     # Accuracy of adversarially trained model on legitimate test inputs
    #     eval_params = {'batch_size': batch_size}
    #     accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test,
    #                           args=eval_params)
    #     print('Test accuracy on legitimate examples: %0.4f' % accuracy)
    #     report.adv_train_clean_eval = accuracy
    #
    #     # Accuracy of the adversarially trained model on adversarial examples
    #     accuracy = model_eval(sess, x, y, preds_2_adv, X_test,
    #                           Y_test, args=eval_params)
    #     print('Test accuracy on adversarial examples: %0.4f' % accuracy)
    #     report.adv_train_adv_eval = accuracy
    #
    # # Perform and evaluate adversarial training
    # model_train(sess, x, y, preds_2, X_train, Y_train,
    #             predictions_adv=preds_2_adv, evaluate=evaluate_2,
    #             args=train_params, rng=rng)
    #
    # # Calculate training errors
    # if testing:
    #     eval_params = {'batch_size': batch_size}
    #     accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train,
    #                           args=eval_params)
    #     report.train_adv_train_clean_eval = accuracy
    #     accuracy = model_eval(sess, x, y, preds_2_adv, X_train,
    #                           Y_train, args=eval_params)
    #     report.train_adv_train_adv_eval = accuracy

    return report
Beispiel #24
0
def cnn_model(sess=tf.get_default_session(), debug=False, w=0, rel=2):
    """
    Defines a CNN model using Keras sequential model
    :param logits: If set to False, returns a Keras model, otherwise will also
                    return logits tensor
    :param input_ph: The TensorFlow tensor for the input
                    (needed if returning logits)
                    ("ph" stands for placeholder but it need not actually be a
                    placeholder)
    :param img_rows: number of row in the image
    :param img_cols: number of columns in the image
    :param channels: number of color channels (e.g., 1 for MNIST)
    :param nb_filters: number of convolutional filters per layer
    :param nb_classes: the number of output classes
    :return:
    """
    return "a"
    # Define the layers successively (convolution layers are version dependent)
    # if keras.backend.image_dim_ordering() == 'th':
    #     input_shape = (channels, img_rows, img_cols)
    # else:
    #     input_shape = (img_rows, img_cols, channels)
    #
    # layers = [Reshape((784,),input_shape=(28,28,1)),
    #           Dense(10),
    #           Activation('relu'),
    #           Dense(50),
    #           Activation('relu'),
    #           Dense(nb_classes)]
    inpt = Input(batch_shape=(None, 28, 28, 1))
    k = Reshape((784,))(inpt)
    # k=Activation(activation=binary_filter_tf)(k) #if you want to use feature squeezing
    
    # if w == 1:
    #     print("No perception ensemble, using only one neuron")
    #     x2 = Activation(micromodel)(k)
    #     x2 = Dropout(0.05)(k)
    # else:
    #
    #     for i in range(w):
    #         percepfilter = Lambda(lambda x: micromodel(x))(k)
    #         percepfilter = Dropout(0.05)(percepfilter)
    #         perceptionlayer.append(percepfilter)
    #     x2 = Concatenate()(perceptionlayer)
    perceptionlayer = []
    g = tf.get_default_graph()
    with g.gradient_override_map({"Sign": "FakeHS"}):
        # for i in range(2):
        #     x = TrainableScaler(activation=Paranograd)(k)
        #     x = Dropout(0.15)(x)
        #     x = TrainableScaler(use_bias=False)(x)
        #     x = Dropout(0.15)(x)
        #     perceptionlayer.append(x)
        #
        # for relusnumber in range(0):
        #     relux = TrainableScaler(activation="relu")(k)
        #     relux = Dropout(0.20)(relux)
        #     relux = TrainableScaler(use_bias=False)(relux)
        #     relux = Dropout(0.20)(relux)
        #     perceptionlayer.append(relux)
        #
        # print(perceptionlayer)
        #
        # x2 = Add()(perceptionlayer)
        # x3 = Multiply()(perceptionlayer)
        # x3 = Lambda(lambda x: -2 * x, output_shape=(784,))(x3)
        # print(x2)
        # print(x3)
        #  x2 = Add()([x2, x3])
        # x2 = TrainableOffset()(x2)
        x = TrainableOffset(bias_constraint=IsNegProb(),
                            bias_initializer=keras.initializers.Constant(value=-0.5),
                            bias_regularizer=antientropy,
                            activation=Heaviside)(k)
        x2 = Reshape((28, 28, 1))(x)
        x2 = Conv2D(64, (6, 6))(x2)
        print(x2)
        x2 = Reshape((-1,))(x2)
        x2 = TrainableOffset(
            bias_initializer=keras.initializers.Constant(value=0.0),
            activation='relu')(x2)
        # x2 = TrainableScaler(use_bias=False, kernel_regularizer='l1', activation=Heaviside)(x2)
        # x2 = Activation('relu')(x2)
        x2 = Reshape((23, 23, 64))(x2)
        x2 = Conv2D(64, (4, 4))(x2)
        x2 = Flatten()(x2)
        x2 = Dense(40, activation="relu")(x2)
        # x2 = Dense(400, activation=Paranograd)(x2)
        # x2 = TrainableScaler(use_bias=False, kernel_regularizer='l1', activation=Heaviside)(x2)
        x2 = Dense(10)(x2)
    predictions2 = Activation(activation="softmax")(x2)
    
    model = keras.Model(inputs=inpt, outputs=predictions2)
    
    if debug:
        return keras.Model(inputs=inpt, outputs=debugO)
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=0,
                                                  train_end=60000,
                                                  test_start=0,
                                                  test_end=10000)
    K.set_session(sess)
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(np.array(X_train), np.array(Y_train), 128, 3, verbose=1)
    print(model.evaluate(np.array(X_test), np.array(Y_test)))
    print(model.get_weights())
    return model
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.001,
                   clean_train=True,
                   testing=False,
                   backprop_through_attack=False,
                   nb_filters=64,
                   num_threads=None,
                   label_smoothing=0.1):
    """
    MNIST cleverhans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param clean_train: perform normal training on clean examples only
                        before performing adversarial training.
    :param testing: if true, complete an AccuracyReport for unit tests
                    to verify that performance is adequate
    :param backprop_through_attack: If True, backprop through adversarial
                                    example construction process during
                                    adversarial training.
    :param label_smoothing: float, amount of label smoothing for cross entropy
    :return: an AccuracyReport object
    """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    if num_threads:
        config_args = dict(intra_op_parallelism_threads=1)
    else:
        config_args = {}
    sess = tf.Session(config=tf.ConfigProto(**config_args))

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)
    # Use Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    eval_params = {'batch_size': batch_size}
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    rng = np.random.RandomState([2017, 8, 30])
    sess = tf.Session()

    def do_eval(preds, x_set, y_set, report_key, is_adv=None):
        acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params)
        setattr(report, report_key, acc)
        if is_adv is None:
            report_text = None
        elif is_adv:
            report_text = 'adversarial'
        else:
            report_text = 'legitimate'
        if report_text:
            print('Test accuracy on %s examples: %0.4f' % (report_text, acc))

    if clean_train:
        model = make_basic_picklable_cnn()
        preds = model.get_logits(x)
        assert len(model.get_params()) > 0
        loss = CrossEntropy(model, smoothing=label_smoothing)

        def evaluate():
            do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False)

        train(sess,
              loss,
              x,
              y,
              x_train,
              y_train,
              evaluate=evaluate,
              args=train_params,
              rng=rng,
              var_list=model.get_params())

        with sess.as_default():
            save("clean_model.joblib", model)
            # Now that the model has been saved, you can evaluate it in a
            # separate process using `evaluate_pickled_model.py`.
            # You should get exactly the same result for both clean and
            # adversarial accuracy as you get within this program.

        # Calculate training error
        if testing:
            do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval')

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        fgsm = FastGradientMethod(model, sess=sess)
        adv_x = fgsm.generate(x, **fgsm_params)
        preds_adv = model.get_logits(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True)

        # Calculate training error
        if testing:
            do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval')

        print('Repeating the process, using adversarial training')

    # Create a new model and train it to be robust to FastGradientMethod
    model2 = make_basic_picklable_cnn()
    fgsm2 = FastGradientMethod(model2, sess=sess)

    def attack(x):
        return fgsm2.generate(x, **fgsm_params)

    loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack)
    preds2 = model2.get_logits(x)
    adv_x2 = attack(x)

    if not backprop_through_attack:
        # For the fgsm attack used in this tutorial, the attack has zero
        # gradient so enabling this flag does not change the gradient.
        # For some other attacks, enabling this flag increases the cost of
        # training, but gives the defender the ability to anticipate how
        # the atacker will change their strategy in response to updates to
        # the defender's parameters.
        adv_x2 = tf.stop_gradient(adv_x2)
    preds2_adv = model2.get_logits(adv_x2)

    def evaluate2():
        # Accuracy of adversarially trained model on legitimate test inputs
        do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False)
        # Accuracy of the adversarially trained model on adversarial examples
        do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True)

    # Perform and evaluate adversarial training
    train(sess,
          loss2,
          x,
          y,
          x_train,
          y_train,
          evaluate=evaluate2,
          args=train_params,
          rng=rng,
          var_list=model2.get_params())

    with sess.as_default():
        save("adv_model.joblib", model2)
        # Now that the model has been saved, you can evaluate it in a
        # separate process using `evaluate_pickled_model.py`.
        # You should get exactly the same result for both clean and
        # adversarial accuracy as you get within this program.

    # Calculate training errors
    if testing:
        do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval')
        do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval')

    return report
Beispiel #26
0
def unprotected_network(sess=tf.get_default_session(), debug=False, train=False, save=False):
    inpt = Input(batch_shape=(None, 28, 28, 1))
    x2 = Conv2D(32, (3, 3), activation='relu')(inpt)
    x2 = Conv2D(32, (3, 3), activation='relu')(x2)
    x2 = MaxPooling2D(pool_size=(2, 2))(x2)
    x2 = Conv2D(32, (3, 3), activation='relu')(x2)
    x2 = Conv2D(32, (3, 3), activation='relu')(x2)
    x2 = MaxPooling2D(pool_size=(2, 2))(x2)
    
    # x2 = Flatten()(x2)
    # layer = []
    # for i in range(w):
    #     layer.append(TrainableOffset(bias_constraint=IsNegProb(),
    #                                  bias_initializer=keras.initializers.random_normal(-0.1),
    #                                  bias_regularizer=antientropy,
    #                                  activation=Heaviside)(x2))
    
    # x2 = Add()(layer)
    # x3 = Multiply()(layer)
    # x3 = Lambda(lambda x:2*x)(x3)
    # x2 = Subtract()([x2,x3])
    # x2 = TrainableScaler(use_bias=False, kernel_regularizer='l1', activation=Heaviside)(x2)
    # x2 = Reshape((23, 23, 64))(x2)
    # x2 = Conv2D(64, (3, 3))(x2)
    # x2 = MaxPool2D()(x2)
    # x2 = Conv2D(128, (3, 3))(x2)
    # x2 = MaxPool2D()(x2)
    
    x2 = Flatten()(x2)
    x2 = Dropout(0.2)(x2)
    layer = []
    # for i in range(w):
    #     layer.append(TrainableOffset(bias_constraint=IsNegProb(),
    #                                  bias_initializer=keras.initializers.random_normal(-0.1),
    #                                  bias_regularizer=antientropy,
    #                                  activation=Heaviside)(x2))
    
    # x2 = XOR()(layer)
    x2 = Dense(50, activation="relu")(x2)
    
    x2 = Dense(50, activation='relu')(x2)
    # x2 = TrainableScaler(use_bias=False, kernel_regularizer='l1', activation=Heaviside)(x2)
    x2 = Dense(10)(x2)


    predictions2 = Activation(activation="softmax")(x2)
    
    model = keras.Model(inputs=inpt, outputs=predictions2)

    if debug:
        model.summary()
    if not train:
        return model
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=0,
                                                  train_end=60000,
                                                  test_start=0,
                                                  test_end=10000)
    K.set_session(sess)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(np.array(X_train), np.array(Y_train), 128, 6, verbose=1)
    print(model.evaluate(np.array(X_test), np.array(Y_test)))
    # print(model.get_weights())
    if not save:
        return model
    save_path = os.path.join("/tmp", "mnist.ckpt")
    saver = tf.train.Saver()
    saver.save(sess, save_path)
    print("Completed model training and saved at: " +
                 str(save_path))
    return model
Beispiel #27
0
def mnist_tutorial_cw(train_start=0,
                      train_end=60000,
                      test_start=0,
                      test_end=10000,
                      viz_enabled=True,
                      nb_epochs=6,
                      batch_size=128,
                      nb_classes=10,
                      source_samples=10,
                      learning_rate=0.001,
                      attack_iterations=100,
                      model_path=os.path.join("models", "mnist"),
                      targeted=True):
    """
    MNIST tutorial for Carlini and Wagner's attack
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # MNIST-specific dimensions
    img_rows = 28
    img_cols = 28
    channels = 1

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = make_basic_cnn()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': os.path.join(*os.path.split(model_path)[:-1]),
        'filename': os.path.split(model_path)[-1]
    }

    rng = np.random.RandomState([2017, 8, 30])
    # check if we've trained before, and if we have, use that pre-trained model
    if os.path.exists(model_path + ".meta"):
        tf_model_load(sess, model_path)
    else:
        model_train(sess,
                    x,
                    y,
                    preds,
                    X_train,
                    Y_train,
                    args=train_params,
                    save=os.path.exists("models"),
                    rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
    assert X_test.shape[0] == test_end - test_start, X_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using Carlini and Wagner's approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) +
          ' adversarial examples')
    print("This could take some time ...")

    # Instantiate a CW attack object
    cw = CarliniWagnerL2(model, back='tf', sess=sess)

    idxs = [np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(10)]
    if targeted:
        # Initialize our array for grid visualization
        grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels)
        grid_viz_data = np.zeros(grid_shape, dtype='f')

        one_hot = np.zeros((10, 10))
        one_hot[np.arange(10), np.arange(10)] = 1

        adv_inputs = np.array([[instance] * 10 for instance in X_test[idxs]],
                              dtype=np.float32)
        adv_inputs = adv_inputs.reshape((100, 28, 28, 1))
        adv_ys = np.array([one_hot] * 10, dtype=np.float32).reshape((100, 10))
        yname = "y_target"
    else:
        # Initialize our array for grid visualization
        grid_shape = (nb_classes, 2, img_rows, img_cols, channels)
        grid_viz_data = np.zeros(grid_shape, dtype='f')

        adv_inputs = X_test[idxs]
        adv_ys = None
        yname = "y"

    cw_params = {
        'binary_search_steps': 1,
        yname: adv_ys,
        'max_iterations': attack_iterations,
        'learning_rate': 0.1,
        'batch_size': 100 if targeted else 10,
        'initial_const': 10
    }

    adv = cw.generate_np(adv_inputs, **cw_params)

    if targeted:
        adv_accuracy = model_eval(sess,
                                  x,
                                  y,
                                  preds,
                                  adv,
                                  adv_ys,
                                  args={'batch_size': 10})
    else:
        adv_accuracy = 1 - model_eval(
            sess, x, y, preds, adv, Y_test[idxs], args={'batch_size': 10})

    for j in range(10):
        if targeted:
            for i in range(10):
                grid_viz_data[i, j] = adv[i * 10 + j]
        else:
            grid_viz_data[j, 0] = adv_inputs[j]
            grid_viz_data[j, 1] = adv[j]

    print(grid_viz_data.shape)

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy))
    report.clean_train_adv_eval = 1. - adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(
        np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        _ = grid_visual(grid_viz_data)

    return report
Beispiel #28
0
def spartan_network(sess=tf.get_default_session(), debug=False, w=4, train=False, create_surrogate=False):
    # we want to integrate an attention mechanism with a feature squeezer
    inpt = Input(batch_shape=(None, 28, 28, 1))
    g = tf.get_default_graph()
    speclayer=[]
    with g.gradient_override_map({"Sign": "FakeHS"}):
        x2 = Conv2D(4, (1,1), activation=Paranograd, bias_initializer=keras.initializers.random_uniform(minval=0.05, maxval=0.5), kernel_initializer=keras.initializers.random_uniform(minval=1, maxval=1.5))(inpt)
        # x2 = SpatialDropout2D(0.1)(x2)
        x2 = Conv2D(32, (3, 3), activation='relu')(x2)
        # x2 = SpatialDropout2D(0.1)(x2)
        x2 = Conv2D(32, (3, 3), activation=Paranograd)(x2)
        x2 = MaxPooling2D(pool_size=(2, 2))(x2)
        x2 = Conv2D(32, (3, 3), activation='relu')(x2)
        # x2 = SpatialDropout2D(0.1)(x2)
        x2 = Conv2D(32, (3, 3), activation='relu')(x2)
        x2 = MaxPooling2D(pool_size=(2, 2))(x2)
        
        # x2 = Flatten()(x2)
        # layer = []
        # for i in range(w):
        #     layer.append(TrainableOffset(bias_constraint=IsNegProb(),
        #                                  bias_initializer=keras.initializers.random_normal(-0.1),
        #                                  bias_regularizer=antientropy,
        #                                  activation=Heaviside)(x2))

        # x2 = Add()(layer)
        # x3 = Multiply()(layer)
        # x3 = Lambda(lambda x:2*x)(x3)
        # x2 = Subtract()([x2,x3])
        # x2 = TrainableScaler(use_bias=False, kernel_regularizer='l1', activation=Heaviside)(x2)
        # x2 = Reshape((23, 23, 64))(x2)
        # x2 = Conv2D(64, (3, 3))(x2)
        # x2 = MaxPool2D()(x2)
        # x2 = Conv2D(128, (3, 3))(x2)
        # x2 = MaxPool2D()(x2)
        
        x2 = Flatten()(x2)
        # x2 = Dropout(0.1)(x2)
        layer=[]
        # for i in range(w):
        #     layer.append(TrainableOffset(bias_constraint=IsNegProb(),
        #                                  bias_initializer=keras.initializers.random_normal(-0.1),
        #                                  bias_regularizer=antientropy,
        #                                  activation=Heaviside)(x2))

        # x2 = XOR()(layer)
        x2 = Dense(100, activation="relu")(x2)
        
        # x2 = Dense(50, activation='relu')(x2)
        # x2 = TrainableScaler(use_bias=False, kernel_regularizer='l1', activation=Heaviside)(x2)
        x2 = Dense(10)(x2)
    predictions2 = Activation(activation="softmax")(x2)
    
    model = keras.Model(inputs=inpt, outputs=predictions2)
    model.summary()
    if not train:
        return model


    hist=LossHistory()
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=0,
                                                  train_end=60000,
                                                  test_start=0,
                                                  test_end=10000)
    K.set_session(sess)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(np.array(X_train), np.array(Y_train), 128, 8, verbose=1, callbacks=[hist])
    print(model.evaluate(np.array(X_test), np.array(Y_test)))
    print(model.get_weights())
    crntm=time.localtime()
    stnct = str(str(crntm.tm_mday) + "_" + str(crntm.tm_mon) + "_" + str(crntm.tm_year) + "_" + str(crntm.tm_hour) + "_" + str(crntm.tm_min))
    with open('Experiments/'+stnct,"ab") as f:
        import pickle
        pickle.dump([model.get_weights(), hist.losses, model.evaluate(np.array(X_test), np.array(Y_test))], f)
        
    if create_surrogate:
        inpt = Input(batch_shape=(None, 28, 28, 1))
        x2 = Conv2D(4, (1, 1), activation=soft_heaviside,
                    kernel_initializer=keras.initializers.random_uniform(minval=0, maxval=30))(inpt)
        x2 = SpatialDropout2D(0.1)(x2)
        x2 = Conv2D(32, (3, 3), activation='relu')(x2)
        x2 = SpatialDropout2D(0.1)(x2)
        x2 = Conv2D(32, (3, 3), activation=soft_heaviside)(x2)
        x2 = MaxPooling2D(pool_size=(2, 2))(x2)
        x2 = Conv2D(32, (3, 3), activation='relu')(x2)
        x2 = SpatialDropout2D(0.1)(x2)
        x2 = Conv2D(32, (3, 3), activation=soft_heaviside)(x2)
        x2 = MaxPooling2D(pool_size=(2, 2))(x2)
        x2 = Flatten()(x2)
        x2 = Dropout(0.2)(x2)
        x2 = Dense(50, activation="relu")(x2)
        x2 = Dense(50, activation='relu')(x2)
        # x2 = TrainableScaler(use_bias=False, kernel_regularizer='l1', activation=Heaviside)(x2)
        x2 = Dense(10)(x2)
        predictions_surr = Activation(activation="softmax")(x2)
        
        model_surrogate=keras.Model(inputs=inpt, outputs=predictions_surr)
        model_surrogate.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        model_surrogate.set_weights(model.get_weights())
        print(model.get_weights()[2],model_surrogate.get_weights()[2])
        return [model_surrogate, model]
        
    return model
def main(measures, src_model_names):
    np.random.seed(0)
    tf.set_random_seed(0)

    flags.DEFINE_integer('BATCH_SIZE', 32, 'Size of batches')
    set_mnist_flags()

    x = K.placeholder(
        (None, FLAGS.IMAGE_ROWS, FLAGS.IMAGE_COLS, FLAGS.NUM_CHANNELS))

    y = K.placeholder((None, FLAGS.NUM_CLASSES))

    X_train, Y_train, X_test, Y_test = data_mnist()

    # source model for crafting adversarial examples
    src_models = [None] * len(src_model_names)
    accuracy = [None] * len(src_model_names)
    for i in range(len(src_model_names)):
        src_models[i] = load_model(src_model_names[i])

    if measures == "Q":
        X_test = X_test[0:100]
        Y_test = Y_test[0:100]
        N = len(X_test)
        k = len(src_model_names)
        Qij = [([None] * k) for p in range(k)]
        for i in range(k - 1):
            for j in range(i + 1, k):
                a = b = c = d = 0.0
                for n in range(N):
                    src_model_i = src_models[i]
                    src_model_j = src_models[j]
                    Ci = tf_compute_C(src_model_i, x, y, X_test[n:n + 1],
                                      Y_test[n:n + 1])
                    Cj = tf_compute_C(src_model_j, x, y, X_test[n:n + 1],
                                      Y_test[n:n + 1])
                    if (Ci[0] == 1 & Cj[0] == 1):
                        a += 1
                    elif (Ci[0] == 0 & Cj[0] == 0):
                        d += 1
                    elif (Ci[0] == 0 & Cj[0] == 1):
                        c += 1
                    elif (Ci[0] == 1 & Cj[0] == 0):
                        b += 1
                print a, b, c, d
                Qij[i][j] = (a * d - b * c) / (a * d + b * c)

        Qij_SUM = 0.0
        for i in range(k - 1):
            for j in range(i + 1, k):
                Qij_SUM += Qij[i][j]
        QAV = (2.0 / (k * (k - 1))) * Qij_SUM
        print('The value of the Q statistic: %.4f' % (QAV))
        return

    if measures == "p":
        X_test = X_test[0:100]
        Y_test = Y_test[0:100]
        N = len(X_test)
        k = len(src_model_names)
        Pij = [([None] * k) for p in range(k)]
        for i in range(k - 1):
            for j in range(i + 1, k):
                a = b = c = d = 0.0
                for n in range(N):
                    src_model_i = src_models[i]
                    src_model_j = src_models[j]
                    Ci = tf_compute_C(src_model_i, x, y, X_test[n:n + 1],
                                      Y_test[n:n + 1])
                    Cj = tf_compute_C(src_model_j, x, y, X_test[n:n + 1],
                                      Y_test[n:n + 1])
                    if (Ci[0] == 1 & Cj[0] == 1):
                        a += 1
                    elif (Ci[0] == 0 & Cj[0] == 0):
                        d += 1
                    elif (Ci[0] == 0 & Cj[0] == 1):
                        c += 1
                    elif (Ci[0] == 1 & Cj[0] == 0):
                        b += 1
                print a, b, c, d
                Pij[i][j] = (a * d - b * c) / math.sqrt(
                    (a + b) * (a + c) * (b + d) * (d + c))

        Pij_SUM = 0.0
        for i in range(k - 1):
            for j in range(i + 1, k):
                Pij_SUM += Pij[i][j]
        PAV = (2.0 / (k * (k - 1))) * Pij_SUM
        print('The value of the correlation coefficient: %.4f' % (PAV))
        return
    if measures == "Ent":
        X_test = X_test[0:100]
        Y_test = Y_test[0:100]
        k = len(src_model_names)
        N = len(X_test)
        num = 0
        for i in range(N):
            lxt = 0
            print i
            for (name, src_model) in zip(src_model_names, src_models):
                C = tf_compute_C(src_model, x, y, X_test[i:i + 1],
                                 Y_test[i:i + 1])
                # lxt denote the number of substitutes that accurately recognize sample x.
                lxt += C[0]  # lxt= 0,1,2,3
            m = min(lxt, k - lxt)
            num += ((1.0 / (k - math.ceil(k / 2.0))) * m)
        Ent = (1.0 / N) * num
        print('The value of the entropy measure: %.4f' % (Ent))
        return

    if measures == "KW":
        X_test = X_test[0:100]
        Y_test = Y_test[0:100]
        k = len(src_model_names)
        N = len(X_test)
        num = 0
        for i in range(N):
            lxt = 0
            print i
            for (name, src_model) in zip(src_model_names, src_models):
                C = tf_compute_C(src_model, x, y, X_test[i:i + 1],
                                 Y_test[i:i + 1])
                # lxt denote the number of substitutes that accurately recognize sample x.
                lxt += C[0]  # lxt= 0,1,2,3
            num += (lxt * (k - lxt))
        KW = (1.0 / (N * math.pow(k, 2))) * num
        print('The value of the Kohavi-Wolpert variance: %.4f' % (KW))
        return

    if measures == "test":
        X_test = X_test[0:5]
        Y_test = Y_test[0:5]
        # display_leg_sample(X_test)
        for j in range(1):

            for (name, src_model) in zip(src_model_names, src_models):
                # the number of substitutes from D that correctly recognize X_test[j]
                num = tf_test_acc_num(src_model, x, y, X_test, Y_test)
                # output 1, 1, 1, 1, 1, 1
                print num

        return
Beispiel #30
0
def mnist_tutorial(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=100, batch_size=128,
                   learning_rate=0.001, train_dir="/tmp",
                   filename="mnist.ckpt", load_model=False,
                   testing=True, w=2, rel=0):
    """
    MNIST CleverHans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param train_dir: Directory storing the saved model
    :param filename: Filename to save model under
    :param load_model: True for load, False for not load
    :param testing: if true, test error is calculated
    :param w : number of perceptive neurons
    :return: an AccuracyReport object
    """
    
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()
    
    # Set TF random seed to improve reproducibility
    #tf.set_random_seed(1234)
    
    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")
    
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")
    
    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)
    keras.layers.core.K.set_learning_phase(1)
    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)
    
    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)
    
    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))
    
    # Define TF model graph
    models = spartan_network(sess=sess,w=w,train=True,create_surrogate=True)
    model_to_attack = models[0]
    spartan_model = models[1]
    ckpt = tf.train.get_checkpoint_state(train_dir)
    print(ckpt)
    trainvalue=True if ckpt is None else False
    #model_to_attack = unprotected_network(sess=sess, train=trainvalue, save=True)
    preds = model_to_attack(x)
    print("Defined TensorFlow model graph.")
    
    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
        report.clean_train_clean_eval = acc
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)
    
    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }
    rng = None
    ckpt = tf.train.get_checkpoint_state(train_dir)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path
    
    if load_model and ckpt_path:
        saver = tf.train.Saver()
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        keras.layers.core.K.set_learning_phase(1)
        # model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate,
          #          args=train_params, save=True, rng=rng)
    keras.layers.core.K.set_learning_phase(0)
    
    # Calculate training error
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
        print("With no Dropout : %s" % acc)
        report.train_clean_train_clean_eval = acc
    
    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    wrap = KerasModelWrapper(model_to_attack)
    global mdl
    mdl = model_to_attack
    fgsm = FastGradientMethod(wrap, sess=sess)
    for epstep in range(40):
        fgsm_params = {'eps': 0.01+0.02*epstep,
                   'clip_min': 0.,
                   'clip_max': 1.}
    # cw_params = {'confidence': 0.5,
    #              'batch_size': 4,
    #              'learning_rate': 2e-2,
    #              'max_iterations': 400,
    #              'clip_min': 0.,
    #              'clip_max': 1.}
        adv_x = fgsm.generate(x, **fgsm_params)
    # Consider the attack to be constant
        adv_x = tf.stop_gradient(adv_x)
    # cwattack = atk.CarliniWagnerL2(model_to_attack,sess=sess)
    # adv_x = cwattack.generate(x, **cw_params)
    # adv_x = tf.stop_gradient(adv_x)
    # adv_x_np = cwattack.generate_np(X_test[500:704], **cw_params)
    # from matplotlib import pyplot as plt
    # plt.rc('figure', figsize=(12.0, 12.0))
    # for j in range(40):
    #
    #     plt.imshow(adv_x_np[j].reshape((28, 28)),
    #                cmap="gray")
    #     plt.pause(0.15)
    # return
    
        preds_adv = spartan_model(adv_x)
    
    
    # Evaluate the accuracy of the MNIST model on adversarial examples
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par)
        print('Test accuracy on epsilon-%0.4f-adversarial examples: %0.4f\n' % (fgsm_params["eps"],acc))
    return
    report.clean_train_adv_eval = acc
    
    # Calculating train error
    if testing:
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, X_train,
                         Y_train, args=eval_par)
        report.train_clean_train_adv_eval = acc
    
    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model(w=w, rel=rel)
    preds_2 = model_2(x)
    wrap_2 = KerasModelWrapper(model_2)
    fgsm2 = FastGradientMethod(wrap_2, sess=sess)
    preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params))
    
    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy
        
        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess, x, y, preds_2_adv, X_test,
                              Y_test, args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy
    
    # Perform and evaluate adversarial training
    model_train(sess, x, y, preds_2, X_train, Y_train,
                predictions_adv=preds_2_adv, evaluate=evaluate_2,
                args=train_params, save=False, rng=rng)
    
    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess, x, y, preds_2_adv, X_train,
                              Y_train, args=eval_params)
        report.train_adv_train_adv_eval = accuracy
def mnist_tutorial(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=6, batch_size=128,
                   learning_rate=0.001,
                   clean_train=True,
                   testing=False,
                   backprop_through_attack=False,
                   nb_filters=64, num_threads=None,
                   label_smoothing=True):
    """
    MNIST cleverhans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param clean_train: perform normal training on clean examples only
                        before performing adversarial training.
    :param testing: if true, complete an AccuracyReport for unit tests
                    to verify that performance is adequate
    :param backprop_through_attack: If True, backprop through adversarial
                                    example construction process during
                                    adversarial training.
    :param clean_train: if true, train on clean examples
    :return: an AccuracyReport object
    """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    if num_threads:
        config_args = dict(intra_op_parallelism_threads=1)
    else:
        config_args = {}
    sess = tf.Session(config=tf.ConfigProto(**config_args))

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)
    # Use Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    if label_smoothing:
        label_smooth = .1
        y_train = y_train.clip(label_smooth /
                               (nb_classes-1), 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    eval_params = {'batch_size': batch_size}
    fgsm_params = {
        'eps': 0.3,
        'clip_min': 0.,
        'clip_max': 1.
    }
    rng = np.random.RandomState([2017, 8, 30])
    sess = tf.Session()

    def do_eval(preds, x_set, y_set, report_key, is_adv=None):
        acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params)
        setattr(report, report_key, acc)
        if is_adv is None:
            report_text = None
        elif is_adv:
            report_text = 'adversarial'
        else:
            report_text = 'legitimate'
        if report_text:
            print('Test accuracy on %s examples: %0.4f' % (report_text, acc))

    if clean_train:
        model = ModelBasicCNN('model1', nb_classes, nb_filters)
        preds = model.get_logits(x)
        loss = LossCrossEntropy(model, smoothing=0.1)

        def evaluate():
            do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False)

        train(sess, loss, x, y, x_train, y_train, evaluate=evaluate,
              args=train_params, rng=rng, var_list=model.get_params())

        # Calculate training error
        if testing:
            do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval')

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        fgsm = FastGradientMethod(model, sess=sess)
        adv_x = fgsm.generate(x, **fgsm_params)
        preds_adv = model.get_logits(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True)

        # Calculate training error
        if testing:
            do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval')

        print('Repeating the process, using adversarial training')

    # Create a new model and train it to be robust to FastGradientMethod
    model2 = ModelBasicCNN('model2', nb_classes, nb_filters)
    fgsm2 = FastGradientMethod(model2, sess=sess)

    def attack(x):
        return fgsm2.generate(x, **fgsm_params)

    loss2 = LossCrossEntropy(model2, smoothing=0.1, attack=attack)
    preds2 = model2.get_logits(x)
    adv_x2 = attack(x)

    if not backprop_through_attack:
        # For the fgsm attack used in this tutorial, the attack has zero
        # gradient so enabling this flag does not change the gradient.
        # For some other attacks, enabling this flag increases the cost of
        # training, but gives the defender the ability to anticipate how
        # the atacker will change their strategy in response to updates to
        # the defender's parameters.
        adv_x2 = tf.stop_gradient(adv_x2)
    preds2_adv = model2.get_logits(adv_x2)

    def evaluate2():
        # Accuracy of adversarially trained model on legitimate test inputs
        do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False)
        # Accuracy of the adversarially trained model on adversarial examples
        do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True)

    # Perform and evaluate adversarial training
    train(sess, loss2, x, y, x_train, y_train, evaluate=evaluate2,
          args=train_params, rng=rng, var_list=model2.get_params())

    # Calculate training errors
    if testing:
        do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval')
        do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval')

    return report
Beispiel #32
0
def main(argv=None):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :return:
    """

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    ###########################################################################
    # Define the dataset and model
    ###########################################################################

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' "
              "to 'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)
    print("Created TensorFlow session and set Keras backend.")

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist()
    print("Loaded MNIST test data.")

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    predictions = model(x)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model if it does not exist in the train_dir folder
    saver = tf.train.Saver()
    save_path = os.path.join(FLAGS.train_dir, FLAGS.filename)
    if os.path.isfile(save_path):
        saver.restore(sess, os.path.join(FLAGS.train_dir, FLAGS.filename))
    else:
        train_params = {
            'nb_epochs': FLAGS.nb_epochs,
            'batch_size': FLAGS.batch_size,
            'learning_rate': FLAGS.learning_rate
        }
        model_train(sess,
                    x,
                    y,
                    predictions,
                    X_train,
                    Y_train,
                    args=train_params)
        saver.save(sess, save_path)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': FLAGS.batch_size}
    accuracy = model_eval(sess,
                          x,
                          y,
                          predictions,
                          X_test,
                          Y_test,
                          args=eval_params)
    assert X_test.shape[0] == 10000, X_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(FLAGS.source_samples) + ' * ' +
          str(FLAGS.nb_classes - 1) + ' adversarial examples')

    # This array indicates whether an adversarial example was found for each
    # test set sample and target class
    results = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='i')

    # This array contains the fraction of perturbed features for each test set
    # sample and target class
    perturbations = np.zeros((FLAGS.nb_classes, FLAGS.source_samples),
                             dtype='f')

    # Define the TF graph for the model's Jacobian
    grads = jacobian_graph(predictions, x, FLAGS.nb_classes)

    # Initialize our array for grid visualization
    grid_shape = (FLAGS.nb_classes, FLAGS.nb_classes, FLAGS.img_rows,
                  FLAGS.img_cols, FLAGS.nb_channels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    np.save("xorg_jsma.npy", X_test)
    np.save("ytest_jsma.npy", Y_test)
    np.save("xtrain_jsma.npy", X_train)
    np.save("ytrain_jsma.npy", Y_train)

    adv_examples = []
    org_examples = []
    ytest_examples = []
    ytarget_examples = []

    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, FLAGS.source_samples):
        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(Y_test[sample_ind]))
        target_classes = other_classes(FLAGS.nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            X_test[sample_ind:(sample_ind + 1)],
            (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

        # Loop over all target classes
        for target in target_classes:
            print('--------------------------------------')
            print('Creating adv. example for target class ' + str(target))

            # This call runs the Jacobian-based saliency map approach
            adv_x, res, percent_perturb = jsma(sess,
                                               x,
                                               predictions,
                                               grads,
                                               X_test[sample_ind:(sample_ind +
                                                                  1)],
                                               target,
                                               theta=1,
                                               gamma=0.1,
                                               increase=True,
                                               back='tf',
                                               clip_min=0,
                                               clip_max=1)
            adv_examples.append(adv_x)
            org_examples.append(X_test[sample_ind:(sample_ind + 1)])
            ytest_examples.append(current_class)
            ytarget_examples.append(target)

            # Display the original and adversarial images side-by-side
            if FLAGS.viz_enabled:
                if 'figure' not in vars():
                    figure = pair_visual(
                        np.reshape(X_test[sample_ind:(sample_ind + 1)],
                                   (FLAGS.img_rows, FLAGS.img_cols)),
                        np.reshape(adv_x, (FLAGS.img_rows, FLAGS.img_cols)))
                else:
                    figure = pair_visual(
                        np.reshape(X_test[sample_ind:(sample_ind + 1)],
                                   (FLAGS.img_rows, FLAGS.img_cols)),
                        np.reshape(adv_x, (FLAGS.img_rows, FLAGS.img_cols)),
                        figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    # Compute the number of adversarial examples that were successfuly found
    nb_targets_tried = ((FLAGS.nb_classes - 1) * FLAGS.source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.2f}'.format(succ_rate))

    adv_examples = np.array(adv_examples)
    org_examples = np.array(org_examples)
    ytest_examples = np.array(ytest_examples)
    ytarget_examples = np.array(ytarget_examples)

    np.save("xorg2_jsma.npy", org_examples)
    np.save("ytest2_jsma.npy", ytest_examples)
    np.save("xadv_jsma.npy", adv_examples)
    np.save("ytarget_jsma.npy", ytarget_examples)

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.2f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.2f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if FLAGS.viz_enabled:
        _ = grid_visual(grid_viz_data)
def mnist_blackbox(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_classes=10, batch_size=128,
                   learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6,
                   nb_epochs_s=10, lmbda=0.1, aug_batch_size=512):
    """
    MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :return: a dictionary with:
             * black-box model accuracy on test set
             * substitute model accuracy on test set
             * black-box model accuracy on adversarial examples transferred
               from the substitute model
    """

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session
    sess = tf.Session()

    # Get MNIST data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)
    # Initialize substitute training set reserved for adversary
    X_sub = x_test[:holdout]
    Y_sub = np.argmax(y_test[:holdout], axis=1)

    # Redefine test set as remaining samples unavailable to adversaries
    x_test = x_test[holdout:]
    y_test = y_test[holdout:]

    # Obtain Image parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None,     nb_classes))

    # Seed random number generator so tutorial is reproducible
    rng = np.random.RandomState([2017, 8, 30])

    # Simulate the black-box model locally
    # You could replace this by a remote labeling API for instance
    print("Preparing the black-box model.")
    prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test,
                              nb_epochs, batch_size, learning_rate,
                              rng, nb_classes, img_rows, img_cols, nchannels)
    model, bbox_preds, accuracies['bbox'] = prep_bbox_out

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    print("Training the substitute model.")
    train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub,
                              nb_classes, nb_epochs_s, batch_size,
                              learning_rate, data_aug, lmbda, aug_batch_size,
                              rng, img_rows, img_cols, nchannels)
    model_sub, preds_sub = train_sub_out

    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_sub, x_test, y_test, args=eval_params)
    accuracies['sub'] = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.}
    fgsm = FastGradientMethod(model_sub, sess=sess)

    # Craft adversarial examples using the substitute
    eval_params = {'batch_size': batch_size}
    x_adv_sub = fgsm.generate(x, **fgsm_par)

    # Evaluate the accuracy of the "black-box" model on adversarial examples
    accuracy = model_eval(sess, x, y, model.get_logits(x_adv_sub),
                          x_test, y_test, args=eval_params)
    print('Test accuracy of oracle on adversarial examples generated '
          'using the substitute: ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex'] = accuracy

    return accuracies
Beispiel #34
0
def main(argv=None):
    """
    MNIST cleverhans tutorial
    :return:
    """

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print(
            "INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to 'th', temporarily setting to 'tf'"
        )

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist()

    assert Y_train.shape[1] == 10.
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = model_mnist()
    predictions = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        accuracy = model_eval(sess, x, y, predictions, X_test, Y_test)
        assert X_test.shape[0] == 10000, X_test.shape
        print('Test accuracy on legitimate test examples: ' + str(accuracy))

    # Train an MNIST model
    model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate)

    # Craft adversarial examples using Fast Gradient Sign Method (FGSM)
    adv_x = fgsm(x, predictions, eps=0.3)
    X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test])
    assert X_test_adv.shape[0] == 10000, X_test_adv.shape

    # Evaluate the accuracy of the MNIST model on adversarial examples
    accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test)
    print('Test accuracy on adversarial examples: ' + str(accuracy))

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = model_mnist()
    predictions_2 = model_2(x)
    adv_x_2 = fgsm(x, predictions_2, eps=0.3)
    predictions_2_adv = model_2(adv_x_2)

    def evaluate_2():
        # Evaluate the accuracy of the adversarialy trained MNIST model on
        # legitimate test examples
        accuracy = model_eval(sess, x, y, predictions_2, X_test, Y_test)
        print('Test accuracy on legitimate test examples: ' + str(accuracy))

        # Evaluate the accuracy of the adversarially trained MNIST model on
        # adversarial examples
        accuracy_adv = model_eval(sess, x, y, predictions_2_adv, X_test,
                                  Y_test)
        print('Test accuracy on adversarial examples: ' + str(accuracy_adv))

    # Perform adversarial training
    model_train(sess,
                x,
                y,
                predictions_2,
                X_train,
                Y_train,
                predictions_adv=predictions_2_adv,
                evaluate=evaluate_2)
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.001,
                   clean_train=True,
                   testing=False,
                   backprop_through_attack=False):
    """
    MNIST cleverhans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param testing: if true, complete an AccuracyReport for unit tests
      to verify that performance is adequate
    :param clean_train: if true, train on clean examples
    :return: an AccuracyReport object
    """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    sess = tf.Session()

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    model_path = "models/mnist"
    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    fgsm_params = {'eps': FLAGS.eps}
    rng = np.random.RandomState([2017, 8, 30])

    if clean_train:
        model = make_basic_cnn()
        preds = model.get_probs(x)

        def evaluate():
            # Evaluate the accuracy of the MNIST model on legitimate test
            # examples
            eval_params = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds,
                             X_test,
                             Y_test,
                             args=eval_params)
            report.clean_train_clean_eval = acc
            assert X_test.shape[0] == test_end - test_start, X_test.shape
            logger.info('Test accuracy on legitimate examples: %0.4f' % acc)

        model_train(sess,
                    x,
                    y,
                    preds,
                    X_train,
                    Y_train,
                    evaluate=evaluate,
                    args=train_params,
                    rng=rng)

        # Calculate training error
        if testing:
            eval_params = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds,
                             X_train,
                             Y_train,
                             args=eval_params)
            report.train_clean_train_clean_eval = acc

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        fgsm = FastGradientMethod(model, sess=sess)

        adv_x = fgsm.generate(x, **fgsm_params)
        if not backprop_through_attack:
            # For the fgsm attack used in this tutorial, the attack has zero
            # gradient so enabling this flag does not change the gradient.
            # For some other attacks, enabling this flag increases the cost of
            # training, but gives the defender the ability to anticipate how
            # the atacker will change their strategy in response to updates to
            # the defender's parameters.
            adv_x = tf.stop_gradient(adv_x)
        preds_adv = model.get_probs(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par)
        logger.info('Test accuracy on adversarial examples: %0.4f\n' % acc)
        report.clean_train_adv_eval = acc

        # Calculate training error
        if testing:
            eval_par = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds_adv,
                             X_train,
                             Y_train,
                             args=eval_par)
            report.train_clean_train_adv_eval = acc

        logger.info("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = make_basic_cnn()
    preds_2 = model_2(x)
    fgsm2 = FastGradientMethod(model_2, sess=sess)
    preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params))

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              X_test,
                              Y_test,
                              args=eval_params)
        logger.info('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              X_test,
                              Y_test,
                              args=eval_params)
        logger.info('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    model_train(sess,
                x,
                y,
                preds_2,
                X_train,
                Y_train,
                predictions_adv=preds_2_adv,
                evaluate=evaluate_2,
                args=train_params,
                rng=rng)

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              X_train,
                              Y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              X_train,
                              Y_train,
                              args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
Beispiel #36
0
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.001,
                   train_dir="train_dir",
                   filename="mnist.ckpt",
                   load_model=False,
                   testing=False,
                   label_smoothing=0.1):
    """
    MNIST CleverHans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param train_dir: Directory storing the saved model
    :param filename: Filename to save model under
    :param load_model: True for load, False for not load
    :param testing: if true, test error is calculated
    :param label_smoothing: float, amount of label smoothing for cross entropy
    :return: an AccuracyReport object
    """
    keras.layers.core.K.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Define TF model graph
    model = cnn_model(img_rows=img_rows,
                      img_cols=img_cols,
                      channels=nchannels,
                      nb_filters=64,
                      nb_classes=nb_classes)
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
        report.clean_train_clean_eval = acc
        #        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }

    rng = np.random.RandomState([2017, 8, 30])
    if not os.path.exists(train_dir):
        os.mkdir(train_dir)

    ckpt = tf.train.get_checkpoint_state(train_dir)
    print(train_dir, ckpt)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path
    wrap = KerasModelWrapper(model)

    if load_model and ckpt_path:
        saver = tf.train.Saver()
        print(ckpt_path)
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        loss = CrossEntropy(wrap, smoothing=label_smoothing)
        train(sess,
              loss,
              x,
              y,
              x_train,
              y_train,
              evaluate=evaluate,
              args=train_params,
              save=True,
              rng=rng)

    # Calculate training error
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params)
        report.train_clean_train_clean_eval = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    adv_x = fgsm.generate(x, **fgsm_params)
    # Consider the attack to be constant
    adv_x = tf.stop_gradient(adv_x)
    preds_adv = model(adv_x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    # Calculating train error
    if testing:
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess,
                         x,
                         y,
                         preds_adv,
                         x_train,
                         y_train,
                         args=eval_par)
        report.train_clean_train_adv_eval = acc

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model(img_rows=img_rows,
                        img_cols=img_cols,
                        channels=nchannels,
                        nb_filters=64,
                        nb_classes=nb_classes)
    wrap_2 = KerasModelWrapper(model_2)
    preds_2 = model_2(x)
    fgsm2 = FastGradientMethod(wrap_2, sess=sess)

    def attack(x):
        return fgsm2.generate(x, **fgsm_params)

    preds_2_adv = model_2(attack(x))
    loss_2 = CrossEntropy(wrap_2, smoothing=label_smoothing, attack=attack)

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              x_test,
                              y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              x_test,
                              y_test,
                              args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    train(sess,
          loss_2,
          x,
          y,
          x_train,
          y_train,
          evaluate=evaluate_2,
          args=train_params,
          save=False,
          rng=rng)

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              x_train,
                              y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              x_train,
                              y_train,
                              args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=6, batch_size=128,
                   learning_rate=0.001, train_dir="/tmp",
                   filename="mnist.ckpt", load_model=False,
                   testing=False):
    """
    MNIST CleverHans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param train_dir: Directory storing the saved model
    :param filename: Filename to save model under
    :param load_model: True for load, False for not load
    :param testing: if true, test error is calculated
    :return: an AccuracyReport object
    """
    keras.layers.core.K.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
        report.clean_train_clean_eval = acc
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }
    ckpt = tf.train.get_checkpoint_state(train_dir)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path

    rng = np.random.RandomState([2017, 8, 30])
    if load_model and ckpt_path:
        saver = tf.train.Saver()
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate,
              args=train_params, save=True)

    # Calculate training error
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params)
        report.train_clean_train_clean_eval = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    wrap = KerasModelWrapper(model)
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.3}
    adv_x = fgsm.generate(x, **fgsm_params)
    # Consider the attack to be constant
    adv_x = tf.stop_gradient(adv_x)
    preds_adv = model(adv_x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    # Calculating train error
    if testing:
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, X_train,
                         Y_train, args=eval_par)
        report.train_clean_train_adv_eval = acc

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model()
    preds_2 = model_2(x)
    wrap_2 = KerasModelWrapper(model_2)
    fgsm2 = FastGradientMethod(wrap_2, sess=sess)
    preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params))

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess, x, y, preds_2_adv, X_test,
                              Y_test, args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    train(sess, x, y, preds_2, X_train, Y_train,
          predictions_adv=preds_2_adv, evaluate=evaluate_2,
          args=train_params, save=False)

    # Get a random slice of the data for linear extrapolation plots
    random_idx = np.random.randint(0, X_train.shape[0])
    X_slice = X_train[random_idx]
    Y_slice = Y_train[random_idx]

    # Plot the linear extrapolation plot for clean model
    log_prob_adv_array = get_logits_over_interval(
        sess, wrap, X_slice, fgsm_params)
    linear_extrapolation_plot(log_prob_adv_array, Y_slice,
                              'lep_clean.png')

    # Plot the linear extrapolation plot for adv model
    log_prob_adv_array = get_logits_over_interval(
        sess, wrap_2, X_slice, fgsm_params)
    linear_extrapolation_plot(log_prob_adv_array, Y_slice,
                              'lep_adv.png')

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess, x, y, preds_2_adv, X_train,
                              Y_train, args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
Beispiel #38
0
def mnist_tutorial_cw(train_start=0,
                      train_end=60000,
                      test_start=0,
                      test_end=10000,
                      viz_enabled=True,
                      nb_epochs=6,
                      batch_size=128,
                      nb_classes=10,
                      source_samples=10,
                      learning_rate=0.001,
                      attack_iterations=100,
                      model_path=os.path.join("models", "mnist"),
                      targeted=True):
    """
    MNIST tutorial for Carlini and Wagner's attack
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # MNIST-specific dimensions
    img_rows = 28
    img_cols = 28
    channels = 1

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Define TF model graph
    model = make_basic_cnn()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    print(X_test)