Example #1
0
    def eval_multi(self, inc_epoch=True):
        """
        Run the evaluation on multiple attacks.
        """
        sess = self.sess
        preds = self.preds
        x = self.x_pre
        y = self.y
        X_train = self.X_train
        Y_train = self.Y_train
        X_test = self.X_test
        Y_test = self.Y_test
        writer = self.writer

        self.summary = tf.Summary()
        report = {}

        # Evaluate on train set
        subsample_factor = 100
        X_train_subsampled = X_train[::subsample_factor]
        Y_train_subsampled = Y_train[::subsample_factor]
        acc_train = model_eval(sess, x, y, preds, X_train_subsampled,
                               Y_train_subsampled, args=self.eval_params)
        self.log_value('train_accuracy_subsampled', acc_train,
                       'Clean accuracy, subsampled train')
        report['train'] = acc_train

        # Evaluate on the test set
        acc = model_eval(sess, x, y, preds, X_test, Y_test,
                         args=self.eval_params)
        self.log_value('test_accuracy_natural', acc,
                       'Clean accuracy, natural test')
        report['test'] = acc

        # Evaluate against adversarial attacks
        if self.epoch % self.hparams.eval_iters == 0:
            for att_type in self.attack_type_test:
                adv_x, preds_adv = self.attacks[att_type]
                acc = self.eval_advs(x, y, preds_adv, X_test, Y_test, att_type)
                report[att_type] = acc

        if self.writer:
            writer.add_summary(self.summary, self.epoch)

        # Add examples of adversarial examples to the summary
        if self.writer and self.epoch % 20 == 0 and self.sum_op is not None:
            sm_val = self.sess.run(self.sum_op,
                                   feed_dict={x: X_test[:self.batch_size],
                                              y: Y_test[:self.batch_size]})
            if self.writer:
                writer.add_summary(sm_val)

        self.epoch += 1 if inc_epoch else 0

        return report
    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess, x, y, preds_2_adv, X_test,
                              Y_test, args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy
Example #3
0
    def evaluate_2():
        # Evaluate the accuracy of the adversarialy trained CIFAR10 model on
        # legitimate test examples
        eval_params = {'batch_size': FLAGS.batch_size}
        accuracy = model_eval(sess, x, y, predictions_2, X_test, Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate test examples: ' + str(accuracy))

        # Evaluate the accuracy of the adversarially trained CIFAR10 model on
        # adversarial examples
        accuracy_adv = model_eval(sess, x, y, predictions_2_adv, X_test,
                                  Y_test, args=eval_params)
        print('Test accuracy on adversarial examples: ' + str(accuracy_adv))
 def evaluate():
     # Evaluate the accuracy of the MNIST model on legitimate test examples
     eval_params = {'batch_size': batch_size}
     acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
     report.clean_train_clean_eval = acc
     assert X_test.shape[0] == test_end - test_start, X_test.shape
     print('Test accuracy on legitimate examples: %0.4f' % acc)
Example #5
0
 def evaluate():
     # Evaluate the accuracy of the CIFAR10 model on legitimate test
     # examples
     eval_params = {'batch_size': FLAGS.batch_size}
     accuracy = model_eval(sess, x, y, predictions, X_test, Y_test,
                           args=eval_params)
     assert X_test.shape[0] == 10000, X_test.shape
     print('Test accuracy on legitimate test examples: ' + str(accuracy))
Example #6
0
def main(argv):
    checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)

    if checkpoint is None:
        raise ValueError("Couldn't find latest checkpoint in " +
                         FLAGS.checkpoint_dir)

    train_start = 0
    train_end = 60000
    test_start = 0
    test_end = 10000
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    assert Y_train.shape[1] == 10

    # NOTE: for compatibility with Madry Lab downloadable checkpoints,
    # we cannot enclose this in a scope or do anything else that would
    # change the automatic naming of the variables.
    model = MadryMNIST()

    x_input = tf.placeholder(tf.float32, shape=[None, 784])
    x_image = tf.placeholder(tf.float32, shape=[None, 28, 28, 1])
    y = tf.placeholder(tf.float32, shape=[None, 10])

    if FLAGS.attack_type == 'fgsm':
        fgsm = FastGradientMethod(model)
        fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
        adv_x = fgsm.generate(x_image, **fgsm_params)
    elif FLAGS.attack_type == 'bim':
        bim = BasicIterativeMethod(model)
        bim_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.,
                      'nb_iter': 50,
                      'eps_iter': .01}
        adv_x = bim.generate(x_image, **bim_params)
    else:
        raise ValueError(FLAGS.attack_type)
    preds_adv = model.get_probs(adv_x)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        # Restore the checkpoint
        saver.restore(sess, checkpoint)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        eval_par = {'batch_size': FLAGS.batch_size}
        t1 = time.time()
        acc = model_eval(
            sess, x_image, y, preds_adv, X_test, Y_test, args=eval_par)
        t2 = time.time()
        print("Took", t2 - t1, "seconds")
        print('Test accuracy on adversarial examples: %0.4f\n' % acc)
 def do_eval(preds, x_set, y_set, report_key, is_adv=None):
     acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params)
     setattr(report, report_key, acc)
     if is_adv is None:
         report_text = None
     elif is_adv:
         report_text = 'adversarial'
     else:
         report_text = 'legitimate'
     if report_text:
         print('Test accuracy on %s examples: %0.4f' % (report_text, acc))
def prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test,
              nb_epochs, batch_size, learning_rate,
              rng, nb_classes=10, img_rows=28, img_cols=28, nchannels=1):
    """
    Define and train a model that simulates the "remote"
    black-box oracle described in the original paper.
    :param sess: the TF session
    :param x: the input placeholder for MNIST
    :param y: the ouput placeholder for MNIST
    :param X_train: the training data for the oracle
    :param Y_train: the training labels for the oracle
    :param X_test: the testing data for the oracle
    :param Y_test: the testing labels for the oracle
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param rng: numpy.random.RandomState
    :return:
    """

    # Define Keras-based TF model graph (for the black-box model)
    nb_filters = 64
    model = cnn_model(nb_filters=nb_filters, nb_classes=nb_classes)

    # Wrap the model in KerasModelWrapper
    model = KerasModelWrapper(model, nb_classes)
    loss = LossCrossEntropy(model, smoothing=0.1)
    predictions = model.get_logits(x)
    print("Defined TensorFlow model graph.")

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    train(sess, loss, x, y, X_train, Y_train, args=train_params, rng=rng)

    # Print out the accuracy on legitimate data
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, predictions, X_test, Y_test,
                          args=eval_params)
    print('Test accuracy of black-box on legitimate test '
          'examples: ' + str(accuracy))

    return model, predictions, accuracy
Example #9
0
    def eval_advs(self, x, y, preds_adv, X_test, Y_test, att_type):
        """
        Evaluate the accuracy of the model on adversarial examples

        :param x: symbolic input to model.
        :param y: symbolic variable for the label.
        :param preds_adv: symbolic variable for the prediction on an
                          adversarial example.
        :param X_test: NumPy array of test set inputs.
        :param Y_test: NumPy array of test set labels.
        :param att_type: name of the attack.
        """
        end = (len(X_test) // self.batch_size) * self.batch_size

        if self.hparams.fast_tests:
            end = 10*self.batch_size

        acc = model_eval(self.sess, x, y, preds_adv, X_test[:end],
                         Y_test[:end], args=self.eval_params)
        self.log_value('test_accuracy_%s' % att_type, acc,
                       'Test accuracy on adversarial examples')
        return acc
Example #10
0
def mnist_blackbox(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_classes=NB_CLASSES,
                   batch_size=BATCH_SIZE,
                   learning_rate=LEARNING_RATE,
                   nb_epochs=NB_EPOCHS,
                   holdout=HOLDOUT,
                   data_aug=DATA_AUG,
                   nb_epochs_s=NB_EPOCHS_S,
                   lmbda=LMBDA,
                   aug_batch_size=AUG_BATCH_SIZE):
    """
  MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :return: a dictionary with:
           * black-box model accuracy on test set
           * substitute model accuracy on test set
           * black-box model accuracy on adversarial examples transferred
             from the substitute model
  """

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session
    sess = tf.Session()

    # Get MNIST data
    mnist = MNIST(train_start=train_start,
                  train_end=train_end,
                  test_start=test_start,
                  test_end=test_end)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Initialize substitute training set reserved for adversary
    x_sub = x_test[:holdout]
    y_sub = np.argmax(y_test[:holdout], axis=1)

    # Redefine test set as remaining samples unavailable to adversaries
    x_test = x_test[holdout:]
    y_test = y_test[holdout:]

    # Obtain Image parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Seed random number generator so tutorial is reproducible
    rng = np.random.RandomState([2017, 8, 30])

    # Simulate the black-box model locally
    # You could replace this by a remote labeling API for instance
    print("Preparing the black-box model.")
    prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test,
                              nb_epochs, batch_size, learning_rate, rng,
                              nb_classes, img_rows, img_cols, nchannels)
    model, bbox_preds, accuracies['bbox'] = prep_bbox_out

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    print("Training the substitute model.")
    train_sub_out = train_sub(sess, x, y, bbox_preds, x_sub, y_sub, nb_classes,
                              nb_epochs_s, batch_size, learning_rate, data_aug,
                              lmbda, aug_batch_size, rng, img_rows, img_cols,
                              nchannels)
    model_sub, preds_sub = train_sub_out

    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_sub, x_test, y_test, args=eval_params)
    accuracies['sub'] = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.}
    fgsm = FastGradientMethod(model_sub, sess=sess)

    # Craft adversarial examples using the substitute
    eval_params = {'batch_size': batch_size}
    x_adv_sub = fgsm.generate(x, **fgsm_par)

    # Evaluate the accuracy of the "black-box" model on adversarial examples
    accuracy = model_eval(sess,
                          x,
                          y,
                          model.get_logits(x_adv_sub),
                          x_test,
                          y_test,
                          args=eval_params)
    print('Test accuracy of oracle on adversarial examples generated '
          'using the substitute: ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex'] = accuracy

    return accuracies
Example #11
0
def mnist_blackbox(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_classes=10, batch_size=128,
                   learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6,
                   nb_epochs_s=10, lmbda=0.1, attack="fgsm", targeted=False):
    """
    MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :return: a dictionary with:
             * black-box model accuracy on test set
             * substitute model accuracy on test set
             * black-box model accuracy on adversarial examples transferred
               from the substitute model
    """
    keras.layers.core.K.set_learning_phase(0)

    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session and set as Keras backend session
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    keras.backend.set_session(sess)

    # Get MNIST data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Initialize substitute training set reserved for adversary
    X_sub = X_test[:holdout]
    Y_sub = np.argmax(Y_test[:holdout], axis=1)

    # Redefine test set as remaining samples unavailable to adversaries
    X_test = X_test[holdout:]
    Y_test = Y_test[holdout:]

    X_test = X_test[:FLAGS.n_attack]
    Y_test = Y_test[:FLAGS.n_attack]

    # Define input and output TF placeholders
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Simulate the black-box model locally
    # You could replace this by a remote labeling API for instance
    print("Preparing the black-box model.")
    prep_bbox_out = prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test,
                              nb_epochs, batch_size, learning_rate)
    model, bbox_preds, accuracies['bbox'] = prep_bbox_out

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    time_start = time.time()
    print("Training the substitute model.")
    train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub,
                              nb_classes, nb_epochs_s, batch_size,
                              learning_rate, data_aug, lmbda)
    model_sub, preds_sub = train_sub_out
    time_end = time.time()
    print("Substitue model training time:", time_end - time_start)

    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params)
    accuracies['sub'] = acc
    print('substitution model accuracy:', acc)

    # Find the correctly predicted labels
    original_predict = batch_eval(sess, [x], [bbox_preds], [X_test],
                          args=eval_params)[0]
    original_class = np.argmax(original_predict, axis = 1)
    true_class = np.argmax(Y_test, axis = 1)
    mask = true_class == original_class
    print(np.sum(mask), "out of", mask.size, "are correct labeled,", len(X_test[mask]))  

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    wrap = KerasModelWrapper(model_sub)


    # Craft adversarial examples using the substitute
    eval_params = {'batch_size': batch_size}

    if attack == "fgsm":
        attacker_params = {'eps': 0.4, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.}
        fgsm = FastGradientMethod(wrap, sess=sess)
        x_adv_sub = fgsm.generate(x, **attacker_params)
        attacker = fgsm
        adv_inputs = X_test
        ori_labels = Y_test
        print("Running FGSM attack...")
    else:
        print("Running Carlini and Wagner\'s L2 attack...")
        yname = "y"
        adv_ys = None
        # wrap = KerasModelWrapper(model)
        cwl2 = CarliniWagnerL2(wrap, back='tf', sess=sess)
        attacker_params = {'binary_search_steps': 9,
                     'max_iterations': 2000,
                     'abort_early': True,
                     'learning_rate': 0.01,
                     'batch_size': 1,
                     'initial_const': 0.01,
                     'confidence': 20}
        # generate targeted labels, 9 for each test example
        if targeted:
            adv_ys = []
            targeted_class = []
            for i in range(0, X_test.shape[0]):
                for j in range(0,10):
                    # skip the original image label
                    if j == np.argmax(Y_test[i]):
                        continue
                    adv_ys.append(np.eye(10)[j])
                    targeted_class.append(j)
            attacker_params['y_target'] = np.array(adv_ys, dtype=np.float32)
            # duplicate the inputs by 9 times
            adv_inputs = np.array([[instance] * 9 for instance in X_test],
                                  dtype=np.float32)
            adv_inputs = adv_inputs.reshape((X_test.shape[0] * 9, 28, 28, 1))
            # also update the mask
            mask = np.repeat(mask, 9)
            ori_labels = np.repeat(Y_test, 9, axis=0)
        else:
            adv_inputs = X_test
            ori_labels = Y_test
        attacker = cwl2

    if attack == "fgsm":
        # Evaluate the accuracy of the "black-box" model on adversarial examples
        accuracy = model_eval(sess, x, y, model(x_adv_sub), adv_inputs, ori_labels,
                              args=eval_params)
        print('Test accuracy of oracle on adversarial examples generated '
              'using the substitute: ' + str(accuracy))
        accuracies['bbox_on_sub_adv_ex'] = accuracy

    time_start = time.time()
    # Evaluate the accuracy of the "black-box" model on adversarial examples
    x_adv_sub_np = attacker.generate_np(adv_inputs, **attacker_params)
    accuracy = model_eval(sess, x, y, bbox_preds, x_adv_sub_np, ori_labels,
                          args=eval_params)
    print('Test accuracy of oracle on adversarial examples generated '
          'using the substitute (NP): ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex'] = accuracy
    time_end = time.time()
    print('Attack time:', time_end - time_start)

    # Evaluate the targeted attack
    bbox_adv_predict = batch_eval(sess, [x], [bbox_preds], [x_adv_sub_np],
                          args=eval_params)[0]
    bbox_adv_class = np.argmax(bbox_adv_predict, axis = 1)
    true_class = np.argmax(ori_labels, axis = 1)
    untargeted_success = np.mean(bbox_adv_class != true_class)
    print('Untargeted attack success rate:', untargeted_success)
    accuracies['untargeted_success'] = untargeted_success
    if targeted:
        targeted_success = np.mean(bbox_adv_class == targeted_class)
        print('Targeted attack success rate:', targeted_success)
        accuracies['targeted_success'] = targeted_success

    if attack == "cwl2":
        # Compute the L2 pertubations of generated adversarial examples
        percent_perturbed = np.sum((x_adv_sub_np - adv_inputs)**2, axis=(1, 2, 3))**.5
        # print(percent_perturbed)
        # print('Avg. L_2 norm of perturbations {0:.4f}'.format(np.mean(percent_perturbed)))
        # when computing the mean, removing the failure attacks first
        print('Avg. L_2 norm of perturbations {0:.4f}'.format(np.mean(percent_perturbed[percent_perturbed > 1e-8])))

    # Evaluate the accuracy of the "black-box" model on adversarial examples
    accuracy = model_eval(sess, x, y, bbox_preds, adv_inputs[mask], ori_labels[mask],
                          args=eval_params)
    print('Test accuracy of excluding originally incorrect labels (should be 1.0): ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex_exc_ori'] = accuracy

    if attack == "fgsm":
        # Evaluate the accuracy of the "black-box" model on adversarial examples (excluding correct)
        accuracy = model_eval(sess, x, y, model(x_adv_sub), adv_inputs[mask], ori_labels[mask],
                              args=eval_params)
        print('Test accuracy of oracle on adversarial examples generated '
              'using the substitute (excluding originally incorrect labels): ' + str(accuracy))
        accuracies['bbox_on_sub_adv_ex_exc'] = accuracy

    # Evaluate the accuracy of the "black-box" model on adversarial examples (excluding correct)
    x_adv_sub_mask_np = x_adv_sub_np[mask]
    accuracy = model_eval(sess, x, y, bbox_preds, x_adv_sub_mask_np, ori_labels[mask],
                          args=eval_params)
    print('Test accuracy of oracle on adversarial examples generated '
          'using the substitute (excluding originally incorrect labels, NP): ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex_exc'] = accuracy

    return accuracies
Example #12
0
def mnist_blackbox(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_classes=10,
                   batch_size=128,
                   learning_rate=0.001,
                   nb_epochs=10,
                   holdout=150,
                   data_aug=6,
                   nb_epochs_s=10,
                   lmbda=0.1):
    """
    MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :return: a dictionary with:
             * black-box model accuracy on test set
             * substitute model accuracy on test set
             * black-box model accuracy on adversarial examples transferred
               from the substitute model
    """

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session
    sess = tf.Session()

    # Get MNIST data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Initialize substitute training set reserved for adversary
    X_sub = X_test[:holdout]
    Y_sub = np.argmax(Y_test[:holdout], axis=1)

    # Redefine test set as remaining samples unavailable to adversaries
    X_test = X_test[holdout:]
    Y_test = Y_test[holdout:]

    # Define input and output TF placeholders
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Seed random number generator so tutorial is reproducible
    rng = np.random.RandomState([2017, 8, 30])

    # Simulate the black-box model locally
    # You could replace this by a remote labeling API for instance
    logger.info("Preparing the black-box model.")
    prep_bbox_out = prep_bbox(sess,
                              x,
                              y,
                              X_train,
                              Y_train,
                              X_test,
                              Y_test,
                              nb_epochs,
                              batch_size,
                              learning_rate,
                              rng=rng)
    model, bbox_preds, accuracies['bbox'] = prep_bbox_out

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    logger.info("Training the substitute model.")
    train_sub_out = train_sub(sess,
                              x,
                              y,
                              bbox_preds,
                              X_sub,
                              Y_sub,
                              nb_classes,
                              nb_epochs_s,
                              batch_size,
                              learning_rate,
                              data_aug,
                              lmbda,
                              rng=rng)
    model_sub, preds_sub = train_sub_out

    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params)
    accuracies['sub'] = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    fgsm_par = {
        'eps': FLAGS.eps,
        'ord': np.inf,
        'clip_min': 0.,
        'clip_max': 1.
    }
    fgsm = FastGradientMethod(model_sub, sess=sess)

    # Craft adversarial examples using the substitute
    eval_params = {'batch_size': batch_size}
    x_adv_sub = fgsm.generate(x, **fgsm_par)

    # Evaluate the accuracy of the "black-box" model on adversarial examples
    accuracy = model_eval(sess,
                          x,
                          y,
                          model(x_adv_sub),
                          X_test,
                          Y_test,
                          args=eval_params)
    logger.info('Test accuracy of oracle on adversarial examples generated '
                'using the substitute: ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex'] = accuracy
    # Evaluate the accuracy of the "black-box" model on adversarial examples
    accuracy = model_eval(sess,
                          x,
                          y,
                          model_sub(x_adv_sub),
                          X_test,
                          Y_test,
                          args=eval_params)
    logger.info(
        'Test accuracy of substitute on adversarial examples generated '
        'using the substitute: ' + str(accuracy))
    accuracies['sub_on_sub_adv_ex'] = accuracy

    return accuracies
Example #13
0
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=NB_EPOCHS,
                   batch_size=BATCH_SIZE,
                   learning_rate=LEARNING_RATE,
                   train_dir=TRAIN_DIR,
                   filename=FILENAME,
                   load_model=LOAD_MODEL,
                   testing=True,
                   label_smoothing=0.1):
    """
  MNIST CleverHans tutorial
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param learning_rate: learning rate for training
  :param train_dir: Directory storing the saved model
  :param filename: Filename to save model under
  :param load_model: True for load, False for not load
  :param testing: if true, test error is calculated
  :param label_smoothing: float, amount of label smoothing for cross entropy
  :return: an AccuracyReport object
  """
    tf.keras.backend.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if keras.backend.image_data_format() != 'channels_last':
        raise NotImplementedError(
            "this tutorial requires keras to be configured to channels_last format"
        )

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    mnist = MNIST(train_start=train_start,
                  train_end=train_end,
                  test_start=test_start,
                  test_end=test_end)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Define TF model graph
    model = cnn_model(img_rows=img_rows,
                      img_cols=img_cols,
                      channels=nchannels,
                      nb_filters=64,
                      nb_classes=nb_classes)
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
        report.clean_train_clean_eval = acc
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }

    rng = np.random.RandomState([2017, 8, 30])
    if not os.path.exists(train_dir):
        os.mkdir(train_dir)

    ckpt = tf.train.get_checkpoint_state(train_dir)
    print(train_dir, ckpt)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path
    wrap = KerasModelWrapper(model)

    if load_model and ckpt_path:
        saver = tf.train.Saver()
        print(ckpt_path)
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        loss = CrossEntropy(wrap, smoothing=label_smoothing)
        train(sess,
              loss,
              x_train,
              y_train,
              evaluate=evaluate,
              args=train_params,
              rng=rng)

    # Calculate training error
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params)
        report.train_clean_train_clean_eval = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    bim = BasicIterativeMethod(wrap, sess=sess)
    bim_params = {
        'eps': 0.3,
        'clip_min': 0.,
        'clip_max': 1.,
        'nb_iter': 50,
        'eps_iter': .01
    }
    adv_x = bim.generate(x, **bim_params)

    batch = 1000
    x_adv_test = None
    x_adv_train = None

    for i in tqdm(range(int(len(x_test) / batch))):
        tmp = sess.run(adv_x, feed_dict={x: x_test[i * batch:(i + 1) * batch]})
        if x_adv_test is None:
            x_adv_test = tmp
        else:
            x_adv_test = np.concatenate((x_adv_test, tmp))

    for i in tqdm(range(int(len(x_train) / batch))):
        tmp = sess.run(adv_x,
                       feed_dict={x: x_train[i * batch:(i + 1) * batch]})
        if x_adv_train is None:
            x_adv_train = tmp
        else:
            x_adv_train = np.concatenate((x_adv_train, tmp))

    def evaluate_adv():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess,
                         x,
                         y,
                         preds,
                         x_adv_test,
                         y_test,
                         args=eval_params)
        report.clean_train_clean_eval = acc
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    evaluate_adv()

    x_adv_train = (np.repeat(x_adv_train, 3, 3) * 255).astype('uint8')
    x_train = (np.repeat(x_train, 3, 3) * 255).astype('uint8')
    x_adv_test = (np.repeat(x_adv_test, 3, 3) * 255).astype('uint8')
    x_test = (np.repeat(x_test, 3, 3) * 255).astype('uint8')

    save_list = [x_adv_train, x_adv_test]
    print(x_adv_train[0])
    pickle.dump(save_list, open("./bim.pkl", 'wb'))
Example #14
0
def mnist_tutorial(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE,
                   learning_rate=LEARNING_RATE, train_dir=TRAIN_DIR,
                   filename=FILENAME, load_model=LOAD_MODEL,
                   testing=False, label_smoothing=0.1,
                   save_model=SAVE_MODEL,attack_method=ATTACK_METHOD,
                   model_type=MODEL_TYPE):
  """
  MNIST CleverHans tutorial
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param learning_rate: learning rate for training
  :param train_dir: Directory storing the saved model
  :param filename: Filename to save model under
  :param load_model: True for load, False for not load
  :param testing: if true, test error is calculated
  :param label_smoothing: float, amount of label smoothing for cross entropy
  :return: an AccuracyReport object
  """
  keras.layers.core.K.set_learning_phase(0)

  # Object used to keep track of (and return) key accuracies
  report = AccuracyReport()

  # Set TF random seed to improve reproducibility
  tf.set_random_seed(1234)

  if not hasattr(backend, "tf"):
    raise RuntimeError("This tutorial requires keras to be configured"
                       " to use the TensorFlow backend.")

  if keras.backend.image_dim_ordering() != 'tf':
    keras.backend.set_image_dim_ordering('tf')
    print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
          "'th', temporarily setting to 'tf'")

  # Create TF session and set as Keras backend session
  os.environ["CUDA_VISIBLE_DEVICES"] = '1'  # only use No.0 GPU
  config = tf.ConfigProto()
  config.allow_soft_placement=True
  config.gpu_options.allow_growth = True
  sess = tf.Session(config=config)
  keras.backend.set_session(sess)

  # Get MNIST test data
  mnist = MNIST(train_start=train_start, train_end=train_end,
                test_start=test_start, test_end=test_end)
  x_train, y_train = mnist.get_set('train')
  x_test, y_test = mnist.get_set('test')
  my_adv = np.load('mifgsm_c_train_adv.npy').reshape(60000,28,28,1)
  x_train = np.concatenate([x_train,my_adv])
  y_train = np.concatenate([y_train,y_train])

  # Obtain Image Parameters
  img_rows, img_cols, nchannels = x_train.shape[1:4]
  nb_classes = y_train.shape[1]

  # Define input TF placeholder
  x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                        nchannels))
  y = tf.placeholder(tf.float32, shape=(None, nb_classes))

  # Define TF model graph
  the_model = modelA
  if model_type == 'a':
      the_model = modelA
  elif model_type == 'b':
      the_model = modelB
  elif model_type == 'c':
      the_model = modelC
  else:
      exit('the model type must be a or b or c.')
  model = the_model(img_rows=img_rows, img_cols=img_cols,
                 channels=nchannels, nb_filters=64,
                 nb_classes=nb_classes)
  preds = model(x)
  print("Defined TensorFlow model graph.")

  def evaluate():
    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    report.clean_train_clean_eval = acc
#        assert X_test.shape[0] == test_end - test_start, X_test.shape
    print('Test accuracy on legitimate examples: %0.4f' % acc)

  # Train an MNIST model
  train_dir =train_dir + '/' + model_type + '/' + 'mifgsm_c'
  train_params = {
      'nb_epochs': nb_epochs,
      'batch_size': batch_size,
      'learning_rate': learning_rate,
      'train_dir': train_dir,
      'filename': filename
  }

  rng = np.random.RandomState([2017, 8, 30])
  if not os.path.exists(train_dir):
    os.mkdir(train_dir)

  ckpt = tf.train.get_checkpoint_state(train_dir)
  print(train_dir, ckpt)
  ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path
  wrap = KerasModelWrapper(model)

  if load_model and ckpt_path:
    saver = tf.train.Saver()
    print(ckpt_path)
    saver.restore(sess, ckpt_path)
    print("Model loaded from: {}".format(ckpt_path))
    evaluate()
  else:
    print("Model was not loaded, training from scratch.")
    loss = CrossEntropy(wrap, smoothing=label_smoothing)
    train(sess, loss, x_train, y_train, evaluate=evaluate,
          args=train_params, rng=rng)
    if save_model:
        saver = tf.train.Saver(max_to_keep=1)
        saver.save(sess, '{}/mnist.ckpt'.format(train_dir), global_step=NB_EPOCHS)
        print("model has been saved")


  # Calculate training error
  if testing:
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params)
    report.train_clean_train_clean_eval = acc

  # Initialize the Basic Iterative Method (BIM) attack object and graph
  if attack_method == 'fgsm':
    att_method = FastGradientMethod(wrap, sess=sess)
    att_method_params = {'eps': 0.2,
                 'clip_min': 0.,
                 'clip_max': 1.}
  elif attack_method == 'bim':
    att_method = BasicIterativeMethod(wrap,sess=sess)
    att_method_params = {'eps': 0.2,
                'eps_iter':0.06,
                'nb_iter':10,
                 'clip_min': 0.,
                 'clip_max': 1.}
  elif attack_method == 'mifgsm':
    att_method = MomentumIterativeMethod(wrap,sess=sess)
    att_method_params =  {'eps': 0.2,
                'eps_iter':0.08,
                'nb_iter':10,
                'decay_factor':0.4,
                 'clip_min': 0.,
                 'clip_max': 1.}
  else:
      exit("the attack method must be fgsm,bim,mifgsm")


  print(att_method_params)
  adv_x = att_method.generate(x, **att_method_params)
  # Consider the attack to be constant
  adv_x = tf.stop_gradient(adv_x)
  preds_adv = model(adv_x)

  # Evaluate the accuracy of the MNIST model on adversarial examples
  eval_par = {'batch_size': batch_size}
  start_time = time.time()
  acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par)

  print('Test accuracy on adversarial examples: %0.4f' % acc)
  end_time = time.time()
  print("{} attack time is {}\n".format(attack_method,end_time - start_time))
  report.clean_train_adv_eval = acc

  #save_acc = np.array(save_acc)
  #record = pd.DataFrame(save_acc,columns=["decay","acc"])
  #record.to_csv("result/mnist_fc_decay__change.csv",index=False)

  gc.collect()
def adv_net_exp(data_dir,
                adv_dir,
                target_model_dir='./tmp/cifar10_train_adv_encoder',
                clip_norm=1.5):

    # sess get setting
    sess = tf.Session()

    # define dataset format
    img_rows = 32
    img_cols = 32
    channels = 3
    nb_classes = 10

    # fetch data
    cifar10_data.maybe_download_and_return_python(data_dir)
    X, Y = mdt_cifar10_input.numpy_input(True, data_dir)

    # create one-hot Y
    one_hot_Y = to_categorical(Y, nb_classes)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    model = make_vgg16_clipRelu_model(name='vgg16_clipRelu_eval_mode',
                                      eval_mode=True)

    eval_feed = mode_feed(sess, False)
    # Get predict tensor
    pred = model(x)
    if not checkpoint_load(sess, target_model_dir):
        return False

    # eval model accuracy
    accuracy = model_eval(sess,
                          x,
                          y,
                          pred,
                          X,
                          one_hot_Y,
                          feed=eval_feed,
                          args={'batch_size': 128})
    print('model accuracy: {0}'.format(accuracy))

    dis_loss, output_images = adv_train_net(x, clip_norm)

    logits = model(output_images)

    # restore adv variables
    ckpt = tf.train.get_checkpoint_state(adv_dir)
    # define adv variables
    adv_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      "adv_encoder")
    saver = tf.train.Saver(adv_variables)
    saver.restore(sess, ckpt.model_checkpoint_path)

    # eval adv accuracy
    accuracy = model_eval(sess,
                          x,
                          y,
                          logits,
                          X,
                          one_hot_Y,
                          feed=eval_feed,
                          args={'batch_size': 128})
    print('transfer rate: {0}'.format(accuracy))

    # universal adversarial examples
    adv_imgs = adv_generate(sess, output_images, x, X, None, 128)
    mean_dif = adv_imgs[1] - X[1]
    print('mean dif\'s size: {0}'.format(mean_dif.shape))
    universal_adv_X = X + mean_dif
    # eval universal adv accuracy
    accuracy = model_eval(sess,
                          x,
                          y,
                          pred,
                          universal_adv_X,
                          one_hot_Y,
                          feed=eval_feed,
                          args={'batch_size': 128})
    print('universal adv transfer rate: {0}'.format(accuracy))
def mnist_blackbox(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_classes=10,
                   batch_size=128,
                   learning_rate=0.001,
                   nb_epochs=10,
                   holdout=150,
                   data_aug=6,
                   nb_epochs_s=10,
                   lmbda=0.1,
                   epsilon=0.3):
    """
    MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :return: a dictionary with:
             * black-box model accuracy on test set
             * substitute model accuracy on test set
             * black-box model accuracy on adversarial examples transferred
               from the substitute model
    """

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)
    pyp = False
    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session
    sess = tf.Session()

    # Get MNIST data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Initialize substitute training set reserved for adversary
    X_sub = X_test[:holdout]
    Y_sub = np.argmax(Y_test[:holdout], axis=1)

    # Redefine test set as remaining samples unavailable to adversaries
    X_test = X_test[holdout:]
    Y_test = Y_test[holdout:]

    # Define input and output TF placeholders
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Seed random number generator so tutorial is reproducible
    rng = np.random.RandomState([2017, 8, 30])

    # Simulate the black-box model locally
    # You could replace this by a remote labeling API for instance
    print("Preparing the black-box model.")
    prep_bbox_out = prep_bbox(sess,
                              x,
                              y,
                              X_train,
                              Y_train,
                              X_test,
                              Y_test,
                              nb_epochs,
                              batch_size,
                              learning_rate,
                              rng=rng)
    model, bbox_preds, accuracies['bbox'] = prep_bbox_out

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    print("Training the substitute model.")
    train_sub_out = train_sub(sess,
                              x,
                              y,
                              bbox_preds,
                              X_sub,
                              Y_sub,
                              nb_classes,
                              nb_epochs_s,
                              batch_size,
                              learning_rate,
                              data_aug,
                              lmbda,
                              rng=rng)
    model_sub, preds_sub = train_sub_out

    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params)
    accuracies['sub'] = acc
    for epstep in [epsilon * i for i in range(20)]:
        # Initialize the Fast Gradient Sign Method (FGSM) attack object.
        fgsm_par = {
            'eps': epstep,
            'ord': np.inf,
            'clip_min': 0.,
            'clip_max': 1.
        }
        fgsm = FastGradientMethod(model_sub, sess=sess)

        # Craft adversarial examples using the substitute
        eval_params = {'batch_size': batch_size}
        x_adv_sub = fgsm.generate(x, **fgsm_par)

        def find_error(glb, mdl):
            temparray = []
            for i in range(len(glb)):
                prd = np.argmax(mdl.predict(np.array([glb[i]])))
                if prd != np.argmax(Y_test[i]):
                    # print('--')
                    # print(prd)
                    # print('diff')
                    # print(np.argmax(Y_test[i]))
                    # print('--')
                    temparray.append([glb[i], Y_test[i], X_test[i], prd, i])
            return temparray

    # Evaluate the accuracy of the "black-box" model on adversarial examples

        accuracy = model_eval(sess,
                              x,
                              y,
                              model(x_adv_sub),
                              X_test,
                              Y_test,
                              args=eval_params)
        print(
            'Test accuracy of oracle on BB Adversarial Samples with epsilon = %s : '
            % epstep + str(accuracy))
        if pyp:

            x_adv_np = fgsm.generate_np(X_test[0:200], **fgsm_par)
            y_adv_np = find_error(x_adv_np, keras_global_model)
            from matplotlib import pyplot as plt
            plt.rc('figure', figsize=(12.0, 12.0))
            for j in range(len(y_adv_np) - 1):
                print(
                    str(y_adv_np[j][3]) + "predit, et le reel etait : " +
                    str(np.argmax(y_adv_np[j][1])))
                plt.imshow(y_adv_np[j][0].reshape((28, 28)),
                           cmap="gray",
                           label=str(np.argmax(y_adv_np[j][3])))
                plt.pause(1)
                print('---')
        accuracies['bbox_on_sub_adv_ex' + str(epstep)] = accuracy

    return accuracies
Example #17
0
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.1):
    """
    MNIST CleverHans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    keras.layers.core.K.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10.
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
        report.clean_train_clean_eval = acc
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    model_train(sess,
                x,
                y,
                preds,
                X_train,
                Y_train,
                evaluate=evaluate,
                args=train_params)

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    fgsm = FastGradientMethod(model, sess=sess)
    fgsm_params = {'eps': 0.3}
    adv_x = fgsm.generate(x, **fgsm_params)
    preds_adv = model(adv_x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model()
    preds_2 = model_2(x)
    fgsm2 = FastGradientMethod(model_2, sess=sess)
    preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params))

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    model_train(sess,
                x,
                y,
                preds_2,
                X_train,
                Y_train,
                predictions_adv=preds_2_adv,
                evaluate=evaluate_2,
                args=train_params)

    return report
Example #18
0
def mnist_tutorial_cw(train_start=0,
                      train_end=60000,
                      test_start=0,
                      test_end=10000,
                      viz_enabled=True,
                      nb_epochs=6,
                      batch_size=128,
                      nb_classes=10,
                      source_samples=10,
                      learning_rate=0.001,
                      attack_iterations=100,
                      model_path=os.path.join("models", "mnist"),
                      targeted=True):
    """
    MNIST tutorial for Carlini and Wagner's attack
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # MNIST-specific dimensions
    img_rows = 28
    img_cols = 28
    channels = 1

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session
    sess = tf.Session()
    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    sess.run(init_op)
    K.set_session(sess)

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)
    K.set_learning_phase(1)
    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    models = {}
    preds = {}
    for model_name in ['mlp', 'cnn', 'hrnn']:
        try:
            print('[DEBUG] Loading model.')
            models[model_name] = load_model('{}{}'.format(
                model_type, model_name))
        except:
            print(
                '[ERROR] Adversarially Trained models not found! Train and save strengthened models first. Then, run this.'
            )
            exit(1)

        preds[model_name] = models[model_name](x)

    rng = np.random.RandomState([2017, 8, 30])

    # Evaluate the accuracy of the Adv trained MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy_test = ''
    attacks = {}

    # Make computations graphs for the attacks
    for model_name in models.keys():
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds[model_name],
                              X_test,
                              Y_test,
                              args=eval_params)
        accuracy_test += '{} {}\n'.format(model_name, accuracy)

        # Instantiate a CW attack object
        wrap = KerasModelWrapper(models[model_name])
        attacks['$PGD_{}$'.format(model_name[0])] = ProjectedGradientDescent(
            wrap, sess=sess)

    # Make the output tensor for specification in the attacks parameters
    idxs = [np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(10)]
    if targeted:
        one_hot = np.zeros((10, 10))
        one_hot[np.arange(10), np.arange(10)] = 1

        adv_inputs = np.array([[instance] * 10 for instance in X_test[idxs]],
                              dtype=np.float32)
        adv_inputs = adv_inputs.reshape((100, 28, 28, 1))
        adv_ys = np.array([one_hot] * 10, dtype=np.float32).reshape((100, 10))
        yname = "y_target"
    else:
        adv_inputs = X_test[idxs]
        adv_ys = None
        yname = "y"

    attack_params = {'eps': 0.3, yname: adv_ys, 'eps_iter': 0.05}

    table_header = '{}model '.format(model_type)
    accuracy_attack = ''

    for model_name in models.keys():

        accuracy_attack += '{} '.format(model_name)

        # For each model, apply all attacks
        for attack_name in attacks.keys():
            print('[DEBUG] Attacking {} using {}.'.format(
                model_name, attack_name))

            # Code brach entered only once for creating the table header with attack names
            if attack_name not in table_header:
                table_header += '{} '.format(attack_name)

            adv = attacks[attack_name].generate_np(adv_inputs, **attack_params)
            if targeted:
                adv_accuracy = model_eval(sess,
                                          x,
                                          y,
                                          preds[model_name],
                                          adv,
                                          adv_ys,
                                          args={'batch_size': 10})
            else:
                adv_accuracy = model_eval(sess,
                                          x,
                                          y,
                                          preds[model_name],
                                          adv,
                                          Y_test[idxs],
                                          args={'batch_size': 10})

            accuracy_attack += '{} '.format(adv_accuracy * 100)

        # Move on to attack the next model
        accuracy_attack += '\n'

    print(table_header)
    print(accuracy_attack)
    print(accuracy_test)

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    return report
Example #19
0
def main(argv):

    model_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)

    if model_file is None:
        print('No model found')
        sys.exit()

    cifar = cifar10_input.CIFAR10Data(FLAGS.dataset_dir)

    nb_classes = 10
    X_test = cifar.eval_data.xs
    Y_test = to_categorical(cifar.eval_data.ys, nb_classes)
    assert Y_test.shape[1] == 10.

    set_log_level(logging.DEBUG)

    with tf.Session() as sess:

        x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
        y = tf.placeholder(tf.float32, shape=(None, 10))

        from madry_cifar10_model import make_madry_wresnet
        model = make_madry_wresnet()

        saver = tf.train.Saver()

        # Restore the checkpoint
        saver.restore(sess, model_file)

        nb_samples = FLAGS.nb_samples

        attack_params = {
            'batch_size': FLAGS.batch_size,
            'clip_min': 0.,
            'clip_max': 255.
        }

        if FLAGS.attack_type == 'cwl2':
            from cleverhans.attacks import CarliniWagnerL2
            attacker = CarliniWagnerL2(model, sess=sess)
            attack_params.update({
                'binary_search_steps': 1,
                'max_iterations': 100,
                'learning_rate': 0.1,
                'initial_const': 10,
                'batch_size': 10
            })

        else:  # eps and eps_iter in range 0-255
            attack_params.update({'eps': 8, 'ord': np.inf})
            if FLAGS.attack_type == 'fgsm':
                from cleverhans.attacks import FastGradientMethod
                attacker = FastGradientMethod(model, sess=sess)

            elif FLAGS.attack_type == 'pgd':
                attack_params.update({'eps_iter': 2, 'nb_iter': 20})
                from cleverhans.attacks import MadryEtAl
                attacker = MadryEtAl(model, sess=sess)

        eval_par = {'batch_size': FLAGS.batch_size}

        if FLAGS.sweep:
            max_eps = 16
            epsilons = np.linspace(1, max_eps, max_eps)
            for e in epsilons:
                t1 = time.time()
                attack_params.update({'eps': e})
                x_adv = attacker.generate(x, **attack_params)
                preds_adv = model.get_probs(x_adv)
                acc = model_eval(sess,
                                 x,
                                 y,
                                 preds_adv,
                                 X_test[:nb_samples],
                                 Y_test[:nb_samples],
                                 args=eval_par)
                print('Epsilon %.2f, accuracy on adversarial' % e,
                      'examples %0.4f\n' % acc)
            t2 = time.time()
        else:
            t1 = time.time()
            x_adv = attacker.generate(x, **attack_params)
            preds_adv = model.get_probs(x_adv)
            acc = model_eval(sess,
                             x,
                             y,
                             preds_adv,
                             X_test[:nb_samples],
                             Y_test[:nb_samples],
                             args=eval_par)
            t2 = time.time()
            print('Test accuracy on adversarial examples %0.4f\n' % acc)
        print("Took", t2 - t1, "seconds")
Example #20
0
def mnist_tutorial_jsma(train_start=0,
                        train_end=60000,
                        test_start=0,
                        test_end=10000,
                        viz_enabled=True,
                        nb_epochs=6,
                        batch_size=128,
                        source_samples=10,
                        learning_rate=0.001):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    nb_filters = 64
    # Define TF model graph
    model = ModelBasicCNN('model1', nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = LossCrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, back='tf', sess=sess)
    jsma_params = {
        'theta': 1.,
        'gamma': 0.1,
        'clip_min': 0.,
        'clip_max': 1.,
        'y_target': None
    }

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = x_test[sample_ind:(sample_ind + 1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, nchannels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, nchannels)),
                    np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, nchannels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        plt.close(figure)
        _ = grid_visual(grid_viz_data)

    return report
def adv_net_exp(data_dir, checkpoint_dir, train_mode,
        train_dir='./tmp/cifar10_train_adv_encoder', batch_size=128,
        data_aug=False,clip_norm=1.5, target=0, lr=0.0001):
    
    # sess get setting
    sess = tf.Session()

    model = make_vgg16_model(name = 'vgg16_eval_mode', eval_mode=True)

    # create mode feed
    train_feed = mode_feed(sess, True)
    eval_feed = mode_feed(sess, False)

    # train model
    if train_mode:
        # set input and get logits
        data_norm = False
        images, labels = mdt_cifar10_input.inputs(False, data_dir, batch_size,
                                                  data_aug, data_norm)

        labels = tf.constant(target,dtype=tf.int64, shape=(batch_size,))

        # dis_loss, output_images = adv_net(images)
        dis_loss, output_images = adv_target_net(images, clip_norm)

        logits = model(output_images)

        # attack seeting
        # c = 0.005
        c=1
        confidence = 0
        target = True

        # define model loss
        loss = adv_loss(dis_loss, logits, labels, target, confidence, c)

        global_step = tf.train.get_or_create_global_step()

        # train setting
        nb_epochs = 100
        lr = 0.0001
        # decay_rate = 0.99
        # decay_epochs = 1
        # decay_steps = decay_epochs*NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN//batch_size
        # lr = tf.train.exponential_decay(initial_lr,
        #                                 global_step,
        #                                 decay_steps,
        #                                 decay_rate,
        #                                 staircase=True)
        tf.summary.scalar('learning_rate', lr)
        opt = tf.train.AdamOptimizer(lr)

        # define train variables
        adv_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          "adv_encoder")
        train_op = create_train_op(loss, global_step, adv_variables, opt)

        # ini all variables
        init_op = tf.global_variables_initializer()
        sess.run(init_op)  

        # restore pre variables
        ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
        var_info = tf.train.list_variables(ckpt.model_checkpoint_path)
        # print(var_info)
        var_name = [v[0] for v in var_info]

        restore_map = {variable.op.name:variable for variable in tf.global_variables()
                           if variable.op.name in var_name}
        # print(restore_map) 
        saver = tf.train.Saver(restore_map)
        saver.restore(sess, ckpt.model_checkpoint_path)
        
        #intialize global steps
        sess.run(global_step.initializer)

        # print(adv_variables)
        train_adv_encoder(sess, logits, loss, labels, train_op, train_dir, batch_size,
                          eval_feed, nb_epochs)

        sess.close()

    else:
        # define dataset format
        img_rows = 32
        img_cols = 32
        channels = 3
        nb_classes = 10

        # fetch data
        cifar10_data.maybe_download_and_return_python(data_dir)
        X, Y = mdt_cifar10_input.numpy_input(True, data_dir)

        Y = np.zeros_like(Y)
        Y[:] = target
        # Define input TF placeholder
        x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels))
        y = tf.placeholder(tf.float32, shape=(None, nb_classes))

        # dis_loss, output_images = adv_net(images)
        dis_loss, output_images = adv_target_net(x, clip_norm)

        logits = model(output_images)

        # restore trained model 
        if not checkpoint_load(sess, train_dir):
            return False
        # saver = tf.train.Saver()
        # ckpt = tf.train.get_checkpoint_state(train_dir)
        # saver.restore(sess, ckpt.model_checkpoint_path)

        # create one-hot Y
        one_hot_Y = to_categorical(Y, nb_classes)

        # eval model accuracy
        accuracy = model_eval(sess, x, y, logits, X, one_hot_Y,
                              feed=eval_feed,
                              args={'batch_size': batch_size})
        print('model accuracy: {0}'.format(accuracy))

        sta_time = time.time()
        adv_imgs = adv_generate(sess, output_images, x, X, eval_feed, batch_size)
        end_time = time.time()
        duration = end_time - sta_time
        print('adv crafting time: {0}'.format(duration))

        #eval adv's l2 distance
        l2_dis = calculate_l2_dis(X/255, adv_imgs/255)
        print('adversarial examples\' mean l2 distance: {0}'.format(l2_dis))
        adv_imgs = np.around(adv_imgs).astype(int)
        # compare_show(X[9], adv_imgs[9])
        compare_show(X[16], adv_imgs[16])
        import matplotlib
        matplotlib.image.imsave('i_{0}_target_{1}.png'.format(FLAGS.i,FLAGS.target), adv_imgs[16])
Example #22
0
def mnist_tutorial(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=6, batch_size=128,
                   learning_rate=0.001, train_dir="train_dir",
                   filename="mnist.ckpt", load_model=False,
                   testing=False, label_smoothing=True):
    """
    MNIST CleverHans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param train_dir: Directory storing the saved model
    :param filename: Filename to save model under
    :param load_model: True for load, False for not load
    :param testing: if true, test error is calculated
    :return: an AccuracyReport object
    """
    keras.layers.core.K.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    if label_smoothing:
        label_smooth = .1
        y_train = y_train.clip(label_smooth / (nb_classes-1),
                               1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Define TF model graph
    model = cnn_model(img_rows=img_rows, img_cols=img_cols,
                      channels=nchannels, nb_filters=64,
                      nb_classes=nb_classes)
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
        report.clean_train_clean_eval = acc
#        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }

    rng = np.random.RandomState([2017, 8, 30])
    if not os.path.exists(train_dir):
        os.mkdir(train_dir)

    ckpt = tf.train.get_checkpoint_state(train_dir)
    print(train_dir, ckpt)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path
    wrap = KerasModelWrapper(model)

    if load_model and ckpt_path:
        saver = tf.train.Saver()
        print(ckpt_path)
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        loss = LossCrossEntropy(wrap, smoothing=0.1)
        train(sess, loss, x, y, x_train, y_train, evaluate=evaluate,
              args=train_params, save=True, rng=rng)

    # Calculate training error
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params)
        report.train_clean_train_clean_eval = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.3,
                   'clip_min': 0.,
                   'clip_max': 1.}
    adv_x = fgsm.generate(x, **fgsm_params)
    # Consider the attack to be constant
    adv_x = tf.stop_gradient(adv_x)
    preds_adv = model(adv_x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    # Calculating train error
    if testing:
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, x_train,
                         y_train, args=eval_par)
        report.train_clean_train_adv_eval = acc

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols,
                        channels=nchannels, nb_filters=64,
                        nb_classes=nb_classes)
    wrap_2 = KerasModelWrapper(model_2)
    preds_2 = model_2(x)
    fgsm2 = FastGradientMethod(wrap_2, sess=sess)

    def attack(x):
        return fgsm2.generate(x, **fgsm_params)

    preds_2_adv = model_2(attack(x))
    loss_2 = LossCrossEntropy(wrap_2, smoothing=0.1, attack=attack)

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, x_test, y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess, x, y, preds_2_adv, x_test,
                              y_test, args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    train(sess, loss_2, x, y, x_train, y_train, evaluate=evaluate_2,
          args=train_params, save=False, rng=rng)

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, x_train, y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess, x, y, preds_2_adv, x_train,
                              y_train, args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
Example #23
0
check_cnn = True
if check_cnn:
    y_test_cat = np.argmax(y_test, axis=1)
    print(eval_simple_cnn(detect_3, [1], X_test, y_test_cat))
    print(eval_simple_cnn(detect_0, [0, 1, 2, 3, 4, 5], X_test, y_test_cat))
    print(eval_simple_cnn(detect_S, [14], X_test, y_test_cat))
    print(eval_simple_cnn(detect_T, [14], X_test, y_test_cat))
    print(eval_simple_cnn(detect_O, [14], X_test, y_test_cat))
    print(eval_simple_cnn(detect_P, [14], X_test, y_test_cat))

wrap_clf = KerasModelWrapper(clf)
preds = clf(x)

eval_par = {'batch_size': 128}
acc = model_eval(sess, x, y, preds, X_test, y_test, args=eval_par)
print('Test accuracy on legitimate test examples: {0}'.format(acc))
report.clean_train_clean_eval = acc

# fgsm = FastGradientMethod(wrap_clf, sess=sess)
# fgsm_params = {'eps': 0.1,
#                'clip_min': 0.,
#                'clip_max': 1.}
# adv_x = fgsm.generate(x, **fgsm_params)
# # Consider the attack to be constant
# adv_x = tf.stop_gradient(adv_x)
# preds_adv = clf(adv_x)

# # Evaluate the accuracy of the MNIST model on adversarial examples
# acc = model_eval(sess, x, y, preds_adv, X_test, y_test, args=eval_par)
# print('Test accuracy on adversarial examples: %0.4f\n' % acc)
Example #24
0
 def evaluate():
     # Evaluate the accuracy of the MNIST model on legitimate test examples
     eval_params = {'batch_size': batch_size}
     acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
     report.clean_train_clean_eval = acc
     print('Test accuracy on legitimate examples: %0.4f' % acc)
Example #25
0
def tutorial(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=6, batch_size=128,
                   learning_rate=0.001, train_dir="/tmp",
                   filename="mnist.ckpt", load_model=False,
                   testing=False):
    """
    MNIST CleverHans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param train_dir: Directory storing the saved model
    :param filename: Filename to save model under
    :param load_model: True for load, False for not load
    :param testing: if true, test error is calculated
    :return: an AccuracyReport object
    """
    keras.layers.core.K.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get cifar10 test data
    X_train, Y_train, X_test, Y_test = data_cifar10()

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)
    
    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model_cifar10(img_rows=32, img_cols=32, channels=3)
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
        report.clean_train_clean_eval = acc
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }
    ckpt = tf.train.get_checkpoint_state(train_dir)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path

    rng = np.random.RandomState([2017, 8, 30])
    if load_model and ckpt_path:
        saver = tf.train.Saver()
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate,
                    args=train_params, save=True, rng=rng)

    # Calculate training error
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params)
        report.train_clean_train_clean_eval = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    wrap = KerasModelWrapper(model)
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.1,
                   'clip_min': 0.,
                   'clip_max': 1.}

    with sess.as_default():
        adv_x3 = fgsm.generate(x[:100], **fgsm_params)
        adv_image = adv_x3.eval(feed_dict={x: X_train[:100], y: Y_train[:100]})
    print("adv_image:", adv_image.shape)
    np.save("adv_image_FGM_cifar10", adv_image[:100])

    adv_x = fgsm.generate(x, **fgsm_params)
    # Consider the attack to be constant
    adv_x = tf.stop_gradient(adv_x)
    preds_adv = model(adv_x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    # Calculating train error
    if testing:
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, X_train,
                         Y_train, args=eval_par)
        report.train_clean_train_adv_eval = acc

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model_cifar10(img_rows=32, img_cols=32, channels=3)
    preds_2 = model_2(x)
    wrap_2 = KerasModelWrapper(model_2)
    fgsm2 = FastGradientMethod(wrap_2, sess=sess)
    preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params))

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess, x, y, preds_2_adv, X_test,
                              Y_test, args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    model_train(sess, x, y, preds_2, X_train, Y_train,
                predictions_adv=preds_2_adv, evaluate=evaluate_2,
                args=train_params, save=False, rng=rng)

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess, x, y, preds_2_adv, X_train,
                              Y_train, args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
Example #26
0
def main(argv=None):
    """
    CIFAR10 CleverHans tutorial
    :return:
    """

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get CIFAR10 test data
    X_train, Y_train, X_test, Y_test = data_cifar10()

    assert Y_train.shape[1] == 10.
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model(img_rows=32, img_cols=32, channels=3)
    predictions = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the CIFAR10 model on legitimate test
        # examples
        eval_params = {'batch_size': FLAGS.batch_size}
        accuracy = model_eval(sess, x, y, predictions, X_test, Y_test,
                              args=eval_params)
        assert X_test.shape[0] == 10000, X_test.shape
        print('Test accuracy on legitimate test examples: ' + str(accuracy))

    # Train an CIFAR10 model
    train_params = {
        'nb_epochs': FLAGS.nb_epochs,
        'batch_size': FLAGS.batch_size,
        'learning_rate': FLAGS.learning_rate
    }
    model_train(sess, x, y, predictions, X_train, Y_train,
                evaluate=evaluate, args=train_params)

    # Craft adversarial examples using Fast Gradient Sign Method (FGSM)
    fgsm = FastGradientMethod(model)
    adv_x = fgsm.generate(x, eps=0.3)
    eval_params = {'batch_size': FLAGS.batch_size}
    X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], args=eval_params)
    assert X_test_adv.shape[0] == 10000, X_test_adv.shape

    # Evaluate the accuracy of the CIFAR10 model on adversarial examples
    accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test,
                          args=eval_params)
    print('Test accuracy on adversarial examples: ' + str(accuracy))

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model(img_rows=32, img_cols=32, channels=3)
    predictions_2 = model_2(x)
    fgsm_2 = FastGradientMethod(model_2)
    adv_x_2 = fgsm_2.generate(x, eps=0.3)
    predictions_2_adv = model_2(adv_x_2)

    def evaluate_2():
        # Evaluate the accuracy of the adversarialy trained CIFAR10 model on
        # legitimate test examples
        eval_params = {'batch_size': FLAGS.batch_size}
        accuracy = model_eval(sess, x, y, predictions_2, X_test, Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate test examples: ' + str(accuracy))

        # Evaluate the accuracy of the adversarially trained CIFAR10 model on
        # adversarial examples
        accuracy_adv = model_eval(sess, x, y, predictions_2_adv, X_test,
                                  Y_test, args=eval_params)
        print('Test accuracy on adversarial examples: ' + str(accuracy_adv))

    # Perform adversarial training
    model_train(sess, x, y, predictions_2, X_train, Y_train,
                predictions_adv=predictions_2_adv, evaluate=evaluate_2,
                args=train_params)

    # Evaluate the accuracy of the CIFAR10 model on adversarial examples
    accuracy = model_eval(sess, x, y, predictions_2_adv, X_test, Y_test,
                          args=eval_params)
    print('Test accuracy on adversarial examples: ' + str(accuracy))
Example #27
0
def whitebox(gan,
             rec_data_path=None,
             batch_size=128,
             learning_rate=0.001,
             nb_epochs=10,
             eps=0.3,
             online_training=False,
             test_on_dev=True,
             attack_type='fgsm',
             defense_type='gan',
             num_tests=-1,
             num_train=-1):
    """Based on MNIST tutorial from cleverhans.
    
    Args:
         gan: A `GAN` model.
         rec_data_path: A string to the directory.
         batch_size: The size of the batch.
         learning_rate: The learning rate for training the target models.
         nb_epochs: Number of epochs for training the target model.
         eps: The epsilon of FGSM.
         online_training: Training Defense-GAN with online reconstruction. The
            faster but less accurate way is to reconstruct the dataset once and use
            it to train the target models with:
            `python train.py --cfg <path-to-model> --save_recs`
         attack_type: Type of the white-box attack. It can be `fgsm`,
            `rand+fgsm`, or `cw`.
         defense_type: String representing the type of attack. Can be `none`,
            `defense_gan`, or `adv_tr`.
    """

    FLAGS = tf.flags.FLAGS

    # Set logging level to see debug information.
    set_log_level(logging.WARNING)

    if 'defense_gan' in FLAGS.defense_type:
        assert gan is not None

    # Create TF session.
    if 'defense_gan' in FLAGS.defense_type:
        sess = gan.sess
        if FLAGS.train_on_recs:
            assert rec_data_path is not None or online_training
    else:
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)

    train_images, train_labels, test_images, test_labels = \
        get_cached_gan_data(gan, test_on_dev)

    rec_test_images = test_images
    rec_test_labels = test_labels

    _, _, test_images, test_labels = \
        get_cached_gan_data(gan, test_on_dev, orig_data_flag=True)

    x_shape = [None] + list(train_images.shape[1:])
    images_pl = tf.placeholder(tf.float32,
                               shape=[None] + list(train_images.shape[1:]))
    alters_pl = tf.placeholder(tf.float32,
                               shape=[None] + list(train_images.shape[1:]))
    labels_pl = tf.placeholder(tf.float32,
                               shape=[None] + [train_labels.shape[1]])

    if num_tests > 0:
        test_images = test_images[:num_tests]
        rec_test_images = rec_test_images[:num_tests]
        test_labels = test_labels[:num_tests]

    if num_train > 0:
        train_images = train_images[:num_train]
        train_labels = train_labels[:num_train]

    # GAN defense flag.
    models = {
        'A': model_a,
        'B': model_b,
        'C': model_c,
        'D': model_d,
        'E': model_e,
        'F': model_f
    }
    model = models[FLAGS.model](input_shape=x_shape,
                                nb_classes=train_labels.shape[1])

    preds = gan.model.get_probs(images_pl)
    report = AccuracyReport()

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test
        # examples.
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess,
                         images_pl,
                         labels_pl,
                         preds,
                         rec_test_images,
                         rec_test_labels,
                         args=eval_params,
                         feed={K.learning_phase(): 0})
        report.clean_train_clean_eval = acc
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
    }

    rng = np.random.RandomState([11, 24, 1990])
    tf.set_random_seed(11241990)

    preds_adv = None
    if FLAGS.defense_type == 'adv_tr':
        attack_params = {
            'eps': FLAGS.fgsm_eps_tr,
            'clip_min': 0.,
            'clip_max': 1.
        }
        if gan:
            if gan.dataset_name == 'celeba':
                attack_params['clip_min'] = -1.0

        attack_obj = FastGradientMethod(gan.model, sess=sess)
        adv_x_tr = attack_obj.generate(images_pl, **attack_params)
        adv_x_tr = tf.stop_gradient(adv_x_tr)
        preds_adv = gan.model(adv_x_tr)
    """classifier_folder = os.path.join(FLAGS.model_folder, FLAGS.model+'/')
    saver = tf.train.Saver()
    if os.path.isfile(os.path.join(classifier_folder,'classifier.ckpt.index')):
        #load model
        saver.restore(sess, os.path.join(classifier_folder, 'classifier.ckpt'))
    else:
        os.mkdir(classifier_folder)
        model_train(sess, images_pl, labels_pl, preds, train_images, train_labels,
                args=train_params, rng=rng, predictions_adv=preds_adv,
                init_all=False, feed={K.learning_phase(): 1},
                evaluate=evaluate)
        #save model
        saver.save(sess, os.path.join(classifier_folder, 'classifier.ckpt'))"""

    # Calculate training error.
    eval_params = {'batch_size': batch_size}
    acc = model_eval(
        sess,
        images_pl,
        labels_pl,
        preds,
        test_images,
        test_labels,
        args=eval_params,
        feed={K.learning_phase(): 0},
    )
    print('[#] Accuracy on clean examples {}'.format(acc))
    """with open(os.path.join(classifier_folder, 'accuracy.txt'), 'w') as f:
        f.write('Test accuracy = {}'.format(acc))"""
    if attack_type is None:
        return acc, 0, None

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and
    # graph.

    if 'defense_gan' in FLAGS.defense_type:
        z_init_val = None

        if FLAGS.same_init:
            z_init_val = tf.constant(
                np.random.randn(batch_size * gan.rec_rr,
                                gan.latent_dim).astype(np.float32))

        if 'bpda' in FLAGS.attack_type:  #
            recon_layer = ReconstructionLayer(gan, z_init_val, x_shape,
                                              batch_size)
        else:
            gan.model.add_rec_model(gan, z_init_val, batch_size)
        irecon_adv_x = tf.zeros([1])

    min_val = 0.0
    if gan:
        if gan.dataset_name == 'celeba':
            min_val = -1.0

    if 'rand' in FLAGS.attack_type:
        test_images = np.clip(
            test_images +
            args.alpha * np.sign(np.random.randn(*test_images.shape)), min_val,
            1.0)
        eps -= args.alpha

    if 'bpda' in FLAGS.attack_type:  #

        if '1' in FLAGS.attack_type:
            attack_obj = MadryEtAl(gan.model, sess=sess)
        elif '2' in FLAGS.attack_type:
            attack_obj = FastGradientMethod(gan.model, sess=sess)
        elif '3' in FLAGS.attack_type:
            attack_obj = MomentumIterativeMethod(gan.model, sess=sess)

        if 'defense_gan' in FLAGS.defense_type:  # 2
            recon_images_pl = recon_layer.fprop(images_pl)
        else:
            recon_images_pl = images_pl

        attack_params = {
            'eps': eps,
            'ord': np.inf,
            'clip_min': min_val,
            'clip_max': 1.
        }
        adv_x = attack_obj.generate(
            recon_images_pl, **attack_params) - recon_images_pl + images_pl

        # adv_x = recon_layer.fprop(irecon_adv_x)

    else:
        if 'fgsm' in FLAGS.attack_type:
            attack_params = {
                'eps': eps,
                'ord': np.inf,
                'clip_min': min_val,
                'clip_max': 1.
            }
            attack_obj = FastGradientMethod(gan.model, sess=sess)
        elif FLAGS.attack_type == 'cw':
            attack_obj = CarliniWagnerL2(gan.model, back='tf', sess=sess)
            attack_iterations = 10
            attack_params = {
                'binary_search_steps': 1,
                'max_iterations': attack_iterations,
                'learning_rate': 10.0,
                'batch_size': batch_size,
                'initial_const': 100
            }
        try:
            adv_x = attack_obj.generate(images_pl, **attack_params)
        except:
            print('none')
            adv_x = images_pl

    eval_par = {'batch_size': batch_size}
    if 'defense_gan' in FLAGS.defense_type:
        preds_adv = gan.model.get_probs(adv_x)
        if 'bpda' in FLAGS.attack_type:
            irecon_adv_x = recon_layer.fprop(alters_pl)
            preds_adv = gan.model.get_probs(irecon_adv_x)

        num_dims = len(images_pl.get_shape())
        avg_inds = list(range(1, num_dims))
        diff_op = tf.reduce_mean(tf.square(adv_x - images_pl), axis=avg_inds)
        start = time.time()  #
        acc_adv, roc_info = model_eval_gan(sess,
                                           images_pl,
                                           labels_pl,
                                           preds_adv,
                                           None,
                                           test_images=test_images,
                                           test_labels=test_labels,
                                           args=eval_par,
                                           feed={K.learning_phase(): 0},
                                           diff_op=diff_op,
                                           attack=('bpda'
                                                   in FLAGS.attack_type),
                                           alter=alters_pl,
                                           adv_samples=adv_x)
        print(time.time() - start)
        # if 'bpda' in FLAGS.attack_type:
        #     sess.run(tf.local_variables_initializer())
        #     listimg = sess.run([images_pl,recon_images_pl,adv_x], \
        #         feed_dict={images_pl: test_images[:batch_size],labels_pl:test_labels[:batch_size]})
        #     sess.run(tf.local_variables_initializer())
        #     listimg += sess.run([irecon_adv_x], feed_dict={alters_pl: listimg[2]})
        #     for j in range(len(listimg)):
        #         samples = listimg[j]
        #         tflib.save_images.save_images(
        #             samples.reshape((len(samples), 28, 28)),
        #             os.path.join('images_saved_36', 'samples_{}_{}_{}.png'.format(FLAGS.model, FLAGS.fgsm_eps, j))
        #         )
        # elif FLAGS.attack_type == 'cw':
        #     idx = np.random.permutation(len(test_images))[:batch_size]
        #     listimg = sess.run([images_pl,adv_x], \
        #         feed_dict={images_pl: test_images[idx],labels_pl:test_labels[idx]})
        #     for j in range(len(listimg)):
        #         samples = listimg[j]
        #         tflib.save_images.save_images(
        #             samples.reshape((len(samples), 28, 28)),
        #             os.path.join('images_cw', 'samples_{}_{}.png'.format(FLAGS.model, j))
        #         )
        print('Test accuracy on adversarial examples: %0.4f\n' % acc_adv)
        return acc_adv, 0, roc_info
    else:
        preds_adv = gan.model(adv_x)
        acc_adv = model_eval(sess,
                             images_pl,
                             labels_pl,
                             preds_adv,
                             test_images,
                             test_labels,
                             args=eval_par,
                             feed={K.learning_phase(): 0})
        print('Test accuracy on adversarial examples: %0.4f\n' % acc_adv)

        return acc_adv, 0, None
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0,
                      test_end=10000, viz_enabled=VIZ_ENABLED,
                      nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE,
                      source_samples=SOURCE_SAMPLES,
                      learning_rate=LEARNING_RATE,
                      attack_iterations=ATTACK_ITERATIONS,
                      model_path=MODEL_PATH,
                      targeted=TARGETED):
    """
    MNIST tutorial for Carlini and Wagner's attack
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session
    config_args = dict(intra_op_parallelism_threads=1)
    config_args["gpu_options"] = tf.GPUOptions(allow_growth=True)
    sess = tf.Session(config=tf.ConfigProto(**config_args))
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    mnist = MNIST(DATA_DIR, train_start=train_start, train_end=train_end,
                  test_start=test_start, test_end=test_end)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))
    nb_filters = 64

    # Define TF model graph
    model = ModelAllConvolutional('model1', nb_classes, nb_filters,
                                  input_shape=[28, 28, 1])
    preds = model.get_logits(x)
    loss = CrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'filename': os.path.split(model_path)[-1]
    }

    rng = np.random.RandomState([2017, 8, 30])
    # check if we've trained before, and if we have, use that pre-trained model
    if os.path.exists(model_path + ".meta"):
        tf_model_load(sess, model_path)
    else:
        train(sess, loss, x_train, y_train, args=train_params, rng=rng)
        saver = tf.train.Saver()
        saver.save(sess, model_path)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using Carlini and Wagner's approach
    ###########################################################################
    nb_adv_per_sample = str(nb_classes - 1) if targeted else '1'
    print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample +
          ' adversarial examples')
    print("This could take some time ...")

    # Instantiate a CW attack object
    cw = CarliniWagnerL2(model, sess=sess)

    if viz_enabled:
        assert source_samples == nb_classes
        idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0]
                for i in range(nb_classes)]
    if targeted:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, nb_classes, img_rows, img_cols,
                          nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = np.array(
                [[instance] * nb_classes for instance in x_test[idxs]],
                dtype=np.float32)
        else:
            adv_inputs = np.array(
                [[instance] * nb_classes for
                 instance in x_test[:source_samples]], dtype=np.float32)

        one_hot = np.zeros((nb_classes, nb_classes))
        one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1

        adv_inputs = adv_inputs.reshape(
            (source_samples * nb_classes, img_rows, img_cols, nchannels))
        adv_ys = np.array([one_hot] * source_samples,
                          dtype=np.float32).reshape((source_samples *
                                                     nb_classes, nb_classes))
        yname = "y_target"
    else:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = x_test[idxs]
        else:
            adv_inputs = x_test[:source_samples]

        adv_ys = None
        yname = "y"

    if targeted:
        cw_params_batch_size = source_samples * nb_classes
    else:
        cw_params_batch_size = source_samples
    cw_params = {'binary_search_steps': 1,
                 yname: adv_ys,
                 'max_iterations': attack_iterations,
                 'learning_rate': CW_LEARNING_RATE,
                 'batch_size': cw_params_batch_size,
                 'initial_const': 10}

    adv = cw.generate_np(adv_inputs,
                         **cw_params)

    eval_params = {'batch_size': np.minimum(nb_classes, source_samples)}
    if targeted:
        adv_accuracy = model_eval(
            sess, x, y, preds, adv, adv_ys, args=eval_params)
    else:
        if viz_enabled:
            err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params)
            adv_accuracy = 1 - err
        else:
            err = model_eval(sess, x, y, preds, adv, y_test[:source_samples],
                             args=eval_params)
            adv_accuracy = 1 - err

    if viz_enabled:
        for j in range(nb_classes):
            if targeted:
                for i in range(nb_classes):
                    grid_viz_data[i, j] = adv[i * nb_classes + j]
            else:
                grid_viz_data[j, 0] = adv_inputs[j]
                grid_viz_data[j, 1] = adv[j]

        print(grid_viz_data.shape)

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy))
    report.clean_train_adv_eval = 1. - adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(np.sum((adv - adv_inputs) ** 2,
                                       axis=(1, 2, 3)) ** .5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        _ = grid_visual(grid_viz_data)

    return report
Example #29
0
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=NB_EPOCHS,
                   batch_size=BATCH_SIZE,
                   learning_rate=LEARNING_RATE,
                   train_dir=TRAIN_DIR,
                   filename=FILENAME,
                   load_model=LOAD_MODEL,
                   testing=False,
                   label_smoothing=0.1):
    """
  MNIST CleverHans tutorial
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param learning_rate: learning rate for training
  :param train_dir: Directory storing the saved model
  :param filename: Filename to save model under
  :param load_model: True for load, False for not load
  :param testing: if true, test error is calculated
  :param label_smoothing: float, amount of label smoothing for cross entropy
  :return: an AccuracyReport object
  """
    tf.keras.backend.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if keras.backend.image_data_format() != 'channels_last':
        raise NotImplementedError(
            "this tutorial requires keras to be configured to channels_last format"
        )

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    mnist = MNIST(train_start=train_start,
                  train_end=train_end,
                  test_start=test_start,
                  test_end=test_end)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Define TF model graph
    model = cnn_model(img_rows=img_rows,
                      img_cols=img_cols,
                      channels=nchannels,
                      nb_filters=64,
                      nb_classes=nb_classes)
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
        report.clean_train_clean_eval = acc
        #        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }

    rng = np.random.RandomState([2017, 8, 30])
    if not os.path.exists(train_dir):
        os.mkdir(train_dir)

    ckpt = tf.train.get_checkpoint_state(train_dir)
    print(train_dir, ckpt)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path
    wrap = KerasModelWrapper(model)

    if load_model and ckpt_path:
        saver = tf.train.Saver()
        print(ckpt_path)
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        loss = CrossEntropy(wrap, smoothing=label_smoothing)
        train(sess,
              loss,
              x_train,
              y_train,
              evaluate=evaluate,
              args=train_params,
              rng=rng)

    # Calculate training error
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params)
        report.train_clean_train_clean_eval = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.3}
    adv_x = fgsm.generate(x, **fgsm_params)
    # Consider the attack to be constant
    adv_x = tf.stop_gradient(adv_x)
    preds_adv = model(adv_x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    # Calculating train error
    if testing:
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess,
                         x,
                         y,
                         preds_adv,
                         x_train,
                         y_train,
                         args=eval_par)
        report.train_clean_train_adv_eval = acc

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model(img_rows=img_rows,
                        img_cols=img_cols,
                        channels=nchannels,
                        nb_filters=64,
                        nb_classes=nb_classes)
    wrap_2 = KerasModelWrapper(model_2)
    preds_2 = model_2(x)
    fgsm2 = FastGradientMethod(wrap_2, sess=sess)

    def attack(x):
        return fgsm2.generate(x, **fgsm_params)

    preds_2_adv = model_2(attack(x))
    loss_2 = CrossEntropy(wrap_2, smoothing=label_smoothing, attack=attack)

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              x_test,
                              y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              x_test,
                              y_test,
                              args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    train(sess,
          loss_2,
          x_train,
          y_train,
          evaluate=evaluate_2,
          args=train_params,
          rng=rng)

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              x_train,
                              y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              x_train,
                              y_train,
                              args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
Example #30
0
def blackbox(gan,
             rec_data_path=None,
             batch_size=128,
             learning_rate=0.001,
             nb_epochs=10,
             holdout=150,
             data_aug=6,
             nb_epochs_s=10,
             lmbda=0.1,
             online_training=False,
             train_on_recs=False,
             test_on_dev=False,
             defense_type='none'):
    """MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
    
    Args:
        train_start: index of first training set example
        train_end: index of last training set example
        test_start: index of first test set example
        test_end: index of last test set example
        defense_type: Type of defense against blackbox attacks
    
    Returns:
        a dictionary with:
             * black-box model accuracy on test set
             * substitute model accuracy on test set
             * black-box model accuracy on adversarial examples transferred
               from the substitute model
    """
    FLAGS = flags.FLAGS

    # Set logging level to see debug information.
    set_log_level(logging.WARNING)

    # Dictionary used to keep track and return key accuracies.
    accuracies = {}

    # Create TF session.
    adv_training = False
    if defense_type:
        if defense_type == 'defense_gan' and gan:
            sess = gan.sess
            gan_defense_flag = True
        else:
            gan_defense_flag = False
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            sess = tf.Session(config=config)
        if 'adv_tr' in defense_type:
            adv_training = True
    else:
        gan_defense_flag = False
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)

    train_images, train_labels, test_images, test_labels = \
        get_cached_gan_data(gan, test_on_dev, orig_data_flag=True)

    x_shape, classes = list(train_images.shape[1:]), train_labels.shape[1]
    nb_classes = classes

    type_to_models = {
        'A': model_a,
        'B': model_b,
        'C': model_c,
        'D': model_d,
        'E': model_e,
        'F': model_f,
        'Q': model_q,
        'Y': model_y,
        'Z': model_z
    }

    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
        bb_model = type_to_models[FLAGS.bb_model](
            input_shape=[None] + x_shape,
            nb_classes=train_labels.shape[1],
        )
    with tf.variable_scope("Substitute", reuse=tf.AUTO_REUSE):
        sub_model = type_to_models[FLAGS.sub_model](
            input_shape=[None] + x_shape,
            nb_classes=train_labels.shape[1],
        )

    if FLAGS.debug:
        train_images = train_images[:20 * batch_size]
        train_labels = train_labels[:20 * batch_size]
        debug_dir = os.path.join('debug', 'blackbox', FLAGS.debug_dir)
        ensure_dir(debug_dir)
        x_debug_test = test_images[:batch_size]

    # Initialize substitute training set reserved for adversary
    images_sub = test_images[:holdout]
    labels_sub = np.argmax(test_labels[:holdout], axis=1)

    print(labels_sub)

    # Redefine test set as remaining samples unavailable to adversaries
    if FLAGS.num_tests > 0:
        test_images = test_images[:FLAGS.num_tests]
        test_labels = test_labels[:FLAGS.num_tests]

    test_images = test_images[holdout:]
    test_labels = test_labels[holdout:]

    # Define input and output TF placeholders

    if FLAGS.image_dim[0] == 3:
        FLAGS.image_dim = [
            FLAGS.image_dim[1], FLAGS.image_dim[2], FLAGS.image_dim[0]
        ]

    images_tensor = tf.placeholder(tf.float32, shape=[None] + x_shape)
    labels_tensor = tf.placeholder(tf.float32, shape=(None, classes))

    rng = np.random.RandomState([11, 24, 1990])

    train_images_bb, train_labels_bb, test_images_bb, test_labels_bb = \
        train_images, train_labels, test_images, \
        test_labels

    cur_gan = gan
    if FLAGS.debug:
        train_images_bb = train_images_bb[:20 * batch_size]
        train_labels_bb = train_labels_bb[:20 * batch_size]

    # Prepare the black_box model.
    prep_bbox_out = prep_bbox(sess,
                              images_tensor,
                              labels_tensor,
                              train_images_bb,
                              train_labels_bb,
                              test_images_bb,
                              test_labels_bb,
                              nb_epochs,
                              batch_size,
                              learning_rate,
                              rng=rng,
                              gan=cur_gan,
                              adv_training=adv_training,
                              cnn_arch=bb_model)

    model, bbox_preds, accuracies['bbox'] = prep_bbox_out

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    print("Training the substitute model.")
    reconstructor = get_reconstructor(gan)
    recon_tensors, _ = reconstructor.reconstruct(images_tensor,
                                                 batch_size=batch_size,
                                                 reconstructor_id=2)

    model_sub, preds_sub = train_sub(sess,
                                     images_tensor,
                                     labels_tensor,
                                     model.get_logits(recon_tensors),
                                     images_sub,
                                     labels_sub,
                                     nb_classes,
                                     nb_epochs_s,
                                     batch_size,
                                     learning_rate,
                                     data_aug,
                                     lmbda,
                                     rng=rng,
                                     substitute_model=sub_model,
                                     dataset_name=gan.dataset_name)

    accuracies['sub'] = 0

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    eps = attack_config_dict[gan.dataset_name]['eps']
    min_val = attack_config_dict[gan.dataset_name]['clip_min']

    fgsm_par = {'eps': eps, 'ord': np.inf, 'clip_min': min_val, 'clip_max': 1.}

    fgsm = FastGradientMethod(model_sub, sess=sess)

    # Craft adversarial examples using the substitute.
    eval_params = {'batch_size': batch_size}
    x_adv_sub = fgsm.generate(images_tensor, **fgsm_par)

    if FLAGS.debug and gan is not None:  # To see some qualitative results.
        recon_tensors, _ = reconstructor.reconstruct(x_adv_sub,
                                                     batch_size=batch_size,
                                                     reconstructor_id=2)
        x_rec_orig, _ = reconstructor.reconstruct(images_tensor,
                                                  batch_size=batch_size,
                                                  reconstructor_id=3)

        x_adv_sub_val = sess.run(x_adv_sub,
                                 feed_dict={images_tensor: x_debug_test})
        x_rec_debug_val = sess.run(recon_tensors,
                                   feed_dict={images_tensor: x_debug_test})
        x_rec_orig_val = sess.run(x_rec_orig,
                                  feed_dict={images_tensor: x_debug_test})
        #sess.run(tf.local_variables_initializer())
        #x_rec_debug_val, x_rec_orig_val = sess.run([reconstructed_tensors, x_rec_orig], feed_dict={images_tensor: x_debug_test})

        save_images_files(x_adv_sub_val, output_dir=debug_dir, postfix='adv')

        postfix = 'gen_rec'
        save_images_files(x_rec_debug_val,
                          output_dir=debug_dir,
                          postfix=postfix)
        save_images_files(x_debug_test, output_dir=debug_dir, postfix='orig')
        save_images_files(x_rec_orig_val,
                          output_dir=debug_dir,
                          postfix='orig_rec')

    if gan_defense_flag:
        num_dims = len(images_tensor.get_shape())
        avg_inds = list(range(1, num_dims))

        recons_adv, zs = reconstructor.reconstruct(x_adv_sub,
                                                   batch_size=batch_size)

        diff_op = tf.reduce_mean(tf.square(x_adv_sub - recons_adv),
                                 axis=avg_inds)
        z_norm = tf.reduce_sum(tf.square(zs), axis=1)

        acc_adv, diffs_mean, roc_info_adv = model_eval_gan(
            sess,
            images_tensor,
            labels_tensor,
            predictions=model.get_logits(recons_adv),
            test_images=test_images,
            test_labels=test_labels,
            args=eval_params,
            diff_op=diff_op,
            z_norm=z_norm,
            recons_adv=recons_adv,
            adv_x=x_adv_sub,
            debug=False)

        # reconstruction on clean images
        recons_clean, zs = reconstructor.reconstruct(images_tensor,
                                                     batch_size=batch_size)

        diff_op = tf.reduce_mean(tf.square(images_tensor - recons_clean),
                                 axis=avg_inds)
        z_norm = tf.reduce_sum(tf.square(zs), axis=1)

        acc_rec, diffs_mean_rec, roc_info_rec = model_eval_gan(
            sess,
            images_tensor,
            labels_tensor,
            model.get_logits(recons_clean),
            None,
            test_images=test_images,
            test_labels=test_labels,
            args=eval_params,
            diff_op=diff_op,
            z_norm=z_norm,
            recons_adv=recons_clean,
            adv_x=images_tensor,
            debug=False)

        print('Evaluation accuracy with reconstruction: {}'.format(acc_rec))
        print('Test accuracy of oracle on cleaned images : {}'.format(acc_adv))

        return {
            'acc_adv': acc_adv,
            'acc_rec': acc_rec,
            'roc_info_adv': roc_info_adv,
            'roc_info_rec': roc_info_rec
        }

    else:
        acc_adv = model_eval(sess,
                             images_tensor,
                             labels_tensor,
                             model.get_logits(x_adv_sub),
                             test_images,
                             test_labels,
                             args=eval_params)
        print('Test accuracy of oracle on adversarial examples generated '
              'using the substitute: ' + str(acc_adv))
        return {
            'acc_adv': acc_adv,
            'acc_rec': 0,
            'roc_info_adv': None,
            'roc_info_rec': None
        }
def mnist_tutorial_cw(train_start=0,
                      train_end=60000,
                      test_start=0,
                      test_end=10000,
                      viz_enabled=True,
                      nb_epochs=6,
                      batch_size=128,
                      nb_classes=10,
                      source_samples=10,
                      learning_rate=0.001,
                      attack_iterations=100,
                      model_path=os.path.join("models", "mnist"),
                      targeted=True):
    """
    MNIST tutorial for Carlini and Wagner's attack
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # MNIST-specific dimensions
    img_rows = 28
    img_cols = 28
    channels = 1

    # Disable Keras learning phase since we will be serving through tensorflow
    keras.layers.core.K.set_learning_phase(0)

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Image dimensions ordering should follow the TensorFlow convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' "
              "to 'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)
    print("Created TensorFlow session and set Keras backend.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': os.path.join(*os.path.split(model_path)[:-1]),
        'filename': os.path.split(model_path)[-1]
    }

    # check if we've trained before, and if we have, use that pre-trained model
    if os.path.exists(model_path + ".meta"):
        tf_model_load(sess, model_path)
    else:
        model_train(sess,
                    x,
                    y,
                    preds,
                    X_train,
                    Y_train,
                    args=train_params,
                    save=os.path.exists("models"))

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
    assert X_test.shape[0] == test_end - test_start, X_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using Carlini and Wagner's approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) +
          ' adversarial examples')
    print("This could take some time ...")

    # Instantiate a CW attack object
    wrap = KerasModelWrapper(model)
    cw = CarliniWagnerL2(wrap, back='tf', sess=sess)

    idxs = [np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(10)]
    if targeted:
        # Initialize our array for grid visualization
        grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels)
        grid_viz_data = np.zeros(grid_shape, dtype='f')

        one_hot = np.zeros((10, 10))
        one_hot[np.arange(10), np.arange(10)] = 1

        adv_inputs = np.array([[instance] * 10 for instance in X_test[idxs]],
                              dtype=np.float32)
        adv_inputs = adv_inputs.reshape((100, 28, 28, 1))
        adv_ys = np.array([one_hot] * 10, dtype=np.float32).reshape((100, 10))
        yname = "y_target"
    else:
        # Initialize our array for grid visualization
        grid_shape = (nb_classes, 2, img_rows, img_cols, channels)
        grid_viz_data = np.zeros(grid_shape, dtype='f')

        adv_inputs = X_test[idxs]
        adv_ys = None
        yname = "y"

    cw_params = {
        'binary_search_steps': 1,
        yname: adv_ys,
        'max_iterations': attack_iterations,
        'learning_rate': 0.1,
        'batch_size': 100 if targeted else 10,
        'initial_const': 10
    }

    adv = cw.generate_np(adv_inputs, **cw_params)

    if targeted:
        adv_accuracy = model_eval(sess,
                                  x,
                                  y,
                                  preds,
                                  adv,
                                  adv_ys,
                                  args={'batch_size': 10})
    else:
        adv_accuracy = 1 - model_eval(
            sess, x, y, preds, adv, Y_test[idxs], args={'batch_size': 10})

    for j in range(10):
        if targeted:
            for i in range(10):
                grid_viz_data[i, j] = adv[i * 10 + j]
        else:
            grid_viz_data[j, 0] = adv_inputs[j]
            grid_viz_data[j, 1] = adv[j]

    print(grid_viz_data.shape)

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy))
    report.clean_train_adv_eval = 1. - adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(
        np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        _ = grid_visual(grid_viz_data)

    return report
Example #32
0
def prep_bbox(sess,
              images,
              labels,
              images_train,
              labels_train,
              images_test,
              labels_test,
              nb_epochs,
              batch_size,
              learning_rate,
              rng,
              gan=None,
              adv_training=False,
              cnn_arch=None):
    """Defines and trains a model that simulates the "remote"
    black-box oracle described in https://arxiv.org/abs/1602.02697.
    
    Args:
        sess: the TF session
        images: the input placeholder
        labels: the ouput placeholder
        images_train: the training data for the oracle
        labels_train: the training labels for the oracle
        images_test: the testing data for the oracle
        labels_test: the testing labels for the oracle
        nb_epochs: number of epochs to train model
        batch_size: size of training batches
        learning_rate: learning rate for training
        rng: numpy.random.RandomState
    
    Returns:
        model: The blackbox model function.
        predictions: The predictions tensor.
        accuracy: Accuracy of the model.
    """

    # Define TF model graph (for the black-box model).
    model = cnn_arch
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': 'classifiers/model/{}'.format(gan.dataset_name),
        'filename': 'model_{}'.format(FLAGS.bb_model)
    }
    eval_params = {'batch_size': batch_size}

    if gan.dataset_name in ['mnist', 'f-mnist']:
        used_vars = model.get_params()
        pred_train = model.get_logits(images, dropout=True)
        pred_eval = model.get_logits(images)

    elif gan.dataset_name == 'cifar-10':
        pre_model = Model('classifiers/model/cifar-10',
                          tiny=False,
                          mode='eval',
                          sess=sess)
        with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
            model = DefenseWrapper(pre_model, 'logits')
        used_vars = [
            x for x in tf.global_variables() if x.name.startswith('model')
        ]
        pred_eval = model.get_logits(images)

    elif gan.dataset_name == 'celeba':
        images_pl_transformed = tf.cast(images, tf.float32) / 255. * 2. - 1.
        used_vars = model.get_params()
        pred_train = model.get_logits(images_pl_transformed, dropout=True)
        pred_eval = model.get_logits(images_pl_transformed)

    classifier_load_success = False
    if FLAGS.load_bb_model:
        try:
            path = tf.train.latest_checkpoint('classifiers/model/{}'.format(
                gan.dataset_name))
            saver = tf.train.Saver(var_list=used_vars)
            saver.restore(sess, path)
            print('[+] BB model loaded successfully ...')
            classifier_load_success = True
        except:
            print('[-] Fail to load BB model ...')
            classifier_load_success = False

    if not classifier_load_success:
        print('[+] Training classifier model ...')
        model_train(sess,
                    images,
                    labels,
                    pred_train,
                    images_train,
                    labels_train,
                    args=train_params,
                    rng=rng,
                    predictions_adv=None,
                    init_all=False,
                    save=False)
    # Print out the accuracy on legitimate test data.
    accuracy = model_eval(
        sess,
        images,
        labels,
        pred_eval,
        images_test,
        labels_test,
        args=eval_params,
    )

    print('Test accuracy of black-box on legitimate test examples: ' +
          str(accuracy))

    return model, pred_eval, accuracy
Example #33
0
 def evaluate():
     eval_params = {'batch_size': 128}
     acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
     assert x_test.shape[0] == test_end - test_start, x_test.shape
     print('Test accuracy on legitimate examples: %0.4f' % acc)
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.001,
                   clean_train=True,
                   testing=False,
                   backprop_through_attack=False,
                   nb_filters=64,
                   num_threads=None):
    """
    MNIST cleverhans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param clean_train: perform normal training on clean examples only
                        before performing adversarial training.
    :param testing: if true, complete an AccuracyReport for unit tests
                    to verify that performance is adequate
    :param backprop_through_attack: If True, backprop through adversarial
                                    example construction process during
                                    adversarial training.
    :param clean_train: if true, train on clean examples
    :return: an AccuracyReport object
    """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    if num_threads:
        config_args = dict(intra_op_parallelism_threads=1)
    else:
        config_args = {}
    sess = tf.Session(config=tf.ConfigProto(**config_args))

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    model_path = "models/mnist"
    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    rng = np.random.RandomState([2017, 8, 30])

    if clean_train:
        model = make_basic_cnn(nb_filters=nb_filters)
        preds = model.get_probs(x)

        def evaluate():
            # Evaluate the accuracy of the MNIST model on legitimate test
            # examples
            eval_params = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds,
                             X_test,
                             Y_test,
                             args=eval_params)
            report.clean_train_clean_eval = acc
            assert X_test.shape[0] == test_end - test_start, X_test.shape
            print('Test accuracy on legitimate examples: %0.4f' % acc)

        model_train(sess,
                    x,
                    y,
                    preds,
                    X_train,
                    Y_train,
                    evaluate=evaluate,
                    args=train_params,
                    rng=rng,
                    var_list=model.get_params())

        # Calculate training error
        if testing:
            eval_params = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds,
                             X_train,
                             Y_train,
                             args=eval_params)
            report.train_clean_train_clean_eval = acc

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        fgsm = FastGradientMethod(model, sess=sess)
        adv_x = fgsm.generate(x, **fgsm_params)
        preds_adv = model.get_probs(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par)
        print('Test accuracy on adversarial examples: %0.4f\n' % acc)
        report.clean_train_adv_eval = acc

        # Calculate training error
        if testing:
            eval_par = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds_adv,
                             X_train,
                             Y_train,
                             args=eval_par)
            report.train_clean_train_adv_eval = acc

        print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = make_basic_cnn(nb_filters=nb_filters)
    preds_2 = model_2(x)
    fgsm2 = FastGradientMethod(model_2, sess=sess)
    adv_x_2 = fgsm2.generate(x, **fgsm_params)
    if not backprop_through_attack:
        # For the fgsm attack used in this tutorial, the attack has zero
        # gradient so enabling this flag does not change the gradient.
        # For some other attacks, enabling this flag increases the cost of
        # training, but gives the defender the ability to anticipate how
        # the atacker will change their strategy in response to updates to
        # the defender's parameters.
        adv_x_2 = tf.stop_gradient(adv_x_2)
    preds_2_adv = model_2(adv_x_2)

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    model_train(sess,
                x,
                y,
                preds_2,
                X_train,
                Y_train,
                predictions_adv=preds_2_adv,
                evaluate=evaluate_2,
                args=train_params,
                rng=rng,
                var_list=model_2.get_params())

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              X_train,
                              Y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              X_train,
                              Y_train,
                              args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
def mdt(model, data_dir, checkpoint_dir,
        train_dir='./tmp/cifar10_train',
        adversarial_dir='./tmp/cifar10_adv', batch_size=128,
        data_aug=False, data_norm=True):

    # train model
    if not tf.gfile.Exists(train_dir):
        # set input and get logits
        images, labels = mdt_cifar10_input.inputs(False, data_dir, batch_size,
                                                  data_aug, data_norm)

        labels = tf.cast(labels, tf.int64)
        # target = False
        # adv_output_layer = 'adv_bounddecoder6'
        # loss = adv_net_loss(images, model, labels, target, adv_output_layer, 0, 10)
        logits = model(images)
        loss = stand_loss(logits, labels)
        train_process(model, loss, images, label, train_dir, batch_size)
    
    # define dataset format
    img_rows = 32
    img_cols = 32
    channels = 3
    nb_classes = 10

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Get predict tensor
    pred = model(x)

    sess = tf.Session()

    if not checkpoint_load(sess, checkpoint_dir):
        return False

    # fetch data
    cifar10_data.maybe_download_and_return_python(data_dir)
    X, Y = mdt_cifar10_input.numpy_input(True, data_dir)

    # create one-hot Y
    one_hot_Y = to_categorical(Y, nb_classes)

    # create mode feed
    train_feed = mode_feed(sess, True)
    eval_feed = mode_feed(sess, False)

    fgsm_params = {'eps': 1,
                   'clip_min': 0.,
                   'clip_max': 255.}
    fgsm = FastGradientMethod(model, sess=sess)
    adv_x = fgsm.generate(x, **fgsm_params)
    preds_adv = model.get_probs(adv_x)
 
    # eval model accuracy
    class_accuracy, accuracy = model_eval_each_class(sess, x, y, pred, 10, X, one_hot_Y,
                          feed=eval_feed,
                          args={'batch_size': 128})
    print('model accuracy: {0}'.format(accuracy))

    for i in range(10):
        print('class {0} accuracy: {1}'.format(i, class_accuracy[i]))

    # eval model's accuacy in cw adversarial examples
    fgsm_accuracy = model_eval(sess, x, y, preds_adv, X, one_hot_Y,
                             feed=eval_feed,
                             args={'batch_size': 128})
    print('model fgsm_accuracy: {0}'.format(fgsm_accuracy))


    jsma_params = {'theta': 1., 'gamma': 0.1,
                   'clip_min': 0., 'clip_max': 1.,
                   'y_target': None}
                   

    X = X[:128]
    Y=one_hot_Y[:128]
    adv_feed = {x:X, y:one_hot_Y}
    adv_feed.update(eval_feed)
    sta = time.time()
    adv_X_ = sess.run(adv_x,feed_dict=adv_feed)
    end = time.time()
    duration = end - sta
    print('finished in {0} seconds'.format(duration))

    l2_dis = calculate_l2_dis(X/255, adv_X_/255)
    print('adversarial examples\' mean l2 distance: {0}'.format(l2_dis))
def main(argv=None):
    tf.set_random_seed(1234)
    sess = tf.Session()
    keras.backend.set_session(sess)

    X_train, Y_train, X_test, Y_test = data_cifar10()
    Y_train = Y_train.clip(.1 / 9., 1. - .1)

    x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    model = cnn_model(img_rows=32, img_cols=32, channels=3)
    predictions = model(x)

    def evaluate():
        eval_params = {'batch_size': FLAGS.batch_size}
        accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params)
        print('Test accuracy on legitimate test examples: ' + str(accuracy))

    train_params = {
        'nb_epochs': FLAGS.nb_epochs, 
        'batch_size': FLAGS.batch_size, 
        'learning_rate': FLAGS.learning_rate, 
        'train_dir': FLAGS.train_dir, 
        'filename': FLAGS.filename
    }

    model_path=os.path.join(FLAGS.train_dir, FLAGS.filename)
    if os.path.exists(model_path + ".meta"):
        tf_model_load(sess, model_path)
    else:
        model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, args=train_params, save=True)

    wrap = KerasModelWrapper(model)

    nb_classes = 10
    targeted = False
    nb_adv_per_sample = str(nb_classes - 1) if targeted else '1'

    cw = CarliniWagnerL2(model, back='tf', sess=sess)
    n_adv = 1000
    adv_inputs = X_test[:n_adv]
    adv_ys = None
    yname = "y"

    cw_params = {
        'binary_search_steps': 1,
        yname: adv_ys,
        'max_iterations': 100,
        'learning_rate': 0.1,
        'batch_size': 10,
        'initial_const': 10,
    }
    
    adv = cw.generate_np(adv_inputs, **cw_params)

    sigma = 16.0/255
    gamma = 0.00061 * 255 * 255
    alpha = 0.00061 * 255 * 255

    n_clusters = 10
    n_samples = 50

    noise = np.random.normal(0.0, sigma, adv.shape)
    adv_gauss = adv + noise

    i1 = np.repeat(np.arange(0,n_adv), n_samples)
    i2 = np.random.randint(32, size = n_adv * n_samples)
    i3 = np.random.randint(32, size = n_adv * n_samples)

    sample = adv[i1, i2, i3]
    noise = np.random.normal(0.0, sigma, sample.shape)
    noisy_samples = sample + noise
    noisy_samples = np.reshape(noisy_samples, (n_adv, n_samples, 3))

    noise = np.random.normal(0.0, sigma, adv.shape)

    adv_rdesc = np.zeros(adv.shape)
    adv_rmix = np.zeros(adv.shape)

    for img_no, img_samples in enumerate(noisy_samples):

        clusters = np.zeros((n_clusters, 3))
        clusters[0] = img_samples[0]
        
        for c_j in range(1, n_clusters):

            prob_cj = np.zeros(n_samples)

            for pix_no, pix in enumerate(img_samples):
            
                l2_min = 100000
                for c_l in range(0, c_j):
                    l2_norm_sq = np.inner(pix - clusters[c_l], pix - clusters[c_l])
                    if l2_norm_sq < l2_min:
                        l2_min = l2_norm_sq
                
                prob_cj[pix_no] = math.exp(gamma * l2_min)

            prob_cj /= prob_cj.sum()
            clusters[c_j] = img_samples[np.random.choice(n_samples, 1, p=prob_cj)]

        for pix_i in range(0, 32):
            for pix_j in range(0,32):
                c_dist_min = 100000
                c_min = np.zeros(3)
                c_sum = np.zeros(3)
                weight_sum = 0
                for c_j in clusters:
                    c_dist = np.linalg.norm(adv_gauss[img_no][pix_i][pix_j] - c_j)
                    weight_j = math.exp(-1 * alpha * c_dist * c_dist)
                    weight_sum = weight_sum + weight_j
                    c_sum = c_sum + weight_j * c_j
                    if c_dist < c_dist_min:
                        c_dist_min = c_dist
                        c_min = c_j

                adv_rdesc[img_no][pix_i][pix_j] = c_min
                adv_rmix[img_no][pix_i][pix_j] = c_sum / weight_sum

    eval_params = {'batch_size': np.minimum(nb_classes, 10)}
    orig_accuracy = model_eval(sess, x, y, predictions, adv_inputs, Y_test[:n_adv], args=eval_params)

    print('Original accuracy {0:.4f}'.format(orig_accuracy))

    adv_accuracy = model_eval(sess, x, y, predictions, adv, Y_test[:n_adv], args=eval_params)

    print('Adversarial without noise {0:.4f}'.format(adv_accuracy))

    percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations without noise {0:.4f}'.format(percent_perturbed))

    adv_accuracy = model_eval(sess, x, y, predictions, adv_gauss, Y_test[:n_adv], args=eval_params)

    print('Avg. rate of successful adv. examples with Gaussian noise {0:.4f}'.format(adv_accuracy))

    percent_perturbed = np.mean(np.sum((adv_gauss - adv_inputs)**2, axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations with Gaussian noise {0:.4f}'.format(percent_perturbed))

    adv_accuracy = model_eval(sess, x, y, predictions, adv_rdesc, Y_test[:n_adv], args=eval_params)

    print('Avg. rate of successful adv. examples with random descent {0:.4f}'.format(adv_accuracy))

    percent_perturbed = np.mean(np.sum((adv_rdesc - adv_inputs)**2, axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations with random descent {0:.4f}'.format(percent_perturbed))
    
    adv_accuracy = model_eval(sess, x, y, predictions, adv_rmix, Y_test[:n_adv], args=eval_params)

    print('Avg. rate of successful adv. examples with random mixture {0:.4f}'.format(adv_accuracy))

    percent_perturbed = np.mean(np.sum((adv_rmix - adv_inputs)**2, axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations with random mixture {0:.4f}'.format(percent_perturbed))

    sess.close()
def mnist_blackbox(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_classes=10, batch_size=128,
                   learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6,
                   nb_epochs_s=10, lmbda=0.1, aug_batch_size=512):
    """
    MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :return: a dictionary with:
             * black-box model accuracy on test set
             * substitute model accuracy on test set
             * black-box model accuracy on adversarial examples transferred
               from the substitute model
    """

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session
    sess = tf.Session()

    # Get MNIST data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)
    # Initialize substitute training set reserved for adversary
    X_sub = x_test[:holdout]
    Y_sub = np.argmax(y_test[:holdout], axis=1)

    # Redefine test set as remaining samples unavailable to adversaries
    x_test = x_test[holdout:]
    y_test = y_test[holdout:]

    # Obtain Image parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None,     nb_classes))

    # Seed random number generator so tutorial is reproducible
    rng = np.random.RandomState([2017, 8, 30])

    # Simulate the black-box model locally
    # You could replace this by a remote labeling API for instance
    print("Preparing the black-box model.")
    prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test,
                              nb_epochs, batch_size, learning_rate,
                              rng, nb_classes, img_rows, img_cols, nchannels)
    model, bbox_preds, accuracies['bbox'] = prep_bbox_out

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    print("Training the substitute model.")
    train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub,
                              nb_classes, nb_epochs_s, batch_size,
                              learning_rate, data_aug, lmbda, aug_batch_size,
                              rng, img_rows, img_cols, nchannels)
    model_sub, preds_sub = train_sub_out

    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_sub, x_test, y_test, args=eval_params)
    accuracies['sub'] = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.}
    fgsm = FastGradientMethod(model_sub, sess=sess)

    # Craft adversarial examples using the substitute
    eval_params = {'batch_size': batch_size}
    x_adv_sub = fgsm.generate(x, **fgsm_par)

    # Evaluate the accuracy of the "black-box" model on adversarial examples
    accuracy = model_eval(sess, x, y, model.get_logits(x_adv_sub),
                          x_test, y_test, args=eval_params)
    print('Test accuracy of oracle on adversarial examples generated '
          'using the substitute: ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex'] = accuracy

    return accuracies
 def evaluate():
     eval_params = {'batch_size': FLAGS.batch_size}
     accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params)
     print('Test accuracy on legitimate test examples: ' + str(accuracy))
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0,
                      test_end=10000, viz_enabled=True, nb_epochs=6,
                      batch_size=128, source_samples=10,
                      learning_rate=0.001, attack_iterations=100,
                      model_path=os.path.join("models", "mnist"),
                      targeted=True):
    """
    MNIST tutorial for Carlini and Wagner's attack
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))
    nb_filters = 64

    # Define TF model graph
    model = ModelBasicCNN('model1', nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = LossCrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': os.path.join(*os.path.split(model_path)[:-1]),
        'filename': os.path.split(model_path)[-1]
    }

    rng = np.random.RandomState([2017, 8, 30])
    # check if we've trained before, and if we have, use that pre-trained model
    if os.path.exists(model_path + ".meta"):
        tf_model_load(sess, model_path)
    else:
        train(sess, loss, x, y, x_train, y_train, args=train_params,
              save=os.path.exists("models"), rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using Carlini and Wagner's approach
    ###########################################################################
    nb_adv_per_sample = str(nb_classes - 1) if targeted else '1'
    print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample +
          ' adversarial examples')
    print("This could take some time ...")

    # Instantiate a CW attack object
    cw = CarliniWagnerL2(model, back='tf', sess=sess)

    if viz_enabled:
        assert source_samples == nb_classes
        idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0]
                for i in range(nb_classes)]
    if targeted:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, nb_classes, img_rows, img_cols,
                          nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = np.array(
                [[instance] * nb_classes for instance in x_test[idxs]],
                dtype=np.float32)
        else:
            adv_inputs = np.array(
                [[instance] * nb_classes for
                 instance in x_test[:source_samples]], dtype=np.float32)

        one_hot = np.zeros((nb_classes, nb_classes))
        one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1

        adv_inputs = adv_inputs.reshape(
            (source_samples * nb_classes, img_rows, img_cols, nchannels))
        adv_ys = np.array([one_hot] * source_samples,
                          dtype=np.float32).reshape((source_samples *
                                                     nb_classes, nb_classes))
        yname = "y_target"
    else:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = x_test[idxs]
        else:
            adv_inputs = x_test[:source_samples]

        adv_ys = None
        yname = "y"

    cw_params = {'binary_search_steps': 1,
                 yname: adv_ys,
                 'max_iterations': attack_iterations,
                 'learning_rate': 0.1,
                 'batch_size': source_samples * nb_classes if
                 targeted else source_samples,
                 'initial_const': 10}

    adv = cw.generate_np(adv_inputs,
                         **cw_params)

    eval_params = {'batch_size': np.minimum(nb_classes, source_samples)}
    if targeted:
        adv_accuracy = model_eval(
            sess, x, y, preds, adv, adv_ys, args=eval_params)
    else:
        if viz_enabled:
            adv_accuracy = 1 - \
                model_eval(sess, x, y, preds, adv, y_test[
                           idxs], args=eval_params)
        else:
            adv_accuracy = 1 - \
                model_eval(sess, x, y, preds, adv, y_test[
                           :source_samples], args=eval_params)

    if viz_enabled:
        for j in range(nb_classes):
            if targeted:
                for i in range(nb_classes):
                    grid_viz_data[i, j] = adv[i * nb_classes + j]
            else:
                grid_viz_data[j, 0] = adv_inputs[j]
                grid_viz_data[j, 1] = adv[j]

        print(grid_viz_data.shape)

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy))
    report.clean_train_adv_eval = 1. - adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2,
                                       axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        _ = grid_visual(grid_viz_data)

    return report
def generate_images():

    print('==> Preparing data..')
    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print(
            "INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
            "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.5
    sess = tf.Session(config=config)
    keras.backend.set_session(sess)

    print "==> Beginning Session"

    # Get CIFAR10 test data
    X_train, Y_train, X_test, Y_test = data_cifar10()

    # Save the vgg labels
    np.save("vgg_adv_y_10000", Y_test)

    assert Y_train.shape[1] == 10.
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Load model
    args_load = 'cifar10vgg.h5'
    args_pool = 0.05
    args_attack = 'jsma'

    print "==> loading vgg model"
    model = vggbn(top=True, pool=args_pool)
    model.load_weights(args_load)
    predictions = model(x)

    eval_params = {'batch_size': FLAGS.batch_size}
    accuracy = model_eval(sess,
                          x,
                          y,
                          predictions,
                          X_test,
                          Y_test,
                          args=eval_params)
    print '==> Accuracy : {}'.format(accuracy)

    def evaluate():
        # Evaluate the accuracy of the CIFAR10 model on legitimate test examples
        eval_params = {'batch_size': FLAGS.batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              predictions,
                              X_test,
                              Y_test,
                              args=eval_params)
        assert X_test.shape[0] == 10000, X_test.shape
        print('Test accuracy on legitimate test examples: ' + str(accuracy))

    # Train an CIFAR10 model
    # train_params = {
    #     'nb_epochs': FLAGS.nb_epochs,
    #     'batch_size': FLAGS.batch_size,
    #     'learning_rate': FLAGS.learning_rate
    # }

    im_base = '/im_'
    if args_attack == 'fgsm' or args_attack == 'FGSM':

        result_dir = os.getcwd() + '/images/fgsm/'
        print "==> creating fgsm adversarial wrapper"

        epsilons = [0.01, 0.03, 0.07, 0.1, 0.2, 0.3]

        for eps in epsilons:

            model_name = "vgg_fgsm_" + str(eps)

            adv_x = fgsm_old(x, predictions, eps=eps)

            print "==> sending to batch evaluator to finalize adversarial images"
            eval_params = {'batch_size': FLAGS.batch_size}
            X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test],
                                     args=eval_params)

            i = 0
            if not os.path.exists(result_dir + model_name):
                os.makedirs(result_dir + model_name)
            print "==> saving images to {}".format(result_dir + model_name)
            for ad in X_test_adv:
                scipy.misc.imsave(
                    result_dir + model_name + im_base + str(i) + '.png', ad)
                i += 1

        sess.close()
    """ JSMA """
    if args_attack == 'jsma' or args_attack == 'JSMA':

        np.save("JSMA_vgg_adv_y_" + str(FLAGS.source_samples),
                Y_test[0:FLAGS.source_samples])

        result_dir = os.getcwd() + '/images/jsma/trial_single_adv_'
        print('Crafting ' + str(FLAGS.source_samples) + ' * ' +
              str(FLAGS.nb_classes - 1) + ' adversarial examples')

        results = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='i')

        # This array contains the fraction of perturbed features for each test set
        perturbations = np.zeros((FLAGS.nb_classes, FLAGS.source_samples),
                                 dtype='f')

        # Define the TF graph for the model's Jacobian
        grads = jacobian_graph(predictions, x, FLAGS.nb_classes)

        # Initialize our array for grid visualization
        grid_shape = (FLAGS.nb_classes, FLAGS.nb_classes, FLAGS.img_rows,
                      FLAGS.img_cols, FLAGS.nb_channels)
        grid_viz_data = np.zeros(grid_shape, dtype='f')
        i_saved = 0
        n_image = 0

        gammas = [0.01, 0.05, 0.1, 0.2, 0.3]

        for gamma in gammas:

            model_name = "vgg_jsma_" + str(gamma)

            # Loop over the samples we want to perturb into adversarial examples
            print "==> saving images to {}".format(result_dir + model_name)
            for sample_ind in xrange(0, FLAGS.source_samples):

                # We want to find an adversarial example for each possible target class
                current_class = int(np.argmax(Y_test[sample_ind]))
                target_classes = other_classes(FLAGS.nb_classes, current_class)

                # For the grid visualization, keep original images along the diagonal
                grid_viz_data[current_class,
                              current_class, :, :, :] = np.reshape(
                                  X_test[sample_ind:(sample_ind + 1)],
                                  (FLAGS.img_rows, FLAGS.img_cols,
                                   FLAGS.nb_channels))

                # Loop over all target classes
                # pdb.set_trace()

                for target in np.random.permutation(target_classes):

                    print "image {}".format(sample_ind)

                    # here we hold all successful adversarials for this iteration
                    # since we dont want 500k images, we will uniformly sample an image to save after each target

                    print('--------------------------------------')
                    print('Creating adv. example for target class ' +
                          str(target))

                    # This call runs the Jacobian-based saliency map approach
                    adv_x, res, percent_perturb = jsma_old(
                        sess,
                        x,
                        predictions,
                        grads,
                        X_test[sample_ind:(sample_ind + 1)],
                        target,
                        num_classes=FLAGS.nb_classes,
                        theta=1,
                        gamma=gamma,
                        increase=True,
                        clip_min=0,
                        clip_max=1)
                    # Display the original and adversarial images side-by-side
                    adversarial = np.reshape(
                        adv_x,
                        (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))
                    original = np.reshape(
                        X_test[sample_ind:(sample_ind + 1)],
                        (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

                    if FLAGS.viz_enabled:

                        if 'figure' not in vars():
                            figure = pair_visual(original, adversarial)
                        else:
                            figure = pair_visual(original, adversarial, figure)

                    if not os.path.exists(result_dir + model_name):
                        os.makedirs(result_dir + model_name)

                    if res == 1:

                        scipy.misc.imsave(
                            result_dir + model_name + im_base +
                            str(sample_ind) + '.png', adversarial)

                        i_saved += 1
                        print "==> images saved: {}".format(i_saved)

                        # Add our adversarial example to our grid data
                        grid_viz_data[target,
                                      current_class, :, :, :] = np.reshape(
                                          adv_x,
                                          (FLAGS.img_rows, FLAGS.img_cols,
                                           FLAGS.nb_channels))

                        # Update the arrays for later analysis
                        results[target, sample_ind] = res
                        perturbations[target, sample_ind] = percent_perturb

                        break

                n_image += 1

# Compute the number of adversarial examples that were successfuly found
            nb_targets_tried = ((FLAGS.nb_classes - 1) * FLAGS.source_samples)
            succ_rate = float(np.sum(results)) / nb_targets_tried
            print('Avg. rate of successful adv. examples {0:.2f}'.format(
                succ_rate))

            # Compute the average distortion introduced by the algorithm
            percent_perturbed = np.mean(perturbations)
            print('Avg. rate of perturbed features {0:.2f}'.format(
                percent_perturbed))

            # Compute the average distortion introduced for successful samples only
            percent_perturb_succ = np.mean(perturbations * (results == 1))
            print(
                'Avg. rate of perturbed features for successful '
                'adversarial examples {0:.2f}'.format(percent_perturb_succ))

        # Close TF session
        sess.close()

        # Finally, block & display a grid of all the adversarial examples
        if FLAGS.viz_enabled:
            grid_visual(grid_viz_data)
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0,
                        test_end=10000, viz_enabled=True, nb_epochs=6,
                        batch_size=128, source_samples=10,
                        learning_rate=0.001):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    nb_filters = 64
    # Define TF model graph
    model = ModelBasicCNN('model1', nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = LossCrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    train(sess, loss, x, y, x_train, y_train, args=train_params,
          rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, back='tf', sess=sess)
    jsma_params = {'theta': 1., 'gamma': 0.1,
                   'clip_min': 0., 'clip_max': 1.,
                   'y_target': None}

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = x_test[sample_ind:(sample_ind+1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, nchannels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, nchannels)),
                    np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, nchannels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        plt.close(figure)
        _ = grid_visual(grid_viz_data)

    return report
Example #42
0
def cifar10_blackbox(nb_classes=10,
                     batch_size=128,
                     nb_samples=10,
                     l2_weight=0.0001,
                     momentum=0.9,
                     initial_lr=0.1,
                     lr_step_epoch=100.0,
                     lr_decay=0.1,
                     num_residual_units=2,
                     num_train_instance=50000,
                     num_test_instance=10000,
                     k=1,
                     eps=0.3,
                     learning_rate=0.001,
                     nb_epochs=10,
                     holdout=150,
                     data_aug=6,
                     nb_epochs_s=10,
                     lmbda=0.1,
                     binary=False,
                     scale=False,
                     model_path=None,
                     targeted=False,
                     data_dir=None,
                     adv=False,
                     delay=0):
    """
    MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :return: a dictionary with:
             * black-box model accuracy on test set
             * substitute model accuracy on test set
             * black-box model accuracy on adversarial examples transferred
               from the substitute model
    """

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get CIFAR10 test data
    X_train, Y_train, X_test, Y_test = data_cifar10_std()

    # Y_train_onehot = np_utils.to_categorical(Y_train, nb_classes)
    Y_test_onehot = np_utils.to_categorical(Y_test, nb_classes)

    # Y_test is for evaluating oracle
    Y_test_bbox = np.argmax(Y_test, axis=1)
    Y_test_bbox = Y_test_bbox.reshape(Y_test_bbox.shape[0], )
    Y_test_bbox = Y_test_bbox.astype('int32')

    #Y_test = Y_test.reshape(Y_test.shape[0],)
    #Y_test = Y_test.astype('int32')
    #Y_train = Y_train.astype('int32')

    # Initialize substitute training set reserved for adversary
    X_sub = X_test[:holdout]
    Y_sub = np.argmax(Y_test_onehot[:holdout], axis=1)

    # Redefine test set as remaining samples unavailable to adversaries
    X_test = X_test[holdout:]
    Y_test = Y_test[holdout:]

    # CIFAR10-specific dimensions
    img_rows = 32
    img_cols = 32
    channels = 3

    rng = np.random.RandomState([2017, 8, 30])

    # with tf.Graph().as_default():

    # Define input and output TF placeholders
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels))
    y = tf.placeholder(tf.int32, shape=(None))

    phase = tf.placeholder(tf.bool, name='phase')
    y_s = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Seed random number generator so tutorial is reproducible

    # Simulate the black-box model locally
    # You could replace this by a remote labeling API for instance
    print("Preparing the WideResNet black-box model.")
    '''
    prep_bbox_out = prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test,
                              img_rows, img_cols, channels, nb_epochs, batch_size, learning_rate,
                              rng=rng, phase=phase, binary=binary, scale=scale,
                              nb_filters=nb_filters, model_path=model_path,
                              adv=adv, delay=delay, eps=eps)

    model, bbox_preds, accuracies['bbox'], model_path = prep_bbox_out
    '''
    decay_step = lr_step_epoch * num_train_instance / batch_size
    hp = resnet.HParams(batch_size=batch_size,
                        num_classes=nb_classes,
                        num_residual_units=num_residual_units,
                        k=k,
                        weight_decay=l2_weight,
                        initial_lr=initial_lr,
                        decay_step=decay_step,
                        lr_decay=lr_decay,
                        momentum=momentum)

    print(binary)
    binary = True if binary else False
    print(binary)
    network = resnet.ResNet(binary, hp, x, y, None)
    network.build_model()

    # bbox_preds = network.preds
    bbox_preds = network.probs

    init = tf.global_variables_initializer()
    sess.run(init)

    # Create a saver.
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=10000)

    if 'model' in model_path.split('/')[-1]:
        saver.restore(sess, model_path)
        print('restored %s' % model_path)
    else:
        saver.restore(sess, tf.train.latest_checkpoint(model_path))
        print('restored %s' % model_path)
    '''
    if os.path.isdir(model_path):
        ckpt = tf.train.get_checkpoint_state(model_path)
        # Restores from checkpoint
        if ckpt and ckpt.model_checkpoint_path:
            print('\tRestore from %s' % ckpt.model_checkpoint_path)
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print('No checkpoint file found in the dir [%s]' % model_path)
            sys.exit(1)
    elif os.path.isfile(model_path):
        print('\tRestore from %s' % model_path)
        saver.restore(sess, model_path)
    else:
        print('No checkpoint file found in the path [%s]' % model_path)
        sys.exit(1)
    '''

    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess,
                     x,
                     y,
                     bbox_preds,
                     X_test,
                     Y_test,
                     phase=phase,
                     args=eval_params)
    print('Test accuracy of black-box on legitimate test examples: %.4f' % acc)
Example #43
0
def main(argv):

    model_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)

    if model_file is None:
        print('No model found')
        sys.exit()

    cifar = cifar10_input.CIFAR10Data(FLAGS.dataset_dir)

    nb_classes = 10
    X_test = cifar.eval_data.xs
    Y_test = to_categorical(cifar.eval_data.ys, nb_classes)
    assert Y_test.shape[1] == 10.

    set_log_level(logging.DEBUG)

    with tf.Session() as sess:

        x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
        y = tf.placeholder(tf.float32, shape=(None, 10))

        from madry_cifar10_model import make_madry_wresnet
        model = make_madry_wresnet()

        saver = tf.train.Saver()

        # Restore the checkpoint
        saver.restore(sess, model_file)

        nb_samples = FLAGS.nb_samples

        attack_params = {'batch_size': FLAGS.batch_size,
                         'clip_min': 0., 'clip_max': 255.}

        if FLAGS.attack_type == 'cwl2':
            from cleverhans.attacks import CarliniWagnerL2
            attacker = CarliniWagnerL2(model, sess=sess)
            attack_params.update({'binary_search_steps': 1,
                                  'max_iterations': 100,
                                  'learning_rate': 0.1,
                                  'initial_const': 10,
                                  'batch_size': 10
                                  })

        else:  # eps and eps_iter in range 0-255
            attack_params.update({'eps': 8, 'ord': np.inf})
            if FLAGS.attack_type == 'fgsm':
                from cleverhans.attacks import FastGradientMethod
                attacker = FastGradientMethod(model, sess=sess)

            elif FLAGS.attack_type == 'pgd':
                attack_params.update({'eps_iter': 2, 'nb_iter': 20})
                from cleverhans.attacks import MadryEtAl
                attacker = MadryEtAl(model, sess=sess)

        eval_par = {'batch_size': FLAGS.batch_size}

        if FLAGS.sweep:
            max_eps = 16
            epsilons = np.linspace(1, max_eps, max_eps)
            for e in epsilons:
                t1 = time.time()
                attack_params.update({'eps': e})
                x_adv = attacker.generate(x, **attack_params)
                preds_adv = model.get_probs(x_adv)
                acc = model_eval(sess, x, y, preds_adv, X_test[
                    :nb_samples], Y_test[:nb_samples], args=eval_par)
                print('Epsilon %.2f, accuracy on adversarial' % e,
                      'examples %0.4f\n' % acc)
            t2 = time.time()
        else:
            t1 = time.time()
            x_adv = attacker.generate(x, **attack_params)
            preds_adv = model.get_probs(x_adv)
            acc = model_eval(sess, x, y, preds_adv, X_test[
                :nb_samples], Y_test[:nb_samples], args=eval_par)
            t2 = time.time()
            print('Test accuracy on adversarial examples %0.4f\n' % acc)
        print("Took", t2 - t1, "seconds")
Example #44
0
		test_in_reshape = X_test_scaled[sample_ind].reshape(-1)
		nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
		percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

		X_adv[sample_ind] = adv_x
		results[target, sample_ind] = res
		perturbations[target, sample_ind] = percent_perturb

print()
print(X_adv.shape)

print("=========================== Evaluation of MLP Performance ==============================")
print()

eval_params = {'batch_size': FLAGS.batch_size}
accuracy = model_eval(sess, x, y, predictions, X_test_scaled, y_test, args=eval_params)
print("Test accuracy on normal examples: {}".format(accuracy))

accuracy_adv = model_eval(sess, x, y, predictions, X_adv, y_test, args=eval_params)
print("Test accuracy on adversarial examples: {}".format(accuracy_adv))
print()

print("=============================== Decision tree CLassifier ==============================")
dt = OneVsRestClassifier(DecisionTreeClassifier(random_state=42))
dt.fit(X_train_scaled, y_train)
y_pred = dt.predict(X_test_scaled)

# Calculate FPR for normal class only
fpr_dt, tpr_dt, _ = roc_curve(y_test[:, 0], y_pred[:, 0])

roc_auc_dt = auc(fpr_dt, tpr_dt)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=6, batch_size=128,
                   learning_rate=0.001, train_dir="/tmp",
                   filename="mnist.ckpt", load_model=False,
                   testing=False):
    """
    MNIST CleverHans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param train_dir: Directory storing the saved model
    :param filename: Filename to save model under
    :param load_model: True for load, False for not load
    :param testing: if true, test error is calculated
    :return: an AccuracyReport object
    """
    keras.layers.core.K.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
        report.clean_train_clean_eval = acc
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }
    ckpt = tf.train.get_checkpoint_state(train_dir)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path

    rng = np.random.RandomState([2017, 8, 30])
    if load_model and ckpt_path:
        saver = tf.train.Saver()
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate,
              args=train_params, save=True)

    # Calculate training error
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params)
        report.train_clean_train_clean_eval = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    wrap = KerasModelWrapper(model)
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.3}
    adv_x = fgsm.generate(x, **fgsm_params)
    # Consider the attack to be constant
    adv_x = tf.stop_gradient(adv_x)
    preds_adv = model(adv_x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    # Calculating train error
    if testing:
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, X_train,
                         Y_train, args=eval_par)
        report.train_clean_train_adv_eval = acc

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model()
    preds_2 = model_2(x)
    wrap_2 = KerasModelWrapper(model_2)
    fgsm2 = FastGradientMethod(wrap_2, sess=sess)
    preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params))

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess, x, y, preds_2_adv, X_test,
                              Y_test, args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    train(sess, x, y, preds_2, X_train, Y_train,
          predictions_adv=preds_2_adv, evaluate=evaluate_2,
          args=train_params, save=False)

    # Get a random slice of the data for linear extrapolation plots
    random_idx = np.random.randint(0, X_train.shape[0])
    X_slice = X_train[random_idx]
    Y_slice = Y_train[random_idx]

    # Plot the linear extrapolation plot for clean model
    log_prob_adv_array = get_logits_over_interval(
        sess, wrap, X_slice, fgsm_params)
    linear_extrapolation_plot(log_prob_adv_array, Y_slice,
                              'lep_clean.png')

    # Plot the linear extrapolation plot for adv model
    log_prob_adv_array = get_logits_over_interval(
        sess, wrap_2, X_slice, fgsm_params)
    linear_extrapolation_plot(log_prob_adv_array, Y_slice,
                              'lep_adv.png')

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess, x, y, preds_2_adv, X_train,
                              Y_train, args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
def prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, nb_epochs,
              batch_size, learning_rate, rng):
    """
    Define and train a model that simulates the "remote"
    black-box oracle described in the original paper.
    :param sess: the TF session
    :param x: the input placeholder for cifar
    :param y: the ouput placeholder for cifar
    :param X_train: the training data for the oracle
    :param Y_train: the training labels for the oracle
    :param X_test: the testing data for the oracle
    :param Y_test: the testing labels for the oracle
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param rng: numpy.random.RandomState
    :return:
    """

    # Define TF model graph (for the black-box model)
    model = cnn_cifar10_model(img_rows=32, img_cols=32, channels=3)
    predictions = model(x)
    fgsm_params = {
        'eps': FLAGS.training_eps,
        'ord': np.inf,
        'clip_min': 0.,
        'clip_max': 1.
    }
    fgsm = FastGradientMethod(model, sess=sess)
    predictions_adv = model(fgsm.generate(x, **fgsm_params))
    logger.info("Defined TensorFlow model graph.")

    # Train an cifar model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    model_train(sess,
                x,
                y,
                predictions,
                X_train,
                Y_train,
                verbose=False,
                args=train_params,
                rng=rng,
                predictions_adv=predictions_adv)

    # logger.info out the accuracy on legitimate data
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess,
                          x,
                          y,
                          predictions,
                          X_test,
                          Y_test,
                          args=eval_params)
    logger.info(
        'Test accuracy of adversarially trained black-box on legitimate test '
        'examples: ' + str(accuracy))

    return model, predictions, accuracy
def adv_net_exp(data_dir, adv_dir,
                target_model_dir='./tmp/cifar10_train_adv_encoder',
                clip_norm=1.5):
    
    # sess get setting
    sess = tf.Session()


    # define dataset format
    img_rows = 32
    img_cols = 32
    channels = 3
    nb_classes = 10

    # fetch data
    cifar10_data.maybe_download_and_return_python(data_dir)
    X, Y = mdt_cifar10_input.numpy_input(True, data_dir)
    
    # create one-hot Y
    one_hot_Y = to_categorical(Y, nb_classes)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    model = make_vgg16_clipRelu_model(name = 'vgg16_clipRelu_eval_mode', eval_mode=True)
    
    eval_feed = mode_feed(sess, False)
    # Get predict tensor
    pred = model(x)
    if not checkpoint_load(sess, target_model_dir):
        return False

    # eval model accuracy
    accuracy = model_eval(sess, x, y, pred, X, one_hot_Y,
                          feed = eval_feed,
                          args={'batch_size': 128})
    print('model accuracy: {0}'.format(accuracy))

    dis_loss, output_images = adv_train_net(x, clip_norm)

    logits = model(output_images)

    # restore adv variables
    ckpt = tf.train.get_checkpoint_state(adv_dir)
    # define adv variables
    adv_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      "adv_encoder")
    saver = tf.train.Saver(adv_variables)
    saver.restore(sess, ckpt.model_checkpoint_path)

    # eval adv accuracy
    accuracy = model_eval(sess, x, y, logits, X, one_hot_Y,
                          feed = eval_feed,
                          args={'batch_size': 128})
    print('transfer rate: {0}'.format(accuracy))


    # universal adversarial examples
    adv_imgs = adv_generate(sess, output_images, x, X, None, 128)
    mean_dif = adv_imgs[1]-X[1]
    print('mean dif\'s size: {0}'.format(mean_dif.shape))
    universal_adv_X = X+mean_dif
    # eval universal adv accuracy
    accuracy = model_eval(sess, x, y, pred, universal_adv_X, one_hot_Y,
                          feed = eval_feed,
                          args={'batch_size': 128})
    print('universal adv transfer rate: {0}'.format(accuracy))
def mdt(model, data_dir, checkpoint_dir,
        train_dir='./tmp/cifar10_train',
        adversarial_dir='./tmp/cifar10_adv', batch_size=128,
        data_aug=False, data_norm=True):

    # train model
    if not tf.gfile.Exists(train_dir):
        # set input and get logits
        images, labels = mdt_cifar10_input.inputs(False, data_dir, batch_size,
                                                  data_aug, data_norm)

        labels = tf.cast(labels, tf.int64)
        # target = False
        # adv_output_layer = 'adv_bounddecoder6'
        # loss = adv_net_loss(images, model, labels, target, adv_output_layer, 0, 10)
        logits = model(images)
        loss = stand_loss(logits, labels)
        train_process(model, loss, images, label, train_dir, batch_size)
    
    # define dataset format
    img_rows = 32
    img_cols = 32
    channels = 3
    nb_classes = 10

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Get predict tensor
    pred = model(x)

    sess = tf.Session()
    if not checkpoint_load(sess, checkpoint_dir):
        return False

    # fetch data
    cifar10_data.maybe_download_and_return_python(data_dir)
    X, Y = mdt_cifar10_input.numpy_input(True, data_dir)

    # print(sess.run(bn_moving_vars))


    # create one-hot Y
    one_hot_Y = to_categorical(Y, nb_classes)

    # create mode feed
    train_feed = mode_feed(sess, True)
    eval_feed = mode_feed(sess, False)

    # craft cw adversarial examples
    if not os.path.exists(adversarial_dir):
        os.makedirs(adversarial_dir)
    cw_file = adversarial_dir+'/cw_adv'
    if os.path.isfile(cw_file):
        fr = open(cw_file, 'rb')
        cw_dict = pickle.load(fr)
        cw_adv = cw_dict['data']
        adv_ys = cw_dict['labels']
        assert cw_adv.shape[0] == adv_ys.shape[0]
        cw_setting = cw_dict['setting']
        print('settings of cw adversarial examples that have been loaded')
        print(cw_setting)
    else:
        print('crafting cw adversarial examples....')
        start_time = time.time()

        cw = CarliniWagnerL2(model, back='tf', sess=sess)
        num_for_test = 100
        adv_inputs = X[:num_for_test]
        yname = 'y'
        adv_ys = one_hot_Y[:num_for_test]

        cw_params = {'binary_search_steps': 5,
                     'confidence':0,
                     'max_iterations': 10000,
                     'learning_rate': 0.1,
                     'batch_size': 100,
                     'initial_const': 10,
                     'clip_min': 0,
                     'clip_max': 255}

        cw_setting = cw_params.copy()

        cw_params['feed'] = eval_feed
        cw_params[yname] = adv_ys

        cw_adv = cw.generate_np(adv_inputs,
                             **cw_params)
        cw_setting['model'] = model.name
        cw_dict = {'data':cw_adv, 'labels':adv_ys, 'setting':cw_setting}
        fw = open(cw_file, 'wb')
        pickle.dump(cw_dict, fw)

        end_time = time.time()
        duration = end_time - start_time
        print('finished in {0} seconds'.format(duration))

    # eval model accuracy
    class_accuracy, accuracy = model_eval_each_class(sess, x, y, pred, 10, X, one_hot_Y,
                          feed=eval_feed,
                          args={'batch_size': 128})
    print('model accuracy: {0}'.format(accuracy))

    for i in range(10):
        print('class {0} accuracy: {1}'.format(i, class_accuracy[i]))

    # eval model's accuacy in cw adversarial examples
    cw_accuracy = model_eval(sess, x, y, pred, cw_adv, adv_ys,
                             feed=eval_feed,
                             args={'batch_size': 128})
    print('model cw_accuracy: {0}'.format(cw_accuracy))

    part_X = X[:cw_adv.shape[0]]
    #eval adv's l2 distance
    l2_dis = calculate_l2_dis(part_X/255, cw_adv/255)
    print('adversarial examples\' mean l2 distance: {0}'.format(l2_dis))

    # show and save img
    import numpy as np
    adv_imgs = np.around(cw_adv).astype(int)
    print(np.max(adv_imgs))
    compare_show(X[16], adv_imgs[16])
    import matplotlib
    matplotlib.image.imsave('cw.png', adv_imgs[16])

    # eval model's uncertainty
    dropout_num = 30
    uncert = evaluate_uncertainty(sess, model, x, part_X, dropout_num,
                                  batch_size, nb_classes, train_feed)

    # eval model's cw_uncertainty
    cw_uncert = evaluate_uncertainty(sess, model, x, cw_adv, dropout_num,
                                     batch_size,nb_classes, train_feed)

    # plot uncertainty histogram
    plt.figure("uncertainty_X")
    n, bins, patches = plt.hist(uncert, bins=25,edgecolor='None',facecolor='blue')
    plt.show()

    plt.figure('uncertainty_CW')
    cw_n, cw_bins, cw_patches = plt.hist(cw_uncert, bins=25,
                                         edgecolor='None',facecolor='red')
    plt.show()

    plt.figure('uncertainty_collections')
    plt.hist(uncert, bins=25,edgecolor='None',facecolor='blue')
    plt.hist(cw_uncert, bins=25,edgecolor='None',facecolor='red')
    plt.show()