Example #1
0
    def fgm(self, x, labels, targeted=False):
        """
        TensorFlow Eager implementation of the Fast Gradient Method.
        :param x: the input variable
        :param targeted: Is the attack targeted or untargeted? Untargeted, the
                         default, will try to make the label incorrect.
                         Targeted will instead try to move in the direction
                         of being more like y.
        :return: a tensor for the adversarial example
        """
        # Compute loss
        with tf.GradientTape() as tape:
            # input should be watched because it may be
            # combination of trainable and non-trainable variables
            tape.watch(x)
            loss_obj = LossCrossEntropy(self.model, smoothing=0.0)
            loss = loss_obj.fprop(x=x, y=labels)
            if targeted:
                loss = -loss

        # Define gradient of loss wrt input
        grad = tape.gradient(loss, x)
        optimal_perturbation = attacks.optimize_linear(grad, self.eps,
                                                       self.ord)

        # Add perturbation to original example to obtain adversarial example
        adv_x = x + optimal_perturbation

        # If clipping is needed
        # reset all values outside of [clip_min, clip_max]
        if (self.clip_min is not None) and (self.clip_max is not None):
            adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max)
        return adv_x
Example #2
0
 def test_xe(self):
     loss = LossCrossEntropy(self.model, smoothing=0.)
     l = loss.fprop(self.x, self.y)
     with tf.Session() as sess:
         vl1 = sess.run(l, feed_dict={self.x: self.vx, self.y: self.vy})
         vl2 = sess.run(l, feed_dict={self.x: self.vx, self.y: self.vy})
     self.assertClose(vl1, [2.210599660, 1.53666997], atol=1e-6)
     self.assertClose(vl2, [2.210599660, 1.53666997], atol=1e-6)
Example #3
0
 def test_xe_smoothing(self):
     loss = LossCrossEntropy(self.model, smoothing=0.1)
     l = loss.fprop(self.x, self.y)
     with tf.Session() as sess:
         vl1 = sess.run(l, feed_dict={self.x: self.vx, self.y: self.vy})
         vl2 = sess.run(l, feed_dict={self.x: self.vx, self.y: self.vy})
     self.assertClose(vl1, [2.10587597, 1.47194624], atol=1e-6)
     self.assertClose(vl2, [2.10587597, 1.47194624], atol=1e-6)
Example #4
0
    def fgm(self, x, labels, targeted=False):
        """
        TensorFlow Eager implementation of the Fast Gradient Method.
        :param x: the input variable
        :param targeted: Is the attack targeted or untargeted? Untargeted, the
                         default, will try to make the label incorrect.
                         Targeted will instead try to move in the direction
                         of being more like y.
        :return: a tensor for the adversarial example
        """
        # Compute loss
        with tf.GradientTape() as tape:
            loss_obj = LossCrossEntropy(self.model, smoothing=0.)
            loss = loss_obj.fprop(x=x, y=labels)
            if targeted:
                loss = -loss

        # Define gradient of loss wrt input
        grad = tape.gradient(loss, x)
        if self.ord == np.inf:
            # Take sign of gradient
            normalized_grad = tf.sign(grad)
            # The following line should not change the numerical results.
            # It applies only because `normalized_grad` is the output of
            # a `sign` op, which has zero derivative anyway.
            # It should not be applied for the other norms, where the
            # perturbation has a non-zero derivative.
            normalized_grad = tf.stop_gradient(normalized_grad)
        elif self.ord == 1:
            red_ind = list(xrange(1, len(x.get_shape())))
            normalized_grad = grad / tf.reduce_sum(
                tf.abs(grad), reduction_indices=red_ind, keep_dims=True)
        elif self.ord == 2:
            red_ind = list(xrange(1, len(x.get_shape())))
            square = tf.reduce_sum(tf.square(grad),
                                   reduction_indices=red_ind,
                                   keep_dims=True)
            normalized_grad = grad / tf.sqrt(square)
        else:
            raise NotImplementedError("Only L-inf, L1 and L2 norms are "
                                      "currently implemented.")

        # Multiply by constant epsilon
        scaled_grad = self.eps * normalized_grad

        # Add perturbation to original example to obtain adversarial example
        adv_x = x + scaled_grad

        # If clipping is needed
        # reset all values outside of [clip_min, clip_max]
        if (self.clip_min is not None) and (self.clip_max is not None):
            adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max)
        return adv_x
def prep_bbox(sess,
              x,
              y,
              X_train,
              Y_train,
              X_test,
              Y_test,
              nb_epochs,
              batch_size,
              learning_rate,
              rng,
              nb_classes=10,
              img_rows=28,
              img_cols=28,
              nchannels=1):
    """
    Define and train a model that simulates the "remote"
    black-box oracle described in the original paper.
    :param sess: the TF session
    :param x: the input placeholder for MNIST
    :param y: the ouput placeholder for MNIST
    :param X_train: the training data for the oracle
    :param Y_train: the training labels for the oracle
    :param X_test: the testing data for the oracle
    :param Y_test: the testing labels for the oracle
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param rng: numpy.random.RandomState
    :return:
    """

    # Define TF model graph (for the black-box model)
    nb_filters = 64
    model = ModelBasicCNN('model1', nb_classes, nb_filters)
    loss = LossCrossEntropy(model, smoothing=0.1)
    predictions = model.get_logits(x)
    print("Defined TensorFlow model graph.")

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    train(sess, loss, x, y, X_train, Y_train, args=train_params, rng=rng)

    # Print out the accuracy on legitimate data
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess,
                          x,
                          y,
                          predictions,
                          X_test,
                          Y_test,
                          args=eval_params)
    print('Test accuracy of black-box on legitimate test '
          'examples: ' + str(accuracy))

    return model, predictions, accuracy
Example #6
0
def train_mnist_cnn(datadir, train_start, train_end, test_start, test_end,
                    num_epochs, batch_size, learning_rate):

    X_train, Y_train, X_test, Y_test = data_mnist(datadir=datadir,
                                                  train_start=train_start, train_end=train_end,
                                                  test_start=test_start, test_end=test_end)

    gpu_options = tf.GPUOptions(allow_growth=True)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    sess.run(tf.global_variables_initializer())

    # Training and evaluating params.
    train_params = {"nb_epochs": num_epochs, "batch_size": batch_size, "learning_rate": learning_rate}
    eval_params = {"batch_size": batch_size}

    # Define the model.
    Model = mnist_cnn_model(input_shape=(None,) + X_train.shape[1:])
    loss = LossCrossEntropy(Model, smoothing=0.1)
    saver = tf.train.Saver(max_to_keep=1)

    x = tf.placeholder(tf.float32, shape=(None,) + X_train.shape[1:])
    y = tf.placeholder(tf.float32, shape=(None,) + Y_train.shape[1:])
    preds_x = Model.get_probs(x)


    train(sess, loss, x, y, X_train, Y_train, args=train_params)
    saver.save(sess, "./runs/ckpt/mnist_cnn_attacked.ckpt")

    test_accuracy = model_eval(sess, x, y, preds_x, X_test, Y_test, args=eval_params)
    print("Test accuracy: %0.4f" % test_accuracy)

    sess.close()
Example #7
0
 def reset(self):
     with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE):
         self.sess = tf.get_default_session()
         # Define input TF placeholder
         self.x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
         self.y = tf.placeholder(tf.float32, shape=(None, 10))
         self.model = ModelBasicCNN('model1', 10, 64)
         self.preds = self.model.get_logits(self.x)
         self.loss = LossCrossEntropy(self.model, smoothing=0.1)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=6, batch_size=128,
                   learning_rate=0.001,
                   clean_train=True,
                   testing=False,
                   backprop_through_attack=False,
                   nb_filters=64, num_threads=None,
                   label_smoothing=0.1):
    """
    MNIST cleverhans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param clean_train: perform normal training on clean examples only
                        before performing adversarial training.
    :param testing: if true, complete an AccuracyReport for unit tests
                    to verify that performance is adequate
    :param backprop_through_attack: If True, backprop through adversarial
                                    example construction process during
                                    adversarial training.
    :param label_smoothing: float, amount of label smoothing for cross entropy
    :return: an AccuracyReport object
    """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    if num_threads:
        config_args = dict(intra_op_parallelism_threads=1)
    else:
        config_args = {}
    sess = tf.Session(config=tf.ConfigProto(**config_args))

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)
    # Use Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    eval_params = {'batch_size': batch_size}
    fgsm_params = {
        'eps': 0.3,
        'clip_min': 0.,
        'clip_max': 1.
    }
    rng = np.random.RandomState([2017, 8, 30])
    sess = tf.Session()

    def do_eval(preds, x_set, y_set, report_key, is_adv=None):
        acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params)
        setattr(report, report_key, acc)
        if is_adv is None:
            report_text = None
        elif is_adv:
            report_text = 'adversarial'
        else:
            report_text = 'legitimate'
        if report_text:
            print('Test accuracy on %s examples: %0.4f' % (report_text, acc))

    if clean_train:
        model = make_basic_picklable_cnn()
        preds = model.get_logits(x)
        assert len(model.get_params()) > 0
        loss = LossCrossEntropy(model, smoothing=label_smoothing)

        def evaluate():
            do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False)

        train(sess, loss, x, y, x_train, y_train, evaluate=evaluate,
              args=train_params, rng=rng, var_list=model.get_params())

        with sess.as_default():
            save("clean_model.joblib", model)
            # Now that the model has been saved, you can evaluate it in a
            # separate process using `evaluate_pickled_model.py`.
            # You should get exactly the same result for both clean and
            # adversarial accuracy as you get within this program.

        # Calculate training error
        if testing:
            do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval')

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        fgsm = FastGradientMethod(model, sess=sess)
        adv_x = fgsm.generate(x, **fgsm_params)
        preds_adv = model.get_logits(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True)

        # Calculate training error
        if testing:
            do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval')

        print('Repeating the process, using adversarial training')

    # Create a new model and train it to be robust to FastGradientMethod
    model2 = make_basic_picklable_cnn()
    fgsm2 = FastGradientMethod(model2, sess=sess)

    def attack(x):
        return fgsm2.generate(x, **fgsm_params)

    loss2 = LossCrossEntropy(model2, smoothing=label_smoothing, attack=attack)
    preds2 = model2.get_logits(x)
    adv_x2 = attack(x)

    if not backprop_through_attack:
        # For the fgsm attack used in this tutorial, the attack has zero
        # gradient so enabling this flag does not change the gradient.
        # For some other attacks, enabling this flag increases the cost of
        # training, but gives the defender the ability to anticipate how
        # the atacker will change their strategy in response to updates to
        # the defender's parameters.
        adv_x2 = tf.stop_gradient(adv_x2)
    preds2_adv = model2.get_logits(adv_x2)

    def evaluate2():
        # Accuracy of adversarially trained model on legitimate test inputs
        do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False)
        # Accuracy of the adversarially trained model on adversarial examples
        do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True)

    # Perform and evaluate adversarial training
    train(sess, loss2, x, y, x_train, y_train, evaluate=evaluate2,
          args=train_params, rng=rng, var_list=model2.get_params())

    with sess.as_default():
        save("adv_model.joblib", model2)
        # Now that the model has been saved, you can evaluate it in a
        # separate process using `evaluate_pickled_model.py`.
        # You should get exactly the same result for both clean and
        # adversarial accuracy as you get within this program.

    # Calculate training errors
    if testing:
        do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval')
        do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval')

    return report
Example #9
0
def mnist_tutorial_cw(train_start=0,
                      train_end=60000,
                      test_start=0,
                      test_end=10000,
                      viz_enabled=True,
                      nb_epochs=6,
                      batch_size=128,
                      source_samples=10,
                      learning_rate=0.001,
                      attack_iterations=100,
                      model_path=os.path.join("models", "mnist"),
                      targeted=True):
    """
    MNIST tutorial for Carlini and Wagner's attack
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))
    nb_filters = 64

    # Define TF model graph
    model = ModelBasicCNN('model1', nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = LossCrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': os.path.join(*os.path.split(model_path)[:-1]),
        'filename': os.path.split(model_path)[-1]
    }

    rng = np.random.RandomState([2017, 8, 30])
    # check if we've trained before, and if we have, use that pre-trained model
    if os.path.exists(model_path + ".meta"):
        tf_model_load(sess, model_path)
    else:
        train(sess,
              loss,
              x,
              y,
              x_train,
              y_train,
              args=train_params,
              save=os.path.exists("models"),
              rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using Carlini and Wagner's approach
    ###########################################################################
    nb_adv_per_sample = str(nb_classes - 1) if targeted else '1'
    print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample +
          ' adversarial examples')
    print("This could take some time ...")

    # Instantiate a CW attack object
    cw = CarliniWagnerL2(model, back='tf', sess=sess)

    if viz_enabled:
        assert source_samples == nb_classes
        idxs = [
            np.where(np.argmax(y_test, axis=1) == i)[0][0]
            for i in range(nb_classes)
        ]
    if targeted:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, nb_classes, img_rows, img_cols,
                          nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = np.array([[instance] * nb_classes
                                   for instance in x_test[idxs]],
                                  dtype=np.float32)
        else:
            adv_inputs = np.array([[instance] * nb_classes
                                   for instance in x_test[:source_samples]],
                                  dtype=np.float32)

        one_hot = np.zeros((nb_classes, nb_classes))
        one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1

        adv_inputs = adv_inputs.reshape(
            (source_samples * nb_classes, img_rows, img_cols, nchannels))
        adv_ys = np.array([one_hot] * source_samples,
                          dtype=np.float32).reshape(
                              (source_samples * nb_classes, nb_classes))
        yname = "y_target"
    else:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = x_test[idxs]
        else:
            adv_inputs = x_test[:source_samples]

        adv_ys = None
        yname = "y"

    cw_params = {
        'binary_search_steps': 1,
        yname: adv_ys,
        'max_iterations': attack_iterations,
        'learning_rate': 0.1,
        'batch_size':
        source_samples * nb_classes if targeted else source_samples,
        'initial_const': 10
    }

    adv = cw.generate_np(adv_inputs, **cw_params)

    eval_params = {'batch_size': np.minimum(nb_classes, source_samples)}
    if targeted:
        adv_accuracy = model_eval(sess,
                                  x,
                                  y,
                                  preds,
                                  adv,
                                  adv_ys,
                                  args=eval_params)
    else:
        if viz_enabled:
            adv_accuracy = 1 - \
                model_eval(sess, x, y, preds, adv, y_test[
                           idxs], args=eval_params)
        else:
            adv_accuracy = 1 - \
                model_eval(sess, x, y, preds, adv, y_test[
                           :source_samples], args=eval_params)

    if viz_enabled:
        for j in range(nb_classes):
            if targeted:
                for i in range(nb_classes):
                    grid_viz_data[i, j] = adv[i * nb_classes + j]
            else:
                grid_viz_data[j, 0] = adv_inputs[j]
                grid_viz_data[j, 1] = adv[j]

        print(grid_viz_data.shape)

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy))
    report.clean_train_adv_eval = 1. - adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(
        np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        _ = grid_visual(grid_viz_data)

    return report
Example #10
0
def train_cifar10_classifier(model_name, nb_epochs, data_augmentation=False):
    rng = np.random.RandomState([2018, 8, 7])

    if data_augmentation:
        datagen, (x_train, y_train), (x_test, y_test) = load_cifar10(augmented=True)
        x_t = x_train.copy()
        y_t = y_train.copy()
        datagen.fit(x_t)
        dataflow = datagen.flow(x_t, y_t, batch_size=50000)
        x_train, y_train = dataflow.next()
    else:
        (x_train, y_train), (x_test, y_test) = load_cifar10()

    x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
    y = tf.placeholder(tf.float32, shape=(None, 10))
    keep_prob = tf.placeholder(tf.float32, ())
    is_training = tf.placeholder(tf.bool, ())

    if model_name == 'simple':
        model = make_simple_cnn(keep_prob=keep_prob)
        train_params = {
            'nb_epochs': nb_epochs,
            'batch_size': 128,
            'learning_rate': 1e-3}
        eval_params = {'batch_size': 128}
    elif model_name == 'resnet':
        model = make_resnet(is_training=is_training, depth=32)
        train_params = {
            'nb_epochs': nb_epochs,
            'batch_size': 32,
            'learning_rate': 3e-4}
        eval_params = {'batch_size': 32}
    #assert len(model.get_params()) == len(tf.trainable_variables())

    preds = model.get_probs(x)
    loss = LossCrossEntropy(model, 0)

    def evaluate():
        acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params,
                         feed={keep_prob: 1.0, is_training: False})
        print('Test accuracy on legitimate examples: %0.4f' % acc)

        if data_augmentation:
            x_aug, y_aug = dataflow.next()
            x_train[...] = x_aug
            y_train[...] = y_aug

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    train(sess, loss, x, y, x_train, y_train, evaluate=evaluate,
          args=train_params, feed={keep_prob: 0.5, is_training: True}, rng=rng,
          var_list=model.get_params())

    savedir = '../tfmodels'
    if not os.path.isdir(savedir):
        os.makedirs(savedir)
    saver = tf.train.Saver(var_list=tf.global_variables())
    model_savename = 'cifar10_%s_model_epoch%d' % (model_name, nb_epochs)
    if data_augmentation: model_savename += '_aug'
    saver.save(sess, os.path.join(savedir, model_savename))
def mnist_tutorial_jsma(train_start=0,
                        train_end=60000,
                        test_start=0,
                        test_end=10000,
                        viz_enabled=True,
                        nb_epochs=6,
                        batch_size=128,
                        nb_classes=10,
                        source_samples=10,
                        learning_rate=0.001):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # MNIST-specific dimensions
    img_rows = 28
    img_cols = 28
    channels = 1

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = ModelBasicCNN('model1', 10, 64)
    preds = model.get_logits(x)
    loss = LossCrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, back='tf', sess=sess)
    jsma_params = {
        'theta': 1.,
        'gamma': 0.1,
        'clip_min': 0.,
        'clip_max': 1.,
        'y_target': None
    }

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = x_test[sample_ind:(sample_ind + 1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, channels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, channels)),
                    np.reshape(adv_x, (img_rows, img_cols, channels)), figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, channels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        plt.close(figure)
        _ = grid_visual(grid_viz_data)

    return report
Example #12
0
def mnist_tutorial(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=6, batch_size=128,
                   learning_rate=0.001, train_dir="train_dir",
                   filename="mnist.ckpt", load_model=False,
                   testing=False, label_smoothing=0.1):
    """
    MNIST CleverHans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param train_dir: Directory storing the saved model
    :param filename: Filename to save model under
    :param load_model: True for load, False for not load
    :param testing: if true, test error is calculated
    :param label_smoothing: float, amount of label smoothing for cross entropy
    :return: an AccuracyReport object
    """
    keras.layers.core.K.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Define TF model graph
    model = cnn_model(img_rows=img_rows, img_cols=img_cols,
                      channels=nchannels, nb_filters=64,
                      nb_classes=nb_classes)
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
        report.clean_train_clean_eval = acc
#        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }

    rng = np.random.RandomState([2017, 8, 30])
    if not os.path.exists(train_dir):
        os.mkdir(train_dir)

    ckpt = tf.train.get_checkpoint_state(train_dir)
    print(train_dir, ckpt)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path
    wrap = KerasModelWrapper(model)

    if load_model and ckpt_path:
        saver = tf.train.Saver()
        print(ckpt_path)
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        loss = LossCrossEntropy(wrap, smoothing=label_smoothing)
        train(sess, loss, x, y, x_train, y_train, evaluate=evaluate,
              args=train_params, save=True, rng=rng)

    # Calculate training error
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params)
        report.train_clean_train_clean_eval = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.3,
                   'clip_min': 0.,
                   'clip_max': 1.}
    adv_x = fgsm.generate(x, **fgsm_params)
    # Consider the attack to be constant
    adv_x = tf.stop_gradient(adv_x)
    preds_adv = model(adv_x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    # Calculating train error
    if testing:
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, x_train,
                         y_train, args=eval_par)
        report.train_clean_train_adv_eval = acc

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols,
                        channels=nchannels, nb_filters=64,
                        nb_classes=nb_classes)
    wrap_2 = KerasModelWrapper(model_2)
    preds_2 = model_2(x)
    fgsm2 = FastGradientMethod(wrap_2, sess=sess)

    def attack(x):
        return fgsm2.generate(x, **fgsm_params)

    preds_2_adv = model_2(attack(x))
    loss_2 = LossCrossEntropy(wrap_2, smoothing=label_smoothing, attack=attack)

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, x_test, y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess, x, y, preds_2_adv, x_test,
                              y_test, args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    train(sess, loss_2, x, y, x_train, y_train, evaluate=evaluate_2,
          args=train_params, save=False, rng=rng)

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, x_train, y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess, x, y, preds_2_adv, x_train,
                              y_train, args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
def train_sub(sess,
              x,
              y,
              bbox_preds,
              X_sub,
              Y_sub,
              nb_classes,
              nb_epochs_s,
              batch_size,
              learning_rate,
              data_aug,
              lmbda,
              aug_batch_size,
              rng,
              img_rows=28,
              img_cols=28,
              nchannels=1):
    """
    This function creates the substitute by alternatively
    augmenting the training data and training the substitute.
    :param sess: TF session
    :param x: input TF placeholder
    :param y: output TF placeholder
    :param bbox_preds: output of black-box model predictions
    :param X_sub: initial substitute training data
    :param Y_sub: initial substitute training labels
    :param nb_classes: number of output classes
    :param nb_epochs_s: number of epochs to train substitute model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param data_aug: number of times substitute training data is augmented
    :param lmbda: lambda from arxiv.org/abs/1602.02697
    :param rng: numpy.random.RandomState instance
    :return:
    """
    # Define TF model graph (for the black-box model)
    model_sub = ModelSubstitute('model_s', nb_classes)
    preds_sub = model_sub.get_logits(x)
    loss_sub = LossCrossEntropy(model_sub, smoothing=0)

    print("Defined TensorFlow model graph for the substitute.")

    # Define the Jacobian symbolically using TensorFlow
    grads = jacobian_graph(preds_sub, x, nb_classes)

    # Train the substitute and augment dataset alternatively
    for rho in xrange(data_aug):
        print("Substitute training epoch #" + str(rho))
        train_params = {
            'nb_epochs': nb_epochs_s,
            'batch_size': batch_size,
            'learning_rate': learning_rate
        }
        with TemporaryLogLevel(logging.WARNING, "cleverhans.utils.tf"):
            train(sess,
                  loss_sub,
                  x,
                  y,
                  X_sub,
                  to_categorical(Y_sub, nb_classes),
                  init_all=False,
                  args=train_params,
                  rng=rng,
                  var_list=model_sub.get_params())

        # If we are not at last substitute training iteration, augment dataset
        if rho < data_aug - 1:
            print("Augmenting substitute training data.")
            # Perform the Jacobian augmentation
            lmbda_coef = 2 * int(int(rho / 3) != 0) - 1
            X_sub = jacobian_augmentation(sess, x, X_sub, Y_sub, grads,
                                          lmbda_coef * lmbda, aug_batch_size)

            print("Labeling substitute training data.")
            # Label the newly generated synthetic points using the black-box
            Y_sub = np.hstack([Y_sub, Y_sub])
            X_sub_prev = X_sub[int(len(X_sub) / 2):]
            eval_params = {'batch_size': batch_size}
            bbox_val = batch_eval(sess, [x], [bbox_preds], [X_sub_prev],
                                  args=eval_params)[0]
            # Note here that we take the argmax because the adversary
            # only has access to the label (not the probabilities) output
            # by the black-box model
            Y_sub[int(len(X_sub) / 2):] = np.argmax(bbox_val, axis=1)

    return model_sub, preds_sub
Example #14
0
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.001,
                   clean_train=True,
                   testing=False,
                   backprop_through_attack=False,
                   nb_filters=64,
                   num_threads=None,
                   label_smoothing=0.1):
    """
    MNIST cleverhans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param clean_train: perform normal training on clean examples only
                        before performing adversarial training.
    :param testing: if true, complete an AccuracyReport for unit tests
                    to verify that performance is adequate
    :param backprop_through_attack: If True, backprop through adversarial
                                    example construction process during
                                    adversarial training.
    :param clean_train: if true, train on clean examples
    :param label_smoothing: float, amount of label smoothing for cross entropy
    :return: an AccuracyReport object
    """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    if num_threads:
        config_args = dict(intra_op_parallelism_threads=1)
    else:
        config_args = {}
    #sess = tf.Session(config=tf.ConfigProto(**config_args))
    sess = tf.Session(config=tf.ConfigProto(device_count={'GPU': 1}))

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(file,
                                                  train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)
    # Use Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    rng = np.random.RandomState([2017, 8, 30])

    ################ color training initialization ####################

    color_training_epochs = 5000
    color_learning_rate = 0.1
    colorCategory = [
        [0.0, 0.4],  # Black
        [0.3, 0.7],  # Grey
        [0.6, 1.0]  # White
    ]

    numOfPRModel = 20
    minColorEpoch = 300
    maxColorEpoch = 3000

    numColorInput = 1
    #numColorOutput = len(colorCategory)

    color_x = tf.placeholder(
        tf.float32,
        [None, numColorInput])  # mnist data image of shape 28*28=784
    color_y = tf.placeholder(
        tf.float32,
        [None, numColorOutput])  # 0-9 digits recognition => 10 classes

    # Set multiple models' weights and biases
    color_W = {}
    color_b = {}
    color_pred_out = {}
    color_cost = {}
    color_optimizer = {}
    color_argmax = {}
    color_correct_prediction = {}
    color_accuracy = {}
    for i in range(numOfPRModel):
        color_W["w" + str(i)] = tf.Variable(
            tf.random_normal([numColorInput, numColorOutput]))
        color_b["b" + str(i)] = tf.Variable(tf.random_normal([numColorOutput]))
        color_pred_out["out" + str(i)] = tf.matmul(
            color_x, color_W["w" + str(i)]) + color_b["b" + str(i)]  # Softmax
        color_cost["cost" + str(i)] = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                logits=color_pred_out["out" + str(i)], labels=color_y))
        # Gradient Descent
        color_optimizer["opt" + str(i)] = tf.train.GradientDescentOptimizer(
            color_learning_rate).minimize(color_cost["cost" + str(i)])

        # Test model
        color_argmax["argmax" + str(i)] = tf.argmax(
            color_pred_out["out" + str(i)], 1)
        color_correct_prediction["pred" + str(i)] = tf.equal(
            tf.argmax(color_pred_out["out" + str(i)], 1),
            tf.argmax(color_y, 1))
        # Calculate accuracy
        color_accuracy["acc" + str(i)] = tf.reduce_mean(
            tf.cast(color_correct_prediction["pred" + str(i)], tf.float32))

    # Graph for re-generating the original image into a new image by using trained color model
    pr_model_x = tf.placeholder(
        tf.float32,
        [None, n_input, numColorInput])  # mnist data image of shape 28*28=784
    pr_model_W = tf.placeholder(tf.float32,
                                [None, numColorInput, numColorOutput
                                 ])  # mnist data image of shape 28*28=784
    pr_model_b = tf.placeholder(tf.float32,
                                [None, numColorInput, numColorOutput
                                 ])  # mnist data image of shape 28*28=784
    pr_model_output = tf.one_hot(
        tf.argmax((tf.matmul(pr_model_x, pr_model_W) + pr_model_b), 2),
        numColorOutput)

    # Merge the random generated output for new image based on the colorCategory
    randomColorCategory = []
    for i in range(len(colorCategory)):
        tmp = []
        tmpRandomColorCategory = my_tf_round(
            tf.random_uniform(tf.shape(pr_model_x),
                              colorCategory[i][0],
                              colorCategory[i][1],
                              dtype=tf.float32), 2)
        tmp.append(tmpRandomColorCategory)
        randomColorCategory.append(tf.concat(tmp, 1))
    random_merge = tf.reshape(tf.concat(randomColorCategory, -1),
                              [-1, n_input, numColorOutput])
    random_color_set = tf.reduce_sum(
        tf.multiply(pr_model_output, random_merge), 2)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    # x = tf.reshape(random_color_set, shape=(-1, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    print(random_color_set)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': save_dir,
        'filename': filename,
        'numColorOutput': numColorOutput
    }
    eval_params = {'batch_size': batch_size, 'numColorOutput': numColorOutput}
    fgsm_params = {'eps': 8 / 256, 'clip_min': 0., 'clip_max': 1.}

    #sess = tf.Session()

    def do_eval(preds,
                x_set,
                y_set,
                report_key,
                is_adv=None,
                pred2=None,
                c_w=None,
                c_b=None,
                pr_model_x=None,
                random_color_set=None,
                pr_model_W=None,
                pr_model_b=None,
                pr_model_output=None,
                ae=None):
        acc = model_eval(sess,
                         x,
                         y,
                         preds,
                         x_set,
                         y_set,
                         args=eval_params,
                         pred2=pred2,
                         c_w=c_w,
                         c_b=c_b,
                         pr_model_x=pr_model_x,
                         random_color_set=random_color_set,
                         pr_model_W=pr_model_W,
                         pr_model_b=pr_model_b,
                         pr_model_output=pr_model_output,
                         is_adv=is_adv,
                         ae=ae)
        setattr(report, report_key, acc)
        if is_adv is None:
            report_text = None
        elif is_adv:
            report_text = 'adversarial'
        else:
            report_text = 'legitimate'
        if report_text:
            print('Test accuracy on %s examples: %0.4f' % (report_text, acc))

    with sess.as_default():
        if hasattr(tf, "global_variables_initializer"):
            tf.global_variables_initializer().run()
        else:
            warnings.warn("Update your copy of tensorflow; future versions of "
                          "CleverHans may drop support for this version.")
            sess.run(tf.initialize_all_variables())

        ################# color training ####################
        print("Trying to load pr model from: " + model_path2)
        if os.path.exists(model_path2 + ".meta"):
            tf_model_load(sess, model_path2)
            c_w, c_b = sess.run([color_W, color_b])
            print("Load color trained model in training")
        else:
            # Training the PR model
            c_w = {}
            c_b = {}
            for modelcount in range(numOfPRModel):
                color_training_epochs = np.random.randint(
                    minColorEpoch, maxColorEpoch)
                for epoch in range(color_training_epochs):
                    outputColorY = []
                    p1 = np.random.random(100)
                    for i in range(len(p1)):
                        outputOverlapColorY = []
                        for j in range(len(colorCategory)):
                            if p1[i] >= colorCategory[j][0] and p1[
                                    i] <= colorCategory[j][1]:
                                colorIndexSeq = []
                                for k in range(len(colorCategory)):
                                    if j == k:
                                        colorIndexSeq.append(1)
                                    else:
                                        colorIndexSeq.append(0)
                                outputOverlapColorY.append(colorIndexSeq)
                                # break

                        # Randomly choose the output for color Y if the outputOverlapColorY has more than 1 item
                        outputColorY.append(
                            outputOverlapColorY[np.random.randint(
                                0, len(outputOverlapColorY))])

                    inputColorX = p1.reshape(100, 1)
                    _, c, _c_w, _c_b = sess.run([
                        color_optimizer["opt" + str(modelcount)],
                        color_cost["cost" + str(modelcount)],
                        color_W["w" + str(modelcount)],
                        color_b["b" + str(modelcount)]
                    ],
                                                feed_dict={
                                                    color_x: inputColorX,
                                                    color_y: outputColorY
                                                })
                    avg_cost = c

                    # Evaluating color model
                    outputColorY = []
                    p1 = np.random.random(100)
                    # Generate output for random color inputs (test case)
                    for i in range(len(p1)):
                        for j in range(len(colorCategory)):
                            outputOverlapColorY = []
                            if p1[i] >= colorCategory[j][0] and p1[
                                    i] <= colorCategory[j][1]:
                                colorIndexSeq = []
                                for k in range(len(colorCategory)):
                                    if j == k:
                                        colorIndexSeq.append(1)
                                    else:
                                        colorIndexSeq.append(0)
                                outputOverlapColorY.append(colorIndexSeq)
                                break

                        # Randomly choose the output for color Y if the outputOverlapColorY has more than 1 item
                        outputColorY.append(
                            outputOverlapColorY[np.random.randint(
                                0, len(outputOverlapColorY))])

                    inputColorX = p1.reshape(100, 1)
                    # print(random_xs)
                    acc, argmax = sess.run([
                        color_accuracy["acc" + str(modelcount)],
                        color_argmax["argmax" + str(modelcount)]
                    ],
                                           feed_dict={
                                               color_x: inputColorX,
                                               color_y: outputColorY
                                           })
                print(str(modelcount + 1) + ") Epoch:",
                          '%04d' % (epoch + 1) + "/" + str(color_training_epochs) + ", Cost= " + \
                          "{:.9f}".format(avg_cost) + ", Training Accuracy= " + \
                          "{:.5f}".format(acc) + " ")

                c_w["w" + str(modelcount)] = _c_w
                c_b["b" + str(modelcount)] = _c_b

                # print(c_w)

            save_path = os.path.join(save_dir2, filename2)
            saver = tf.train.Saver()
            saver.save(sess, save_path)
        ##################### end of color training ------------------------------

    ################# model training ####################
    if clean_train:
        model = ModelBasicCNN('model1', nb_classes, nb_filters)
        preds = model.get_logits(x)
        loss = LossCrossEntropy(model, smoothing=label_smoothing)

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        saveFileNum = 50
        # saveFileNum = 500
        # saveFileNum = 1000
        model_path = os.path.join(save_dir, filename + "-" + str(saveFileNum))
        fgsm = FastGradientMethod(model)
        # fgsm = BasicIterativeMethod(model)
        # fgsm = MomentumIterativeMethod(model)
        adv_x = fgsm.generate(x, **fgsm_params)
        preds_adv = model.get_logits(adv_x)

        def evaluate():
            do_eval(preds,
                    x_test,
                    y_test,
                    'clean_train_clean_eval',
                    False,
                    pred2=preds,
                    c_w=c_w,
                    c_b=c_b,
                    pr_model_x=pr_model_x,
                    random_color_set=random_color_set,
                    pr_model_W=pr_model_W,
                    pr_model_b=pr_model_b)
            #do_eval(preds, x_test, y_test, 'clean_train_adv_eval', True,
            #pred2=preds, c_w=c_w, c_b=c_b, ae=adv_x,
            #pr_model_x=pr_model_x, random_color_set=random_color_set,
            #pr_model_W=pr_model_W, pr_model_b=pr_model_b, pr_model_output=pr_model_output
            #)

        print("Trying to load trained model from: " + model_path)
        if os.path.exists(model_path + ".meta"):
            tf_model_load(sess, model_path)
            print("Load trained model")
        else:
            train(sess,
                  loss,
                  x,
                  y,
                  x_train,
                  y_train,
                  evaluate=evaluate,
                  args=train_params,
                  rng=rng,
                  var_list=model.get_params(),
                  save=True,
                  c_w=c_w,
                  c_b=c_b,
                  pr_model_x=pr_model_x,
                  random_color_set=random_color_set,
                  pr_model_W=pr_model_W,
                  pr_model_b=pr_model_b)

        # Calculate training error
        if testing:
            do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval')

        # Evaluate the accuracy of the MNIST model on adversarial examples
        do_eval(preds,
                x_test,
                y_test,
                'clean_train_adv_eval',
                True,
                pred2=preds,
                c_w=c_w,
                c_b=c_b,
                ae=adv_x,
                pr_model_x=pr_model_x,
                random_color_set=random_color_set,
                pr_model_W=pr_model_W,
                pr_model_b=pr_model_b)

        # Calculate training error
        if testing:
            do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval')
def mnist_tutorial(train_start=0,
                   train_end=1000,
                   test_start=0,
                   test_end=1666,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.001,
                   clean_train=True,
                   testing=False,
                   backprop_through_attack=False,
                   nb_filters=64,
                   num_threads=None):

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    if num_threads:
        config_args = dict(intra_op_parallelism_threads=1)
    else:
        config_args = {}
    sess = tf.Session(config=tf.ConfigProto(**config_args))

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    eval_params = {'batch_size': batch_size}
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    rng = np.random.RandomState([2017, 8, 30])
    sess = tf.Session()

    def do_eval(preds, x_set, y_set, report_key, is_adv=None):
        acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params)
        setattr(report, report_key, acc)
        if is_adv is None:
            report_text = None
        elif is_adv:
            report_text = 'adversarial'
            # added by hhkim
            # print('cur:', y_set)
            # feed_dict = {x: x_set}
            # probabilities = sess.run(preds, feed_dict)
            # print(probabilities)
        else:
            report_text = 'legitimate'
        if report_text:
            print('Test accuracy on %s examples: %0.4f' % (report_text, acc))

    if clean_train:
        model = ModelBasicCNN('model1', 10, nb_filters)
        preds = model.get_logits(x)
        loss = LossCrossEntropy(model, smoothing=0.1)

        def evaluate():
            do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False)

        train(sess,
              loss,
              x,
              y,
              x_train,
              y_train,
              evaluate=evaluate,
              args=train_params,
              rng=rng,
              var_list=model.get_params())

        # Calculate training error
        if testing:
            do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval')

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        fgsm = FastGradientMethod(model, sess=sess)
        adv_x = fgsm.generate(x, **fgsm_params)
        preds_adv = model.get_logits(adv_x)
        print('adv_x shape:', adv_x.shape)

        # Get array of output
        # updated by hak hyun kim
        feed_dict = {x: x_test[:1]}
        probabilities = sess.run(preds_adv, feed_dict)
        print(probabilities)
        print('original answer :', y_test[:1])

        # Evaluate the accuracy of the MNIST model on adversarial examples
        do_eval(preds_adv, x_test[:1], y_test[:1], 'clean_train_adv_eval',
                True)

        # Calculate training error
        if testing:
            do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval')

        print('Repeating the process, using adversarial training')
def train(model,
          X_train=None,
          Y_train=None,
          save=False,
          predictions_adv=None,
          evaluate=None,
          args=None,
          rng=None,
          var_list=None,
          attack=None,
          attack_args=None):
    """
  Train a TF Eager model
  :param model: instance of cleverhans model, takes in input batch,
                  gives out probs(softmax layer).
  :param X_train: numpy array with training inputs
  :param Y_train: numpy array with training outputs
  :param save: boolean controlling the save operation
  :param predictions_adv: if set with the adversarial example tensor,
                          will run adversarial training
  :param evaluate: function that is run after each training iteration
                   (typically to display the test/validation accuracy).
  :param args: dict or argparse `Namespace` object.
               Should contain `nb_epochs`, `learning_rate`,
               `batch_size`
               If save is True, should also contain 'train_dir'
               and 'filename'
  :param rng: Instance of numpy.random.RandomState
  :param var_list: List of variables to train.
  :param attack: Instance of the class cleverhans.attacks.attacks_eager
  :param attack_args: Parameters required for the attack.
  :return: True if model trained
  """
    args = _ArgsWrapper(args or {})
    if ((attack is None) != (attack_args is None)):
        raise ValueError("attack and attack_args must be " "passed together.")
    if X_train is None or Y_train is None:
        raise ValueError("X_train argument and Y_train argument "
                         "must be supplied.")
    # Check that necessary arguments were given (see doc above)
    assert args.nb_epochs, "Number of epochs was not given in args dict"
    assert args.learning_rate, "Learning rate was not given in args dict"
    assert args.batch_size, "Batch size was not given in args dict"

    if save:
        assert args.train_dir, "Directory for save was not given in args dict"
        assert args.filename, "Filename for save was not given in args dict"

    if rng is None:
        rng = np.random.RandomState()

    # Optimizer
    tfe = tf.contrib.eager
    optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
    batch_x = tfe.Variable(X_train[0:args.batch_size], dtype=tf.float32)
    batch_y = tfe.Variable(Y_train[0:args.batch_size], dtype=tf.float32)

    # One epoch of training.
    for epoch in xrange(args.nb_epochs):
        # Compute number of batches
        nb_batches = int(math.ceil(float(len(X_train)) / args.batch_size))
        assert nb_batches * args.batch_size >= len(X_train)

        # Indices to shuffle training set
        index_shuf = list(range(len(X_train)))
        rng.shuffle(index_shuf)

        prev = time.time()
        for batch in range(nb_batches):

            # Compute batch start and end indices
            start, end = batch_indices(batch, len(X_train), args.batch_size)

            # Perform one training step
            tf.assign(batch_x, X_train[index_shuf[start:end]])
            tf.assign(batch_y, Y_train[index_shuf[start:end]])
            # Compute grads
            with tf.GradientTape() as tape:
                # Define loss
                loss_clean_obj = LossCrossEntropy(model, smoothing=0.)
                loss_clean = loss_clean_obj.fprop(x=batch_x, y=batch_y)
                loss = loss_clean
                # Adversarial training
                if attack is not None:
                    batch_adv_x = attack.generate(batch_x, **attack_args)
                    loss_adv_obj = LossCrossEntropy(model, smoothing=0.)
                    loss_adv = loss_adv_obj.fprop(x=batch_adv_x, y=batch_y)
                    loss = (loss_clean + loss_adv) / 2.0
            # Apply grads
            model_variables = model.get_params()
            grads = tape.gradient(loss, model_variables)
            optimizer.apply_gradients(zip(grads, model_variables))

        assert end >= len(X_train)  # Check that all examples were used
        cur = time.time()
        _logger.info("Epoch " + str(epoch) + " took " + str(cur - prev) +
                     " seconds")
        if evaluate is not None:
            evaluate()

    if save:
        save_path = os.path.join(args.train_dir, args.filename)
        saver = tf.train.Saver()
        saver.save(save_path, model_variables)
        _logger.info("Completed model training and saved at: " +
                     str(save_path))
    else:
        _logger.info("Completed model training.")

    return True
def mnist_tutorial_cw(train_start=0,
                      train_end=60000,
                      test_start=0,
                      test_end=10000,
                      viz_enabled=True,
                      nb_epochs=6,
                      batch_size=128,
                      source_samples=10,
                      learning_rate=0.001,
                      attack_iterations=100,
                      model_path=os.path.join("models", "mnist"),
                      targeted=True):
    """
    MNIST tutorial for Carlini and Wagner's attack
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(file,
                                                  train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    nb_filters = 64

    ################ color training initialization ####################

    color_training_epochs = 5000
    color_learning_rate = 0.1
    colorCategory = [
        [0.0, 0.4],  # Black
        [0.3, 0.7],  # Grey
        [0.6, 1.0]  # White
    ]

    numColorInput = 1

    color_x = tf.placeholder(
        tf.float32,
        [None, numColorInput])  # mnist data image of shape 28*28=784
    color_y = tf.placeholder(
        tf.float32,
        [None, numColorOutput])  # 0-9 digits recognition => 10 classes

    # Set model weights
    color_W = tf.Variable(tf.zeros([numColorInput, numColorOutput]))
    color_b = tf.Variable(tf.zeros([numColorOutput]))
    color_pred_out = tf.matmul(color_x, color_W) + color_b  # Softmax

    color_cost = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=color_pred_out,
                                                labels=color_y))
    # Gradient Descent
    color_optimizer = tf.train.GradientDescentOptimizer(
        color_learning_rate).minimize(color_cost)

    # Test model
    color_argmax = tf.argmax(color_pred_out, 1)
    color_correct_prediction = tf.equal(tf.argmax(color_pred_out, 1),
                                        tf.argmax(color_y, 1))
    # Calculate accuracy
    color_accuracy = tf.reduce_mean(
        tf.cast(color_correct_prediction, tf.float32))

    # Graph for re-generating the original image into a new image by using trained color model
    pr_model_x = tf.placeholder(
        tf.float32,
        [None, n_input, numColorInput])  # mnist data image of shape 28*28=784
    pr_model_W = tf.placeholder(tf.float32,
                                [None, numColorInput, numColorOutput
                                 ])  # mnist data image of shape 28*28=784
    pr_model_b = tf.placeholder(tf.float32,
                                [None, numColorInput, numColorOutput
                                 ])  # mnist data image of shape 28*28=784
    pr_model_output = tf.one_hot(
        tf.argmax((tf.matmul(pr_model_x, pr_model_W) + pr_model_b), 2),
        numColorOutput)

    # Merge the random generated output for new image based on the colorCategory
    randomColorCategory = []
    for i in range(len(colorCategory)):
        tmp = []
        tmpRandomColorCategory = my_tf_round(
            tf.random_uniform(tf.shape(pr_model_x),
                              colorCategory[i][0],
                              colorCategory[i][1],
                              dtype=tf.float32), 2)
        tmp.append(tmpRandomColorCategory)
        randomColorCategory.append(tf.concat(tmp, 1))

    random_merge = tf.reshape(tf.concat(randomColorCategory, -1),
                              [-1, n_input, numColorOutput])
    random_color_set = tf.reduce_sum(
        tf.multiply(pr_model_output, random_merge), 2)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    x = tf.reshape(random_color_set, shape=(-1, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Define TF model graph
    model = ModelBasicCNN('model1', nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = LossCrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        #'train_dir': os.path.join(*os.path.split(model_path)[:-1]),
        #'filename': os.path.split(model_path)[-1],
        'train_dir': save_dir,
        'filename': filename,
        'numColorOutput': numColorOutput
    }
    with sess.as_default():
        if hasattr(tf, "global_variables_initializer"):
            tf.global_variables_initializer().run()
        else:
            warnings.warn("Update your copy of tensorflow; future versions of "
                          "CleverHans may drop support for this version.")
            sess.run(tf.initialize_all_variables())

        ################# color training ####################
        print("Trying to load pr model from: " + model_path2)
        if os.path.exists(model_path2 + ".meta"):
            tf_model_load(sess, model_path2)
            c_w, c_b = sess.run([color_W, color_b])
            print("Load color trained model in training")
        else:
            # Training the color
            for epoch in range(color_training_epochs):
                outputColorY = []
                p1 = np.random.random(100)
                for i in range(len(p1)):
                    outputOverlapColorY = []
                    for j in range(len(colorCategory)):
                        if p1[i] >= colorCategory[j][0] and p1[
                                i] <= colorCategory[j][1]:
                            colorIndexSeq = []
                            for k in range(len(colorCategory)):
                                if j == k:
                                    colorIndexSeq.append(1)
                                else:
                                    colorIndexSeq.append(0)
                            outputOverlapColorY.append(colorIndexSeq)
                            #break

                    # Randomly choose the output for color Y if the outputOverlapColorY has more than 1 item
                    outputColorY.append(outputOverlapColorY[np.random.randint(
                        0, len(outputOverlapColorY))])

                inputColorX = p1.reshape(100, 1)
                _, c, c_w, c_b = sess.run(
                    [color_optimizer, color_cost, color_W, color_b],
                    feed_dict={
                        color_x: inputColorX,
                        color_y: outputColorY
                    })
                avg_cost = c

                # Evaluating color model
                outputColorY = []
                p1 = np.random.random(100)
                # Generate output for random color inputs (test case)
                for i in range(len(p1)):
                    for j in range(len(colorCategory)):
                        outputOverlapColorY = []
                        if p1[i] >= colorCategory[j][0] and p1[
                                i] <= colorCategory[j][1]:
                            colorIndexSeq = []
                            for k in range(len(colorCategory)):
                                if j == k:
                                    colorIndexSeq.append(1)
                                else:
                                    colorIndexSeq.append(0)
                            outputOverlapColorY.append(colorIndexSeq)
                            break

                    # Randomly choose the output for color Y if the outputOverlapColorY has more than 1 item
                    outputColorY.append(outputOverlapColorY[np.random.randint(
                        0, len(outputOverlapColorY))])

                inputColorX = p1.reshape(100, 1)
                # print(random_xs)
                acc, argmax = sess.run([color_accuracy, color_argmax],
                                       feed_dict={
                                           color_x: inputColorX,
                                           color_y: outputColorY
                                       })
                print("Epoch:", '%04d' % (epoch + 1) + "/" + str(color_training_epochs) + ", Cost= " + \
                      "{:.9f}".format(avg_cost) + ", Training Accuracy= " + \
                      "{:.5f}".format(acc) + " ")
                # print(c_w)

            with tf.device('/CPU:0'):
                saver = tf.train.Saver(tf.global_variables(), max_to_keep=50)
                # Since training PR model is fast, we do not have to save multiple sessions for this
                save_path = os.path.join(save_dir2, filename2)
                saver.save(sess, save_path)
        ##################### end of color training ------------------------------

    ################# model training ####################
    rng = np.random.RandomState([2017, 8, 30])
    saveFileNum = 50
    saveFileNum = 500
    # saveFileNum = 1000
    model_path = os.path.join(save_dir, filename + "-" + str(saveFileNum))
    # check if we've trained before, and if we have, use that pre-trained model
    print("Trying to load trained model from: " + model_path)
    if os.path.exists(model_path + ".meta"):
        tf_model_load(sess, model_path)
        print("Load trained model")
    else:
        train(sess,
              loss,
              x,
              y,
              x_train,
              y_train,
              args=train_params,
              rng=rng,
              save=True,
              c_w=c_w,
              c_b=c_b,
              pr_model_x=pr_model_x,
              random_color_set=random_color_set,
              pr_model_W=pr_model_W,
              pr_model_b=pr_model_b)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size, 'numColorOutput': numColorOutput}
    #accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params,
    #                   pred2=preds, c_w=c_w, c_b=c_b, pr_model_x=pr_model_x, random_color_set=random_color_set,
    #                   pr_model_W=pr_model_W, pr_model_b=pr_model_b)
    #assert x_test.shape[0] == test_end - test_start, x_test.shape
    #print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    #report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using Carlini and Wagner's approach
    ###########################################################################
    nb_adv_per_sample = str(nb_classes - 1) if targeted else '1'
    print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample +
          ' adversarial examples')
    print("This could take some time ...")

    # Instantiate a CW attack object
    cw = CarliniWagnerL2(model, back='tf', sess=sess)

    if viz_enabled:
        assert source_samples == nb_classes
        idxs = [
            np.where(np.argmax(y_test, axis=1) == i)[0][0]
            for i in range(nb_classes)
        ]
    if targeted:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, nb_classes, img_rows, img_cols,
                          nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = np.array([[instance] * nb_classes
                                   for instance in x_test[idxs]],
                                  dtype=np.float32)
        else:
            adv_inputs = np.array([[instance] * nb_classes
                                   for instance in x_test[:source_samples]],
                                  dtype=np.float32)

        one_hot = np.zeros((nb_classes, nb_classes))
        one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1

        adv_inputs = adv_inputs.reshape(
            (source_samples * nb_classes, img_rows, img_cols, nchannels))
        adv_ys = np.array([one_hot] * source_samples,
                          dtype=np.float32).reshape(
                              (source_samples * nb_classes, nb_classes))
        yname = "y_target"
    else:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = x_test[idxs]
            adv_inputs = x_test
        else:
            adv_inputs = x_test[:source_samples]
            adv_inputs = x_test

        adv_ys = None
        yname = "y"

    cw_params = {
        'binary_search_steps': 1,
        'max_iterations': attack_iterations,
        'learning_rate': 0.1,
        'batch_size':
        source_samples * nb_classes if targeted else source_samples,
        'initial_const': 10
    }

    adv2 = cw.generate(x, **cw_params)
    cw_params[yname] = adv_ys
    adv = None
    adv = cw.generate_np(adv_inputs, **cw_params)

    eval_params = {
        'batch_size': np.minimum(nb_classes, source_samples),
        'numColorOutput': numColorOutput
    }
    if targeted:
        adv_accuracy = model_eval(sess,
                                  x,
                                  y,
                                  preds,
                                  adv,
                                  adv_ys,
                                  args=eval_params)
    else:
        if viz_enabled:
            adv_accuracy = model_eval(sess,
                                      x,
                                      y,
                                      preds,
                                      adv,
                                      y_test[idxs],
                                      args=eval_params)
        else:
            #adv_accuracy = model_eval(sess, x, y, preds, adv, y_test[
            #               :source_samples], args=eval_params)
            adv_accuracy = model_eval(sess,
                                      x,
                                      y,
                                      preds,
                                      adv,
                                      y_test,
                                      args=eval_params,
                                      pred2=preds,
                                      c_w=c_w,
                                      c_b=c_b,
                                      pr_model_x=pr_model_x,
                                      random_color_set=random_color_set,
                                      pr_model_W=pr_model_W,
                                      pr_model_b=pr_model_b,
                                      is_adv=True,
                                      ae=adv2)

    if viz_enabled:
        for j in range(nb_classes):
            if targeted:
                for i in range(nb_classes):
                    grid_viz_data[i, j] = adv[i * nb_classes + j]
            else:
                grid_viz_data[j, 0] = adv_inputs[j]
                grid_viz_data[j, 1] = adv[j]

        print(grid_viz_data.shape)

    print('--------------------------------------')
    print("load save file: ", saveFileNum)
    # Compute the number of adversarial examples that were successfully found
    print('Test with adv. examples {0:.4f}'.format(adv_accuracy))
    report.clean_train_adv_eval = 1. - adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(
        np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        _ = grid_visual(grid_viz_data)

    return report