예제 #1
0
  def plot_results( adv_inputs, adv, recon_orig, recon_adv):
    nb_classes = 10
    img_rows = img_cols = 28
    nchannels = 1

    grid_shape = (nb_classes, nb_classes, img_rows, img_cols,
                    nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')
    grid_viz_data_1 = np.zeros(grid_shape, dtype='f')
    curr_class = 0
    for j in range(nb_classes):
        for i in range(nb_classes):
          #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i]
          if(i==j):
            grid_viz_data[i,j] = recon_orig[curr_class*9]
            grid_viz_data_1[i,j] = adv_inputs[curr_class*9]
            curr_class = curr_class+1
          else:
            if(j>i):
              grid_viz_data[i,j] = recon_adv[i*(nb_classes-1) + j-1]
              grid_viz_data_1[i,j] = adv[i*(nb_classes-1)+j-1]
            else:
              grid_viz_data[i,j] = recon_adv[i*(nb_classes-1) + j]
              grid_viz_data_1[i,j] = adv[i*(nb_classes-1)+j]

    _ = grid_visual(grid_viz_data)
    _ = grid_visual(grid_viz_data_1)
예제 #2
0
        def do_jsma():
            print('Crafting ' + str(source_samples) + ' * ' +
                  str(nb_classes - 1) + ' adversarial examples')

            # Keep track of success (adversarial example classified in target)
            results = np.zeros((nb_classes, source_samples), dtype='i')

            # Rate of perturbed features for each test set example and target class
            perturbations = np.zeros((nb_classes, source_samples), dtype='f')

            # Initialize our array for grid visualization
            grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            # Instantiate a SaliencyMapMethod attack object
            jsma = SaliencyMapMethod(model, back='tf', sess=sess)
            jsma_params = {
                'theta': 1.,
                'gamma': 0.1,
                'clip_min': 0.,
                'clip_max': 1.,
                'y_target': None
            }

            figure = None
            # Loop over the samples we want to perturb into adversarial examples
            for sample_ind in xrange(0, source_samples):
                print('--------------------------------------')
                print('Attacking input %i/%i' %
                      (sample_ind + 1, source_samples))
                sample = X_test[sample_ind:(sample_ind + 1)]

                # We want to find an adversarial example for each possible target class
                # (i.e. all classes that differ from the label given in the dataset)
                current_class = int(np.argmax(Y_test[sample_ind]))
                target_classes = other_classes(nb_classes, current_class)

                # For the grid visualization, keep original images along the diagonal
                grid_viz_data[current_class,
                              current_class, :, :, :] = np.reshape(
                                  sample, (img_rows, img_cols, channels))

                # Loop over all target classes
                for target in target_classes:
                    print('Generating adv. example for target class %i' %
                          target)

                    # This call runs the Jacobian-based saliency map approach
                    one_hot_target = np.zeros((1, nb_classes),
                                              dtype=np.float32)
                    one_hot_target[0, target] = 1
                    jsma_params['y_target'] = one_hot_target
                    adv_x = jsma.generate_np(sample, **jsma_params)

                    # Check if success was achieved
                    res = int(model_argmax(sess, x, preds, adv_x) == target)

                    # Computer number of modified features
                    adv_x_reshape = adv_x.reshape(-1)
                    test_in_reshape = X_test[sample_ind].reshape(-1)
                    nb_changed = np.where(
                        adv_x_reshape != test_in_reshape)[0].shape[0]
                    percent_perturb = float(nb_changed) / adv_x.reshape(
                        -1).shape[0]

                    # Display the original and adversarial images side-by-side
                    if FLAGS.viz_enabled:
                        figure = pair_visual(
                            np.reshape(sample, (img_rows, img_cols)),
                            np.reshape(adv_x, (img_rows, img_cols)), figure)

                    # Add our adversarial example to our grid data
                    grid_viz_data[target, current_class, :, :, :] = np.reshape(
                        adv_x, (img_rows, img_cols, channels))

                    # Update the arrays for later analysis
                    results[target, sample_ind] = res
                    perturbations[target, sample_ind] = percent_perturb

            print('--------------------------------------')

            # Compute the number of adversarial examples that were successfully found
            nb_targets_tried = ((nb_classes - 1) * source_samples)
            succ_rate = float(np.sum(results)) / nb_targets_tried
            print('Avg. rate of successful adv. examples {0:.4f}'.format(
                succ_rate))
            report.clean_train_adv_eval = 1. - succ_rate

            # Compute the average distortion introduced by the algorithm
            percent_perturbed = np.mean(perturbations)
            print('Avg. rate of perturbed features {0:.4f}'.format(
                percent_perturbed))

            # Compute the average distortion introduced for successful samples only
            percent_perturb_succ = np.mean(perturbations * (results == 1))
            print('Avg. rate of perturbed features for successful '
                  'adversarial examples {0:.4f}'.format(percent_perturb_succ))
            if FLAGS.viz_enabled:
                import matplotlib.pyplot as plt
                plt.close(figure)
                _ = grid_visual(grid_viz_data)

            return report
예제 #3
0
def generate_images():

    print('==> Preparing data..')
    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print(
            "INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
            "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.5
    sess = tf.Session(config=config)
    keras.backend.set_session(sess)

    print "==> Beginning Session"

    # Get CIFAR10 test data
    X_train, Y_train, X_test, Y_test = data_cifar10()

    assert Y_train.shape[1] == 10.
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Load model
    print "==> loading vgg model"
    args = load_args()

    if args.model == 'vgg6': model = vggbn(top=True, pool=args.pool)
    if args.model == 'vgg15': model = vgg15(top=True, pool=args.pool)
    if args.model == 'generic': model = generic(top=True, pool=args.pool)
    if args.model == 'resnet18': model = resnet.build_resnet_18(args.pool)

    predictions = model(x)

    model.load_weights(args.load)

    eval_params = {'batch_size': FLAGS.batch_size}
    accuracy = model_eval(sess,
                          x,
                          y,
                          predictions,
                          X_test,
                          Y_test,
                          args=eval_params)
    print '==> Accuracy : {}'.format(accuracy)

    def evaluate():
        # Evaluate the accuracy of the CIFAR10 model on legitimate test examples
        eval_params = {'batch_size': FLAGS.batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              predictions,
                              X_test,
                              Y_test,
                              args=eval_params)
        assert X_test.shape[0] == 10000, X_test.shape
        print('Test accuracy on legitimate test examples: ' + str(accuracy))

    # Train an CIFAR10 model
    train_params = {
        'nb_epochs': FLAGS.nb_epochs,
        'batch_size': FLAGS.batch_size,
        'learning_rate': FLAGS.learning_rate
    }

    im_base = '/im_'
    model_name = args.model + '_p' + str(args.pool)
    if args.attack == 'fgsm' or args.attack == 'FGSM':

        result_dir = os.getcwd() + '/images/fgsm/'
        print "==> creating fgsm adversarial wrapper"
        adv_x = fgsm(x, predictions, eps=0.3)

        print "==> sending to batch evaluator to finalize adversarial images"
        eval_params = {'batch_size': FLAGS.batch_size}
        X_train_adv, = batch_eval(sess, [x], [adv_x], [X_train],
                                  args=eval_params)

        i = 0
        if not os.path.exists(result_dir + model_name):
            os.makedirs(result_dir + model_name)
        print "==> saving images to {}".format(result_dir + model_name)
        for ad in X_train_adv:
            scipy.misc.imsave(
                result_dir + model_name + im_base + str(i) + '.png', ad)
            i += 1

        sess.close()
    """ JSMA """
    if args.attack == 'jsma' or args.attack == 'JSMA':

        result_dir = os.getcwd() + '/images/jsma/trial_single_adv'
        print('Crafting ' + str(FLAGS.source_samples) + ' * ' +
              str(FLAGS.nb_classes - 1) + ' adversarial examples')

        results = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='i')

        # This array contains the fraction of perturbed features for each test set
        perturbations = np.zeros((FLAGS.nb_classes, FLAGS.source_samples),
                                 dtype='f')

        # Define the TF graph for the model's Jacobian
        grads = jacobian_graph(predictions, x, FLAGS.nb_classes)

        # Initialize our array for grid visualization
        grid_shape = (FLAGS.nb_classes, FLAGS.nb_classes, FLAGS.img_rows,
                      FLAGS.img_cols, FLAGS.nb_channels)
        grid_viz_data = np.zeros(grid_shape, dtype='f')
        i_saved = 0
        n_image = 0
        # Loop over the samples we want to perturb into adversarial examples
        print "==> saving images to {}".format(result_dir + model_name)
        for sample_ind in xrange(7166, FLAGS.source_samples):
            # We want to find an adversarial example for each possible target class
            current_class = int(np.argmax(Y_train[sample_ind]))
            target_classes = other_classes(FLAGS.nb_classes, current_class)
            # For the grid visualization, keep original images along the diagonal
            grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
                X_train[sample_ind:(sample_ind + 1)],
                (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

            # Loop over all target classes
            adversarials = []
            for idx, target in enumerate(target_classes):
                print "image {}".format(sample_ind)

                # here we hold all successful adversarials for this iteration
                # since we dont want 500k images, we will uniformly sample an image to save after each target

                print('--------------------------------------')
                print('Creating adv. example for target class ' + str(target))

                # This call runs the Jacobian-based saliency map approach
                adv_x, res, percent_perturb = jsma(
                    sess,
                    x,
                    predictions,
                    grads,
                    X_train[sample_ind:(sample_ind + 1)],
                    target,
                    theta=1,
                    gamma=0.1,
                    increase=True,
                    back='tf',
                    clip_min=0,
                    clip_max=1)
                # Display the original and adversarial images side-by-side
                adversarial = np.reshape(
                    adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))
                original = np.reshape(
                    X_train[sample_ind:(sample_ind + 1)],
                    (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

                if FLAGS.viz_enabled:

                    if 'figure' not in vars():
                        figure = pair_visual(original, adversarial)
                    else:
                        figure = pair_visual(original, adversarial, figure)

                if not os.path.exists(result_dir + model_name):
                    os.makedirs(result_dir + model_name)

                if res == 1:
                    adversarials.append(adversarial)

                if idx == FLAGS.nb_classes - 2:

                    try:
                        if len(adversarials) == 1:
                            idx_uniform = 0
                        else:
                            idx_uniform = np.random.randint(
                                0,
                                len(adversarials) - 1)
                        print idx_uniform
                        scipy.misc.imsave(
                            result_dir + model_name + im_base +
                            str(sample_ind) + '.png',
                            adversarials[idx_uniform])
                        i_saved += 1
                        print "==> images saved: {}".format(i_saved)

                    except:

                        print "No adversarials generated"

# Add our adversarial example to our grid data
                grid_viz_data[target, current_class, :, :, :] = np.reshape(
                    adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

                # Update the arrays for later analysis
                results[target, sample_ind] = res
                perturbations[target, sample_ind] = percent_perturb

            n_image += 1

# Compute the number of adversarial examples that were successfuly found
        nb_targets_tried = ((FLAGS.nb_classes - 1) * FLAGS.source_samples)
        succ_rate = float(np.sum(results)) / nb_targets_tried
        print(
            'Avg. rate of successful adv. examples {0:.2f}'.format(succ_rate))

        # Compute the average distortion introduced by the algorithm
        percent_perturbed = np.mean(perturbations)
        print('Avg. rate of perturbed features {0:.2f}'.format(
            percent_perturbed))

        # Compute the average distortion introduced for successful samples only
        percent_perturb_succ = np.mean(perturbations * (results == 1))
        print(
            'Avg. rate of perturbed features for successful '
            'adversarial examples {0:.2f}'.format(percent_perturb_succ))

        # Close TF session
        sess.close()

        # Finally, block & display a grid of all the adversarial examples
        if FLAGS.viz_enabled:
            _ = grid_visual(grid_viz_data)
예제 #4
0
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0,
                      test_end=10000, viz_enabled=True, nb_epochs=6,
                      batch_size=128, source_samples=10,
                      learning_rate=0.001, attack_iterations=100,
                      model_path=os.path.join("models", "mnist"),
                      targeted=True):
    """
    MNIST tutorial for Carlini and Wagner's attack
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))
    nb_filters = 64

    # Define TF model graph
    model = ModelBasicCNN('model1', nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = LossCrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': os.path.join(*os.path.split(model_path)[:-1]),
        'filename': os.path.split(model_path)[-1]
    }

    rng = np.random.RandomState([2017, 8, 30])
    # check if we've trained before, and if we have, use that pre-trained model
    if os.path.exists(model_path + ".meta"):
        tf_model_load(sess, model_path)
    else:
        train(sess, loss, x, y, x_train, y_train, args=train_params,
              save=os.path.exists("models"), rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using Carlini and Wagner's approach
    ###########################################################################
    nb_adv_per_sample = str(nb_classes - 1) if targeted else '1'
    print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample +
          ' adversarial examples')
    print("This could take some time ...")

    # Instantiate a CW attack object
    cw = CarliniWagnerL2(model, back='tf', sess=sess)

    if viz_enabled:
        assert source_samples == nb_classes
        idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0]
                for i in range(nb_classes)]
    if targeted:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, nb_classes, img_rows, img_cols,
                          nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = np.array(
                [[instance] * nb_classes for instance in x_test[idxs]],
                dtype=np.float32)
        else:
            adv_inputs = np.array(
                [[instance] * nb_classes for
                 instance in x_test[:source_samples]], dtype=np.float32)

        one_hot = np.zeros((nb_classes, nb_classes))
        one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1

        adv_inputs = adv_inputs.reshape(
            (source_samples * nb_classes, img_rows, img_cols, nchannels))
        adv_ys = np.array([one_hot] * source_samples,
                          dtype=np.float32).reshape((source_samples *
                                                     nb_classes, nb_classes))
        yname = "y_target"
    else:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = x_test[idxs]
        else:
            adv_inputs = x_test[:source_samples]

        adv_ys = None
        yname = "y"

    cw_params = {'binary_search_steps': 1,
                 yname: adv_ys,
                 'max_iterations': attack_iterations,
                 'learning_rate': 0.1,
                 'batch_size': source_samples * nb_classes if
                 targeted else source_samples,
                 'initial_const': 10}

    adv = cw.generate_np(adv_inputs,
                         **cw_params)

    eval_params = {'batch_size': np.minimum(nb_classes, source_samples)}
    if targeted:
        adv_accuracy = model_eval(
            sess, x, y, preds, adv, adv_ys, args=eval_params)
    else:
        if viz_enabled:
            adv_accuracy = 1 - \
                model_eval(sess, x, y, preds, adv, y_test[
                           idxs], args=eval_params)
        else:
            adv_accuracy = 1 - \
                model_eval(sess, x, y, preds, adv, y_test[
                           :source_samples], args=eval_params)

    if viz_enabled:
        for j in range(nb_classes):
            if targeted:
                for i in range(nb_classes):
                    grid_viz_data[i, j] = adv[i * nb_classes + j]
            else:
                grid_viz_data[j, 0] = adv_inputs[j]
                grid_viz_data[j, 1] = adv[j]

        print(grid_viz_data.shape)

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy))
    report.clean_train_adv_eval = 1. - adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2,
                                       axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        _ = grid_visual(grid_viz_data)

    return report
예제 #5
0
format = '%i,%i,%i'
np.savetxt("./DataAnalysis/attack_fgsm_eps10_ord2_target.csv",
           result_log,
           fmt=format,
           delimiter=",")

print('--------------------------------------')

# Compute the number of adversarial examples that were successfully found
nb_targets_tried = source_samples
succ_rate = float(np.sum(results)) / nb_targets_tried
print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
report.clean_train_adv_eval = 1. - succ_rate

# Compute the average distortion introduced by the algorithm
percent_perturbed = np.mean(perturbations)
print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

# Compute the average distortion introduced for successful samples only
percent_perturb_succ = np.mean(perturbations * (results == 1))
print('Avg. rate of perturbed features for successful '
      'adversarial examples {0:.4f}'.format(percent_perturb_succ))

# Close TF session
#sess.close()

# Finally, block & display a grid of all the adversarial examples
if viz_enabled:
    plt.close(figure)
    _ = grid_visual(grid_viz_data)
예제 #6
0
def mnist_tutorial_jsma(
    train_start=0,
    train_end=60000,
    test_start=0,
    test_end=10000,
    viz_enabled=VIZ_ENABLED,
    nb_epochs=NB_EPOCHS,
    batch_size=BATCH_SIZE,
    source_samples=SOURCE_SAMPLES,
    learning_rate=LEARNING_RATE,
):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    mnist = MNIST(
        train_start=train_start,
        train_end=train_end,
        test_start=test_start,
        test_end=test_end,
    )
    x_train, y_train = mnist.get_set("train")
    x_test, y_test = mnist.get_set("test")

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    nb_filters = 64
    # Define TF model graph
    model = ModelBasicCNN("model1", nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = CrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        "nb_epochs": nb_epochs,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
    }
    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    train(sess, loss, x_train, y_train, args=train_params, rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {"batch_size": batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print("Test accuracy on legitimate test examples: {0}".format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print("Crafting " + str(source_samples) + " * " + str(nb_classes - 1) +
          " adversarial examples")

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype="i")

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype="f")

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype="f")

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, sess=sess)
    jsma_params = {
        "theta": 1.0,
        "gamma": 0.1,
        "clip_min": 0.0,
        "clip_max": 1.0,
        "y_target": None,
    }

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print("--------------------------------------")
        print("Attacking input %i/%i" % (sample_ind + 1, source_samples))
        sample = x_test[sample_ind:(sample_ind + 1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, nchannels))

        # Loop over all target classes
        for target in target_classes:
            print("Generating adv. example for target class %i" % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params["y_target"] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Compute number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, nchannels)),
                    np.reshape(adv_x, (img_rows, img_cols, nchannels)),
                    figure,
                )

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, nchannels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print("--------------------------------------")

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = (nb_classes - 1) * source_samples
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print("Avg. rate of successful adv. examples {0:.4f}".format(succ_rate))
    report.clean_train_adv_eval = 1.0 - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations[np.where(perturbations != 0)])
    print("Avg. rate of perturbed features {0:.4f}".format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(
        perturbations[np.where(perturbations != 0)] *
        (results[np.where(perturbations != 0)] == 1))
    print("Avg. rate of perturbed features for successful "
          "adversarial examples {0:.4f}".format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt

        plt.close(figure)
        _ = grid_visual(grid_viz_data)

    return report
예제 #7
0
def zoo(viz_enabled=VIZ_ENABLED,
        nb_epochs=NB_EPOCHS,
        batch_size=BATCH_SIZE,
        source_samples=SOURCE_SAMPLES,
        learning_rate=LEARNING_RATE,
        attack_iterations=ATTACK_ITERATIONS,
        model_path=MODEL_PATH,
        targeted=TARGETED):
    """
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    if DATASET == 'MNIST':
        train_start = 0
        train_end = 60000
        test_start = 0
        test_end = 10000
        ds = dataset.MNIST(train_start=train_start,
                           train_end=train_end,
                           test_start=test_start,
                           test_end=test_end,
                           center=False)
    elif DATASET == 'SVHN':
        train_start = 0
        train_end = 73257
        test_start = 0
        test_end = 26032
        ds = dataset.SVHN(train_start=train_start,
                          train_end=train_end,
                          test_start=test_start,
                          test_end=test_end)
    elif DATASET == 'CIFAR10':
        train_start = 0
        train_end = 60000
        test_start = 0
        test_end = 10000
        ds = dataset.CIFAR10(train_start=train_start,
                             train_end=train_end,
                             test_start=test_start,
                             test_end=test_end,
                             center=False)

    x_train, y_train, x_test, y_test = ds.get_set('train') + ds.get_set('test')

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))
    nb_filters = 64

    # Define TF model graph
    model = ModelBasicCNN(DATASET, nb_classes, nb_filters,
                          (None, img_rows, img_cols, nchannels))
    preds = model.get_logits(x)
    loss = CrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'filename': os.path.split(model_path)[-1]
    }

    rng = np.random.RandomState([2018, 10, 22])
    # check if we've trained before, and if we have, use that pre-trained model
    if os.path.exists(model_path + ".meta"):
        tf_model_load(sess, model_path)
    else:
        train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng)
        saver = tf.train.Saver()
        saver.save(sess, model_path)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using Carlini and Wagner's approach
    ###########################################################################
    nb_adv_per_sample = str(nb_classes - 1) if targeted else '1'
    print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample +
          ' adversarial examples')
    print("This could take some time ...")

    # Instantiate a Zoo attack object
    zoo = Zoo(model, sess=sess)

    if viz_enabled:
        assert source_samples == nb_classes
        idxs = [
            np.where(np.argmax(y_test, axis=1) == i)[0][0]
            for i in range(nb_classes)
        ]
    if targeted:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, nb_classes, img_rows, img_cols,
                          nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = np.array([[instance] * nb_classes
                                   for instance in x_test[idxs]],
                                  dtype=np.float32)
        else:
            adv_inputs = np.array([[instance] * nb_classes
                                   for instance in x_test[:source_samples]],
                                  dtype=np.float32)

        one_hot = np.zeros((nb_classes, nb_classes))
        one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1

        adv_inputs = adv_inputs.reshape(
            (source_samples * nb_classes, img_rows, img_cols, nchannels))
        adv_ys = np.array([one_hot] * source_samples,
                          dtype=np.float32).reshape(
                              (source_samples * nb_classes, nb_classes))
        yname = "y_target"
    else:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = x_test[idxs]
        else:
            adv_inputs = x_test[:source_samples]

        adv_ys = None
        yname = "y"

    zoo_params = {
        'binary_search_steps': BINARY_SEARCH_STEPS,
        yname: adv_ys,
        'max_iterations': attack_iterations,
        'learning_rate': ZOO_LEARNING_RATE,
        'batch_size':
        source_samples * nb_classes if targeted else source_samples,
        'initial_const': INIT_CONST,
        'solver': SOLVER,
        'image_shape': [img_rows, img_cols, nchannels],
        'nb_classes': nb_classes
    }

    adv = zoo.generate_np(adv_inputs, **zoo_params)

    eval_params = {'batch_size': np.minimum(nb_classes, source_samples)}
    if targeted:
        adv_accuracy = model_eval(sess,
                                  x,
                                  y,
                                  preds,
                                  adv,
                                  adv_ys,
                                  args=eval_params)
    else:
        if viz_enabled:
            adv_accuracy = 1 - model_eval(
                sess, x, y, preds, adv, y_test[idxs], args=eval_params)
        else:
            adv_accuracy = 1 - model_eval(sess,
                                          x,
                                          y,
                                          preds,
                                          adv,
                                          y_test[:source_samples],
                                          args=eval_params)

    if viz_enabled:
        for j in range(nb_classes):
            if targeted:
                for i in range(nb_classes):
                    grid_viz_data[i, j] = adv[i * nb_classes + j]
            else:
                grid_viz_data[j, 0] = adv_inputs[j]
                grid_viz_data[j, 1] = adv[j]

        print(grid_viz_data.shape)

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy))
    report.clean_train_adv_eval = 1. - adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(
        np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        _ = grid_visual(grid_viz_data)

    return report
예제 #8
0
def mnist_tutorial_cw(train_start=0,
                      train_end=60000,
                      test_start=0,
                      test_end=10000,
                      viz_enabled=VIZ_ENABLED,
                      nb_epochs=NB_EPOCHS,
                      batch_size=BATCH_SIZE,
                      source_samples=SOURCE_SAMPLES,
                      learning_rate=LEARNING_RATE,
                      attack_iterations=ATTACK_ITERATIONS,
                      model_path=MODEL_PATH,
                      model_path_cls=MODEL_PATH,
                      targeted=TARGETED):

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)
    rng = np.random.RandomState()
    # Create TF session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)
    nb_latent_size = 100
    # Get MNIST test data
    mnist = MNIST(train_start=train_start,
                  train_end=train_end,
                  test_start=test_start,
                  test_end=test_end)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    x_t = tf.placeholder(tf.float32,
                         shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))
    y_t = tf.placeholder(tf.float32, shape=(None, nb_classes))
    z = tf.placeholder(tf.float32, shape=(None, nb_latent_size))
    z_t = tf.placeholder(tf.float32, shape=(None, nb_latent_size))

    #nb_filters = 64
    nb_layers = 500

    # Define TF model graph
    model = ModelBasicAE('model', nb_layers, nb_latent_size)
    cl_model = ModelCls('cl_model')
    #preds = model.get_logits(x)
    recons = model.get_layer(x, 'RECON')
    loss = SquaredError(model)
    print("Defined TensorFlow model graph.")

    loss_cls = CrossEntropy(cl_model)
    y_logits = cl_model.get_layer(z, 'LOGITS')
    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'filename': os.path.split(model_path)[-1]
    }
    train_params_cls = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'filename': os.path.split(model_path_cls)[-1]
    }
    rng = np.random.RandomState([2017, 8, 30])
    # check if we've trained before, and if we have, use that pre-trained model
    #if os.path.exists(model_path + ".meta"):
    # tf_model_load(sess, model_path)
    #else:
    eval_params_cls = {'batch_size': batch_size}

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}

    ###########################################################################
    # Craft adversarial examples using Carlini and Wagner's approach
    ###########################################################################
    nb_adv_per_sample = str(nb_classes - 1) if targeted else '1'
    print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample +
          ' adversarial examples')
    print("This could take some time ...")

    # Instantiate a CW attack object
    cw = CarliniWagnerAE(model, cl_model, sess=sess)

    if viz_enabled:
        assert source_samples == nb_classes
        idxs = [
            np.where(np.argmax(y_test, axis=1) == i)[0][0]
            for i in range(nb_classes)
        ]
    if targeted:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, nb_classes, img_rows, img_cols,
                          nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')
            grid_viz_data_1 = np.zeros(grid_shape, dtype='f')

            adv_inputs = np.array([[instance] * (nb_classes - 1)
                                   for instance in x_test[idxs]],
                                  dtype=np.float32)

            #adv_input_y = np.array([[instance]*(nb_classes-1) for instance in y_test[idxs]])

            adv_input_y = []
            for curr_num in range(nb_classes):
                targ = []
                for id in range(nb_classes - 1):
                    targ.append(y_test[idxs[curr_num]])
                adv_input_y.append(targ)

            adv_input_y = np.array(adv_input_y)

            adv_target_y = []
            for curr_num in range(nb_classes):
                targ = []
                for id in range(nb_classes):
                    if (id != curr_num):
                        targ.append(y_test[idxs[id]])
                adv_target_y.append(targ)

            adv_target_y = np.array(adv_target_y)

            #print("adv_input_y: \n", adv_input_y)
            #print("adv_target_y: \n", adv_target_y)

            adv_input_targets = []
            for curr_num in range(nb_classes):
                targ = []
                for id in range(nb_classes):
                    if (id != curr_num):
                        targ.append(x_test[idxs[id]])
                adv_input_targets.append(targ)
            adv_input_targets = np.array(adv_input_targets)

            adv_inputs = adv_inputs.reshape((source_samples * (nb_classes - 1),
                                             img_rows, img_cols, nchannels))
            adv_input_targets = adv_input_targets.reshape(
                (source_samples * (nb_classes - 1), img_rows, img_cols,
                 nchannels))

            adv_input_y = adv_input_y.reshape(
                source_samples * (nb_classes - 1), 10)
            adv_target_y = adv_target_y.reshape(
                source_samples * (nb_classes - 1), 10)

            #print("adv_input_y: \n", adv_input_y)
            #print("adv_target_y: \n", adv_target_y)

        one_hot = np.zeros((nb_classes, nb_classes))
        one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1

    train_ae(sess,
             loss,
             x_train,
             x_train,
             args=train_params,
             rng=rng,
             var_list=model.get_params())
    saver = tf.train.Saver()
    saver.save(sess, model_path)

    x_train_lat = model.get_layer(x_train, 'LATENT')
    x_test_lat = model.get_layer(x_test, 'LATENT')
    x_train_lat = sess.run(x_train_lat)
    x_test_lat = sess.run(x_test_lat)

    def do_eval_cls(preds, x_set, y_set, x_tar_set, report_key, is_adv=None):
        acc = model_eval(sess,
                         z,
                         y,
                         preds,
                         z_t,
                         x_set,
                         y_set,
                         x_tar_set,
                         args=eval_params_cls)
        setattr(report, report_key, acc)
        if is_adv is None:
            report_text = None
        elif is_adv:
            report_text = 'adversarial'
        else:
            report_text = 'legitimate'
        if report_text:
            print('Test accuracy on %s examples: %0.4f' % (report_text, acc))

    def eval_cls():
        do_eval_cls(y_logits, x_test_lat, y_test, x_test_lat,
                    'clean_train_clean_eval', False)

    #train_cls(sess, loss_cls, x_train, y_train, evaluate = eval_cls, args = train_params_cls, rng = rng, var_list = cl_model.get_params())
    train_cls_lat(sess,
                  loss_cls,
                  x_train_lat,
                  y_train,
                  evaluate=eval_cls,
                  args=train_params_cls,
                  rng=rng,
                  var_list=cl_model.get_params())
    saver.save(sess, model_path_cls)

    #adv_input_y = cl_model.get_layer(adv_inputs, 'LOGITS')
    #adv_target_y = cl_model.get_layer(adv_input_targets, 'LOGITS')

    adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape(
        (source_samples * nb_classes, nb_classes))
    yname = "y_target"

    cw_params_batch_size = source_samples * (nb_classes - 1)

    cw_params = {
        'binary_search_steps': 10,
        yname: adv_ys,
        'max_iterations': attack_iterations,
        'learning_rate': CW_LEARNING_RATE,
        'batch_size': cw_params_batch_size,
        'initial_const': 1
    }

    adv = cw.generate_np(adv_inputs, adv_input_targets, **cw_params)

    #print("shaep of adv: ", np.shape(adv))
    recon_orig = model.get_layer(adv_inputs, 'RECON')
    lat_adv = model.get_layer(adv, 'LATENT')
    recon_adv = model.get_layer(adv, 'RECON')
    lat_orig = model.get_layer(x, 'LATENT')
    lat_orig_recon = model.get_layer(recons, 'LATENT')
    #pred_adv_recon = cl_model.get_layer(recon_adv, 'LOGITS')
    pred_adv_recon = cl_model.get_layer(lat_adv, 'LOGITS')

    #eval_params = {'batch_size': np.minimum(nb_classes, source_samples)}
    eval_params = {'batch_size': 90}
    if targeted:
        noise, d1, d2, dist_diff, avg_dist_lat = model_eval_ae(
            sess,
            x,
            x_t,
            recons,
            adv_inputs,
            adv_input_targets,
            adv,
            recon_adv,
            lat_orig,
            lat_orig_recon,
            args=eval_params)
        acc = model_eval(sess,
                         x,
                         y,
                         pred_adv_recon,
                         x_t,
                         adv_inputs,
                         adv_target_y,
                         adv_input_targets,
                         args=eval_params_cls)
        print("noise: ", noise)
        print("classifier acc: ", acc)

    recon_adv = sess.run(recon_adv)
    recon_orig = sess.run(recon_orig)
    #print("recon_adv[0]\n", recon_adv[0,:,:,0])
    curr_class = 0
    if viz_enabled:
        for j in range(nb_classes):
            if targeted:
                for i in range(nb_classes):
                    #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i]
                    if (i == j):
                        grid_viz_data[i, j] = recon_orig[curr_class * 9]
                        grid_viz_data_1[i, j] = adv_inputs[curr_class * 9]
                        curr_class = curr_class + 1
                    else:
                        if (j > i):
                            grid_viz_data[i,
                                          j] = recon_adv[i * (nb_classes - 1) +
                                                         j - 1]
                            grid_viz_data_1[i, j] = adv[i * (nb_classes - 1) +
                                                        j - 1]
                        else:
                            grid_viz_data[i,
                                          j] = recon_adv[i * (nb_classes - 1) +
                                                         j]
                            grid_viz_data_1[i,
                                            j] = adv[i * (nb_classes - 1) + j]

        #rint(grid_viz_data.shape)

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(
        np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Close TF session
    #sess.close()

    # Finally, block & display a grid of all the adversarial examples

    if viz_enabled:
        _ = grid_visual(grid_viz_data)
        _ = grid_visual(grid_viz_data_1)

    #return report

    #adversarial training
    if (adv_train == True):

        print("starting adversarial training")
        #sess1 = tf.Session()
        adv_input_set = []
        adv_input_target_set = []

        for i in range(20):

            indices = np.arange(np.shape(x_train)[0])
            np.random.shuffle(indices)
            print("indices: ", indices[1:10])
            x_train = x_train[indices]
            y_train = y_train[indices]

            idxs = [
                np.where(np.argmax(y_train, axis=1) == i)[0][0]
                for i in range(nb_classes)
            ]
            adv_inputs_2 = np.array([[instance] * (nb_classes - 1)
                                     for instance in x_train[idxs]],
                                    dtype=np.float32)
            adv_input_targets_2 = []
            for curr_num in range(nb_classes):
                targ = []
                for id in range(nb_classes):
                    if (id != curr_num):
                        targ.append(x_train[idxs[id]])
                adv_input_targets_2.append(targ)
            adv_input_targets_2 = np.array(adv_input_targets_2)

            adv_inputs_2 = adv_inputs_2.reshape(
                (source_samples * (nb_classes - 1), img_rows, img_cols,
                 nchannels))
            adv_input_targets_2 = adv_input_targets_2.reshape(
                (source_samples * (nb_classes - 1), img_rows, img_cols,
                 nchannels))

            adv_input_set.append(adv_inputs_2)
            adv_input_target_set.append(adv_input_targets_2)

        adv_input_set = np.array(adv_input_set),
        adv_input_target_set = np.array(adv_input_target_set)
        print("shape of adv_input_set: ", np.shape(adv_input_set))
        print("shape of adv_input_target_set: ",
              np.shape(adv_input_target_set))
        adv_input_set = np.reshape(
            adv_input_set,
            (np.shape(adv_input_set)[0] * np.shape(adv_input_set)[1] *
             np.shape(adv_input_set)[2], np.shape(adv_input_set)[3],
             np.shape(adv_input_set)[4], np.shape(adv_input_set)[5]))
        adv_input_target_set = np.reshape(adv_input_target_set,
                                          (np.shape(adv_input_target_set)[0] *
                                           np.shape(adv_input_target_set)[1],
                                           np.shape(adv_input_target_set)[2],
                                           np.shape(adv_input_target_set)[3],
                                           np.shape(adv_input_target_set)[4]))

        print("generated adversarial training set")

        adv_set = cw.generate_np(adv_input_set, adv_input_target_set,
                                 **cw_params)

        x_train_aim = np.append(x_train, adv_input_set, axis=0)
        x_train_app = np.append(x_train, adv_set, axis=0)

        model_adv_trained = ModelBasicAE('model_adv_trained', nb_layers,
                                         nb_latent_size)
        recons_2 = model_adv_trained.get_layer(x, 'RECON')
        loss_2 = SquaredError(model_adv_trained)
        train_ae(sess,
                 loss_2,
                 x_train_app,
                 x_train_aim,
                 args=train_params,
                 rng=rng,
                 var_list=model_adv_trained.get_params())
        saver = tf.train.Saver()
        saver.save(sess, model_path)

        cw2 = CarliniWagnerAE(model_adv_trained, cl_model, sess=sess)

        adv_2 = cw2.generate_np(adv_inputs, adv_input_targets, **cw_params)

        #print("shaep of adv: ", np.shape(adv))
        recon_orig = model_adv_trained.get_layer(adv_inputs, 'RECON')
        recon_adv = model_adv_trained.get_layer(adv_2, 'RECON')
        lat_orig = model_adv_trained.get_layer(x, 'LATENT')
        lat_orig_recon = model_adv_trained.get_layer(recons, 'LATENT')
        pred_adv_recon = cl_model.get_layer(recon_adv, 'LOGITS')

        #eval_params = {'batch_size': np.minimum(nb_classes, source_samples)}
        eval_params = {'batch_size': 90}
        if targeted:
            noise, d1, d2, dist_diff, avg_dist_lat = model_eval_ae(
                sess,
                x,
                x_t,
                recons,
                adv_inputs,
                adv_input_targets,
                adv_2,
                recon_adv,
                lat_orig,
                lat_orig_recon,
                args=eval_params)
            acc = model_eval(sess,
                             x,
                             y,
                             pred_adv_recon,
                             x_t,
                             adv_inputs,
                             adv_target_y,
                             adv_input_targets,
                             args=eval_params_cls)
            print("noise: ", noise)
            #print("d1: ", d1)
            #print("d2: ", d2)
            #print("d1-d2: ", dist_diff)
            #print("Avg_dist_lat: ", avg_dist_lat)
            print("classifier acc: ", acc)

        recon_adv = sess.run(recon_adv)
        recon_orig = sess.run(recon_orig)
        #print("recon_adv[0]\n", recon_adv[0,:,:,0])
        curr_class = 0
        if viz_enabled:
            for j in range(nb_classes):
                for i in range(nb_classes):
                    #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i]
                    if (i == j):
                        grid_viz_data[i, j] = recon_orig[curr_class * 9]
                        grid_viz_data_1[i, j] = adv_inputs[curr_class * 9]
                        curr_class = curr_class + 1
                    else:
                        if (j > i):
                            grid_viz_data[i,
                                          j] = recon_adv[i * (nb_classes - 1) +
                                                         j - 1]
                            grid_viz_data_1[i,
                                            j] = adv_2[i * (nb_classes - 1) +
                                                       j - 1]
                        else:
                            grid_viz_data[i,
                                          j] = recon_adv[i * (nb_classes - 1) +
                                                         j]
                            grid_viz_data_1[i, j] = adv_2[i *
                                                          (nb_classes - 1) + j]

            #rint(grid_viz_data.shape)

        print('--------------------------------------')

        # Compute the number of adversarial examples that were successfully found

        # Compute the average distortion introduced by the algorithm
        percent_perturbed = np.mean(
            np.sum((adv_2 - adv_inputs)**2, axis=(1, 2, 3))**.5)
        print(
            'Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

        # Close TF session
        sess.close()

        # Finally, block & display a grid of all the adversarial examples
        if viz_enabled:
            _ = grid_visual(grid_viz_data)
            _ = grid_visual(grid_viz_data_1)

        return report


#binarization defense
    if (binarization_defense == True or mean_filtering == True):

        #adv = sess.run(adv)
        # print(adv[0])
        if (binarization_defense == True):
            adv[adv > 0.5] = 1.0
            adv[adv <= 0.5] = 0.0
        else:
            #radius = 2
            #adv_list = [mean(adv[i,:,:,0], disk(radius)) for i in range(0, np.shape(adv)[0])]
            #adv = np.array(adv_list)
            #adv = np.expand_dims(adv, axis = 3)
            adv = uniform_filter(adv, 2)
            #adv = median_filter(adv, 2)
        #print("after bin ")
        #print(adv[0])

        recon_orig = model.get_layer(adv_inputs, 'RECON')
        recon_adv = model.get_layer(adv, 'RECON')
        lat_adv = model.get_layer(adv, 'LATENT')
        lat_orig = model.get_layer(x, 'LATENT')
        lat_orig_recon = model.get_layer(recons, 'LATENT')
        pred_adv_recon = cl_model.get_layer(lat_adv, 'LOGITS')

        #eval_params = {'batch_size': np.minimum(nb_classes, source_samples)}
        eval_params = {'batch_size': 90}
        if targeted:
            noise, d1, d2, dist_diff, avg_dist_lat = model_eval_ae(
                sess,
                x,
                x_t,
                recons,
                adv_inputs,
                adv_input_targets,
                adv,
                recon_adv,
                lat_orig,
                lat_orig_recon,
                args=eval_params)
            acc1 = model_eval(sess,
                              x,
                              y,
                              pred_adv_recon,
                              x_t,
                              adv_inputs,
                              adv_target_y,
                              adv_input_targets,
                              args=eval_params_cls)
            acc2 = model_eval(sess,
                              x,
                              y,
                              pred_adv_recon,
                              x_t,
                              adv_inputs,
                              adv_input_y,
                              adv_input_targets,
                              args=eval_params_cls)
            print("noise: ", noise)
            print("classifier acc for target class: ", acc1)
            print("classifier acc for true class: ", acc2)

        recon_adv = sess.run(recon_adv)
        recon_orig = sess.run(recon_orig)
        #print("recon_adv[0]\n", recon_adv[0,:,:,0])
        curr_class = 0
        if viz_enabled:
            for j in range(nb_classes):
                for i in range(nb_classes):
                    #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i]
                    if (i == j):
                        grid_viz_data[i, j] = recon_orig[curr_class * 9]
                        grid_viz_data_1[i, j] = adv_inputs[curr_class * 9]
                        curr_class = curr_class + 1
                    else:
                        if (j > i):
                            grid_viz_data[i,
                                          j] = recon_adv[i * (nb_classes - 1) +
                                                         j - 1]
                            grid_viz_data_1[i, j] = adv[i * (nb_classes - 1) +
                                                        j - 1]
                        else:
                            grid_viz_data[i,
                                          j] = recon_adv[i * (nb_classes - 1) +
                                                         j]
                            grid_viz_data_1[i,
                                            j] = adv[i * (nb_classes - 1) + j]
            sess.close()

            _ = grid_visual(grid_viz_data)
            _ = grid_visual(grid_viz_data_1)
def main(argv=None):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :return:
    """
    # Disable Keras learning phase since we will be serving through tensorflow
    keras.layers.core.K.set_learning_phase(0)

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' "
              "to 'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)
    print("Created TensorFlow session and set Keras backend.")

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist()
    print("Loaded MNIST test data.")

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model if it does not exist in the train_dir folder
    saver = tf.train.Saver()
    save_path = os.path.join(FLAGS.train_dir, FLAGS.filename)
    if os.path.isfile(save_path):
        saver.restore(sess, os.path.join(FLAGS.train_dir, FLAGS.filename))
    else:
        train_params = {
            'nb_epochs': FLAGS.nb_epochs,
            'batch_size': FLAGS.batch_size,
            'learning_rate': FLAGS.learning_rate
        }
        model_train(sess, x, y, preds, X_train, Y_train,
                    args=train_params)
        saver.save(sess, save_path)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': FLAGS.batch_size}
    accuracy = model_eval(sess, x, y, preds, X_test, Y_test,
                          args=eval_params)
    assert X_test.shape[0] == 10000, X_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(FLAGS.source_samples) + ' * ' +
          str(FLAGS.nb_classes-1) + ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((FLAGS.nb_classes, FLAGS.source_samples),
                             dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (FLAGS.nb_classes,
                  FLAGS.nb_classes,
                  FLAGS.img_rows,
                  FLAGS.img_cols,
                  FLAGS.nb_channels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Define the SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, back='tf', sess=sess)

    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, FLAGS.source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, FLAGS.source_samples))

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(Y_test[sample_ind]))
        target_classes = other_classes(FLAGS.nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            X_test[sample_ind:(sample_ind+1)],
            (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, FLAGS.nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params = {'theta': 1., 'gamma': 0.1,
                           'nb_classes': FLAGS.nb_classes, 'clip_min': 0.,
                           'clip_max': 1., 'targets': y,
                           'y_val': one_hot_target}
            adv_x = jsma.generate_np(X_test[sample_ind:(sample_ind+1)],
                                     **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = X_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if FLAGS.viz_enabled:
                if 'figure' not in vars():
                    figure = pair_visual(
                        np.reshape(X_test[sample_ind:(sample_ind+1)],
                                   (FLAGS.img_rows, FLAGS.img_cols)),
                        np.reshape(adv_x,
                                   (FLAGS.img_rows, FLAGS.img_cols)))
                else:
                    figure = pair_visual(
                        np.reshape(X_test[sample_ind:(sample_ind+1)],
                                   (FLAGS.img_rows, FLAGS.img_cols)),
                        np.reshape(adv_x, (FLAGS.img_rows,
                                   FLAGS.img_cols)), figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((FLAGS.nb_classes - 1) * FLAGS.source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if FLAGS.viz_enabled:
        _ = grid_visual(grid_viz_data)
예제 #10
0
def main(argv=None):
    """
    CIFAR10 CleverHans tutorial
    :return:
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # CIFAR10-specific dimensions
    img_rows = 32
    img_cols = 32
    channels = 3
    nb_classes = 10

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    sess = tf.Session()

    set_log_level(logging.DEBUG)

    # Get CIFAR10 test data
    X_train, Y_train, X_test, Y_test = data_cifar10()

    # Label smoothing
    assert Y_train.shape[1] == 10.

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels))

    y = tf.placeholder(tf.float32, shape=(None, 10))

    model_path = FLAGS.model_path
    nb_samples = FLAGS.nb_samples

    from cnn_models import make_basic_cnn
    model = make_basic_cnn('fp_',
                           input_shape=(None, img_rows, img_cols, channels),
                           nb_filters=FLAGS.nb_filters)

    preds = model(x)
    print("Defined TensorFlow model graph with %d parameters" % model.n_params)

    rng = np.random.RandomState([2017, 8, 30])

    def evaluate(eval_params):
        # Evaluate the model on legitimate test examples
        acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
        return acc

    model_load(sess, model_path)
    print('Restored model from %s' % model_path)
    eval_params = {'batch_size': FLAGS.batch_size}
    accuracy = evaluate(eval_params)
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(nb_samples) + ' * ' + str(nb_classes - 1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, nb_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, nb_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    from cleverhans.attacks import SaliencyMapMethod
    jsma = SaliencyMapMethod(model, sess=sess)
    jsma_params = {
        'gamma': FLAGS.gamma,
        'theta': 1.,
        'symbolic_impl': True,
        'clip_min': 0.,
        'clip_max': 1.,
        'y_target': None
    }
    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in range(0, nb_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, nb_samples))
        sample = X_test[sample_ind:(sample_ind + 1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(Y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, channels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = X_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if FLAGS.viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, channels)),
                    np.reshape(adv_x, (img_rows, img_cols, channels)), figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, channels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * nb_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if FLAGS.viz_enabled:
        import matplotlib.pyplot as plt
        plt.close(figure)
        _ = grid_visual(grid_viz_data)
예제 #11
0
def mnist_tutorial_jsma(train_start=0,
                        train_end=60000,
                        test_start=0,
                        test_end=10000,
                        viz_enabled=VIZ_ENABLED,
                        nb_epochs=NB_EPOCHS,
                        batch_size=BATCH_SIZE,
                        source_samples=SOURCE_SAMPLES,
                        learning_rate=LEARNING_RATE):
    """
  MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param viz_enabled: (boolean) activate plots of adversarial examples
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param nb_classes: number of output classes
  :param source_samples: number of test inputs to attack
  :param learning_rate: learning rate for training
  :return: an AccuracyReport object
  """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    #replace
    num_threads = None
    if num_threads:
        config_args = dict(intra_op_parallelism_threads=1)
    else:
        config_args = {}
    sess = tf.Session(config=tf.ConfigProto(**config_args))
    #with sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    mnist = MNIST(train_start=train_start,
                  train_end=train_end,
                  test_start=test_start,
                  test_end=test_end)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]
    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    nb_filters = 64

    # Define TF model graph
    model = make_basic_picklable_cnn()

    preds = model.get_logits(x)
    loss = CrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.reshape(x_train, [60000, 28, 28]), y_train))
    dataset = dataset.batch(32)
    val_dataset = tf.data.Dataset.from_tensor_slices(
        (tf.reshape(x_test, [10000, 28, 28]), y_test))
    val_dataset = val_dataset.batch(32)

    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    if TRAIN_NEW == 1:
        with sess.as_default():
            train(sess, loss, x_train, y_train, args=train_params, rng=rng)
            save("test.joblib", model)
    else:
        with sess.as_default():
            model = load("test.joblib")  #changed
        assert len(model.get_params()) > 0
        preds = model.get_logits(x)
        loss = CrossEntropy(model, smoothing=0.1)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, sess=sess)
    jsma_params = {
        'theta': 1.,
        'gamma': 0.1,
        'clip_min': 0.,
        'clip_max': 1.,
        'y_target': None
    }

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    seed(SEED)
    for sample_ind in xrange(0, source_samples):
        img = randint(0, 10000)
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = x_test[img:(img +
                             1)]  #sample = x_test[sample_ind:(sample_ind + 1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(
            y_test[img]))  #current_class = int(np.argmax(y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, nchannels))
        tn = 0
        totc = 0
        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Compute number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]
            diff = np.array(adv_x - sample)
            #print(np.sum(diff))
            diff = np.reshape(diff, (28, 28))
            diff = diff * 255
            cv2.imwrite("test.png", diff)
            diff = cv2.imread("test.png")
            diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
            nieghbors = 0
            tc = 0
            for i in range(0, 28, 1):
                for j in range(0, 28, 1):
                    if diff[i, j] > 0:
                        tc = tc + 1
                        totc = totc + 1
                        if i > 0 and i < 27 and j > 0 and j < 27:  #main grid not edges or corners
                            if diff[i - 1, j - 1] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i - 1, j] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i - 1, j + 1] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i, j - 1] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i, j + 1] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i + 1, j - 1] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i + 1, j] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i + 1, j + 1] > 0:
                                nieghbors = nieghbors + 1
                        else:
                            #corners
                            if i == 0 and j == 0:
                                if diff[i, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j] > 0:
                                    nieghbors = nieghbors + 1
                            if i == 27 and j == 0:
                                if diff[i, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j] > 0:
                                    nieghbors = nieghbors + 1
                            if i == 0 and j == 27:
                                if diff[i, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j] > 0:
                                    nieghbors = nieghbors + 1
                            if i == 27 and j == 27:
                                if diff[i, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j] > 0:
                                    nieghbors = nieghbors + 1
                            #edges
                            if i == 0 and j > 0 and j < 27:  #left side
                                if diff[i, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                            if i == 27 and j > 0 and j < 27:  #right side
                                if diff[i, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                            if j == 0 and i > 0 and i < 27:  #top side
                                if diff[i - 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                            if j == 27 and i > 0 and i < 27:  #bot side
                                if diff[i - 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j - 1] > 0:
                                    nieghbors = nieghbors + 1

            # print(tc)
            # print(nieghbors)
            tn = tn + nieghbors
            # if tc > 0:
            # print(nieghbors/tc)
            # Display the original and adversarial images side-by-side
            if viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, nchannels)),
                    np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure)
            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, nchannels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb
            #print(perturbations[target, sample_ind])

    print('--------------------------------------')

    print("average neighbors per modified pixel ", tn / totc)
    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.8f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)

    s = perturbations.shape
    myPert = np.empty(0)
    myResults = np.empty(0)
    for i in range(s[0]):
        for j in range(s[1]):
            if perturbations[i][j] > 0:
                myPert = np.append(myPert, perturbations[i][j])
                myResults = np.append(myResults, results[i][j])
    min_perturbed = np.min(myPert)
    max_perturbed = np.max(myPert)

    s2 = myResults.shape
    final = np.empty(0)
    for i in range(s2[0]):
        if myResults[i] > 0:
            final = np.append(final, myPert[i])

    print('Avg. rate of perturbed features {0:.8f}'.format(percent_perturbed))
    print('MIN of perturbed features {0:.8f}'.format(min_perturbed))
    print('MAX of perturbed features {0:.8f}'.format(max_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    min_perturb_succ = np.min(final)
    max_perturb_succ = np.max(final)
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.8f}'.format(percent_perturb_succ))
    print('Min of perturbed features for successful '
          'adversarial examples {0:.8f}'.format(min_perturb_succ))
    print('Max of perturbed features for successful '
          'adversarial examples {0:.8f}'.format(max_perturb_succ))

    #Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        plt.close(figure)
        _ = grid_visual(grid_viz_data)

    return report
def mnist_attack(train_start=0,
                 train_end=60000,
                 test_start=0,
                 test_end=10000,
                 viz_enabled=True,
                 nb_epochs=6,
                 batch_size=128,
                 nb_filters=64,
                 nb_samples=10,
                 learning_rate=0.001,
                 eps=0.3,
                 attack=0,
                 attack_iterations=100,
                 model_path=None,
                 targeted=False,
                 binary=False,
                 scale=False,
                 rand=False,
                 debug=None,
                 test=False,
                 data_dir=None,
                 delay=0,
                 adv=0,
                 nb_iter=40):
    """
    MNIST tutorial for generic attack
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param nb_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # MNIST-specific dimensions
    img_rows = 28
    img_cols = 28
    channels = 1
    nb_classes = 10

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1237)

    # Create TF session
    sess = tf.Session()
    print("Created TensorFlow session.")

    if debug:
        set_log_level(logging.DEBUG)
    else:
        set_log_level(logging.WARNING)  # for running on sharcnet

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(datadir=data_dir,
                                                  train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))
    phase = tf.placeholder(tf.bool, name='phase')

    # for attempting to break unscaled network.
    logits_scalar = tf.placeholder_with_default(INIT_T,
                                                shape=(),
                                                name="logits_temperature")

    save = False
    train_from_scratch = False
    if model_path is not None:
        if os.path.exists(model_path):
            # check for existing model in immediate subfolder
            if any(f.endswith('.meta') for f in os.listdir(model_path)):
                binary, scale, nb_filters, batch_size, learning_rate, nb_epochs, adv = parse_model_settings(
                    model_path)
                train_from_scratch = False
            else:
                model_path = build_model_save_path(model_path, binary,
                                                   batch_size, nb_filters,
                                                   learning_rate, nb_epochs,
                                                   adv, delay, scale)
                print(model_path)
                save = True
                train_from_scratch = True
    else:
        train_from_scratch = True  # train from scratch, but don't save since no path given

    # Define TF model graph
    if binary:
        print('binary=True')
        if scale:
            print('scale=True')
            if rand:
                print('rand=True')
                from cleverhans_tutorials.tutorial_models import make_scaled_binary_rand_cnn
                model = make_scaled_binary_rand_cnn(
                    phase,
                    logits_scalar,
                    'binsc_',
                    input_shape=(None, img_rows, img_cols, channels),
                    nb_filters=nb_filters)
            else:
                from cleverhans_tutorials.tutorial_models import make_scaled_binary_cnn
                model = make_scaled_binary_cnn(phase,
                                               logits_scalar,
                                               'binsc_',
                                               input_shape=(None, img_rows,
                                                            img_cols,
                                                            channels),
                                               nb_filters=nb_filters)
        else:
            from cleverhans_tutorials.tutorial_models import make_basic_binary_cnn
            model = make_basic_binary_cnn(phase,
                                          logits_scalar,
                                          'bin_',
                                          nb_filters=nb_filters)
    else:
        if rand:
            print('rand=True')
            from cleverhans_tutorials.tutorial_models import make_scaled_rand_cnn
            model = make_scaled_rand_cnn(phase,
                                         logits_scalar,
                                         'fp_rand',
                                         nb_filters=nb_filters)
        else:
            from cleverhans_tutorials.tutorial_models import make_basic_cnn
            model = make_basic_cnn(phase,
                                   logits_scalar,
                                   'fp_',
                                   nb_filters=nb_filters)

    preds = model(x, reuse=False)  # * logits_scalar
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################
    rng = np.random.RandomState([2017, 8, 30])

    # Train an MNIST model
    train_params = {
        'binary': binary,
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'loss_name': 'train loss',
        'filename': 'model',
        'reuse_global_step': False,
        'train_scope': 'train',
        'is_training': True
    }

    if adv != 0:
        if adv == ADVERSARIAL_TRAINING_MADRYETAL:
            from cleverhans.attacks import MadryEtAl
            train_attack_params = {
                'eps': MAX_EPS,
                'eps_iter': 0.01,
                'nb_iter': nb_iter
            }
            train_attacker = MadryEtAl(model, sess=sess)

        elif adv == ADVERSARIAL_TRAINING_FGSM:
            from cleverhans.attacks import FastGradientMethod
            stddev = int(np.ceil((MAX_EPS * 255) // 2))
            train_attack_params = {
                'eps':
                tf.abs(
                    tf.truncated_normal(shape=(batch_size, 1, 1, 1),
                                        mean=0,
                                        stddev=stddev))
            }
            train_attacker = FastGradientMethod(model, back='tf', sess=sess)
        # create the adversarial trainer
        train_attack_params.update({'clip_min': 0., 'clip_max': 1.})
        adv_x_train = train_attacker.generate(x, phase, **train_attack_params)
        preds_adv_train = model.get_probs(adv_x_train)

        eval_attack_params = {'eps': MAX_EPS, 'clip_min': 0., 'clip_max': 1.}
        adv_x_eval = train_attacker.generate(x, phase, **eval_attack_params)
        preds_adv_eval = model.get_probs(adv_x_eval)  # * logits_scalar

    def evaluate():
        # Evaluate the accuracy of the MNIST model on clean test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess,
                         x,
                         y,
                         preds,
                         X_test,
                         Y_test,
                         phase=phase,
                         args=eval_params)
        report.clean_train_clean_eval = acc
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

        if adv != 0:
            # Accuracy of the adversarially trained model on adversarial
            # examples
            acc = model_eval(sess,
                             x,
                             y,
                             preds_adv_eval,
                             X_test,
                             Y_test,
                             phase=phase,
                             args=eval_params)
            print('Test accuracy on adversarial examples: %0.4f' % acc)

            acc = model_eval(sess,
                             x,
                             y,
                             preds_adv_eval,
                             X_test,
                             Y_test,
                             phase=phase,
                             args=eval_params,
                             feed={logits_scalar: ATTACK_T})
            print('Test accuracy on adversarial examples (scaled): %0.4f' %
                  acc)

    if train_from_scratch:
        if save:
            train_params.update({'log_dir': model_path})
            if adv and delay > 0:
                train_params.update({'nb_epochs': delay})

        # do clean training for 'nb_epochs' or 'delay' epochs
        if test:
            model_train(sess,
                        x,
                        y,
                        preds,
                        X_train,
                        Y_train,
                        phase=phase,
                        evaluate=evaluate,
                        args=train_params,
                        save=save,
                        rng=rng)
        else:
            model_train(sess,
                        x,
                        y,
                        preds,
                        X_train,
                        Y_train,
                        phase=phase,
                        args=train_params,
                        save=save,
                        rng=rng)

        # optionally do additional adversarial training
        if adv:
            print("Adversarial training for %d epochs" % (nb_epochs - delay))
            train_params.update({'nb_epochs': nb_epochs - delay})
            train_params.update({'reuse_global_step': True})
            if test:
                model_train(sess,
                            x,
                            y,
                            preds,
                            X_train,
                            Y_train,
                            phase=phase,
                            predictions_adv=preds_adv_train,
                            evaluate=evaluate,
                            args=train_params,
                            save=save,
                            rng=rng)
            else:
                model_train(sess,
                            x,
                            y,
                            preds,
                            X_train,
                            Y_train,
                            phase=phase,
                            predictions_adv=preds_adv_train,
                            args=train_params,
                            save=save,
                            rng=rng)
    else:
        tf_model_load(sess, model_path)
        print('Restored model from %s' % model_path)
        evaluate()

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess,
                          x,
                          y,
                          preds,
                          X_test,
                          Y_test,
                          phase=phase,
                          feed={phase: False},
                          args=eval_params)
    assert X_test.shape[0] == test_end - test_start, X_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Build dataset
    ###########################################################################
    if viz_enabled:
        assert nb_samples == nb_classes
        idxs = [
            np.where(np.argmax(Y_test, axis=1) == i)[0][0]
            for i in range(nb_classes)
        ]
        viz_rows = nb_classes if targeted else 2
        # Initialize our array for grid visualization
        grid_shape = (nb_classes, viz_rows, img_rows, img_cols, channels)
        grid_viz_data = np.zeros(grid_shape, dtype='f')

    if targeted:
        from cleverhans.utils import build_targeted_dataset
        if viz_enabled:
            from cleverhans.utils import grid_visual
            adv_inputs, true_labels, adv_ys = build_targeted_dataset(
                X_test, Y_test, idxs, nb_classes, img_rows, img_cols, channels)
        else:
            adv_inputs, true_labels, adv_ys = build_targeted_dataset(
                X_test, Y_test, np.arange(nb_samples), nb_classes, img_rows,
                img_cols, channels)
    else:
        if viz_enabled:
            from cleverhans.utils import pair_visual
            adv_inputs = X_test[idxs]
        else:
            adv_inputs = X_test[:nb_samples]

    ###########################################################################
    # Craft adversarial examples using generic approach
    ###########################################################################
    if targeted:
        att_batch_size = np.clip(nb_samples * (nb_classes - 1),
                                 a_max=MAX_BATCH_SIZE,
                                 a_min=1)
        nb_adv_per_sample = nb_classes - 1
        yname = "y_target"

    else:
        att_batch_size = np.minimum(nb_samples, MAX_BATCH_SIZE)
        nb_adv_per_sample = 1
        adv_ys = None
        yname = "y"

    print('Crafting ' + str(nb_samples) + ' * ' + str(nb_adv_per_sample) +
          ' adversarial examples')
    print("This could take some time ...")

    if attack == ATTACK_CARLINI_WAGNER_L2:
        print('Attack: CarliniWagnerL2')
        from cleverhans.attacks import CarliniWagnerL2
        attacker = CarliniWagnerL2(model, back='tf', sess=sess)
        attack_params = {
            'binary_search_steps': 1,
            'max_iterations': attack_iterations,
            'learning_rate': 0.1,
            'batch_size': att_batch_size,
            'initial_const': 10,
        }
    elif attack == ATTACK_JSMA:
        print('Attack: SaliencyMapMethod')
        from cleverhans.attacks import SaliencyMapMethod
        attacker = SaliencyMapMethod(model, back='tf', sess=sess)
        attack_params = {'theta': 1., 'gamma': 0.1}
    elif attack == ATTACK_FGSM:
        print('Attack: FastGradientMethod')
        from cleverhans.attacks import FastGradientMethod
        attacker = FastGradientMethod(model, back='tf', sess=sess)
        attack_params = {'eps': eps}
    elif attack == ATTACK_MADRYETAL:
        print('Attack: MadryEtAl')
        from cleverhans.attacks import MadryEtAl
        attacker = MadryEtAl(model, back='tf', sess=sess)
        attack_params = {'eps': eps, 'eps_iter': 0.01, 'nb_iter': nb_iter}
    elif attack == ATTACK_BASICITER:
        print('Attack: BasicIterativeMethod')
        from cleverhans.attacks import BasicIterativeMethod
        attacker = BasicIterativeMethod(model, back='tf', sess=sess)
        attack_params = {'eps': eps, 'eps_iter': 0.01, 'nb_iter': nb_iter}
    else:
        print("Attack undefined")
        sys.exit(1)

    attack_params.update({yname: adv_ys, 'clip_min': 0., 'clip_max': 1.})
    adv_np = attacker.generate_np(adv_inputs, phase, **attack_params)
    '''
    name = 'm_fgsm_eps%s_n%s.npy' % (eps, nb_samples)
    fpath = os.path.join(
        '/scratch/gallowaa/mnist/adversarial_examples/cleverhans/', name)
    np.savez(fpath, x=adv_np, y=Y_test[:nb_samples])
    '''
    '''
    adv_x = attacker.generate(x, phase, **attack_params)
    adv_np, = batch_eval(sess, [x], [adv_x], [adv_inputs], feed={
                         phase: False}, args=eval_params)
    '''
    eval_params = {'batch_size': att_batch_size}
    if targeted:
        print("Evaluating targeted results")
        adv_accuracy = model_eval(sess,
                                  x,
                                  y,
                                  preds,
                                  adv_np,
                                  true_labels,
                                  phase=phase,
                                  args=eval_params)

    else:
        print("Evaluating untargeted results")
        if viz_enabled:
            adv_accuracy = model_eval(sess,
                                      x,
                                      y,
                                      preds,
                                      adv_np,
                                      Y_test[idxs],
                                      phase=phase,
                                      args=eval_params)
        else:
            adv_accuracy = model_eval(sess,
                                      x,
                                      y,
                                      preds,
                                      adv_np,
                                      Y_test[:nb_samples],
                                      phase=phase,
                                      args=eval_params)

    if viz_enabled:
        n = nb_classes - 1
        for i in range(nb_classes):
            if targeted:
                for j in range(nb_classes):
                    if i != j:
                        if j != 0 and i != n:
                            grid_viz_data[i, j] = adv_np[j * n + i]
                        if j == 0 and i > 0 or i == n and j > 0:
                            grid_viz_data[i, j] = adv_np[j * n + i - 1]
                    else:
                        grid_viz_data[i, j] = adv_inputs[j * n]
            else:
                grid_viz_data[j, 0] = adv_inputs[j]
                grid_viz_data[j, 1] = adv_np[j]
        print(grid_viz_data.shape)

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    print('Test accuracy on adversarial examples {0:.4f}'.format(adv_accuracy))
    report.clean_train_adv_eval = 1. - adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(
        np.sum((adv_np - adv_inputs)**2, axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Compute number of modified features (L_0 norm)
    nb_changed = np.where(adv_np != adv_inputs)[0].shape[0]
    percent_perturb = np.mean(float(nb_changed) / adv_np.reshape(-1).shape[0])

    # Compute the average distortion introduced by the algorithm
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturb))

    # Friendly output for pasting into spreadsheet
    print('{0:.4f}'.format(accuracy))
    print('{0:.4f}'.format(adv_accuracy))
    print('{0:.4f}'.format(percent_perturbed))
    print('{0:.4f}'.format(percent_perturb))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        _ = grid_visual(grid_viz_data)

    return report
예제 #13
0
def mnist_tutorial_cw(
    train_start=0,
    train_end=60000,
    test_start=0,
    test_end=10000,
    viz_enabled=VIZ_ENABLED,
    nb_epochs=NB_EPOCHS,
    batch_size=BATCH_SIZE,
    source_samples=SOURCE_SAMPLES,
    learning_rate=LEARNING_RATE,
    attack_iterations=ATTACK_ITERATIONS,
    model_path=MODEL_PATH,
    targeted=TARGETED,
):
    """
    MNIST tutorial for Carlini and Wagner's attack
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    mnist = MNIST(
        train_start=train_start,
        train_end=train_end,
        test_start=test_start,
        test_end=test_end,
    )
    x_train, y_train = mnist.get_set("train")
    x_test, y_test = mnist.get_set("test")

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))
    nb_filters = 64

    # Define TF model graph
    model = ModelBasicCNN("model1", nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = CrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        "nb_epochs": nb_epochs,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
        "filename": os.path.split(model_path)[-1],
    }

    rng = np.random.RandomState([2017, 8, 30])
    # check if we've trained before, and if we have, use that pre-trained model
    if os.path.exists(model_path + ".meta"):
        tf_model_load(sess, model_path)
    else:
        train(sess, loss, x_train, y_train, args=train_params, rng=rng)
        saver = tf.train.Saver()
        saver.save(sess, model_path)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {"batch_size": batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print("Test accuracy on legitimate test examples: {0}".format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using Carlini and Wagner's approach
    ###########################################################################
    nb_adv_per_sample = str(nb_classes - 1) if targeted else "1"
    print(
        "Crafting "
        + str(source_samples)
        + " * "
        + nb_adv_per_sample
        + " adversarial examples"
    )
    print("This could take some time ...")

    # Instantiate a CW attack object
    cw = CarliniWagnerL2(model, sess=sess)

    if viz_enabled:
        assert source_samples == nb_classes
        idxs = [
            np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes)
        ]
    if targeted:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype="f")

            adv_inputs = np.array(
                [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32
            )
        else:
            adv_inputs = np.array(
                [[instance] * nb_classes for instance in x_test[:source_samples]],
                dtype=np.float32,
            )

        one_hot = np.zeros((nb_classes, nb_classes))
        one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1

        adv_inputs = adv_inputs.reshape(
            (source_samples * nb_classes, img_rows, img_cols, nchannels)
        )
        adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape(
            (source_samples * nb_classes, nb_classes)
        )
        yname = "y_target"
    else:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype="f")

            adv_inputs = x_test[idxs]
        else:
            adv_inputs = x_test[:source_samples]

        adv_ys = None
        yname = "y"

    if targeted:
        cw_params_batch_size = source_samples * nb_classes
    else:
        cw_params_batch_size = source_samples
    cw_params = {
        "binary_search_steps": 1,
        yname: adv_ys,
        "max_iterations": attack_iterations,
        "learning_rate": CW_LEARNING_RATE,
        "batch_size": cw_params_batch_size,
        "initial_const": 10,
    }

    adv = cw.generate_np(adv_inputs, **cw_params)

    eval_params = {"batch_size": np.minimum(nb_classes, source_samples)}
    if targeted:
        adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params)
    else:
        if viz_enabled:
            err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params)
            adv_accuracy = 1 - err
        else:
            err = model_eval(
                sess, x, y, preds, adv, y_test[:source_samples], args=eval_params
            )
            adv_accuracy = 1 - err

    if viz_enabled:
        for j in range(nb_classes):
            if targeted:
                for i in range(nb_classes):
                    grid_viz_data[i, j] = adv[i * nb_classes + j]
            else:
                grid_viz_data[j, 0] = adv_inputs[j]
                grid_viz_data[j, 1] = adv[j]

        print(grid_viz_data.shape)

    print("--------------------------------------")

    # Compute the number of adversarial examples that were successfully found
    print("Avg. rate of successful adv. examples {0:.4f}".format(adv_accuracy))
    report.clean_train_adv_eval = 1.0 - adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(np.sum((adv - adv_inputs) ** 2, axis=(1, 2, 3)) ** 0.5)
    print("Avg. L_2 norm of perturbations {0:.4f}".format(percent_perturbed))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        _ = grid_visual(grid_viz_data)

    return report
예제 #14
0
def mnist_tutorial_cw(train_start=0,
                      train_end=60000,
                      test_start=0,
                      test_end=10000,
                      viz_enabled=True,
                      nb_epochs=6,
                      batch_size=128,
                      source_samples=10,
                      learning_rate=0.001,
                      attack_iterations=100,
                      model_path=os.path.join("models", "mnist"),
                      targeted=True):
    """
    MNIST tutorial for Carlini and Wagner's attack
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(file,
                                                  train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    nb_filters = 64

    ################ color training initialization ####################

    color_training_epochs = 5000
    color_learning_rate = 0.1
    colorCategory = [
        [0.0, 0.4],  # Black
        [0.3, 0.7],  # Grey
        [0.6, 1.0]  # White
    ]

    numColorInput = 1

    color_x = tf.placeholder(
        tf.float32,
        [None, numColorInput])  # mnist data image of shape 28*28=784
    color_y = tf.placeholder(
        tf.float32,
        [None, numColorOutput])  # 0-9 digits recognition => 10 classes

    # Set model weights
    color_W = tf.Variable(tf.zeros([numColorInput, numColorOutput]))
    color_b = tf.Variable(tf.zeros([numColorOutput]))
    color_pred_out = tf.matmul(color_x, color_W) + color_b  # Softmax

    color_cost = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=color_pred_out,
                                                labels=color_y))
    # Gradient Descent
    color_optimizer = tf.train.GradientDescentOptimizer(
        color_learning_rate).minimize(color_cost)

    # Test model
    color_argmax = tf.argmax(color_pred_out, 1)
    color_correct_prediction = tf.equal(tf.argmax(color_pred_out, 1),
                                        tf.argmax(color_y, 1))
    # Calculate accuracy
    color_accuracy = tf.reduce_mean(
        tf.cast(color_correct_prediction, tf.float32))

    # Graph for re-generating the original image into a new image by using trained color model
    pr_model_x = tf.placeholder(
        tf.float32,
        [None, n_input, numColorInput])  # mnist data image of shape 28*28=784
    pr_model_W = tf.placeholder(tf.float32,
                                [None, numColorInput, numColorOutput
                                 ])  # mnist data image of shape 28*28=784
    pr_model_b = tf.placeholder(tf.float32,
                                [None, numColorInput, numColorOutput
                                 ])  # mnist data image of shape 28*28=784
    pr_model_output = tf.one_hot(
        tf.argmax((tf.matmul(pr_model_x, pr_model_W) + pr_model_b), 2),
        numColorOutput)

    # Merge the random generated output for new image based on the colorCategory
    randomColorCategory = []
    for i in range(len(colorCategory)):
        tmp = []
        tmpRandomColorCategory = my_tf_round(
            tf.random_uniform(tf.shape(pr_model_x),
                              colorCategory[i][0],
                              colorCategory[i][1],
                              dtype=tf.float32), 2)
        tmp.append(tmpRandomColorCategory)
        randomColorCategory.append(tf.concat(tmp, 1))

    random_merge = tf.reshape(tf.concat(randomColorCategory, -1),
                              [-1, n_input, numColorOutput])
    random_color_set = tf.reduce_sum(
        tf.multiply(pr_model_output, random_merge), 2)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    x = tf.reshape(random_color_set, shape=(-1, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Define TF model graph
    model = ModelBasicCNN('model1', nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = LossCrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        #'train_dir': os.path.join(*os.path.split(model_path)[:-1]),
        #'filename': os.path.split(model_path)[-1],
        'train_dir': save_dir,
        'filename': filename,
        'numColorOutput': numColorOutput
    }
    with sess.as_default():
        if hasattr(tf, "global_variables_initializer"):
            tf.global_variables_initializer().run()
        else:
            warnings.warn("Update your copy of tensorflow; future versions of "
                          "CleverHans may drop support for this version.")
            sess.run(tf.initialize_all_variables())

        ################# color training ####################
        print("Trying to load pr model from: " + model_path2)
        if os.path.exists(model_path2 + ".meta"):
            tf_model_load(sess, model_path2)
            c_w, c_b = sess.run([color_W, color_b])
            print("Load color trained model in training")
        else:
            # Training the color
            for epoch in range(color_training_epochs):
                outputColorY = []
                p1 = np.random.random(100)
                for i in range(len(p1)):
                    outputOverlapColorY = []
                    for j in range(len(colorCategory)):
                        if p1[i] >= colorCategory[j][0] and p1[
                                i] <= colorCategory[j][1]:
                            colorIndexSeq = []
                            for k in range(len(colorCategory)):
                                if j == k:
                                    colorIndexSeq.append(1)
                                else:
                                    colorIndexSeq.append(0)
                            outputOverlapColorY.append(colorIndexSeq)
                            #break

                    # Randomly choose the output for color Y if the outputOverlapColorY has more than 1 item
                    outputColorY.append(outputOverlapColorY[np.random.randint(
                        0, len(outputOverlapColorY))])

                inputColorX = p1.reshape(100, 1)
                _, c, c_w, c_b = sess.run(
                    [color_optimizer, color_cost, color_W, color_b],
                    feed_dict={
                        color_x: inputColorX,
                        color_y: outputColorY
                    })
                avg_cost = c

                # Evaluating color model
                outputColorY = []
                p1 = np.random.random(100)
                # Generate output for random color inputs (test case)
                for i in range(len(p1)):
                    for j in range(len(colorCategory)):
                        outputOverlapColorY = []
                        if p1[i] >= colorCategory[j][0] and p1[
                                i] <= colorCategory[j][1]:
                            colorIndexSeq = []
                            for k in range(len(colorCategory)):
                                if j == k:
                                    colorIndexSeq.append(1)
                                else:
                                    colorIndexSeq.append(0)
                            outputOverlapColorY.append(colorIndexSeq)
                            break

                    # Randomly choose the output for color Y if the outputOverlapColorY has more than 1 item
                    outputColorY.append(outputOverlapColorY[np.random.randint(
                        0, len(outputOverlapColorY))])

                inputColorX = p1.reshape(100, 1)
                # print(random_xs)
                acc, argmax = sess.run([color_accuracy, color_argmax],
                                       feed_dict={
                                           color_x: inputColorX,
                                           color_y: outputColorY
                                       })
                print("Epoch:", '%04d' % (epoch + 1) + "/" + str(color_training_epochs) + ", Cost= " + \
                      "{:.9f}".format(avg_cost) + ", Training Accuracy= " + \
                      "{:.5f}".format(acc) + " ")
                # print(c_w)

            with tf.device('/CPU:0'):
                saver = tf.train.Saver(tf.global_variables(), max_to_keep=50)
                # Since training PR model is fast, we do not have to save multiple sessions for this
                save_path = os.path.join(save_dir2, filename2)
                saver.save(sess, save_path)
        ##################### end of color training ------------------------------

    ################# model training ####################
    rng = np.random.RandomState([2017, 8, 30])
    saveFileNum = 50
    saveFileNum = 500
    # saveFileNum = 1000
    model_path = os.path.join(save_dir, filename + "-" + str(saveFileNum))
    # check if we've trained before, and if we have, use that pre-trained model
    print("Trying to load trained model from: " + model_path)
    if os.path.exists(model_path + ".meta"):
        tf_model_load(sess, model_path)
        print("Load trained model")
    else:
        train(sess,
              loss,
              x,
              y,
              x_train,
              y_train,
              args=train_params,
              rng=rng,
              save=True,
              c_w=c_w,
              c_b=c_b,
              pr_model_x=pr_model_x,
              random_color_set=random_color_set,
              pr_model_W=pr_model_W,
              pr_model_b=pr_model_b)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size, 'numColorOutput': numColorOutput}
    #accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params,
    #                   pred2=preds, c_w=c_w, c_b=c_b, pr_model_x=pr_model_x, random_color_set=random_color_set,
    #                   pr_model_W=pr_model_W, pr_model_b=pr_model_b)
    #assert x_test.shape[0] == test_end - test_start, x_test.shape
    #print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    #report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using Carlini and Wagner's approach
    ###########################################################################
    nb_adv_per_sample = str(nb_classes - 1) if targeted else '1'
    print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample +
          ' adversarial examples')
    print("This could take some time ...")

    # Instantiate a CW attack object
    cw = CarliniWagnerL2(model, back='tf', sess=sess)

    if viz_enabled:
        assert source_samples == nb_classes
        idxs = [
            np.where(np.argmax(y_test, axis=1) == i)[0][0]
            for i in range(nb_classes)
        ]
    if targeted:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, nb_classes, img_rows, img_cols,
                          nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = np.array([[instance] * nb_classes
                                   for instance in x_test[idxs]],
                                  dtype=np.float32)
        else:
            adv_inputs = np.array([[instance] * nb_classes
                                   for instance in x_test[:source_samples]],
                                  dtype=np.float32)

        one_hot = np.zeros((nb_classes, nb_classes))
        one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1

        adv_inputs = adv_inputs.reshape(
            (source_samples * nb_classes, img_rows, img_cols, nchannels))
        adv_ys = np.array([one_hot] * source_samples,
                          dtype=np.float32).reshape(
                              (source_samples * nb_classes, nb_classes))
        yname = "y_target"
    else:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = x_test[idxs]
            adv_inputs = x_test
        else:
            adv_inputs = x_test[:source_samples]
            adv_inputs = x_test

        adv_ys = None
        yname = "y"

    cw_params = {
        'binary_search_steps': 1,
        'max_iterations': attack_iterations,
        'learning_rate': 0.1,
        'batch_size':
        source_samples * nb_classes if targeted else source_samples,
        'initial_const': 10
    }

    adv2 = cw.generate(x, **cw_params)
    cw_params[yname] = adv_ys
    adv = None
    adv = cw.generate_np(adv_inputs, **cw_params)

    eval_params = {
        'batch_size': np.minimum(nb_classes, source_samples),
        'numColorOutput': numColorOutput
    }
    if targeted:
        adv_accuracy = model_eval(sess,
                                  x,
                                  y,
                                  preds,
                                  adv,
                                  adv_ys,
                                  args=eval_params)
    else:
        if viz_enabled:
            adv_accuracy = model_eval(sess,
                                      x,
                                      y,
                                      preds,
                                      adv,
                                      y_test[idxs],
                                      args=eval_params)
        else:
            #adv_accuracy = model_eval(sess, x, y, preds, adv, y_test[
            #               :source_samples], args=eval_params)
            adv_accuracy = model_eval(sess,
                                      x,
                                      y,
                                      preds,
                                      adv,
                                      y_test,
                                      args=eval_params,
                                      pred2=preds,
                                      c_w=c_w,
                                      c_b=c_b,
                                      pr_model_x=pr_model_x,
                                      random_color_set=random_color_set,
                                      pr_model_W=pr_model_W,
                                      pr_model_b=pr_model_b,
                                      is_adv=True,
                                      ae=adv2)

    if viz_enabled:
        for j in range(nb_classes):
            if targeted:
                for i in range(nb_classes):
                    grid_viz_data[i, j] = adv[i * nb_classes + j]
            else:
                grid_viz_data[j, 0] = adv_inputs[j]
                grid_viz_data[j, 1] = adv[j]

        print(grid_viz_data.shape)

    print('--------------------------------------')
    print("load save file: ", saveFileNum)
    # Compute the number of adversarial examples that were successfully found
    print('Test with adv. examples {0:.4f}'.format(adv_accuracy))
    report.clean_train_adv_eval = 1. - adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(
        np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        _ = grid_visual(grid_viz_data)

    return report
예제 #15
0
        def do_cw():
            nb_adv_per_sample = str(nb_classes - 1) if FLAGS.targeted else '1'
            print('Crafting ' + str(source_samples) + ' * ' +
                  nb_adv_per_sample + ' adversarial examples')
            print("This could take some time ...")

            # Instantiate a CW attack object
            cw = CarliniWagnerL2(model, back='tf', sess=sess)

            if FLAGS.viz_enabled:
                assert source_samples == nb_classes
                idxs = [
                    np.where(np.argmax(Y_test, axis=1) == i)[0][0]
                    for i in range(nb_classes)
                ]
            if FLAGS.targeted:
                if FLAGS.viz_enabled:
                    # Initialize our array for grid visualization
                    grid_shape = (nb_classes, nb_classes, img_rows, img_cols,
                                  channels)
                    grid_viz_data = np.zeros(grid_shape, dtype='f')

                    adv_inputs = np.array([[instance] * nb_classes
                                           for instance in X_test[idxs]],
                                          dtype=np.float32)
                else:
                    adv_inputs = np.array(
                        [[instance] * nb_classes
                         for instance in X_test[:source_samples]],
                        dtype=np.float32)

                one_hot = np.zeros((nb_classes, nb_classes))
                one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1

                adv_inputs = adv_inputs.reshape((source_samples * nb_classes,
                                                 img_rows, img_cols, channels))
                adv_ys = np.array([one_hot] * source_samples,
                                  dtype=np.float32).reshape(
                                      (source_samples * nb_classes,
                                       nb_classes))
                yname = "y_target"
            else:
                if FLAGS.viz_enabled:
                    # Initialize our array for grid visualization
                    grid_shape = (nb_classes, 2, img_rows, img_cols, channels)
                    grid_viz_data = np.zeros(grid_shape, dtype='f')

                    adv_inputs = X_test[idxs]
                else:
                    adv_inputs = X_test[:source_samples]

                adv_ys = None
                yname = "y"

            cw_params = {
                'binary_search_steps': 1,
                yname: adv_ys,
                'max_iterations': FLAGS.attack_iterations,
                'learning_rate': 0.1,
                'batch_size': source_samples *
                nb_classes if FLAGS.targeted else source_samples,
                'initial_const': 10
            }

            adv = cw.generate_np(adv_inputs, **cw_params)

            if FLAGS.targeted:
                adv_accuracy = model_eval(sess,
                                          x,
                                          y,
                                          preds,
                                          adv,
                                          adv_ys,
                                          args=eval_par)
            else:
                if FLAGS.viz_enabled:
                    adv_accuracy = 1 - \
                                   model_eval(sess, x, y, preds, adv, Y_test[
                                       idxs], args=eval_par)
                else:
                    adv_accuracy = 1 - \
                                   model_eval(sess, x, y, preds, adv, Y_test[
                                       :source_samples], args=eval_par)

            if FLAGS.viz_enabled:
                for j in range(nb_classes):
                    if FLAGS.targeted:
                        for i in range(nb_classes):
                            grid_viz_data[i, j] = adv[i * nb_classes + j]
                    else:
                        grid_viz_data[j, 0] = adv_inputs[j]
                        grid_viz_data[j, 1] = adv[j]

                print(grid_viz_data.shape)

            print('--------------------------------------')

            # Compute the number of adversarial examples that were successfully found
            print('Avg. rate of successful adv. examples {0:.4f}'.format(
                adv_accuracy))
            report.clean_train_adv_eval = 1. - adv_accuracy

            # Compute the average distortion introduced by the algorithm
            percent_perturbed = np.mean(
                np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5)
            print('Avg. L_2 norm of perturbations {0:.4f}'.format(
                percent_perturbed))
            # Close TF session
            #            sess.close()

            # Finally, block & display a grid of all the adversarial examples
            if FLAGS.viz_enabled:
                import matplotlib.pyplot as plt
                _ = grid_visual(grid_viz_data)

            return report
예제 #16
0
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0,
                        test_end=10000, viz_enabled=True, nb_epochs=6,
                        batch_size=128, source_samples=10,
                        learning_rate=0.001):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    nb_filters = 64
    # Define TF model graph
    model = ModelBasicCNN('model1', nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = LossCrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    train(sess, loss, x, y, x_train, y_train, args=train_params,
          rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, back='tf', sess=sess)
    jsma_params = {'theta': 1., 'gamma': 0.1,
                   'clip_min': 0., 'clip_max': 1.,
                   'y_target': None}

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = x_test[sample_ind:(sample_ind+1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, nchannels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, nchannels)),
                    np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, nchannels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        plt.close(figure)
        _ = grid_visual(grid_viz_data)

    return report
예제 #17
0
def mnist_tutorial_deepfool(train_start=0, train_end=60000, #读60000训练
                            test_start=0,test_end=10000, #读10000测试
                            viz_enabled=True, nb_epochs=6,
                            batch_size=128, nb_classes=2, source_samples=10,
                            learning_rate=0.001, attack_iterations=100,
                            model_path=os.path.join("models", "mnist")):
    """
    MNIST tutorial for Deepfool's attack
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples激活对抗例子
    :param nb_epochs: number of epochs to train model(一个epoch指代所有的数据送入网络中完成一次前向计算及反向传播的过程。)
    :param batch_size: size of training batches
    :param nb_classes: number of output classes(输出几类)
    :param source_samples: number of test inputs to attack(测试输入用于攻击的数量)
    :param learning_rate: learning rate for training(学习率)
    :param model_path: path to the model file(文件路径)
    :param attack_iterations: 攻击迭代次数
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies精确度报告
    report = AccuracyReport()

    # MNIST-specific dimensions图像尺寸28*28*1
    img_rows = 28
    img_cols = 28
    channels = 1

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = make_basic_picklable_cnn()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow(构建训练模型)
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': os.path.join(*os.path.split(model_path)[:-1]),
        'filename': os.path.split(model_path)[-1]
    }

    rng = np.random.RandomState([2018, 8, 9])
    # check if we've trained before, and if we have, use that pre-trained model
    if os.path.exists(model_path+".meta"):
        tf_model_load(sess, model_path)
    else:
        model_train(sess, x, y, preds, X_train, Y_train, args=train_params,
                    save=os.path.exists("models"), rng=rng)
        print("save success")

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
    assert X_test.shape[0] == test_end - test_start, X_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using Carlini and Wagner's approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) +
          ' adversarial examples')
    print("This could take some time ...")

    # Instantiate a DeepFool attack object
    deepfool = DeepFool(model, back='tf', sess=sess)


    idxs = [np.where(np.argmax(Y_test, axis=1) == i)[0][1] for i in range(10)]
    print("idxs:",idxs)

    # construct adv_inputs
    grid_shape = (nb_classes, 2, img_rows, img_cols, channels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')
    print("grid_viz_data",grid_viz_data.shape)
    adv_inputs = X_test[idxs].reshape([-1,28,28,1])

    deepfool_params = {'nb_candidate': 10,
                       'overshoot': 0.02,
                       'max_iter': attack_iterations,
                       'nb_classes': 10,
                       'clip_min': 0.,
                       'clip_max': 1.}

    adv = deepfool.generate_np(adv_inputs, **deepfool_params)

    print("adv success")

    adv_accuracy = 1-model_eval(sess, x, y, preds, adv, Y_test[idxs],
                                args={'batch_size': 10})

    for j in range(10):
        grid_viz_data[j, 0] = adv_inputs[j]
        grid_viz_data[j, 1] = adv[j]

    print(grid_viz_data.shape)

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy))
    report.clean_train_adv_eval = 1.-adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2,
                                       axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        _ = grid_visual(grid_viz_data)

    return report
예제 #18
0
def mnist_tutorial_cw(train_start=0,
                      train_end=60000,
                      test_start=0,
                      test_end=10000,
                      viz_enabled=True,
                      nb_epochs=6,
                      batch_size=128,
                      source_samples=10,
                      learning_rate=0.001,
                      attack_iterations=100,
                      model_path=os.path.join("models", "mnist"),
                      targeted=True):
    """
    MNIST tutorial for Carlini and Wagner's attack
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))
    nb_filters = 64

    # Define TF model graph
    model = ModelBasicCNN('model1', nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = LossCrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': os.path.join(*os.path.split(model_path)[:-1]),
        'filename': os.path.split(model_path)[-1]
    }

    rng = np.random.RandomState([2017, 8, 30])
    # check if we've trained before, and if we have, use that pre-trained model
    if os.path.exists(model_path + ".meta"):
        tf_model_load(sess, model_path)
    else:
        train(sess,
              loss,
              x,
              y,
              x_train,
              y_train,
              args=train_params,
              save=os.path.exists("models"),
              rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using Carlini and Wagner's approach
    ###########################################################################
    nb_adv_per_sample = str(nb_classes - 1) if targeted else '1'
    print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample +
          ' adversarial examples')
    print("This could take some time ...")

    # Instantiate a CW attack object
    cw = CarliniWagnerL2(model, back='tf', sess=sess)

    if viz_enabled:
        assert source_samples == nb_classes
        idxs = [
            np.where(np.argmax(y_test, axis=1) == i)[0][0]
            for i in range(nb_classes)
        ]
    if targeted:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, nb_classes, img_rows, img_cols,
                          nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = np.array([[instance] * nb_classes
                                   for instance in x_test[idxs]],
                                  dtype=np.float32)
        else:
            adv_inputs = np.array([[instance] * nb_classes
                                   for instance in x_test[:source_samples]],
                                  dtype=np.float32)

        one_hot = np.zeros((nb_classes, nb_classes))
        one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1

        adv_inputs = adv_inputs.reshape(
            (source_samples * nb_classes, img_rows, img_cols, nchannels))
        adv_ys = np.array([one_hot] * source_samples,
                          dtype=np.float32).reshape(
                              (source_samples * nb_classes, nb_classes))
        yname = "y_target"
    else:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = x_test[idxs]
        else:
            adv_inputs = x_test[:source_samples]

        adv_ys = None
        yname = "y"

    cw_params = {
        'binary_search_steps': 1,
        yname: adv_ys,
        'max_iterations': attack_iterations,
        'learning_rate': 0.1,
        'batch_size':
        source_samples * nb_classes if targeted else source_samples,
        'initial_const': 10
    }

    adv = cw.generate_np(adv_inputs, **cw_params)

    eval_params = {'batch_size': np.minimum(nb_classes, source_samples)}
    if targeted:
        adv_accuracy = model_eval(sess,
                                  x,
                                  y,
                                  preds,
                                  adv,
                                  adv_ys,
                                  args=eval_params)
    else:
        if viz_enabled:
            adv_accuracy = 1 - \
                model_eval(sess, x, y, preds, adv, y_test[
                           idxs], args=eval_params)
        else:
            adv_accuracy = 1 - \
                model_eval(sess, x, y, preds, adv, y_test[
                           :source_samples], args=eval_params)

    if viz_enabled:
        for j in range(nb_classes):
            if targeted:
                for i in range(nb_classes):
                    grid_viz_data[i, j] = adv[i * nb_classes + j]
            else:
                grid_viz_data[j, 0] = adv_inputs[j]
                grid_viz_data[j, 1] = adv[j]

        print(grid_viz_data.shape)

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy))
    report.clean_train_adv_eval = 1. - adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(
        np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        _ = grid_visual(grid_viz_data)

    return report
예제 #19
0
def main(argv=None):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :return:
    """

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    ###########################################################################
    # Define the dataset and model
    ###########################################################################

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' "
              "to 'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)
    print("Created TensorFlow session and set Keras backend.")

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist()
    print("Loaded MNIST test data.")

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    predictions = model(x)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model if it does not exist in the train_dir folder
    saver = tf.train.Saver()
    save_path = os.path.join(FLAGS.train_dir, FLAGS.filename)
    if os.path.isfile(save_path):
        saver.restore(sess, os.path.join(FLAGS.train_dir, FLAGS.filename))
    else:
        train_params = {
            'nb_epochs': FLAGS.nb_epochs,
            'batch_size': FLAGS.batch_size,
            'learning_rate': FLAGS.learning_rate
        }
        model_train(sess,
                    x,
                    y,
                    predictions,
                    X_train,
                    Y_train,
                    args=train_params)
        saver.save(sess, save_path)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': FLAGS.batch_size}
    accuracy = model_eval(sess,
                          x,
                          y,
                          predictions,
                          X_test,
                          Y_test,
                          args=eval_params)
    assert X_test.shape[0] == 10000, X_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(FLAGS.source_samples) + ' * ' +
          str(FLAGS.nb_classes - 1) + ' adversarial examples')

    # This array indicates whether an adversarial example was found for each
    # test set sample and target class
    results = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='i')

    # This array contains the fraction of perturbed features for each test set
    # sample and target class
    perturbations = np.zeros((FLAGS.nb_classes, FLAGS.source_samples),
                             dtype='f')

    # Define the TF graph for the model's Jacobian
    grads = jacobian_graph(predictions, x, FLAGS.nb_classes)

    # Initialize our array for grid visualization
    grid_shape = (FLAGS.nb_classes, FLAGS.nb_classes, FLAGS.img_rows,
                  FLAGS.img_cols, FLAGS.nb_channels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, FLAGS.source_samples):
        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(Y_test[sample_ind]))
        target_classes = other_classes(FLAGS.nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            X_test[sample_ind:(sample_ind + 1)],
            (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

        # Loop over all target classes
        for target in target_classes:
            print('--------------------------------------')
            print('Creating adv. example for target class ' + str(target))

            # This call runs the Jacobian-based saliency map approach
            adv_x, res, percent_perturb = jsma(sess,
                                               x,
                                               predictions,
                                               grads,
                                               X_test[sample_ind:(sample_ind +
                                                                  1)],
                                               target,
                                               theta=1,
                                               gamma=0.1,
                                               increase=True,
                                               back='tf',
                                               clip_min=0,
                                               clip_max=1)

            # Display the original and adversarial images side-by-side
            if FLAGS.viz_enabled:
                if 'figure' not in vars():
                    figure = pair_visual(
                        np.reshape(X_test[sample_ind:(sample_ind + 1)],
                                   (FLAGS.img_rows, FLAGS.img_cols)),
                        np.reshape(adv_x, (FLAGS.img_rows, FLAGS.img_cols)))
                else:
                    figure = pair_visual(
                        np.reshape(X_test[sample_ind:(sample_ind + 1)],
                                   (FLAGS.img_rows, FLAGS.img_cols)),
                        np.reshape(adv_x, (FLAGS.img_rows, FLAGS.img_cols)),
                        figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    # Compute the number of adversarial examples that were successfuly found
    nb_targets_tried = ((FLAGS.nb_classes - 1) * FLAGS.source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.2f}'.format(succ_rate))

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.2f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.2f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if FLAGS.viz_enabled:
        _ = grid_visual(grid_viz_data)
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0,
                        test_end=10000, viz_enabled=True, nb_epochs=6,
                        batch_size=128, nb_classes=10, source_samples=10,
                        learning_rate=0.001):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # MNIST-specific dimensions
    img_rows = 28
    img_cols = 28
    channels = 1

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = make_basic_cnn()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    model_train(sess, x, y, preds, X_train, Y_train, args=train_params,
                rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
    assert X_test.shape[0] == test_end - test_start, X_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, back='tf', sess=sess)
    jsma_params = {'theta': 1., 'gamma': 0.1,
                   'clip_min': 0., 'clip_max': 1.,
                   'y_target': None}

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = X_test[sample_ind:(sample_ind+1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(Y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, channels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = X_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, channels)),
                    np.reshape(adv_x, (img_rows, img_cols, channels)), figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, channels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        plt.close(figure)
        _ = grid_visual(grid_viz_data)

    return report
예제 #21
0
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0,
                      test_end=10000, viz_enabled=True, nb_epochs=6,
                      batch_size=128, nb_classes=10, source_samples=10,
                      learning_rate=0.001, attack_iterations=100,
                      model_path=os.path.join("models", "mnist"),
                      targeted=True):
    """
    MNIST tutorial for Carlini and Wagner's attack
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # MNIST-specific dimensions
    img_rows = 28
    img_cols = 28
    channels = 1

    # Disable Keras learning phase since we will be serving through tensorflow
    keras.layers.core.K.set_learning_phase(0)

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Image dimensions ordering should follow the TensorFlow convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' "
              "to 'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)
    print("Created TensorFlow session and set Keras backend.")

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': os.path.join(*os.path.split(model_path)[:-1]),
        'filename': os.path.split(model_path)[-1]
    }

    # check if we've trained before, and if we have, use that pre-trained model
    if os.path.exists(model_path+".meta"):
        tf_model_load(sess, model_path)
    else:
        model_train(sess, x, y, preds, X_train, Y_train, args=train_params,
                    save=os.path.exists("models"))

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
    assert X_test.shape[0] == test_end - test_start, X_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using Carlini and Wagner's approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) +
          ' adversarial examples')
    print("This could take some time ...")

    # Instantiate a CW attack object
    wrap = KerasModelWrapper(model)
    cw = CarliniWagnerL2(wrap, back='tf', sess=sess)

    idxs = [np.where(np.argmax(Y_test, axis=1) == i)[0][0] for i in range(10)]
    if targeted:
        # Initialize our array for grid visualization
        grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels)
        grid_viz_data = np.zeros(grid_shape, dtype='f')

        one_hot = np.zeros((10, 10))
        one_hot[np.arange(10), np.arange(10)] = 1

        adv_inputs = np.array([[instance] * 10 for instance in X_test[idxs]],
                              dtype=np.float32)
        adv_inputs = adv_inputs.reshape((100, 28, 28, 1))
        adv_ys = np.array([one_hot] * 10, dtype=np.float32).reshape((100, 10))
        yname = "y_target"
    else:
        # Initialize our array for grid visualization
        grid_shape = (nb_classes, 2, img_rows, img_cols, channels)
        grid_viz_data = np.zeros(grid_shape, dtype='f')

        adv_inputs = X_test[idxs]
        adv_ys = None
        yname = "y"

    cw_params = {'binary_search_steps': 1,
                 yname: adv_ys,
                 'max_iterations': attack_iterations,
                 'learning_rate': 0.1,
                 'batch_size': 100 if targeted else 10,
                 'initial_const': 10}

    adv = cw.generate_np(adv_inputs,
                         **cw_params)

    if targeted:
        adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys,
                                  args={'batch_size': 10})
    else:
        adv_accuracy = 1-model_eval(sess, x, y, preds, adv, Y_test[idxs],
                                    args={'batch_size': 10})

    for j in range(10):
        if targeted:
            for i in range(10):
                grid_viz_data[i, j] = adv[i * 10 + j]
        else:
            grid_viz_data[j, 0] = adv_inputs[j]
            grid_viz_data[j, 1] = adv[j]

    print(grid_viz_data.shape)

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy))
    report.clean_train_adv_eval = 1.-adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2,
                                       axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        _ = grid_visual(grid_viz_data)

    return report