Ejemplo n.º 1
0
def gtsrb_blackbox(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_classes=NB_CLASSES,
                   batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE,
                   nb_epochs=NB_EPOCHS, holdout=HOLDOUT, data_aug=DATA_AUG,
                   nb_epochs_s=NB_EPOCHS_S, lmbda=LMBDA,
                   aug_batch_size=AUG_BATCH_SIZE):
    """
    MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :return: a dictionary with:
             * black-box model accuracy on test set
             * substitute model accuracy on test set
             * black-box model accuracy on adversarial examples transferred
               from the substitute model
    """

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session
    sess = tf.Session()

    t1 = time.time()
    x_train, y_train, x_VAL, y_VAL, x_test, y_test = read_gtsrb_dataset()
    print('Data reading time :', time.time()-t1, 'seconds')

    # Initialize substitute training set reserved for adversary
    x_sub = x_test[:holdout]

    savefigfromarray(x_sub[0],filename = 'my2.ppm')
    #y_sub = np.argmax(y_test[:holdout], axis=1)
    y_sub = y_test[:holdout]

    print(x_sub.shape)
    print(y_sub.shape)
    print(x_train.shape)
    print(y_train.shape)
    print(x_test.shape)
    print(y_test.shape)

    # Redefine test set as remaining samples unavailable to adversaries
    x_test = x_test[holdout:]
    y_test = y_test[holdout:]

    # Obtain Image parameters
    nchannels, img_rows, img_cols = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, nchannels, img_rows, img_cols))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Seed random number generator so tutorial is reproducible
    rng = np.random.RandomState([2017, 8, 30])

    # Simulate the black-box model locally
    print("Loading the black-box model.")
    t1 = time.time()
    prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test,
                              nb_epochs, batch_size, learning_rate,
                              rng, nb_classes, img_rows, img_cols, nchannels)
    model, bbox_preds, accuracies['bbox'] = prep_bbox_out
    print(bbox_preds.shape)
    print('Oracle loading time :', time.time()-t1, 'seconds')

    # Evaluate oracle on random noised test samples
    rand_x_test, rand_y_test = [], y_test
    try:
        rand_x_test = np.load('rand_x_test.npy')
    except:
        for itest in range(len(x_test)):
            rand_x_test.append(add_gaussian_noise(x_test[itest], std=0.1))
        rand_x_test = np.array(rand_x_test)
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, bbox_preds, rand_x_test, rand_y_test, args=eval_params)
    accuracies['oracle on noise'] = acc

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    print("Training the substitute model.")
    t1 = time.time()

    train_sub_out = train_sub(sess, x, y, bbox_preds, x_train, y_train,
                              nb_classes, nb_epochs_s, batch_size,
                              learning_rate, data_aug, lmbda, aug_batch_size,
                              rng, img_rows, img_cols, nchannels)

    print('Substitute training time :', time.time()-t1, 'seconds')

    model_sub, preds_sub = train_sub_out
    print(preds_sub.shape)
    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_sub, x_train, y_train, args=eval_params)
    accuracies['sub'] = acc
    print('sub on clean test {0}'.format(acc))

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.}
    fgsm = FastGradientMethod(model_sub, sess=sess)

    # Craft adversarial examples using the substitute
    t1 = time.time()
    eval_params = {'batch_size': batch_size}
    x_adv_sub = fgsm.generate(x, **fgsm_par)
    print('Adversarial example crafting time :', time.time()-t1, 'seconds')

    # Evaluate the accuracy of the "black-box" model on adversarial examples
    accuracy = model_eval(sess, x, y, model.get_logits(x_adv_sub),
                          x_test, y_test, args=eval_params)
    print('Test accuracy of oracle on adversarial examples generated '
          'using the substitute: ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex'] = accuracy

    # Visualize one example:
    x_adv_sub_0  = x_adv_sub.eval(session=sess, feed_dict = {x:x_test[0].reshape(1,3,48,48)})
    print('ONE EXMAPLE: shape = {0}'.format(x_adv_sub_0.shape))
    print('symbolic x_adv_sub: shape = {0}'.format(x_adv_sub.shape))
    np.save('x_adv_sub_0', x_adv_sub_0)

    ###########################################################################
    # Visualize adversarial examples as a grid of pictures.
    ###########################################################################
    source_samples = 10
    img_rows = 48
    img_cols = 48
    nchannels = 3
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    # jsma = SaliencyMapMethod(model, back='tf', sess=sess)
    # jsma_params = {'theta': 1., 'gamma': 0.1,
    #                'clip_min': 0., 'clip_max': 1.,
    #                'y_target': None}

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = x_test[sample_ind:(sample_ind + 1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, nchannels))

        # Loop over all target classes
        for target in target_classes[:3]:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            # jsma_params['y_target'] = one_hot_target
            #adv_x = jsma.generate_np(sample, **jsma_params)
            adv_x = fgsm.generate_np(sample, **fgsm_par)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds_sub, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            fig1 = pair_visual(
                np.reshape(sample, (img_rows, img_cols, nchannels)),
                np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure)

            # Add our adversarial example to our grid data
            fig2 = grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, nchannels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb
    fig1.savefig('fig1.png')
    np.save('fig2.png', fig2)

    print('--------------------------------------')

    return accuracies
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0,
                        test_end=10000, viz_enabled=True, nb_epochs=6,
                        batch_size=128, nb_classes=10, source_samples=10,
                        learning_rate=0.001):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # MNIST-specific dimensions
    img_rows = 28
    img_cols = 28
    channels = 1

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = make_basic_cnn()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    model_train(sess, x, y, preds, X_train, Y_train, args=train_params,
                rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
    assert X_test.shape[0] == test_end - test_start, X_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, back='tf', sess=sess)
    jsma_params = {'theta': 1., 'gamma': 0.1,
                   'clip_min': 0., 'clip_max': 1.,
                   'y_target': None}

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = X_test[sample_ind:(sample_ind+1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(Y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, channels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = X_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, channels)),
                    np.reshape(adv_x, (img_rows, img_cols, channels)), figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, channels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        plt.close(figure)
        _ = grid_visual(grid_viz_data)

    return report
Ejemplo n.º 3
0
        def do_jsma():
            print('Crafting ' + str(source_samples) + ' * ' +
                  str(nb_classes - 1) + ' adversarial examples')

            # Keep track of success (adversarial example classified in target)
            results = np.zeros((nb_classes, source_samples), dtype='i')

            # Rate of perturbed features for each test set example and target class
            perturbations = np.zeros((nb_classes, source_samples), dtype='f')

            # Initialize our array for grid visualization
            grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            # Instantiate a SaliencyMapMethod attack object
            jsma = SaliencyMapMethod(model, back='tf', sess=sess)
            jsma_params = {
                'theta': 1.,
                'gamma': 0.1,
                'clip_min': 0.,
                'clip_max': 1.,
                'y_target': None
            }

            figure = None
            # Loop over the samples we want to perturb into adversarial examples
            for sample_ind in xrange(0, source_samples):
                print('--------------------------------------')
                print('Attacking input %i/%i' %
                      (sample_ind + 1, source_samples))
                sample = X_test[sample_ind:(sample_ind + 1)]

                # We want to find an adversarial example for each possible target class
                # (i.e. all classes that differ from the label given in the dataset)
                current_class = int(np.argmax(Y_test[sample_ind]))
                target_classes = other_classes(nb_classes, current_class)

                # For the grid visualization, keep original images along the diagonal
                grid_viz_data[current_class,
                              current_class, :, :, :] = np.reshape(
                                  sample, (img_rows, img_cols, channels))

                # Loop over all target classes
                for target in target_classes:
                    print('Generating adv. example for target class %i' %
                          target)

                    # This call runs the Jacobian-based saliency map approach
                    one_hot_target = np.zeros((1, nb_classes),
                                              dtype=np.float32)
                    one_hot_target[0, target] = 1
                    jsma_params['y_target'] = one_hot_target
                    adv_x = jsma.generate_np(sample, **jsma_params)

                    # Check if success was achieved
                    res = int(model_argmax(sess, x, preds, adv_x) == target)

                    # Computer number of modified features
                    adv_x_reshape = adv_x.reshape(-1)
                    test_in_reshape = X_test[sample_ind].reshape(-1)
                    nb_changed = np.where(
                        adv_x_reshape != test_in_reshape)[0].shape[0]
                    percent_perturb = float(nb_changed) / adv_x.reshape(
                        -1).shape[0]

                    # Display the original and adversarial images side-by-side
                    if FLAGS.viz_enabled:
                        figure = pair_visual(
                            np.reshape(sample, (img_rows, img_cols)),
                            np.reshape(adv_x, (img_rows, img_cols)), figure)

                    # Add our adversarial example to our grid data
                    grid_viz_data[target, current_class, :, :, :] = np.reshape(
                        adv_x, (img_rows, img_cols, channels))

                    # Update the arrays for later analysis
                    results[target, sample_ind] = res
                    perturbations[target, sample_ind] = percent_perturb

            print('--------------------------------------')

            # Compute the number of adversarial examples that were successfully found
            nb_targets_tried = ((nb_classes - 1) * source_samples)
            succ_rate = float(np.sum(results)) / nb_targets_tried
            print('Avg. rate of successful adv. examples {0:.4f}'.format(
                succ_rate))
            report.clean_train_adv_eval = 1. - succ_rate

            # Compute the average distortion introduced by the algorithm
            percent_perturbed = np.mean(perturbations)
            print('Avg. rate of perturbed features {0:.4f}'.format(
                percent_perturbed))

            # Compute the average distortion introduced for successful samples only
            percent_perturb_succ = np.mean(perturbations * (results == 1))
            print('Avg. rate of perturbed features for successful '
                  'adversarial examples {0:.4f}'.format(percent_perturb_succ))
            if FLAGS.viz_enabled:
                import matplotlib.pyplot as plt
                plt.close(figure)
                _ = grid_visual(grid_viz_data)

            return report
Ejemplo n.º 4
0
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0,
                        test_end=10000, viz_enabled=True, nb_epochs=6,
                        batch_size=128, source_samples=10,
                        learning_rate=0.001):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    nb_filters = 64
    # Define TF model graph
    model = ModelBasicCNN('model1', nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = LossCrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    train(sess, loss, x, y, x_train, y_train, args=train_params,
          rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, back='tf', sess=sess)
    jsma_params = {'theta': 1., 'gamma': 0.1,
                   'clip_min': 0., 'clip_max': 1.,
                   'y_target': None}

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = x_test[sample_ind:(sample_ind+1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, nchannels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, nchannels)),
                    np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, nchannels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        plt.close(figure)
        _ = grid_visual(grid_viz_data)

    return report
Ejemplo n.º 5
0
def main(argv=None):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :return:
    """

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    ###########################################################################
    # Define the dataset and model
    ###########################################################################

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' "
              "to 'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)
    print("Created TensorFlow session and set Keras backend.")

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist()
    print("Loaded MNIST test data.")

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    predictions = model(x)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model if it does not exist in the train_dir folder
    saver = tf.train.Saver()
    save_path = os.path.join(FLAGS.train_dir, FLAGS.filename)
    if os.path.isfile(save_path):
        saver.restore(sess, os.path.join(FLAGS.train_dir, FLAGS.filename))
    else:
        train_params = {
            'nb_epochs': FLAGS.nb_epochs,
            'batch_size': FLAGS.batch_size,
            'learning_rate': FLAGS.learning_rate
        }
        model_train(sess,
                    x,
                    y,
                    predictions,
                    X_train,
                    Y_train,
                    args=train_params)
        saver.save(sess, save_path)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': FLAGS.batch_size}
    accuracy = model_eval(sess,
                          x,
                          y,
                          predictions,
                          X_test,
                          Y_test,
                          args=eval_params)
    assert X_test.shape[0] == 10000, X_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(FLAGS.source_samples) + ' * ' +
          str(FLAGS.nb_classes - 1) + ' adversarial examples')

    # This array indicates whether an adversarial example was found for each
    # test set sample and target class
    results = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='i')

    # This array contains the fraction of perturbed features for each test set
    # sample and target class
    perturbations = np.zeros((FLAGS.nb_classes, FLAGS.source_samples),
                             dtype='f')

    # Define the TF graph for the model's Jacobian
    grads = jacobian_graph(predictions, x, FLAGS.nb_classes)

    # Initialize our array for grid visualization
    grid_shape = (FLAGS.nb_classes, FLAGS.nb_classes, FLAGS.img_rows,
                  FLAGS.img_cols, FLAGS.nb_channels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, FLAGS.source_samples):
        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(Y_test[sample_ind]))
        target_classes = other_classes(FLAGS.nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            X_test[sample_ind:(sample_ind + 1)],
            (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

        # Loop over all target classes
        for target in target_classes:
            print('--------------------------------------')
            print('Creating adv. example for target class ' + str(target))

            # This call runs the Jacobian-based saliency map approach
            adv_x, res, percent_perturb = jsma(sess,
                                               x,
                                               predictions,
                                               grads,
                                               X_test[sample_ind:(sample_ind +
                                                                  1)],
                                               target,
                                               theta=1,
                                               gamma=0.1,
                                               increase=True,
                                               back='tf',
                                               clip_min=0,
                                               clip_max=1)

            # Display the original and adversarial images side-by-side
            if FLAGS.viz_enabled:
                if 'figure' not in vars():
                    figure = pair_visual(
                        np.reshape(X_test[sample_ind:(sample_ind + 1)],
                                   (FLAGS.img_rows, FLAGS.img_cols)),
                        np.reshape(adv_x, (FLAGS.img_rows, FLAGS.img_cols)))
                else:
                    figure = pair_visual(
                        np.reshape(X_test[sample_ind:(sample_ind + 1)],
                                   (FLAGS.img_rows, FLAGS.img_cols)),
                        np.reshape(adv_x, (FLAGS.img_rows, FLAGS.img_cols)),
                        figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    # Compute the number of adversarial examples that were successfuly found
    nb_targets_tried = ((FLAGS.nb_classes - 1) * FLAGS.source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.2f}'.format(succ_rate))

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.2f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.2f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if FLAGS.viz_enabled:
        _ = grid_visual(grid_viz_data)
Ejemplo n.º 6
0
def generate_images():

    print('==> Preparing data..')
    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print(
            "INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
            "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.5
    sess = tf.Session(config=config)
    keras.backend.set_session(sess)

    print "==> Beginning Session"

    # Get CIFAR10 test data
    X_train, Y_train, X_test, Y_test = data_cifar10()

    assert Y_train.shape[1] == 10.
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Load model
    print "==> loading vgg model"
    args = load_args()

    if args.model == 'vgg6': model = vggbn(top=True, pool=args.pool)
    if args.model == 'vgg15': model = vgg15(top=True, pool=args.pool)
    if args.model == 'generic': model = generic(top=True, pool=args.pool)
    if args.model == 'resnet18': model = resnet.build_resnet_18(args.pool)

    predictions = model(x)

    model.load_weights(args.load)

    eval_params = {'batch_size': FLAGS.batch_size}
    accuracy = model_eval(sess,
                          x,
                          y,
                          predictions,
                          X_test,
                          Y_test,
                          args=eval_params)
    print '==> Accuracy : {}'.format(accuracy)

    def evaluate():
        # Evaluate the accuracy of the CIFAR10 model on legitimate test examples
        eval_params = {'batch_size': FLAGS.batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              predictions,
                              X_test,
                              Y_test,
                              args=eval_params)
        assert X_test.shape[0] == 10000, X_test.shape
        print('Test accuracy on legitimate test examples: ' + str(accuracy))

    # Train an CIFAR10 model
    train_params = {
        'nb_epochs': FLAGS.nb_epochs,
        'batch_size': FLAGS.batch_size,
        'learning_rate': FLAGS.learning_rate
    }

    im_base = '/im_'
    model_name = args.model + '_p' + str(args.pool)
    if args.attack == 'fgsm' or args.attack == 'FGSM':

        result_dir = os.getcwd() + '/images/fgsm/'
        print "==> creating fgsm adversarial wrapper"
        adv_x = fgsm(x, predictions, eps=0.3)

        print "==> sending to batch evaluator to finalize adversarial images"
        eval_params = {'batch_size': FLAGS.batch_size}
        X_train_adv, = batch_eval(sess, [x], [adv_x], [X_train],
                                  args=eval_params)

        i = 0
        if not os.path.exists(result_dir + model_name):
            os.makedirs(result_dir + model_name)
        print "==> saving images to {}".format(result_dir + model_name)
        for ad in X_train_adv:
            scipy.misc.imsave(
                result_dir + model_name + im_base + str(i) + '.png', ad)
            i += 1

        sess.close()
    """ JSMA """
    if args.attack == 'jsma' or args.attack == 'JSMA':

        result_dir = os.getcwd() + '/images/jsma/trial_single_adv'
        print('Crafting ' + str(FLAGS.source_samples) + ' * ' +
              str(FLAGS.nb_classes - 1) + ' adversarial examples')

        results = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='i')

        # This array contains the fraction of perturbed features for each test set
        perturbations = np.zeros((FLAGS.nb_classes, FLAGS.source_samples),
                                 dtype='f')

        # Define the TF graph for the model's Jacobian
        grads = jacobian_graph(predictions, x, FLAGS.nb_classes)

        # Initialize our array for grid visualization
        grid_shape = (FLAGS.nb_classes, FLAGS.nb_classes, FLAGS.img_rows,
                      FLAGS.img_cols, FLAGS.nb_channels)
        grid_viz_data = np.zeros(grid_shape, dtype='f')
        i_saved = 0
        n_image = 0
        # Loop over the samples we want to perturb into adversarial examples
        print "==> saving images to {}".format(result_dir + model_name)
        for sample_ind in xrange(7166, FLAGS.source_samples):
            # We want to find an adversarial example for each possible target class
            current_class = int(np.argmax(Y_train[sample_ind]))
            target_classes = other_classes(FLAGS.nb_classes, current_class)
            # For the grid visualization, keep original images along the diagonal
            grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
                X_train[sample_ind:(sample_ind + 1)],
                (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

            # Loop over all target classes
            adversarials = []
            for idx, target in enumerate(target_classes):
                print "image {}".format(sample_ind)

                # here we hold all successful adversarials for this iteration
                # since we dont want 500k images, we will uniformly sample an image to save after each target

                print('--------------------------------------')
                print('Creating adv. example for target class ' + str(target))

                # This call runs the Jacobian-based saliency map approach
                adv_x, res, percent_perturb = jsma(
                    sess,
                    x,
                    predictions,
                    grads,
                    X_train[sample_ind:(sample_ind + 1)],
                    target,
                    theta=1,
                    gamma=0.1,
                    increase=True,
                    back='tf',
                    clip_min=0,
                    clip_max=1)
                # Display the original and adversarial images side-by-side
                adversarial = np.reshape(
                    adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))
                original = np.reshape(
                    X_train[sample_ind:(sample_ind + 1)],
                    (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

                if FLAGS.viz_enabled:

                    if 'figure' not in vars():
                        figure = pair_visual(original, adversarial)
                    else:
                        figure = pair_visual(original, adversarial, figure)

                if not os.path.exists(result_dir + model_name):
                    os.makedirs(result_dir + model_name)

                if res == 1:
                    adversarials.append(adversarial)

                if idx == FLAGS.nb_classes - 2:

                    try:
                        if len(adversarials) == 1:
                            idx_uniform = 0
                        else:
                            idx_uniform = np.random.randint(
                                0,
                                len(adversarials) - 1)
                        print idx_uniform
                        scipy.misc.imsave(
                            result_dir + model_name + im_base +
                            str(sample_ind) + '.png',
                            adversarials[idx_uniform])
                        i_saved += 1
                        print "==> images saved: {}".format(i_saved)

                    except:

                        print "No adversarials generated"

# Add our adversarial example to our grid data
                grid_viz_data[target, current_class, :, :, :] = np.reshape(
                    adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

                # Update the arrays for later analysis
                results[target, sample_ind] = res
                perturbations[target, sample_ind] = percent_perturb

            n_image += 1

# Compute the number of adversarial examples that were successfuly found
        nb_targets_tried = ((FLAGS.nb_classes - 1) * FLAGS.source_samples)
        succ_rate = float(np.sum(results)) / nb_targets_tried
        print(
            'Avg. rate of successful adv. examples {0:.2f}'.format(succ_rate))

        # Compute the average distortion introduced by the algorithm
        percent_perturbed = np.mean(perturbations)
        print('Avg. rate of perturbed features {0:.2f}'.format(
            percent_perturbed))

        # Compute the average distortion introduced for successful samples only
        percent_perturb_succ = np.mean(perturbations * (results == 1))
        print(
            'Avg. rate of perturbed features for successful '
            'adversarial examples {0:.2f}'.format(percent_perturb_succ))

        # Close TF session
        sess.close()

        # Finally, block & display a grid of all the adversarial examples
        if FLAGS.viz_enabled:
            _ = grid_visual(grid_viz_data)
Ejemplo n.º 7
0
                # Check if success was achieved
                res = int(model_argmax(sess, x, preds, adv_x) == target)

                # Computer number of modified features
                adv_x_reshape = adv_x.reshape(-1)
                test_in_reshape = X_test[sample_ind].reshape(-1)
                nb_changed = np.where(
                    adv_x_reshape != test_in_reshape)[0].shape[0]
                percent_perturb = float(nb_changed) / \
                    adv_x.reshape(-1).shape[0]

                # Display the original and adversarial images side-by-side
                if FLAGS.viz_enabled:
                    figure = pair_visual(
                        np.reshape(sample, (img_rows, img_cols, channels)),
                        np.reshape(adv_x, (img_rows, img_cols, channels)),
                        figure)

                # Add our adversarial example to our grid data
                grid_viz_data[target, current_class, :, :, :] = np.reshape(
                    adv_x, (img_rows, img_cols, channels))

                # Update the arrays for later analysis
                results[target, sample_ind] = res
                perturbations[target, sample_ind] = percent_perturb

        print('--------------------------------------')

        # Compute the number of adversarial examples that were successfully
        # found
        nb_targets_tried = ((nb_classes - 1) * nb_samples)
def main(argv=None):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :return:
    """
    # Disable Keras learning phase since we will be serving through tensorflow
    keras.layers.core.K.set_learning_phase(0)

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' "
              "to 'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)
    print("Created TensorFlow session and set Keras backend.")

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist()
    print("Loaded MNIST test data.")

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model if it does not exist in the train_dir folder
    saver = tf.train.Saver()
    save_path = os.path.join(FLAGS.train_dir, FLAGS.filename)
    if os.path.isfile(save_path):
        saver.restore(sess, os.path.join(FLAGS.train_dir, FLAGS.filename))
    else:
        train_params = {
            'nb_epochs': FLAGS.nb_epochs,
            'batch_size': FLAGS.batch_size,
            'learning_rate': FLAGS.learning_rate
        }
        model_train(sess, x, y, preds, X_train, Y_train,
                    args=train_params)
        saver.save(sess, save_path)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': FLAGS.batch_size}
    accuracy = model_eval(sess, x, y, preds, X_test, Y_test,
                          args=eval_params)
    assert X_test.shape[0] == 10000, X_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(FLAGS.source_samples) + ' * ' +
          str(FLAGS.nb_classes-1) + ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((FLAGS.nb_classes, FLAGS.source_samples),
                             dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (FLAGS.nb_classes,
                  FLAGS.nb_classes,
                  FLAGS.img_rows,
                  FLAGS.img_cols,
                  FLAGS.nb_channels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Define the SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, back='tf', sess=sess)

    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, FLAGS.source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, FLAGS.source_samples))

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(Y_test[sample_ind]))
        target_classes = other_classes(FLAGS.nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            X_test[sample_ind:(sample_ind+1)],
            (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, FLAGS.nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params = {'theta': 1., 'gamma': 0.1,
                           'nb_classes': FLAGS.nb_classes, 'clip_min': 0.,
                           'clip_max': 1., 'targets': y,
                           'y_val': one_hot_target}
            adv_x = jsma.generate_np(X_test[sample_ind:(sample_ind+1)],
                                     **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = X_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if FLAGS.viz_enabled:
                if 'figure' not in vars():
                    figure = pair_visual(
                        np.reshape(X_test[sample_ind:(sample_ind+1)],
                                   (FLAGS.img_rows, FLAGS.img_cols)),
                        np.reshape(adv_x,
                                   (FLAGS.img_rows, FLAGS.img_cols)))
                else:
                    figure = pair_visual(
                        np.reshape(X_test[sample_ind:(sample_ind+1)],
                                   (FLAGS.img_rows, FLAGS.img_cols)),
                        np.reshape(adv_x, (FLAGS.img_rows,
                                   FLAGS.img_cols)), figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((FLAGS.nb_classes - 1) * FLAGS.source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if FLAGS.viz_enabled:
        _ = grid_visual(grid_viz_data)
Ejemplo n.º 9
0
def main(argv=None):
    """
    CIFAR10 CleverHans tutorial
    :return:
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # CIFAR10-specific dimensions
    img_rows = 32
    img_cols = 32
    channels = 3
    nb_classes = 10

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    sess = tf.Session()

    set_log_level(logging.DEBUG)

    # Get CIFAR10 test data
    X_train, Y_train, X_test, Y_test = data_cifar10()

    # Label smoothing
    assert Y_train.shape[1] == 10.

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels))

    y = tf.placeholder(tf.float32, shape=(None, 10))

    model_path = FLAGS.model_path
    nb_samples = FLAGS.nb_samples

    from cnn_models import make_basic_cnn
    model = make_basic_cnn('fp_',
                           input_shape=(None, img_rows, img_cols, channels),
                           nb_filters=FLAGS.nb_filters)

    preds = model(x)
    print("Defined TensorFlow model graph with %d parameters" % model.n_params)

    rng = np.random.RandomState([2017, 8, 30])

    def evaluate(eval_params):
        # Evaluate the model on legitimate test examples
        acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
        return acc

    model_load(sess, model_path)
    print('Restored model from %s' % model_path)
    eval_params = {'batch_size': FLAGS.batch_size}
    accuracy = evaluate(eval_params)
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(nb_samples) + ' * ' + str(nb_classes - 1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, nb_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, nb_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    from cleverhans.attacks import SaliencyMapMethod
    jsma = SaliencyMapMethod(model, sess=sess)
    jsma_params = {
        'gamma': FLAGS.gamma,
        'theta': 1.,
        'symbolic_impl': True,
        'clip_min': 0.,
        'clip_max': 1.,
        'y_target': None
    }
    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in range(0, nb_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, nb_samples))
        sample = X_test[sample_ind:(sample_ind + 1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(Y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, channels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = X_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if FLAGS.viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, channels)),
                    np.reshape(adv_x, (img_rows, img_cols, channels)), figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, channels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * nb_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if FLAGS.viz_enabled:
        import matplotlib.pyplot as plt
        plt.close(figure)
        _ = grid_visual(grid_viz_data)
Ejemplo n.º 10
0
def mnist_tutorial_jsma(train_start=0,
                        train_end=60000,
                        test_start=0,
                        test_end=10000,
                        viz_enabled=VIZ_ENABLED,
                        nb_epochs=NB_EPOCHS,
                        batch_size=BATCH_SIZE,
                        source_samples=SOURCE_SAMPLES,
                        learning_rate=LEARNING_RATE):
    """
  MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param viz_enabled: (boolean) activate plots of adversarial examples
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param nb_classes: number of output classes
  :param source_samples: number of test inputs to attack
  :param learning_rate: learning rate for training
  :return: an AccuracyReport object
  """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    #replace
    num_threads = None
    if num_threads:
        config_args = dict(intra_op_parallelism_threads=1)
    else:
        config_args = {}
    sess = tf.Session(config=tf.ConfigProto(**config_args))
    #with sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    mnist = MNIST(train_start=train_start,
                  train_end=train_end,
                  test_start=test_start,
                  test_end=test_end)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]
    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    nb_filters = 64

    # Define TF model graph
    model = make_basic_picklable_cnn()

    preds = model.get_logits(x)
    loss = CrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.reshape(x_train, [60000, 28, 28]), y_train))
    dataset = dataset.batch(32)
    val_dataset = tf.data.Dataset.from_tensor_slices(
        (tf.reshape(x_test, [10000, 28, 28]), y_test))
    val_dataset = val_dataset.batch(32)

    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    if TRAIN_NEW == 1:
        with sess.as_default():
            train(sess, loss, x_train, y_train, args=train_params, rng=rng)
            save("test.joblib", model)
    else:
        with sess.as_default():
            model = load("test.joblib")  #changed
        assert len(model.get_params()) > 0
        preds = model.get_logits(x)
        loss = CrossEntropy(model, smoothing=0.1)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, sess=sess)
    jsma_params = {
        'theta': 1.,
        'gamma': 0.1,
        'clip_min': 0.,
        'clip_max': 1.,
        'y_target': None
    }

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    seed(SEED)
    for sample_ind in xrange(0, source_samples):
        img = randint(0, 10000)
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = x_test[img:(img +
                             1)]  #sample = x_test[sample_ind:(sample_ind + 1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(
            y_test[img]))  #current_class = int(np.argmax(y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, nchannels))
        tn = 0
        totc = 0
        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Compute number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]
            diff = np.array(adv_x - sample)
            #print(np.sum(diff))
            diff = np.reshape(diff, (28, 28))
            diff = diff * 255
            cv2.imwrite("test.png", diff)
            diff = cv2.imread("test.png")
            diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
            nieghbors = 0
            tc = 0
            for i in range(0, 28, 1):
                for j in range(0, 28, 1):
                    if diff[i, j] > 0:
                        tc = tc + 1
                        totc = totc + 1
                        if i > 0 and i < 27 and j > 0 and j < 27:  #main grid not edges or corners
                            if diff[i - 1, j - 1] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i - 1, j] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i - 1, j + 1] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i, j - 1] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i, j + 1] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i + 1, j - 1] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i + 1, j] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i + 1, j + 1] > 0:
                                nieghbors = nieghbors + 1
                        else:
                            #corners
                            if i == 0 and j == 0:
                                if diff[i, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j] > 0:
                                    nieghbors = nieghbors + 1
                            if i == 27 and j == 0:
                                if diff[i, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j] > 0:
                                    nieghbors = nieghbors + 1
                            if i == 0 and j == 27:
                                if diff[i, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j] > 0:
                                    nieghbors = nieghbors + 1
                            if i == 27 and j == 27:
                                if diff[i, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j] > 0:
                                    nieghbors = nieghbors + 1
                            #edges
                            if i == 0 and j > 0 and j < 27:  #left side
                                if diff[i, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                            if i == 27 and j > 0 and j < 27:  #right side
                                if diff[i, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                            if j == 0 and i > 0 and i < 27:  #top side
                                if diff[i - 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                            if j == 27 and i > 0 and i < 27:  #bot side
                                if diff[i - 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j - 1] > 0:
                                    nieghbors = nieghbors + 1

            # print(tc)
            # print(nieghbors)
            tn = tn + nieghbors
            # if tc > 0:
            # print(nieghbors/tc)
            # Display the original and adversarial images side-by-side
            if viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, nchannels)),
                    np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure)
            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, nchannels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb
            #print(perturbations[target, sample_ind])

    print('--------------------------------------')

    print("average neighbors per modified pixel ", tn / totc)
    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.8f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)

    s = perturbations.shape
    myPert = np.empty(0)
    myResults = np.empty(0)
    for i in range(s[0]):
        for j in range(s[1]):
            if perturbations[i][j] > 0:
                myPert = np.append(myPert, perturbations[i][j])
                myResults = np.append(myResults, results[i][j])
    min_perturbed = np.min(myPert)
    max_perturbed = np.max(myPert)

    s2 = myResults.shape
    final = np.empty(0)
    for i in range(s2[0]):
        if myResults[i] > 0:
            final = np.append(final, myPert[i])

    print('Avg. rate of perturbed features {0:.8f}'.format(percent_perturbed))
    print('MIN of perturbed features {0:.8f}'.format(min_perturbed))
    print('MAX of perturbed features {0:.8f}'.format(max_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    min_perturb_succ = np.min(final)
    max_perturb_succ = np.max(final)
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.8f}'.format(percent_perturb_succ))
    print('Min of perturbed features for successful '
          'adversarial examples {0:.8f}'.format(min_perturb_succ))
    print('Max of perturbed features for successful '
          'adversarial examples {0:.8f}'.format(max_perturb_succ))

    #Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        plt.close(figure)
        _ = grid_visual(grid_viz_data)

    return report
Ejemplo n.º 11
0
def mnist_tutorial_jsma(
    train_start=0,
    train_end=60000,
    test_start=0,
    test_end=10000,
    viz_enabled=VIZ_ENABLED,
    nb_epochs=NB_EPOCHS,
    batch_size=BATCH_SIZE,
    source_samples=SOURCE_SAMPLES,
    learning_rate=LEARNING_RATE,
):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    mnist = MNIST(
        train_start=train_start,
        train_end=train_end,
        test_start=test_start,
        test_end=test_end,
    )
    x_train, y_train = mnist.get_set("train")
    x_test, y_test = mnist.get_set("test")

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    nb_filters = 64
    # Define TF model graph
    model = ModelBasicCNN("model1", nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = CrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        "nb_epochs": nb_epochs,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
    }
    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    train(sess, loss, x_train, y_train, args=train_params, rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {"batch_size": batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print("Test accuracy on legitimate test examples: {0}".format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print("Crafting " + str(source_samples) + " * " + str(nb_classes - 1) +
          " adversarial examples")

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype="i")

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype="f")

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype="f")

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, sess=sess)
    jsma_params = {
        "theta": 1.0,
        "gamma": 0.1,
        "clip_min": 0.0,
        "clip_max": 1.0,
        "y_target": None,
    }

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print("--------------------------------------")
        print("Attacking input %i/%i" % (sample_ind + 1, source_samples))
        sample = x_test[sample_ind:(sample_ind + 1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, nchannels))

        # Loop over all target classes
        for target in target_classes:
            print("Generating adv. example for target class %i" % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params["y_target"] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Compute number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, nchannels)),
                    np.reshape(adv_x, (img_rows, img_cols, nchannels)),
                    figure,
                )

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, nchannels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print("--------------------------------------")

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = (nb_classes - 1) * source_samples
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print("Avg. rate of successful adv. examples {0:.4f}".format(succ_rate))
    report.clean_train_adv_eval = 1.0 - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations[np.where(perturbations != 0)])
    print("Avg. rate of perturbed features {0:.4f}".format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(
        perturbations[np.where(perturbations != 0)] *
        (results[np.where(perturbations != 0)] == 1))
    print("Avg. rate of perturbed features for successful "
          "adversarial examples {0:.4f}".format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt

        plt.close(figure)
        _ = grid_visual(grid_viz_data)

    return report