Beispiel #1
0
def main(argv):
    checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)

    if checkpoint is None:
        raise ValueError("Couldn't find latest checkpoint in " +
                         FLAGS.checkpoint_dir)

    train_start = 0
    train_end = 60000
    test_start = 0
    test_end = 10000
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    assert Y_train.shape[1] == 10

    # NOTE: for compatibility with Madry Lab downloadable checkpoints,
    # we cannot enclose this in a scope or do anything else that would
    # change the automatic naming of the variables.
    model = MadryMNIST()

    x_input = tf.placeholder(tf.float32, shape=[None, 784])
    x_image = tf.placeholder(tf.float32, shape=[None, 28, 28, 1])
    y = tf.placeholder(tf.float32, shape=[None, 10])

    if FLAGS.attack_type == 'fgsm':
        fgsm = FastGradientMethod(model)
        fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
        adv_x = fgsm.generate(x_image, **fgsm_params)
    elif FLAGS.attack_type == 'bim':
        bim = BasicIterativeMethod(model)
        bim_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.,
                      'nb_iter': 50,
                      'eps_iter': .01}
        adv_x = bim.generate(x_image, **bim_params)
    else:
        raise ValueError(FLAGS.attack_type)
    preds_adv = model.get_probs(adv_x)

    saver = tf.train.Saver()

    with tf.Session() as sess:
        # Restore the checkpoint
        saver.restore(sess, checkpoint)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        eval_par = {'batch_size': FLAGS.batch_size}
        t1 = time.time()
        acc = model_eval(
            sess, x_image, y, preds_adv, X_test, Y_test, args=eval_par)
        t2 = time.time()
        print("Took", t2 - t1, "seconds")
        print('Test accuracy on adversarial examples: %0.4f\n' % acc)
Beispiel #2
0
def get_logits_over_interval(sess, model, x_data, fgsm_params,
                             min_epsilon=-10., max_epsilon=10.,
                             num_points=21):
    """Get logits when the input is perturbed in an interval in adv direction.

    Args:
        sess: Tf session
        model: Model for which we wish to get logits.
        x_data: Numpy array corresponding to single data.
                point of shape [height, width, channels].
        fgsm_params: Parameters for generating adversarial examples.
        min_epsilon: Minimum value of epsilon over the interval.
        max_epsilon: Maximum value of epsilon over the interval.
        num_points: Number of points used to interpolate.

    Returns:
        Numpy array containing logits.

    Raises:
        ValueError if min_epsilon is larger than max_epsilon.
    """
    # Get the height, width and number of channels
    height = x_data.shape[0]
    width = x_data.shape[1]
    channels = x_data.shape[2]
    size = height * width * channels

    x_data = np.expand_dims(x_data, axis=0)
    import tensorflow as tf
    from cleverhans.attacks import FastGradientMethod

    # Define the data placeholder
    x = tf.placeholder(dtype=tf.float32,
                       shape=[1, height,
                              width,
                              channels],
                       name='x')
    # Define adv_x
    fgsm = FastGradientMethod(model, sess=sess)
    adv_x = fgsm.generate(x, **fgsm_params)

    if min_epsilon > max_epsilon:
        raise ValueError('Minimum epsilon is less than maximum epsilon')

    eta = tf.nn.l2_normalize(adv_x - x, dim=0)
    epsilon = tf.reshape(tf.lin_space(float(min_epsilon),
                                      float(max_epsilon),
                                      num_points),
                         (num_points, 1, 1, 1))
    lin_batch = x + epsilon * eta
    logits = model.get_logits(lin_batch)
    with sess.as_default():
        log_prob_adv_array = sess.run(logits,
                                      feed_dict={x: x_data})
    return log_prob_adv_array
Beispiel #3
0
def baseline_deepfool(train_start=0,
                      train_end=60000,
                      test_start=0,
                      test_end=10000,
                      nb_epochs=6,
                      batch_size=128,
                      learning_rate=0.001,
                      clean_train=True,
                      testing=False,
                      backprop_through_attack=False,
                      nb_filters=64):
    """
    MNIST cleverhans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param clean_train: perform normal training on clean examples only
                        before performing adversarial training.
    :param testing: if true, complete an AccuracyReport for unit tests
                    to verify that performance is adequate
    :param backprop_through_attack: If True, backprop through adversarial
                                    example construction process during
                                    adversarial training.
    :param clean_train: if true, train on clean examples
    :return: an AccuracyReport object
    """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    sess = tf.Session()

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    model_path = "models/mnist"
    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    deepfool_params = {
        'nb_candidate': 10,
        'overshoot': 0.02,
        'max_iter': 50,
        'clip_min': 0.,
        'clip_max': 1.
    }
    rng = np.random.RandomState([2017, 8, 30])

    if clean_train:
        model = make_basic_cnn(nb_filters=nb_filters)
        preds = model.get_probs(x)

        def evaluate():
            # Evaluate the accuracy of the MNIST model on legitimate test
            # examples
            eval_params = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds,
                             X_test,
                             Y_test,
                             args=eval_params)
            report.clean_train_clean_eval = acc
            assert X_test.shape[0] == test_end - test_start, X_test.shape
            print('Test accuracy on legitimate examples: %0.4f' % acc)

        #
        # HERE already trained model, thus we need a new one (model_2)
        model_train(sess,
                    x,
                    y,
                    preds,
                    X_train,
                    Y_train,
                    evaluate=evaluate,
                    args=train_params,
                    rng=rng)

        # Calculate training error
        if testing:
            eval_params = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds,
                             X_train,
                             Y_train,
                             args=eval_params)
            report.train_clean_train_clean_eval = acc

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        deepfool = DeepFool(model, sess=sess)
        adv_x = deepfool.generate(x, **deepfool_params)

        preds_adv = model.get_probs(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par)
        print('Test accuracy on DeepFool adversarial examples: %0.4f\n' % acc)
        report.clean_train_adv_eval = acc

        # Calculate training error
        if testing:
            eval_par = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds_adv,
                             X_train,
                             Y_train,
                             args=eval_par)
            report.train_clean_train_adv_eval = acc

        print("Repeating the process, using adversarial training")
        # Redefine TF model graph
    model_2 = make_basic_cnn(nb_filters=nb_filters)
    preds_2 = model_2(x)

    deepfool2 = DeepFool(model_2, sess=sess)
    adv_x_2 = deepfool2.generate(x, **deepfool_params)
    if not backprop_through_attack:
        # For the fgsm attack used in this tutorial, the attack has zero
        # gradient so enabling this flag does not change the gradient.
        # For some other attacks, enabling this flag increases the cost of
        # training, but gives the defender the ability to anticipate how
        # the atacker will change their strategy in response to updates to
        # the defender's parameters.
        adv_x_2 = tf.stop_gradient(adv_x_2)
    preds_2_adv = model_2(adv_x_2)

    #
    # let's generate DeepFool examples
    #
    # let's generate FGSM examples
    #
    fgsm = FastGradientMethod(model_2, sess=sess)
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    adv_x_fgsm = fgsm.generate(x, **fgsm_params)
    if not backprop_through_attack:
        # For the fgsm attack used in this tutorial, the attack has zero
        # gradient so enabling this flag does not change the gradient.
        # For some other attacks, enabling this flag increases the cost of
        # training, but gives the defender the ability to anticipate how
        # the atacker will change their strategy in response to updates to
        # the defender's parameters.
        adv_x_fgsm = tf.stop_gradient(adv_x_fgsm)
    preds_2_fgsm = model_2(adv_x_fgsm)

    # DON'T WANT TO TRAIN on FGSM adv examples yet

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on FGSM adversarial examples
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_fgsm,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on FGSM adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

        # Accuracy of the DeepFool adv trained model on DeepFool examples
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on DeepFool adversarial examples: %0.4f' %
              accuracy)

    # Perform and evaluate adversarial training
    model_train(sess,
                x,
                y,
                preds_2,
                X_train,
                Y_train,
                predictions_adv=preds_2_adv,
                evaluate=evaluate_2,
                args=train_params,
                rng=rng)

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              X_train,
                              Y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              X_train,
                              Y_train,
                              args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
def mnist_tutorial(nb_epochs=6, batch_size=128, train_end=-1, test_end=-1,
                   learning_rate=0.001):
    """
    MNIST cleverhans tutorial
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    # Train a pytorch MNIST model
    torch_model = PytorchMnistModel()
    if torch.cuda.is_available():
        torch_model = torch_model.cuda()
    report = AccuracyReport()

    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('data', train=True, download=True,
                       transform=transforms.ToTensor()),
        batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('data', train=False, transform=transforms.ToTensor()),
        batch_size=batch_size)

    # Truncate the datasets so that our test run more quickly
    train_loader.dataset.train_data = train_loader.dataset.train_data[
                                      :train_end]
    test_loader.dataset.test_data = test_loader.dataset.test_data[:test_end]

    # Train our model
    optimizer = optim.Adam(torch_model.parameters(), lr=learning_rate)
    train_loss = []

    total = 0
    correct = 0
    step = 0
    for epoch in range(nb_epochs):
        for xs, ys in train_loader:
            xs, ys = Variable(xs), Variable(ys)
            if torch.cuda.is_available():
                xs, ys = xs.cuda(), ys.cuda()
            optimizer.zero_grad()
            preds = torch_model(xs)
            loss = F.nll_loss(preds, ys)
            loss.backward()  # calc gradients
            train_loss.append(loss.data.item())
            optimizer.step()  # update gradients

            preds_np = preds.data.cpu().numpy()
            correct += (np.argmax(preds_np, axis=1) == ys).sum()
            total += len(xs)
            step += 1
            if total % 1000 == 0:
                acc = float(correct) / total
                print('[%s] Training accuracy: %.2f%%' % (step, acc * 100))
                total = 0
                correct = 0

    # Evaluate on clean data
    total = 0
    correct = 0
    for xs, ys in test_loader:
        xs, ys = Variable(xs), Variable(ys)
        if torch.cuda.is_available():
            xs, ys = xs.cuda(), ys.cuda()

        preds = torch_model(xs)
        preds_np = preds.data.cpu().numpy()

        correct += (np.argmax(preds_np, axis=1) == ys).sum()
        total += len(xs)

    acc = float(correct) / total
    report.clean_train_clean_eval = acc
    print('[%s] Clean accuracy: %.2f%%' % (step, acc * 100))

    # We use tf for evaluation on adversarial data
    sess = tf.Session()
    x_op = tf.placeholder(tf.float32, shape=(None, 1, 28, 28,))

    # Convert pytorch model to a tf_model and wrap it in cleverhans
    tf_model_fn = convert_pytorch_model_to_tf(torch_model)
    cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits')

    # Create an FGSM attack
    fgsm_op = FastGradientMethod(cleverhans_model, sess=sess)
    fgsm_params = {'eps': 0.3,
                   'clip_min': 0.,
                   'clip_max': 1.}
    adv_x_op = fgsm_op.generate(x_op, **fgsm_params)
    adv_preds_op = tf_model_fn(adv_x_op)

    # Run an evaluation of our model against fgsm
    total = 0
    correct = 0
    for xs, ys in test_loader:
        adv_preds = sess.run(adv_preds_op, feed_dict={x_op: xs})
        correct += (np.argmax(adv_preds, axis=1) == ys).sum()
        total += len(xs)

    acc = float(correct) / total
    print('Adv accuracy: {:.3f}'.format(acc * 100))
    report.clean_train_adv_eval = acc
    return report
Beispiel #5
0
def blackbox(gan,
             rec_data_path=None,
             batch_size=128,
             learning_rate=0.001,
             nb_epochs=10,
             holdout=150,
             data_aug=6,
             nb_epochs_s=10,
             lmbda=0.1,
             online_training=False,
             train_on_recs=False,
             test_on_dev=False,
             defense_type='none'):
    """MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
    
    Args:
        train_start: index of first training set example
        train_end: index of last training set example
        test_start: index of first test set example
        test_end: index of last test set example
        defense_type: Type of defense against blackbox attacks
    
    Returns:
        a dictionary with:
             * black-box model accuracy on test set
             * substitute model accuracy on test set
             * black-box model accuracy on adversarial examples transferred
               from the substitute model
    """
    FLAGS = flags.FLAGS

    # Set logging level to see debug information.
    set_log_level(logging.WARNING)

    # Dictionary used to keep track and return key accuracies.
    accuracies = {}

    # Create TF session.
    adv_training = False
    if defense_type:
        if defense_type == 'defense_gan' and gan:
            sess = gan.sess
            gan_defense_flag = True
        else:
            gan_defense_flag = False
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            sess = tf.Session(config=config)
        if 'adv_tr' in defense_type:
            adv_training = True
    else:
        gan_defense_flag = False
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)

    train_images, train_labels, test_images, test_labels = \
        get_cached_gan_data(gan, test_on_dev, orig_data_flag=True)

    x_shape, classes = list(train_images.shape[1:]), train_labels.shape[1]
    nb_classes = classes

    type_to_models = {
        'A': model_a,
        'B': model_b,
        'C': model_c,
        'D': model_d,
        'E': model_e,
        'F': model_f,
        'Q': model_q,
        'Y': model_y,
        'Z': model_z
    }

    with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):
        bb_model = type_to_models[FLAGS.bb_model](
            input_shape=[None] + x_shape,
            nb_classes=train_labels.shape[1],
        )
    with tf.variable_scope("Substitute", reuse=tf.AUTO_REUSE):
        sub_model = type_to_models[FLAGS.sub_model](
            input_shape=[None] + x_shape,
            nb_classes=train_labels.shape[1],
        )

    if FLAGS.debug:
        train_images = train_images[:20 * batch_size]
        train_labels = train_labels[:20 * batch_size]
        debug_dir = os.path.join('debug', 'blackbox', FLAGS.debug_dir)
        ensure_dir(debug_dir)
        x_debug_test = test_images[:batch_size]

    # Initialize substitute training set reserved for adversary
    images_sub = test_images[:holdout]
    labels_sub = np.argmax(test_labels[:holdout], axis=1)

    print(labels_sub)

    # Redefine test set as remaining samples unavailable to adversaries
    if FLAGS.num_tests > 0:
        test_images = test_images[:FLAGS.num_tests]
        test_labels = test_labels[:FLAGS.num_tests]

    test_images = test_images[holdout:]
    test_labels = test_labels[holdout:]

    # Define input and output TF placeholders

    if FLAGS.image_dim[0] == 3:
        FLAGS.image_dim = [
            FLAGS.image_dim[1], FLAGS.image_dim[2], FLAGS.image_dim[0]
        ]

    images_tensor = tf.placeholder(tf.float32, shape=[None] + x_shape)
    labels_tensor = tf.placeholder(tf.float32, shape=(None, classes))

    rng = np.random.RandomState([11, 24, 1990])

    train_images_bb, train_labels_bb, test_images_bb, test_labels_bb = \
        train_images, train_labels, test_images, \
        test_labels

    cur_gan = gan
    if FLAGS.debug:
        train_images_bb = train_images_bb[:20 * batch_size]
        train_labels_bb = train_labels_bb[:20 * batch_size]

    # Prepare the black_box model.
    prep_bbox_out = prep_bbox(sess,
                              images_tensor,
                              labels_tensor,
                              train_images_bb,
                              train_labels_bb,
                              test_images_bb,
                              test_labels_bb,
                              nb_epochs,
                              batch_size,
                              learning_rate,
                              rng=rng,
                              gan=cur_gan,
                              adv_training=adv_training,
                              cnn_arch=bb_model)

    model, bbox_preds, accuracies['bbox'] = prep_bbox_out

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    print("Training the substitute model.")
    reconstructor = get_reconstructor(gan)
    recon_tensors, _ = reconstructor.reconstruct(images_tensor,
                                                 batch_size=batch_size,
                                                 reconstructor_id=2)

    model_sub, preds_sub = train_sub(sess,
                                     images_tensor,
                                     labels_tensor,
                                     model.get_logits(recon_tensors),
                                     images_sub,
                                     labels_sub,
                                     nb_classes,
                                     nb_epochs_s,
                                     batch_size,
                                     learning_rate,
                                     data_aug,
                                     lmbda,
                                     rng=rng,
                                     substitute_model=sub_model,
                                     dataset_name=gan.dataset_name)

    accuracies['sub'] = 0

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    eps = attack_config_dict[gan.dataset_name]['eps']
    min_val = attack_config_dict[gan.dataset_name]['clip_min']

    fgsm_par = {'eps': eps, 'ord': np.inf, 'clip_min': min_val, 'clip_max': 1.}

    fgsm = FastGradientMethod(model_sub, sess=sess)

    # Craft adversarial examples using the substitute.
    eval_params = {'batch_size': batch_size}
    x_adv_sub = fgsm.generate(images_tensor, **fgsm_par)

    if FLAGS.debug and gan is not None:  # To see some qualitative results.
        recon_tensors, _ = reconstructor.reconstruct(x_adv_sub,
                                                     batch_size=batch_size,
                                                     reconstructor_id=2)
        x_rec_orig, _ = reconstructor.reconstruct(images_tensor,
                                                  batch_size=batch_size,
                                                  reconstructor_id=3)

        x_adv_sub_val = sess.run(x_adv_sub,
                                 feed_dict={images_tensor: x_debug_test})
        x_rec_debug_val = sess.run(recon_tensors,
                                   feed_dict={images_tensor: x_debug_test})
        x_rec_orig_val = sess.run(x_rec_orig,
                                  feed_dict={images_tensor: x_debug_test})
        #sess.run(tf.local_variables_initializer())
        #x_rec_debug_val, x_rec_orig_val = sess.run([reconstructed_tensors, x_rec_orig], feed_dict={images_tensor: x_debug_test})

        save_images_files(x_adv_sub_val, output_dir=debug_dir, postfix='adv')

        postfix = 'gen_rec'
        save_images_files(x_rec_debug_val,
                          output_dir=debug_dir,
                          postfix=postfix)
        save_images_files(x_debug_test, output_dir=debug_dir, postfix='orig')
        save_images_files(x_rec_orig_val,
                          output_dir=debug_dir,
                          postfix='orig_rec')

    if gan_defense_flag:
        num_dims = len(images_tensor.get_shape())
        avg_inds = list(range(1, num_dims))

        recons_adv, zs = reconstructor.reconstruct(x_adv_sub,
                                                   batch_size=batch_size)

        diff_op = tf.reduce_mean(tf.square(x_adv_sub - recons_adv),
                                 axis=avg_inds)
        z_norm = tf.reduce_sum(tf.square(zs), axis=1)

        acc_adv, diffs_mean, roc_info_adv = model_eval_gan(
            sess,
            images_tensor,
            labels_tensor,
            predictions=model.get_logits(recons_adv),
            test_images=test_images,
            test_labels=test_labels,
            args=eval_params,
            diff_op=diff_op,
            z_norm=z_norm,
            recons_adv=recons_adv,
            adv_x=x_adv_sub,
            debug=False)

        # reconstruction on clean images
        recons_clean, zs = reconstructor.reconstruct(images_tensor,
                                                     batch_size=batch_size)

        diff_op = tf.reduce_mean(tf.square(images_tensor - recons_clean),
                                 axis=avg_inds)
        z_norm = tf.reduce_sum(tf.square(zs), axis=1)

        acc_rec, diffs_mean_rec, roc_info_rec = model_eval_gan(
            sess,
            images_tensor,
            labels_tensor,
            model.get_logits(recons_clean),
            None,
            test_images=test_images,
            test_labels=test_labels,
            args=eval_params,
            diff_op=diff_op,
            z_norm=z_norm,
            recons_adv=recons_clean,
            adv_x=images_tensor,
            debug=False)

        print('Evaluation accuracy with reconstruction: {}'.format(acc_rec))
        print('Test accuracy of oracle on cleaned images : {}'.format(acc_adv))

        return {
            'acc_adv': acc_adv,
            'acc_rec': acc_rec,
            'roc_info_adv': roc_info_adv,
            'roc_info_rec': roc_info_rec
        }

    else:
        acc_adv = model_eval(sess,
                             images_tensor,
                             labels_tensor,
                             model.get_logits(x_adv_sub),
                             test_images,
                             test_labels,
                             args=eval_params)
        print('Test accuracy of oracle on adversarial examples generated '
              'using the substitute: ' + str(acc_adv))
        return {
            'acc_adv': acc_adv,
            'acc_rec': 0,
            'roc_info_adv': None,
            'roc_info_rec': None
        }
model_dict = {}
model_dict[0] = model
model_dict[1] = model_target

Xdata_dict = {}
Xdata_dict[0] = X_freq_s
Xdata_dict[1] = X_freq_t

indices = get_indices(model_dict, Xdata_dict, y_test, len(model_dict))
indices = np.random.choice(indices, 1000, replace=False)

#FGSM
print("FGSM")
fgsm_params = {'eps': 0.03, 'clip_min': 0., 'clip_max': 1.}

fgsm_attack = FastGradientMethod(wrap, sess=sess)
X_adv = np.zeros((len(indices), 224, 224, 3))
for i in range(0, len(indices), 100):
    X_adv[i:(i + 100)] = fgsm_attack.generate_np(X_test[indices[i:(i + 100)]],
                                                 **fgsm_params)
print("metrics")
print(metrics(model, X_adv, X_test, y_test, indices))
print(metrics(model_target, X_adv, X_test, y_test, indices))
X_adv_freq = filt(X_adv, type_freq_t, lim_freq_t1)
print(metrics(model_target, X_adv_freq, X_test, y_test, indices))

#MIM Diverse
print("MIM-DIVERSE")
mim_params = {
    'eps': 0.03,
    'eps_iter': 0.01,
Beispiel #7
0
class TestFastGradientMethod(CleverHansTest):
    def setUp(self):
        super(TestFastGradientMethod, self).setUp()
        import tensorflow as tf

        # The world's simplest neural network
        def my_model(x):
            W1 = tf.constant([[1.5, .3], [-2, 0.3]], dtype=tf.float32)
            h1 = tf.nn.sigmoid(tf.matmul(x, W1))
            W2 = tf.constant([[-2.4, 1.2], [0.5, -2.3]], dtype=tf.float32)
            res = tf.nn.softmax(tf.matmul(h1, W2))
            return res

        self.sess = tf.Session()
        self.model = my_model
        self.attack = FastGradientMethod(self.model, sess=self.sess)

    def help_generate_np_gives_adversarial_example(self, ord):
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        x_adv = self.attack.generate_np(x_val,
                                        eps=.5,
                                        ord=ord,
                                        clip_min=-5,
                                        clip_max=5)
        if ord == np.inf:
            delta = np.max(np.abs(x_adv - x_val), axis=1)
        elif ord == 1:
            delta = np.sum(np.abs(x_adv - x_val), axis=1)
        elif ord == 2:
            delta = np.sum(np.square(x_adv - x_val), axis=1)**.5
        self.assertClose(delta, 0.5)

        orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1)
        new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1)
        self.assertTrue(np.mean(orig_labs == new_labs) < 0.5)

    def test_generate_np_gives_adversarial_example_linfinity(self):
        self.help_generate_np_gives_adversarial_example(np.infty)

    def test_generate_np_gives_adversarial_example_l1(self):
        self.help_generate_np_gives_adversarial_example(1)

    def test_generate_np_gives_adversarial_example_l2(self):
        self.help_generate_np_gives_adversarial_example(2)

    def test_targeted_generate_np_gives_adversarial_example(self):
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)
        random_labs = np.random.random_integers(0, 1, 100)
        random_labs_one_hot = np.zeros((100, 2))
        random_labs_one_hot[np.arange(100), random_labs] = 1

        x_adv = self.attack.generate_np(x_val,
                                        eps=.5,
                                        ord=np.inf,
                                        clip_min=-5,
                                        clip_max=5,
                                        y_target=random_labs_one_hot)

        delta = np.max(np.abs(x_adv - x_val), axis=1)
        self.assertClose(delta, 0.5)

        new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1)
        self.assertTrue(np.mean(random_labs == new_labs) > 0.7)

    def test_generate_np_can_be_called_with_different_eps(self):
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        for eps in [0.1, 0.2, 0.3, 0.4]:
            x_adv = self.attack.generate_np(x_val,
                                            eps=eps,
                                            ord=np.inf,
                                            clip_min=-5.0,
                                            clip_max=5.0)

            delta = np.max(np.abs(x_adv - x_val), axis=1)
            self.assertClose(delta, eps)

    def test_generate_np_clip_works_as_expected(self):
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        x_adv = self.attack.generate_np(x_val,
                                        eps=0.5,
                                        ord=np.inf,
                                        clip_min=-0.2,
                                        clip_max=0.1)

        self.assertClose(np.min(x_adv), -0.2)
        self.assertClose(np.max(x_adv), 0.1)

    def test_generate_np_caches_graph_computation_for_eps_clip_or_xi(self):
        import tensorflow as tf

        x_val = np.random.rand(1, 2)
        x_val = np.array(x_val, dtype=np.float32)

        self.attack.generate_np(x_val,
                                eps=.3,
                                num_iterations=10,
                                clip_max=-5.0,
                                clip_min=-5.0,
                                xi=1e-6)

        old_grads = tf.gradients

        def fn(*x, **y):
            raise RuntimeError()

        tf.gradients = fn

        self.attack.generate_np(x_val,
                                eps=.2,
                                num_iterations=10,
                                clip_max=-4.0,
                                clip_min=-4.0,
                                xi=1e-5)

        tf.gradients = old_grads
Beispiel #8
0
#Load classifier model whose gradients will be used to create adversarial examples
keras_model = load_model('classifiers/fc-784-100-100-10-defender-model.h5')
atkr_clfr = load_model('classifiers/fc-784-200-200-100-10-attacker-model.h5')
backend.set_learning_phase(False)

data_train_shuffle = data_train
data_train1 = data_train_shuffle[0:30000, 0:784]
data_train2 = data_train_shuffle[30000:60000, 0:784]

#Create adversarial examples on testing data
sess = backend.get_session()
eta1 = 0.25
eta2 = 0.50
wrap = KerasModelWrapper(keras_model)
fgsm = FastGradientMethod(wrap, sess=sess)
adv_train_x1 = fgsm.generate_np(data_train1,
                                eps=eta1,
                                clip_min=0.,
                                clip_max=1.)
adv_train_x2 = fgsm.generate_np(data_train2,
                                eps=eta2,
                                clip_min=0.,
                                clip_max=1.)
adv_train_x = np.vstack([adv_train_x1, adv_train_x2])
adv_test_x = fgsm.generate_np(data_test, eps=eta1, clip_min=0., clip_max=1.)

#Total datasets
data_total_train = np.vstack([data_train, adv_train_x])
data_total_test = np.vstack([data_test, adv_test_x])
Beispiel #9
0
def main(argv):

    model_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)

    if model_file is None:
        print('No model found')
        sys.exit()

    cifar = cifar10_input.CIFAR10Data(FLAGS.dataset_dir)

    nb_classes = 10
    X_test = cifar.eval_data.xs
    Y_test = to_categorical(cifar.eval_data.ys, nb_classes)
    assert Y_test.shape[1] == 10.

    set_log_level(logging.DEBUG)

    with tf.Session() as sess:

        x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
        y = tf.placeholder(tf.float32, shape=(None, 10))

        from madry_cifar10_model import make_madry_wresnet
        model = make_madry_wresnet()

        saver = tf.train.Saver()

        # Restore the checkpoint
        saver.restore(sess, model_file)

        nb_samples = FLAGS.nb_samples

        attack_params = {'batch_size': FLAGS.batch_size,
                         'clip_min': 0., 'clip_max': 255.}

        if FLAGS.attack_type == 'cwl2':
            from cleverhans.attacks import CarliniWagnerL2
            attacker = CarliniWagnerL2(model, sess=sess)
            attack_params.update({'binary_search_steps': 1,
                                  'max_iterations': 100,
                                  'learning_rate': 0.1,
                                  'initial_const': 10,
                                  'batch_size': 10
                                  })

        else:  # eps and eps_iter in range 0-255
            attack_params.update({'eps': 8, 'ord': np.inf})
            if FLAGS.attack_type == 'fgsm':
                from cleverhans.attacks import FastGradientMethod
                attacker = FastGradientMethod(model, sess=sess)

            elif FLAGS.attack_type == 'pgd':
                attack_params.update({'eps_iter': 2, 'nb_iter': 20})
                from cleverhans.attacks import MadryEtAl
                attacker = MadryEtAl(model, sess=sess)

        eval_par = {'batch_size': FLAGS.batch_size}

        if FLAGS.sweep:
            max_eps = 16
            epsilons = np.linspace(1, max_eps, max_eps)
            for e in epsilons:
                t1 = time.time()
                attack_params.update({'eps': e})
                x_adv = attacker.generate(x, **attack_params)
                preds_adv = model.get_probs(x_adv)
                acc = model_eval(sess, x, y, preds_adv, X_test[
                    :nb_samples], Y_test[:nb_samples], args=eval_par)
                print('Epsilon %.2f, accuracy on adversarial' % e,
                      'examples %0.4f\n' % acc)
            t2 = time.time()
        else:
            t1 = time.time()
            x_adv = attacker.generate(x, **attack_params)
            preds_adv = model.get_probs(x_adv)
            acc = model_eval(sess, x, y, preds_adv, X_test[
                :nb_samples], Y_test[:nb_samples], args=eval_par)
            t2 = time.time()
            print('Test accuracy on adversarial examples %0.4f\n' % acc)
        print("Took", t2 - t1, "seconds")
Beispiel #10
0
class TestFastGradientMethod(CleverHansTest):
  def setUp(self):
    super(TestFastGradientMethod, self).setUp()

    self.sess = tf.Session()
    self.model = SimpleModel()
    self.attack = FastGradientMethod(self.model, sess=self.sess)

  def generate_adversarial_examples_np(self, ord, eps, **kwargs):
    x_val = np.random.rand(100, 2)
    x_val = np.array(x_val, dtype=np.float32)

    x_adv = self.attack.generate_np(x_val, eps=eps, ord=ord,
                                    clip_min=-5, clip_max=5, **kwargs)
    if ord == np.inf:
      delta = np.max(np.abs(x_adv - x_val), axis=1)
    elif ord == 1:
      delta = np.sum(np.abs(x_adv - x_val), axis=1)
    elif ord == 2:
      delta = np.sum(np.square(x_adv - x_val), axis=1) ** .5

    return x_val, x_adv, delta

  def help_generate_np_gives_adversarial_example(self, ord, eps=.5,
                                                 **kwargs):
    x_val, x_adv, delta = self.generate_adversarial_examples_np(ord, eps,
                                                                **kwargs)
    self.assertClose(delta, eps)
    orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1)
    new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1)
    self.assertTrue(np.mean(orig_labs == new_labs) < 0.5)

  def test_invalid_input(self):
    x_val = -np.ones((2, 2), dtype='float32')
    with self.assertRaises(tf.errors.InvalidArgumentError) as context:
      self.attack.generate_np(x_val, eps=1., clip_min=0., clip_max=1.)
    self.assertTrue(context.exception)


  def test_generate_np_gives_adversarial_example_linfinity(self):
    self.help_generate_np_gives_adversarial_example(np.infty)

  def test_generate_np_gives_adversarial_example_l1(self):
    self.help_generate_np_gives_adversarial_example(1)

  def test_generate_np_gives_adversarial_example_l2(self):
    self.help_generate_np_gives_adversarial_example(2)

  def test_generate_respects_dtype(self):
    self.attack = FastGradientMethod(self.model, sess=self.sess,
                                     dtypestr='float64')
    x = tf.placeholder(dtype=tf.float64, shape=(100, 2))
    x_adv = self.attack.generate(x)
    self.assertEqual(x_adv.dtype, tf.float64)

  def test_targeted_generate_np_gives_adversarial_example(self):
    random_labs = np.random.random_integers(0, 1, 100)
    random_labs_one_hot = np.zeros((100, 2))
    random_labs_one_hot[np.arange(100), random_labs] = 1

    _, x_adv, delta = self.generate_adversarial_examples_np(
        eps=.5, ord=np.inf, y_target=random_labs_one_hot)

    self.assertClose(delta, 0.5)

    new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1)
    self.assertTrue(np.mean(random_labs == new_labs) > 0.7)

  def test_generate_np_can_be_called_with_different_eps(self):
    x_val = np.random.rand(100, 2)
    x_val = np.array(x_val, dtype=np.float32)

    for eps in [0.1, 0.2, 0.3, 0.4]:
      x_adv = self.attack.generate_np(x_val, eps=eps, ord=np.inf,
                                      clip_min=-5.0, clip_max=5.0)

      delta = np.max(np.abs(x_adv - x_val), axis=1)
      self.assertClose(delta, eps)

  def test_generate_np_clip_works_as_expected(self):
    x_val = np.random.rand(100, 2)
    x_val = np.array(x_val, dtype=np.float32)

    x_adv = self.attack.generate_np(x_val, eps=0.5, ord=np.inf,
                                    clip_min=-0.2, clip_max=0.1,
                                    sanity_checks=False)

    self.assertClose(np.min(x_adv), -0.2)
    self.assertClose(np.max(x_adv), 0.1)
Beispiel #11
0
        max_learning_rate - min_learning_rate) * np.exp(-i / decay_speed)

    if i % 500 == 0 or i == 50000:
        counter += 1
        # Saves generated images
        samples = sess.run(GXsigmoid_test,
                           feed_dict={
                               GY: all_classes,
                               BS: num_classes
                           })
        fig = plot_generator(samples)
        plt.savefig(folder_out + "gen_" + str(i).zfill(6) + '.png',
                    bbox_inches='tight')
        plt.close(fig)

        attack_fgsm = FastGradientMethod(model_classifier, sess=sess)
        adv_x_np = attack_fgsm.generate_np(x_test, **fgsm_params)
        fig = plot_generator(adv_x_np[:num_classes])
        plt.savefig(folder_out + "adv_" + str(i).zfill(6) + '.png',
                    bbox_inches='tight')
        plt.close(fig)

        accu_test, c_loss_test, sigmoid_test, softmax_test, sum_c = sess.run(
            [
                accuracy, c_loss, max_output_sigmoid_test,
                max_output_softmax_test, c_sum
            ], {
                X: x_test,
                Y_: y_test
            })
        writer.add_summary(sum_c, i)
def train(cifar10_data, epochs, L, learning_rate, scale3, Delta2, epsilon2, eps2_ratio, alpha, perturbFM, fgsm_eps, total_eps, logfile):
  logfile.write("fgsm_eps \t %g, LR \t %g, alpha \t %d , epsilon \t %d \n"%(fgsm_eps, learning_rate, alpha, total_eps))
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    eps_benign = 1/(1+eps2_ratio)*(epsilon2)
    eps_adv = eps2_ratio/(1+eps2_ratio)*(epsilon2)
    
    # Parameters Declarification
    #with tf.variable_scope('conv1') as scope:
    kernel1 = _variable_with_weight_decay('kernel1',
                                         shape=[4, 4, 3, 128],
                                         stddev=np.sqrt(2.0/(5*5*256))/math.ceil(5 / 2),
                                         wd=0.0, collect=[AECODER_VARIABLES])
    biases1 = _bias_on_cpu('biases1', [128], tf.constant_initializer(0.0), collect=[AECODER_VARIABLES])
    
    shape     = kernel1.get_shape().as_list()
    w_t       = tf.reshape(kernel1, [-1, shape[-1]])
    w         = tf.transpose(w_t)
    sing_vals = tf.svd(w, compute_uv=False)
    sensitivity = tf.reduce_max(sing_vals)
    gamma = 2*Delta2/(L*sensitivity) #2*3*(14*14 + 2)*16/(L*sensitivity)
    
    #with tf.variable_scope('conv2') as scope:
    kernel2 = _variable_with_weight_decay('kernel2',
                                         shape=[5, 5, 128, 128],
                                         stddev=np.sqrt(2.0/(5*5*256))/math.ceil(5 / 2),
                                         wd=0.0, collect=[CONV_VARIABLES])
    biases2 = _bias_on_cpu('biases2', [128], tf.constant_initializer(0.1), collect=[CONV_VARIABLES])
    #with tf.variable_scope('conv3') as scope:
    kernel3 = _variable_with_weight_decay('kernel3',
                                         shape=[5, 5, 256, 256],
                                         stddev=np.sqrt(2.0/(5*5*256))/math.ceil(5 / 2),
                                         wd=0.0, collect=[CONV_VARIABLES])
    biases3 = _bias_on_cpu('biases3', [256], tf.constant_initializer(0.1), collect=[CONV_VARIABLES])
    #with tf.variable_scope('local4') as scope:
    kernel4 = _variable_with_weight_decay('kernel4', shape=[int(image_size/4)**2*256, hk], stddev=0.04, wd=0.004, collect=[CONV_VARIABLES])
    biases4 = _bias_on_cpu('biases4', [hk], tf.constant_initializer(0.1), collect=[CONV_VARIABLES])
        #with tf.variable_scope('local5') as scope:
    kernel5 = _variable_with_weight_decay('kernel5', [hk, 10],
                                                  stddev=np.sqrt(2.0/(int(image_size/4)**2*256))/math.ceil(5 / 2), wd=0.0, collect=[CONV_VARIABLES])
    biases5 = _bias_on_cpu('biases5', [10], tf.constant_initializer(0.1), collect=[CONV_VARIABLES])

    #scale2 = tf.Variable(tf.ones([hk]))
    #beta2 = tf.Variable(tf.zeros([hk]))
    
    params = [kernel1, biases1, kernel2, biases2, kernel3, biases3, kernel4, biases4, kernel5, biases5]
    ########
    
    # Build a Graph that computes the logits predictions from the
    # inference model.
    FM_h = tf.placeholder(tf.float32, [None, 14, 14, 128]);
    noise = tf.placeholder(tf.float32, [None, image_size, image_size, 3]);
    adv_noise = tf.placeholder(tf.float32, [None, image_size, image_size, 3]);
    
    x = tf.placeholder(tf.float32, [None,image_size,image_size,3]);
    adv_x = tf.placeholder(tf.float32, [None,image_size,image_size,3]);
    
    # Auto-Encoder #
    Enc_Layer2 = EncLayer(inpt=adv_x, n_filter_in = 3, n_filter_out = 128, filter_size = 3, W=kernel1, b=biases1, activation=tf.nn.relu)
    pretrain_adv = Enc_Layer2.get_train_ops2(xShape = tf.shape(adv_x)[0], Delta = Delta2, epsilon = epsilon2, batch_size = L, learning_rate= learning_rate, W = kernel1, b = biases1, perturbFMx = adv_noise, perturbFM_h = FM_h)
    Enc_Layer3 = EncLayer(inpt=x, n_filter_in = 3, n_filter_out = 128, filter_size = 3, W=kernel1, b=biases1, activation=tf.nn.relu)
    pretrain_benign = Enc_Layer3.get_train_ops2(xShape = tf.shape(x)[0], Delta = Delta2, epsilon = epsilon2, batch_size = L, learning_rate= learning_rate, W = kernel1, b = biases1, perturbFMx = noise, perturbFM_h = FM_h)
    cost = tf.reduce_sum((Enc_Layer2.cost + Enc_Layer3.cost)/2.0);
    ###
    
    x_image = x + noise;
    y_conv = inference(x_image, FM_h, params);
    softmax_y_conv = tf.nn.softmax(y_conv)
    y_ = tf.placeholder(tf.float32, [None, 10]);
    
    adv_x += adv_noise
    y_adv_conv = inference(adv_x, FM_h, params)
    adv_y_ = tf.placeholder(tf.float32, [None, 10]);
    
    # Calculate loss. Apply Taylor Expansion for the output layer
    perturbW = perturbFM*params[8]
    loss = cifar10.TaylorExp(y_conv, y_, y_adv_conv, adv_y_, L, alpha, perturbW)
    
    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    #pretrain_step = tf.train.AdamOptimizer(1e-4).minimize(pretrain_adv, global_step=global_step, var_list=[kernel1, biases1]);
    pretrain_var_list = tf.get_collection(AECODER_VARIABLES)
    train_var_list = tf.get_collection(CONV_VARIABLES)
    #print(pretrain_var_list)
    #print(train_var_list)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        pretrain_step = tf.train.AdamOptimizer(learning_rate).minimize(pretrain_adv+pretrain_benign, global_step=global_step, var_list=pretrain_var_list);
        train_op = cifar10.train(loss, global_step, learning_rate, _var_list= train_var_list)
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False))
    
    sess.run(kernel1.initializer)
    dp_epsilon=1.0
    _gamma = sess.run(gamma)
    _gamma_x = Delta2/L
    epsilon2_update = epsilon2/(1.0 + 1.0/_gamma + 1/_gamma_x)
    print(epsilon2_update/_gamma + epsilon2_update/_gamma_x)
    print(epsilon2_update)
    delta_r = fgsm_eps*(image_size**2);
    _sensitivityW = sess.run(sensitivity)
    delta_h = _sensitivityW*(14**2)
    #delta_h = 1.0 * delta_r; #sensitivity*(14**2) = sensitivity*(\beta**2) can also be used
    #dp_mult = (Delta2/(L*epsilon2))/(delta_r / dp_epsilon) + (2*Delta2/(L*epsilon2))/(delta_h / dp_epsilon)
    #dp_mult = (Delta2/(L*epsilon2_update))/(delta_r / dp_epsilon) + (2*Delta2/(L*epsilon2_update))/(delta_h / dp_epsilon)
    dp_mult = (Delta2*dp_epsilon) / (L*epsilon2_update * (delta_h / 2 + delta_r))
    
    dynamic_eps = tf.placeholder(tf.float32);
    """y_test = inference(x, FM_h, params)
    softmax_y = tf.nn.softmax(y_test);
    c_x_adv = fgsm(x, softmax_y, eps=dynamic_eps/3, clip_min=-1.0, clip_max=1.0)
    x_adv = tf.reshape(c_x_adv, [L, image_size, image_size, 3])"""
    
    attack_switch = {'fgsm':True, 'ifgsm':True, 'deepfool':False, 'mim':True, 'spsa':False, 'cwl2':False, 'madry':True, 'stm':False}
    
    ch_model_probs = CustomCallableModelWrapper(callable_fn=inference_test_input_probs, output_layer='probs', params=params, image_size=image_size, adv_noise = adv_noise)
    
    # define each attack method's tensor
    mu_alpha = tf.placeholder(tf.float32, [1]);
    attack_tensor_dict = {}
    # FastGradientMethod
    if attack_switch['fgsm']:
        print('creating attack tensor of FastGradientMethod')
        fgsm_obj = FastGradientMethod(model=ch_model_probs, sess=sess)
        #x_adv_test_fgsm = fgsm_obj.generate(x=x, eps=fgsm_eps, clip_min=-1.0, clip_max=1.0, ord=2) # testing now
        x_adv_test_fgsm = fgsm_obj.generate(x=x, eps=mu_alpha, clip_min=-1.0, clip_max=1.0) # testing now
        attack_tensor_dict['fgsm'] = x_adv_test_fgsm
    
    # Iterative FGSM (BasicIterativeMethod/ProjectedGradientMethod with no random init)
    # default: eps_iter=0.05, nb_iter=10
    if attack_switch['ifgsm']:
        print('creating attack tensor of BasicIterativeMethod')
        ifgsm_obj = BasicIterativeMethod(model=ch_model_probs, sess=sess)
        #x_adv_test_ifgsm = ifgsm_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, clip_min=-1.0, clip_max=1.0, ord=2)
        x_adv_test_ifgsm = ifgsm_obj.generate(x=x, eps=mu_alpha, eps_iter=fgsm_eps/3, nb_iter=3, clip_min=-1.0, clip_max=1.0)
        attack_tensor_dict['ifgsm'] = x_adv_test_ifgsm
    
    # MomentumIterativeMethod
    # default: eps_iter=0.06, nb_iter=10
    if attack_switch['mim']:
        print('creating attack tensor of MomentumIterativeMethod')
        mim_obj = MomentumIterativeMethod(model=ch_model_probs, sess=sess)
        #x_adv_test_mim = mim_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, decay_factor=1.0, clip_min=-1.0, clip_max=1.0, ord=2)
        x_adv_test_mim = mim_obj.generate(x=x, eps=mu_alpha, eps_iter=fgsm_eps/3, nb_iter=3, decay_factor=1.0, clip_min=-1.0, clip_max=1.0)
        attack_tensor_dict['mim'] = x_adv_test_mim
    
    # MadryEtAl (Projected Grdient with random init, same as rand+fgsm)
    # default: eps_iter=0.01, nb_iter=40
    if attack_switch['madry']:
        print('creating attack tensor of MadryEtAl')
        madry_obj = MadryEtAl(model=ch_model_probs, sess=sess)
        #x_adv_test_madry = madry_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, clip_min=-1.0, clip_max=1.0, ord=2)
        x_adv_test_madry = madry_obj.generate(x=x, eps=mu_alpha, eps_iter=fgsm_eps/3, nb_iter=3, clip_min=-1.0, clip_max=1.0)
        attack_tensor_dict['madry'] = x_adv_test_madry
    
    #====================== attack =========================
    
    #adv_logits, _ = inference(c_x_adv + W_conv1Noise, perturbFM, params)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()
    sess.run(init)
    
    # Start the queue runners.
    #tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.summary.FileWriter(os.getcwd() + dirCheckpoint, sess.graph)
    
    # load the most recent models
    _global_step = 0
    ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
    if ckpt and ckpt.model_checkpoint_path:
        print(ckpt.model_checkpoint_path);
        saver.restore(sess, ckpt.model_checkpoint_path)
        _global_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
    else:
        print('No checkpoint file found')
    
    T = int(int(math.ceil(D/L))*epochs + 1) # number of steps
    step_for_epoch = int(math.ceil(D/L)); #number of steps for one epoch
    
    perturbH_test = np.random.laplace(0.0, 0, 14*14*128)
    perturbH_test = np.reshape(perturbH_test, [-1, 14, 14, 128]);
    
    #W_conv1Noise = np.random.laplace(0.0, Delta2/(L*epsilon2), 32 * 32 * 3).astype(np.float32)
    #W_conv1Noise = np.reshape(_W_conv1Noise, [32, 32, 3])
    
    perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*128)
    perturbFM_h = np.reshape(perturbFM_h, [-1, 14, 14, 128]);
    
    #_W_adv = np.random.laplace(0.0, 0, 32 * 32 * 3).astype(np.float32)
    #_W_adv = np.reshape(_W_adv, [32, 32, 3])
    #_perturbFM_h_adv = np.random.laplace(0.0, 0, 10*10*128)
    #_perturbFM_h_adv = np.reshape(_perturbFM_h_adv, [10, 10, 128]);
    
    test_size = len(cifar10_data.test.images)
    #beta = redistributeNoise(os.getcwd() + '/LRP_0_25_v12.txt')
    #BenignLNoise = generateIdLMNoise(image_size, Delta2, eps_benign, L) #generateNoise(image_size, Delta2, eps_benign, L, beta);
    #AdvLnoise = generateIdLMNoise(image_size, Delta2, eps_adv, L)
    Noise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L)
    #generateNoise(image_size, Delta2, eps_adv, L, beta);
    Noise_test = generateIdLMNoise(image_size, 0, epsilon2_update, L) #generateNoise(image_size, 0, 2*epsilon2, test_size, beta);
    
    emsemble_L = int(L/3)
    preT_epochs = 100
    pre_T = int(int(math.ceil(D/L))*preT_epochs + 1);
    """logfile.write("pretrain: \n")
    for step in range(_global_step, _global_step + pre_T):
        d_eps = random.random()*0.5;
        batch = cifar10_data.train.next_batch(L); #Get a random batch.
        adv_images = sess.run(x_adv, feed_dict = {x: batch[0], dynamic_eps: d_eps, FM_h: perturbH_test})
        for iter in range(0, 2):
            adv_images = sess.run(x_adv, feed_dict = {x: adv_images, dynamic_eps: d_eps, FM_h: perturbH_test})
        #sess.run(pretrain_step, feed_dict = {x: batch[0], noise: AdvLnoise, FM_h: perturbFM_h});
        batch = cifar10_data.train.next_batch(L);
        sess.run(pretrain_step, feed_dict = {x: np.append(batch[0], adv_images, axis = 0), noise: Noise, FM_h: perturbFM_h});
        if step % int(25*step_for_epoch) == 0:
            cost_value = sess.run(cost, feed_dict={x: cifar10_data.test.images, noise: Noise_test, FM_h: perturbH_test})/(test_size*128)
            logfile.write("step \t %d \t %g \n"%(step, cost_value))
            print(cost_value)
    print('pre_train finished')"""
    
    _global_step = 0
    for step in xrange(_global_step, _global_step + T):
      start_time = time.time()
      d_eps = random.random()*0.5;
      batch = cifar10_data.train.next_batch(emsemble_L); #Get a random batch.
      y_adv_batch = batch[1]
      """adv_images = sess.run(x_adv, feed_dict = {x: batch[0], dynamic_eps: d_eps, FM_h: perturbH_test})
      for iter in range(0, 2):
          adv_images = sess.run(x_adv, feed_dict = {x: adv_images, dynamic_eps: d_eps, FM_h: perturbH_test})"""
      adv_images_ifgsm = sess.run(attack_tensor_dict['ifgsm'], feed_dict ={x:batch[0], adv_noise: Noise, mu_alpha:[d_eps]})
      batch = cifar10_data.train.next_batch(emsemble_L);
      y_adv_batch = np.append(y_adv_batch, batch[1], axis = 0)
      adv_images_mim = sess.run(attack_tensor_dict['mim'], feed_dict ={x:batch[0], adv_noise: Noise, mu_alpha:[d_eps]})
      batch = cifar10_data.train.next_batch(emsemble_L);
      y_adv_batch = np.append(y_adv_batch, batch[1], axis = 0)
      adv_images_madry = sess.run(attack_tensor_dict['madry'], feed_dict ={x:batch[0], adv_noise: Noise, mu_alpha:[d_eps]})
      adv_images = np.append(np.append(adv_images_ifgsm, adv_images_mim, axis = 0),adv_images_madry, axis = 0)
      
      batch = cifar10_data.train.next_batch(L); #Get a random batch.

      sess.run(pretrain_step, feed_dict = {x: batch[0], adv_x: adv_images, adv_noise: Noise_test, noise: Noise, FM_h: perturbFM_h});
      _, loss_value = sess.run([train_op, loss], feed_dict = {x: batch[0], y_: batch[1], adv_x: adv_images, adv_y_: y_adv_batch, noise: Noise, adv_noise: Noise_test, FM_h: perturbFM_h})
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
      
      # report the result periodically
      if step % (50*step_for_epoch) == 0 and step >= (300*step_for_epoch):
          '''predictions_form_argmax = np.zeros([test_size, 10])
          softmax_predictions = sess.run(softmax_y_conv, feed_dict={x: cifar10_data.test.images, noise: Noise_test, FM_h: perturbH_test})
          argmax_predictions = np.argmax(softmax_predictions, axis=1)
          """for n_draws in range(0, 2000):
            _BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2, L)
            _perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2*L), 14*14*128)
            _perturbFM_h = np.reshape(_perturbFM_h, [-1, 14, 14, 128]);"""
          for j in range(test_size):
            pred = argmax_predictions[j]
            predictions_form_argmax[j, pred] += 2000;
          """softmax_predictions = sess.run(softmax_y_conv, feed_dict={x: cifar10_data.test.images, noise: _BenignLNoise, FM_h: _perturbFM_h})
            argmax_predictions = np.argmax(softmax_predictions, axis=1)"""
          final_predictions = predictions_form_argmax;
          is_correct = []
          is_robust = []
          for j in range(test_size):
              is_correct.append(np.argmax(cifar10_data.test.labels[j]) == np.argmax(final_predictions[j]))
              robustness_from_argmax = robustness.robustness_size_argmax(counts=predictions_form_argmax[j],eta=0.05,dp_attack_size=fgsm_eps, dp_epsilon=1.0, dp_delta=0.05, dp_mechanism='laplace') / dp_mult
              is_robust.append(robustness_from_argmax >= fgsm_eps)
          acc = np.sum(is_correct)*1.0/test_size
          robust_acc = np.sum([a and b for a,b in zip(is_robust, is_correct)])*1.0/np.sum(is_robust)
          robust_utility = np.sum(is_robust)*1.0/test_size
          log_str = "step: {:.1f}\t epsilon: {:.1f}\t benign: {:.4f} \t {:.4f} \t {:.4f} \t {:.4f} \t".format(step, total_eps, acc, robust_acc, robust_utility, robust_acc*robust_utility)'''
          
          #===================adv samples=====================
          log_str = "step: {:.1f}\t epsilon: {:.1f}\t".format(step, total_eps)
          """adv_images_dict = {}
          for atk in attack_switch.keys():
              if attack_switch[atk]:
                  adv_images_dict[atk] = sess.run(attack_tensor_dict[atk], feed_dict ={x:cifar10_data.test.images})
          print("Done with the generating of Adversarial samples")"""
          #===================adv samples=====================
          adv_acc_dict = {}
          robust_adv_acc_dict = {}
          robust_adv_utility_dict = {}
          test_bach_size = 5000
          for atk in attack_switch.keys():
              print(atk)
              if atk not in adv_acc_dict:
                  adv_acc_dict[atk] = -1
                  robust_adv_acc_dict[atk] = -1
                  robust_adv_utility_dict[atk] = -1
              if attack_switch[atk]:
                  test_bach = cifar10_data.test.next_batch(test_bach_size)
                  adv_images_dict = sess.run(attack_tensor_dict[atk], feed_dict ={x:test_bach[0], adv_noise: Noise_test, mu_alpha:[fgsm_eps]})
                  print("Done adversarial examples")
                  ### PixelDP Robustness ###
                  predictions_form_argmax = np.zeros([test_bach_size, 10])
                  softmax_predictions = sess.run(softmax_y_conv, feed_dict={x: adv_images_dict, noise: Noise, FM_h: perturbFM_h})
                  argmax_predictions = np.argmax(softmax_predictions, axis=1)
                  for n_draws in range(0, 1000):
                      _BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L);
                      _perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*128)
                      _perturbFM_h = np.reshape(_perturbFM_h, [-1, 14, 14, 128]);
                      if n_draws == 500:
                          print("n_draws = 500")
                      for j in range(test_bach_size):
                          pred = argmax_predictions[j]
                          predictions_form_argmax[j, pred] += 1;
                      softmax_predictions = sess.run(softmax_y_conv, feed_dict={x: adv_images_dict, noise: (_BenignLNoise/10 + Noise), FM_h: perturbFM_h}) * sess.run(softmax_y_conv, feed_dict={x: adv_images_dict, noise: Noise, FM_h: (_perturbFM_h/10 + perturbFM_h)})
                      #softmax_predictions = sess.run(softmax_y_conv, feed_dict={x: adv_images_dict, noise: (_BenignLNoise), FM_h: perturbFM_h}) * sess.run(softmax_y_conv, feed_dict={x: adv_images_dict, noise: Noise, FM_h: (_perturbFM_h)})
                      argmax_predictions = np.argmax(softmax_predictions, axis=1)
                  final_predictions = predictions_form_argmax;
                  is_correct = []
                  is_robust = []
                  for j in range(test_bach_size):
                      is_correct.append(np.argmax(test_bach[1][j]) == np.argmax(final_predictions[j]))
                      robustness_from_argmax = robustness.robustness_size_argmax(counts=predictions_form_argmax[j],eta=0.05,dp_attack_size=fgsm_eps, dp_epsilon=dp_epsilon, dp_delta=0.05, dp_mechanism='laplace') / dp_mult
                      is_robust.append(robustness_from_argmax >= fgsm_eps)
                  adv_acc_dict[atk] = np.sum(is_correct)*1.0/test_bach_size
                  robust_adv_acc_dict[atk] = np.sum([a and b for a,b in zip(is_robust, is_correct)])*1.0/np.sum(is_robust)
                  robust_adv_utility_dict[atk] = np.sum(is_robust)*1.0/test_bach_size
                  ##############################
          for atk in attack_switch.keys():
              if attack_switch[atk]:
                  # added robust prediction
                  log_str += " {}: {:.4f} {:.4f} {:.4f} {:.4f}".format(atk, adv_acc_dict[atk], robust_adv_acc_dict[atk], robust_adv_utility_dict[atk], robust_adv_acc_dict[atk] * robust_adv_utility_dict[atk])
          print(log_str)
          logfile.write(log_str + '\n')

      # Save the model checkpoint periodically.
      if step % (10*step_for_epoch) == 0 and (step > _global_step):
        num_examples_per_step = L
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)
        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                                             examples_per_sec, sec_per_batch))
      """if step % (50*step_for_epoch) == 0 and (step >= 900*step_for_epoch):
def evaluate_model(filepath,
                   train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   batch_size=128,
                   testing=False,
                   num_threads=None):
    """
  Run evaluation on a saved model
  :param filepath: path to model to evaluate
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param batch_size: size of evaluation batches
  """

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.INFO)

    # Create TF session
    if num_threads:
        config_args = dict(intra_op_parallelism_threads=1)
    else:
        config_args = {}
    sess = tf.Session(config=tf.ConfigProto(**config_args))

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)
    # Use Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    eval_params = {'batch_size': batch_size}
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}

    def do_eval(preds, x_set, y_set, report_key, is_adv=None):
        acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params)
        if is_adv is None:
            report_text = None
        elif is_adv:
            report_text = 'adversarial'
        else:
            report_text = 'legitimate'
        if report_text:
            print('Test accuracy on %s examples: %0.4f' % (report_text, acc))

    with sess.as_default():
        model = load(filepath)
    assert len(model.get_params()) > 0

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and
    # graph
    fgsm = FastGradientMethod(model, sess=sess)
    adv_x = fgsm.generate(x, **fgsm_params)
    preds_adv = model.get_logits(adv_x)
    preds = model.get_logits(x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    do_eval(preds, x_test, y_test, 'train_clean_train_clean_eval', False)
    do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True)
Beispiel #14
0
def mnist_blackbox(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_classes=NB_CLASSES,
                   batch_size=BATCH_SIZE,
                   learning_rate=LEARNING_RATE,
                   nb_epochs=NB_EPOCHS,
                   holdout=HOLDOUT,
                   data_aug=DATA_AUG,
                   nb_epochs_s=NB_EPOCHS_S,
                   lmbda=LMBDA,
                   aug_batch_size=AUG_BATCH_SIZE):
    """
  MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :return: a dictionary with:
           * black-box model accuracy on test set
           * substitute model accuracy on test set
           * black-box model accuracy on adversarial examples transferred
             from the substitute model
  """

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session
    sess = tf.Session()

    # Get MNIST data
    mnist = MNIST(train_start=train_start,
                  train_end=train_end,
                  test_start=test_start,
                  test_end=test_end)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Initialize substitute training set reserved for adversary
    x_sub = x_test[:holdout]
    y_sub = np.argmax(y_test[:holdout], axis=1)

    # Redefine test set as remaining samples unavailable to adversaries
    x_test = x_test[holdout:]
    y_test = y_test[holdout:]

    # Obtain Image parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Seed random number generator so tutorial is reproducible
    rng = np.random.RandomState([2017, 8, 30])

    # Simulate the black-box model locally
    # You could replace this by a remote labeling API for instance
    print("Preparing the black-box model.")
    prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test,
                              nb_epochs, batch_size, learning_rate, rng,
                              nb_classes, img_rows, img_cols, nchannels)
    model, bbox_preds, accuracies['bbox'] = prep_bbox_out

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    print("Training the substitute model.")
    train_sub_out = train_sub(sess, x, y, bbox_preds, x_sub, y_sub, nb_classes,
                              nb_epochs_s, batch_size, learning_rate, data_aug,
                              lmbda, aug_batch_size, rng, img_rows, img_cols,
                              nchannels)
    model_sub, preds_sub = train_sub_out

    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_sub, x_test, y_test, args=eval_params)
    accuracies['sub'] = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.}
    fgsm = FastGradientMethod(model_sub, sess=sess)

    # Craft adversarial examples using the substitute
    eval_params = {'batch_size': batch_size}
    x_adv_sub = fgsm.generate(x, **fgsm_par)

    # Evaluate the accuracy of the "black-box" model on adversarial examples
    accuracy = model_eval(sess,
                          x,
                          y,
                          model.get_logits(x_adv_sub),
                          x_test,
                          y_test,
                          args=eval_params)
    print('Test accuracy of oracle on adversarial examples generated '
          'using the substitute: ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex'] = accuracy

    return accuracies
Beispiel #15
0
 def test_generate_respects_dtype(self):
   self.attack = FastGradientMethod(self.model, sess=self.sess,
                                    dtypestr='float64')
   x = tf.placeholder(dtype=tf.float64, shape=(100, 2))
   x_adv = self.attack.generate(x)
   self.assertEqual(x_adv.dtype, tf.float64)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=6, batch_size=128,
                   learning_rate=0.001,
                   clean_train=True,
                   testing=False,
                   backprop_through_attack=False,
                   nb_filters=64, num_threads=None,
                   label_smoothing=True):
    """
    MNIST cleverhans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param clean_train: perform normal training on clean examples only
                        before performing adversarial training.
    :param testing: if true, complete an AccuracyReport for unit tests
                    to verify that performance is adequate
    :param backprop_through_attack: If True, backprop through adversarial
                                    example construction process during
                                    adversarial training.
    :param clean_train: if true, train on clean examples
    :return: an AccuracyReport object
    """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    if num_threads:
        config_args = dict(intra_op_parallelism_threads=1)
    else:
        config_args = {}
    sess = tf.Session(config=tf.ConfigProto(**config_args))

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)
    # Use Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    if label_smoothing:
        label_smooth = .1
        y_train = y_train.clip(label_smooth /
                               (nb_classes-1), 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    eval_params = {'batch_size': batch_size}
    fgsm_params = {
        'eps': 0.3,
        'clip_min': 0.,
        'clip_max': 1.
    }
    rng = np.random.RandomState([2017, 8, 30])
    sess = tf.Session()

    def do_eval(preds, x_set, y_set, report_key, is_adv=None):
        acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params)
        setattr(report, report_key, acc)
        if is_adv is None:
            report_text = None
        elif is_adv:
            report_text = 'adversarial'
        else:
            report_text = 'legitimate'
        if report_text:
            print('Test accuracy on %s examples: %0.4f' % (report_text, acc))

    if clean_train:
        model = ModelBasicCNN('model1', nb_classes, nb_filters)
        preds = model.get_logits(x)
        loss = LossCrossEntropy(model, smoothing=0.1)

        def evaluate():
            do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False)

        train(sess, loss, x, y, x_train, y_train, evaluate=evaluate,
              args=train_params, rng=rng, var_list=model.get_params())

        # Calculate training error
        if testing:
            do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval')

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        fgsm = FastGradientMethod(model, sess=sess)
        adv_x = fgsm.generate(x, **fgsm_params)
        preds_adv = model.get_logits(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True)

        # Calculate training error
        if testing:
            do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval')

        print('Repeating the process, using adversarial training')

    # Create a new model and train it to be robust to FastGradientMethod
    model2 = ModelBasicCNN('model2', nb_classes, nb_filters)
    fgsm2 = FastGradientMethod(model2, sess=sess)

    def attack(x):
        return fgsm2.generate(x, **fgsm_params)

    loss2 = LossCrossEntropy(model2, smoothing=0.1, attack=attack)
    preds2 = model2.get_logits(x)
    adv_x2 = attack(x)

    if not backprop_through_attack:
        # For the fgsm attack used in this tutorial, the attack has zero
        # gradient so enabling this flag does not change the gradient.
        # For some other attacks, enabling this flag increases the cost of
        # training, but gives the defender the ability to anticipate how
        # the atacker will change their strategy in response to updates to
        # the defender's parameters.
        adv_x2 = tf.stop_gradient(adv_x2)
    preds2_adv = model2.get_logits(adv_x2)

    def evaluate2():
        # Accuracy of adversarially trained model on legitimate test inputs
        do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False)
        # Accuracy of the adversarially trained model on adversarial examples
        do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True)

    # Perform and evaluate adversarial training
    train(sess, loss2, x, y, x_train, y_train, evaluate=evaluate2,
          args=train_params, rng=rng, var_list=model2.get_params())

    # Calculate training errors
    if testing:
        do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval')
        do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval')

    return report
Beispiel #17
0
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=NB_EPOCHS,
                   batch_size=BATCH_SIZE,
                   learning_rate=LEARNING_RATE,
                   train_dir=TRAIN_DIR,
                   filename=FILENAME,
                   load_model=LOAD_MODEL,
                   testing=False,
                   label_smoothing=0.1):
    """
    MNIST CleverHans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param train_dir: Directory storing the saved model
    :param filename: Filename to save model under
    :param load_model: True for load, False for not load
    :param testing: if true, test error is calculated
    :param label_smoothing: float, amount of label smoothing for cross entropy
    :return: an AccuracyReport object
    """
    tf.keras.backend.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if keras.backend.image_data_format() != 'channels_last':
        raise NotImplementedError(
            "this tutorial requires keras to be configured to channels_last format"
        )

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    cifar10 = CIFAR10(train_start=train_start,
                      train_end=train_end,
                      test_start=test_start,
                      test_end=test_end)
    x_train, y_train = cifar10.get_set('train')
    x_test, y_test = cifar10.get_set('test')

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Define TF model graph
    model = cnn_model(img_rows=img_rows,
                      img_cols=img_cols,
                      channels=nchannels,
                      nb_filters=64,
                      nb_classes=nb_classes)
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
        report.clean_train_clean_eval = acc
        #        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }

    rng = np.random.RandomState([2017, 8, 30])
    if not os.path.exists(train_dir):
        os.mkdir(train_dir)

    ckpt = tf.train.get_checkpoint_state(train_dir)
    print(train_dir, ckpt)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path
    wrap = KerasModelWrapper(model)

    if load_model and ckpt_path:
        saver = tf.train.Saver()
        print(ckpt_path)
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        loss = CrossEntropy(wrap, smoothing=label_smoothing)
        train(sess,
              loss,
              x_train,
              y_train,
              evaluate=evaluate,
              args=train_params,
              rng=rng)

    # Calculate training error
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params)
        report.train_clean_train_clean_eval = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    adv_x = fgsm.generate(x, **fgsm_params)
    # Consider the attack to be constant
    adv_x = tf.stop_gradient(adv_x)
    preds_adv = model(adv_x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    # Calculating train error
    if testing:
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess,
                         x,
                         y,
                         preds_adv,
                         x_train,
                         y_train,
                         args=eval_par)
        report.train_clean_train_adv_eval = acc

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model(img_rows=img_rows,
                        img_cols=img_cols,
                        channels=nchannels,
                        nb_filters=64,
                        nb_classes=nb_classes)
    wrap_2 = KerasModelWrapper(model_2)
    preds_2 = model_2(x)
    fgsm2 = FastGradientMethod(wrap_2, sess=sess)

    def attack(x):
        return fgsm2.generate(x, **fgsm_params)

    preds_2_adv = model_2(attack(x))
    loss_2 = CrossEntropy(wrap_2, smoothing=label_smoothing, attack=attack)

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              x_test,
                              y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              x_test,
                              y_test,
                              args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    train(sess,
          loss_2,
          x_train,
          y_train,
          evaluate=evaluate_2,
          args=train_params,
          rng=rng)

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              x_train,
                              y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              x_train,
                              y_train,
                              args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.001,
                   train_dir="/tmp",
                   filename="mnist.ckpt",
                   load_model=False,
                   testing=False):
    keras.layers.core.K.set_learning_phase(0)
    report = AccuracyReport()
    tf.set_random_seed(1234)
    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    keras.backend.set_session(sess)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    predictions = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess,
                         x,
                         y,
                         predictions,
                         X_test,
                         Y_test,
                         args=eval_params)
        report.clean_train_clean_eval = acc
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }
    # Train an MNIST model
    ckpt = tf.train.get_checkpoint_state(train_dir)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path

    rng = np.random.RandomState([2017, 8, 30])
    if load_model and ckpt_path:
        saver = tf.train.Saver()
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
    else:
        print("Model was not loaded, training from scratch.")
        model_train(sess,
                    x,
                    y,
                    predictions,
                    X_train,
                    Y_train,
                    evaluate=evaluate,
                    args=train_params,
                    save=True,
                    rng=rng)

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    wrap = KerasModelWrapper(model)
    advGenTimeStart = time.time()
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.2, 'clip_min': 0., 'clip_max': 1.}
    adv_x = fgsm.generate(x, **fgsm_params)
    adv_x = sess.run(adv_x, feed_dict={x: X_test[:4500]})
    advGenTimeEnd = time.time()
    advGenTime = advGenTimeEnd - advGenTimeStart

    for i in xrange(4500):
        normalization(adv_x[i:(i + 1)])

    print('adversarial examples generation time = ', advGenTime, 'seconds')

    intervals = [128, 85, 64, 51, 43, 37, 32, 28, 26]

    for intervalIndex in range(9):
        startTime = time.time()
        print('NBinterval =  ', intervalIndex + 2, '; interval size = ',
              intervals[intervalIndex])
        original_classified_wrong_number = 0
        disturbed_failure_number = 0
        test_number = 0
        TTP = 0
        TP = 0
        FN = 0
        FP = 0

        for i in range(1000):
            current_class = int(np.argmax(Y_test[i]))

            currentXLabel = model_argmax(sess, x, predictions,
                                         X_test[i:(i + 1)])
            if currentXLabel != current_class:
                original_classified_wrong_number += 1
                continue

            currentAdvXLabel = model_argmax(sess, x, predictions,
                                            adv_x[i:(i + 1)])
            if currentAdvXLabel == currentXLabel:
                disturbed_failure_number += 1
                continue

            test_number += 1

            currentX = np.reshape(X_test[i:(i + 1)], (28, 28))
            currentX = scalarQuantization(currentX, intervals[intervalIndex])
            currentX = np.reshape(currentX, X_test[i:(i + 1)].shape)
            currentXFilteredLabel = model_argmax(sess, x, predictions,
                                                 currentX)

            currentAdvX = np.reshape(adv_x[i:(i + 1)], (28, 28))
            currentAdvX = scalarQuantization(currentAdvX,
                                             intervals[intervalIndex])
            currentAdvX = np.reshape(currentAdvX, X_test[i:(i + 1)].shape)
            currentAdvXFilteredLabel = model_argmax(sess, x, predictions,
                                                    currentAdvX)

            if currentAdvXFilteredLabel != currentAdvXLabel:
                TP += 1
                if currentAdvXFilteredLabel == current_class:
                    TTP += 1
            else:
                FN += 1
            if currentXFilteredLabel != currentXLabel:
                FP += 1

            if (i + 1) % 1000 == 0:
                str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % (
                    test_number, original_classified_wrong_number,
                    disturbed_failure_number, TP, FN, FP, TTP)
                print(str1)

        str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % (
            test_number, original_classified_wrong_number,
            disturbed_failure_number, TP, FN, FP, TTP)
        print(str1)

        endTime = time.time()
        print('lasting ', endTime - startTime, 'seconds')
        Recall = TP / (TP + FN)
        Precision = TP / (TP + FP)
        tempStarStr = '********************************************************'
        recallStr = 'Recall = %.4f' % (Recall)
        precisionStr = 'Precision = %.4f' % (Precision)
        print(tempStarStr)
        print(recallStr)
        print(precisionStr)
        print(tempStarStr)

    return report
Beispiel #19
0
def test_attacks(batch_size=128,
                 source_samples=10,
                 model_path=os.path.join("models", "mnist"),
                 targeted=True):
    """
    Test many attacks on MNIST with deep Bayes classifier.
    :param batch_size: size of training batches
    :param source_samples: number of test inputs to attack
    :param model_path: path to the model file
    :param targeted: should we run a targeted attack? or untargeted?
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    from cleverhans.utils_mnist import data_mnist
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=0,
                                                  train_end=60000,
                                                  test_start=0,
                                                  test_end=10000)
    img_rows, img_cols, channels = X_train[0].shape
    nb_classes = Y_train.shape[1]

    # Define input TF placeholder
    batch_size = min(batch_size, source_samples)
    x = tf.placeholder(tf.float32,
                       shape=(batch_size, img_rows, img_cols, channels))
    y = tf.placeholder(tf.float32, shape=(batch_size, nb_classes))

    # Define TF model graph
    model_name = str(sys.argv[1])
    if model_name == 'bayes':
        from load_bayes_classifier import BayesModel
        conv = True
        checkpoint = 0  #int(sys.argv[1])
        K = int(sys.argv[3])
        use_mean = True
        model = BayesModel(sess,
                           'mnist',
                           conv,
                           K,
                           checkpoint=checkpoint,
                           attack_snapshot=False,
                           use_mean=use_mean)
        if use_mean:
            model_name = 'bayes_mean_mlp'
        else:
            model_name = 'bayes_K%d' % K
    if model_name == 'cnn':
        from load_cnn_classifier import CNNModel
        model = CNNModel(sess, 'mnist')
    if model_name == 'wgan':
        from load_wgan_classifier import WGANModel
        conv = True
        checkpoint = 0  #int(sys.argv[1])
        K = int(sys.argv[3])
        T = int(sys.argv[4])
        model = WGANModel(sess, 'mnist', conv, K, T, checkpoint=checkpoint)
        model_name = 'wgan_K%d_T%d' % (K, T)

    preds = model.predict(x, softmax=True)  # output probabilities
    print("Defined TensorFlow model graph.")

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    # Craft adversarial examples
    nb_adv_per_sample = str(nb_classes - 1) if targeted else '1'
    print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample +
          ' adversarial examples')
    print("This could take some time ...")

    # make adv inputs and labels for the attack if targeted
    if targeted:
        adv_inputs = np.array([[instance] * nb_classes
                               for instance in X_test[:source_samples]],
                              dtype=np.float32)

        one_hot = np.zeros((nb_classes, nb_classes))
        one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1

        adv_inputs = adv_inputs.reshape(
            (source_samples * nb_classes, img_rows, img_cols, 1))
        adv_ys = np.array([one_hot] * source_samples,
                          dtype=np.float32).reshape(
                              (source_samples * nb_classes, nb_classes))
    else:
        adv_inputs = X_test[:source_samples]
        adv_ys = Y_test[:source_samples]

    # Instantiate an attack object
    attack_method = str(sys.argv[2])
    if attack_method == 'fgsm':
        from cleverhans.attacks import FastGradientMethod
        model_prob = lambda x: model.predict(x, softmax=True)
        attack = FastGradientMethod(model_prob, sess=sess)
        from attack_config import config_fgsm
        attack_params = config_fgsm(targeted, adv_ys)
    if attack_method == 'bim':
        from cleverhans.attacks import BasicIterativeMethod
        model_prob = lambda x: model.predict(x, softmax=True)
        attack = BasicIterativeMethod(model_prob, sess=sess)
        from attack_config import config_bim
        attack_params = config_bim(targeted, adv_ys)
    if attack_method == 'mim':
        from cleverhans.attacks import MomentumIterativeMethod
        model_prob = lambda x: model.predict(x, softmax=True)
        attack = MomentumIterativeMethod(model_prob, sess=sess)
        from attack_config import config_mim
        attack_params = config_mim(targeted, adv_ys)
    if attack_method == 'jsma':
        from cleverhans.attacks import SaliencyMapMethod
        model_prob = lambda x: model.predict(x, softmax=True)
        attack = SaliencyMapMethod(model_prob, sess=sess)
        from attack_config import config_jsma
        attack_params = config_jsma(targeted, adv_ys)
    if attack_method == 'vat':
        from cleverhans.attacks import VirtualAdversarialMethod
        model_logit = lambda x: model.predict(x, softmax=False)
        attack = VirtualAdversarialMethod(model_logit, sess=sess)
        from attack_config import config_vat
        attack_params = config_vat(targeted, adv_ys)
    if attack_method == 'cw':
        from cleverhans.attacks import CarliniWagnerL2
        model_logit = lambda x: model.predict(x, softmax=False)
        attack = CarliniWagnerL2(model_logit, sess=sess)
        from attack_config import config_cw
        attack_params = config_cw(targeted, adv_ys)
    if attack_method == 'elastic':
        from cleverhans.attacks import ElasticNetMethod
        model_logit = lambda x: model.predict(x, softmax=False)
        attack = ElasticNetMethod(model_logit, sess=sess)
        from attack_config import config_elastic
        attack_params = config_elastic(targeted, adv_ys)
    if attack_method == 'deepfool':
        from cleverhans.attacks import DeepFool
        model_logit = lambda x: model.predict(x, softmax=False)
        attack = DeepFool(model_logit, sess=sess)
        from attack_config import config_deepfool
        attack_params = config_deepfool(targeted, adv_ys)
    if attack_method == 'madry':
        from cleverhans.attacks import MadryEtAl
        model_prob = lambda x: model.predict(x, softmax=True)
        attack = MadryEtAl(model_prob, sess=sess)
        from attack_config import config_madry
        attack_params = config_madry(targeted, adv_ys)

    attack_params['batch_size'] = batch_size
    print('batchsize', batch_size)

    # perform the attack!
    adv = []
    n_batch = int(adv_inputs.shape[0] / batch_size)
    for i in xrange(n_batch):
        adv_batch = adv_inputs[i * batch_size:(i + 1) * batch_size]
        adv.append(attack.generate_np(adv_batch, **attack_params))
    adv = np.concatenate(adv, axis=0)

    for _ in xrange(5):
        y_adv = []
        for i in xrange(n_batch):
            adv_batch = adv[i * batch_size:(i + 1) * batch_size]
            y_adv.append(sess.run(preds, {x: adv_batch}))
        y_adv = np.concatenate(y_adv, axis=0)

        print('--------------------------------------')
        for i in xrange(10):
            print(np.argmax(y_adv[i * 10:(i + 1) * 10], 1))

    correct_pred = np.asarray(np.argmax(y_adv, 1) == np.argmax(adv_ys, 1),
                              dtype='f')
    adv_accuracy = np.mean(correct_pred)

    if not targeted:
        #        adv_accuracy, y_adv = model_eval(sess, x, y, preds, adv,
        #                                         adv_ys, args=eval_params,
        #                                         return_pred=True)
        #    else:
        #        adv_accuracy, y_adv = model_eval(sess, x, y, preds, adv,
        #                                         Y_test[:source_samples], args=eval_params,
        #                                         return_pred=True)
        adv_accuracy = 1. - adv_accuracy

    print('--------------------------------------')

    print(np.argmax(adv_ys[:10], 1))
    print(np.argmax(y_adv[:10], 1))
    for i in xrange(5):
        tmp = sess.run(preds, {x: adv[:100]})
        print(np.argmax(tmp[:10], 1))

    # Compute the number of adversarial examples that were successfully found
    print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy))
    report.clean_train_adv_eval = 1. - adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(
        np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Close TF session
    sess.close()

    # visualisation
    vis_adv = True
    if vis_adv:
        N_vis = 100
        sys.path.append('../../utils')
        from visualisation import plot_images
        if channels == 1:
            shape = (img_rows, img_cols)
        else:
            shape = (img_rows, img_cols, channels)
        path = 'figs/'
        filename = model_name + '_' + attack_method
        if targeted:
            filename = filename + '_targeted'
        else:
            filename = filename + '_untargeted'
        plot_images(adv_inputs[:N_vis], shape, path, filename + '_data')
        plot_images(adv[:N_vis], shape, path, filename + '_adv')

    save_result = True
    if save_result:
        path = 'results/'
        filename = model_name + '_' + attack_method
        if targeted:
            filename = filename + '_targeted'
            y_input = adv_ys
        else:
            filename = filename + '_untargeted'
            y_input = Y_test[:source_samples]
        results = [adv_inputs, y_input, adv, y_adv]
        import pickle
        pickle.dump(results, open(path + filename + '.pkl', 'w'))
        print("results saved at %s.pkl" % filename)

    return report
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.001,
                   train_dir="/tmp",
                   filename="mnist.ckpt",
                   load_model=False,
                   testing=False):
    """
    MNIST CleverHans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param train_dir: Directory storing the saved model
    :param filename: Filename to save model under
    :param load_model: True for load, False for not load
    :param testing: if true, test error is calculated
    :return: an AccuracyReport object
    """
    keras.layers.core.K.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
        report.clean_train_clean_eval = acc
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }
    ckpt = tf.train.get_checkpoint_state(train_dir)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path

    rng = np.random.RandomState([2017, 8, 30])
    if load_model and ckpt_path:
        saver = tf.train.Saver()
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        model_train(sess,
                    x,
                    y,
                    preds,
                    X_train,
                    Y_train,
                    evaluate=evaluate,
                    args=train_params,
                    save=True,
                    rng=rng)

    # Calculate training error
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params)
        report.train_clean_train_clean_eval = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    wrap = KerasModelWrapper(model)
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    adv_x = fgsm.generate(x, **fgsm_params)
    # Consider the attack to be constant
    adv_x = tf.stop_gradient(adv_x)
    preds_adv = model(adv_x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    # Calculating train error
    if testing:
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess,
                         x,
                         y,
                         preds_adv,
                         X_train,
                         Y_train,
                         args=eval_par)
        report.train_clean_train_adv_eval = acc

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model()
    preds_2 = model_2(x)
    wrap_2 = KerasModelWrapper(model_2)
    fgsm2 = FastGradientMethod(wrap_2, sess=sess)
    preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params))

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    model_train(sess,
                x,
                y,
                preds_2,
                X_train,
                Y_train,
                predictions_adv=preds_2_adv,
                evaluate=evaluate_2,
                args=train_params,
                save=False,
                rng=rng)

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              X_train,
                              Y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              X_train,
                              Y_train,
                              args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.001,
                   train_dir="/tmp",
                   filename="mnist.ckpt",
                   load_model=False,
                   testing=False):
    """
    MNIST CleverHans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param train_dir: Directory storing the saved model
    :param filename: Filename to save model under
    :param load_model: True for load, False for not load
    :param testing: if true, test error is calculated
    :return: an AccuracyReport object
    """
    keras.layers.core.K.set_learning_phase(0)
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()
    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)
    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")
    # Create TF session and set as Keras backend session


#     gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
#     config = tf.ConfigProto(gpu_options=gpu_options)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    keras.backend.set_session(sess)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    predictions = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess,
                         x,
                         y,
                         predictions,
                         X_test,
                         Y_test,
                         args=eval_params)
        report.clean_train_clean_eval = acc
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }
    # Train an MNIST model
    ckpt = tf.train.get_checkpoint_state(train_dir)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path

    rng = np.random.RandomState([2017, 8, 30])
    if load_model and ckpt_path:
        saver = tf.train.Saver()
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
    else:
        print("Model was not loaded, training from scratch.")
        model_train(sess,
                    x,
                    y,
                    predictions,
                    X_train,
                    Y_train,
                    evaluate=evaluate,
                    args=train_params,
                    save=True,
                    rng=rng)

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    wrap = KerasModelWrapper(model)
    advGenTimeStart = time.time()
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.2, 'clip_min': 0., 'clip_max': 1.}
    adv_x = fgsm.generate(x, **fgsm_params)
    adv_x = sess.run(adv_x, feed_dict={x: X_test[:4500]})
    advGenTimeEnd = time.time()
    advGenTime = advGenTimeEnd - advGenTimeStart

    for i in xrange(4500):
        normalization(adv_x[i:(i + 1)])

    print('adversarial examples generation time = ', advGenTime, 'seconds')
    crosses = [
        np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]]),
        np.array([[0, 0, 1, 0, 0], [0, 0, 1, 0, 0], [1, 1, 1, 1, 1],
                  [0, 0, 1, 0, 0], [0, 0, 1, 0, 0]]),
        np.array([[0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0],
                  [0, 0, 0, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1],
                  [0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0],
                  [0, 0, 0, 1, 0, 0, 0]]),
        np.array([
            [0, 0, 0, 0, 1, 0, 0, 0, 0],
            [0, 0, 0, 0, 1, 0, 0, 0, 0],
            [0, 0, 0, 0, 1, 0, 0, 0, 0],
            [0, 0, 0, 0, 1, 0, 0, 0, 0],
            [1, 1, 1, 1, 1, 1, 1, 1, 1],
            [0, 0, 0, 0, 1, 0, 0, 0, 0],
            [0, 0, 0, 0, 1, 0, 0, 0, 0],
            [0, 0, 0, 0, 1, 0, 0, 0, 0],
            [0, 0, 0, 0, 1, 0, 0, 0, 0],
        ])
    ]
    coefficient = [5, 9, 13, 17]
    #diamond filter test, kernel size: 3, 5, 7, 9
    kernelIndex = -1
    for kernelSize in xrange(3, 10, 2):
        startTime = time.time()
        original_classified_wrong_number = 0
        disturbed_failure_number = 0
        test_number = 0
        TTP = 0
        TP = 0
        FN = 0
        FP = 0

        start = (kernelSize - 1) // 2
        end = 28 - start
        kernelIndex += 1
        print('cross filter')
        print(crosses[kernelIndex])
        for i in range(4500):
            current_class = int(np.argmax(Y_test[i]))

            currentXLabel = model_argmax(sess, x, predictions,
                                         X_test[i:(i + 1)])
            if currentXLabel != current_class:
                original_classified_wrong_number += 1
                continue

            currentAdvXLabel = model_argmax(sess, x, predictions,
                                            adv_x[i:(i + 1)])
            if currentAdvXLabel == currentXLabel:
                disturbed_failure_number += 1
                continue

            test_number += 1

            currentX = np.reshape(X_test[i:(i + 1)], (28, 28))
            currentX = diamondAndCrossFilterOperations(
                currentX, crosses[kernelIndex], start, end,
                coefficient[kernelIndex])
            currentX = np.reshape(currentX, X_test[i:(i + 1)].shape)
            currentXFilteredLabel = model_argmax(sess, x, predictions,
                                                 currentX)

            currentAdvX = np.reshape(adv_x[i:(i + 1)], (28, 28))
            currentAdvX = diamondAndCrossFilterOperations(
                currentAdvX, crosses[kernelIndex], start, end,
                coefficient[kernelIndex])
            currentAdvX = np.reshape(currentAdvX, X_test[i:(i + 1)].shape)
            currentAdvXFilteredLabel = model_argmax(sess, x, predictions,
                                                    currentAdvX)

            if currentAdvXFilteredLabel != currentAdvXLabel:
                TP += 1
                if currentAdvXFilteredLabel == current_class:
                    TTP += 1
            else:
                FN += 1
            if currentXFilteredLabel != currentXLabel:
                FP += 1

            if (i + 1) % 1000 == 0:
                str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % (
                    test_number, original_classified_wrong_number,
                    disturbed_failure_number, TP, FN, FP, TTP)
                print(str1)

        str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % (
            test_number, original_classified_wrong_number,
            disturbed_failure_number, TP, FN, FP, TTP)
        print(str1)

        endTime = time.time()
        print('lasting ', endTime - startTime, 'seconds')
        Recall = TP / (TP + FN)
        Precision = TP / (TP + FP)
        tempStarStr = '********************************************************'
        recallStr = 'Recall = %.4f' % (Recall)
        precisionStr = 'Precision = %.4f' % (Precision)
        print(tempStarStr)
        print(recallStr)
        print(precisionStr)
        print(tempStarStr)

    return report
Beispiel #22
0
def main(argv=None):
    """
    CIFAR10 CleverHans tutorial
    :return:
    """

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get CIFAR10 test data
    X_train, Y_train, X_test, Y_test = data_cifar10()

    assert Y_train.shape[1] == 10.
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model(img_rows=32, img_cols=32, channels=3)
    predictions = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the CIFAR10 model on legitimate test
        # examples
        eval_params = {'batch_size': FLAGS.batch_size}
        accuracy = model_eval(sess, x, y, predictions, X_test, Y_test,
                              args=eval_params)
        assert X_test.shape[0] == 10000, X_test.shape
        print('Test accuracy on legitimate test examples: ' + str(accuracy))

    # Train an CIFAR10 model
    train_params = {
        'nb_epochs': FLAGS.nb_epochs,
        'batch_size': FLAGS.batch_size,
        'learning_rate': FLAGS.learning_rate
    }
    model_train(sess, x, y, predictions, X_train, Y_train,
                evaluate=evaluate, args=train_params)

    # Craft adversarial examples using Fast Gradient Sign Method (FGSM)
    fgsm = FastGradientMethod(model)
    adv_x = fgsm.generate(x, eps=0.3)
    eval_params = {'batch_size': FLAGS.batch_size}
    X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], args=eval_params)
    assert X_test_adv.shape[0] == 10000, X_test_adv.shape

    # Evaluate the accuracy of the CIFAR10 model on adversarial examples
    accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test,
                          args=eval_params)
    print('Test accuracy on adversarial examples: ' + str(accuracy))

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model(img_rows=32, img_cols=32, channels=3)
    predictions_2 = model_2(x)
    fgsm_2 = FastGradientMethod(model_2)
    adv_x_2 = fgsm_2.generate(x, eps=0.3)
    predictions_2_adv = model_2(adv_x_2)

    def evaluate_2():
        # Evaluate the accuracy of the adversarialy trained CIFAR10 model on
        # legitimate test examples
        eval_params = {'batch_size': FLAGS.batch_size}
        accuracy = model_eval(sess, x, y, predictions_2, X_test, Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate test examples: ' + str(accuracy))

        # Evaluate the accuracy of the adversarially trained CIFAR10 model on
        # adversarial examples
        accuracy_adv = model_eval(sess, x, y, predictions_2_adv, X_test,
                                  Y_test, args=eval_params)
        print('Test accuracy on adversarial examples: ' + str(accuracy_adv))

    # Perform adversarial training
    model_train(sess, x, y, predictions_2, X_train, Y_train,
                predictions_adv=predictions_2_adv, evaluate=evaluate_2,
                args=train_params)

    # Evaluate the accuracy of the CIFAR10 model on adversarial examples
    accuracy = model_eval(sess, x, y, predictions_2_adv, X_test, Y_test,
                          args=eval_params)
    print('Test accuracy on adversarial examples: ' + str(accuracy))
Beispiel #23
0
def main(argv):

    model_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)

    if model_file is None:
        print('No model found')
        sys.exit()

    cifar = cifar10_input.CIFAR10Data(FLAGS.dataset_dir)

    nb_classes = 10
    X_test = cifar.eval_data.xs
    Y_test = to_categorical(cifar.eval_data.ys, nb_classes)
    assert Y_test.shape[1] == 10.

    set_log_level(logging.DEBUG)

    with tf.Session() as sess:

        x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
        y = tf.placeholder(tf.float32, shape=(None, 10))

        from madry_cifar10_model import make_madry_wresnet
        model = make_madry_wresnet()

        saver = tf.train.Saver()

        # Restore the checkpoint
        saver.restore(sess, model_file)

        nb_samples = FLAGS.nb_samples

        attack_params = {
            'batch_size': FLAGS.batch_size,
            'clip_min': 0.,
            'clip_max': 255.
        }

        if FLAGS.attack_type == 'cwl2':
            from cleverhans.attacks import CarliniWagnerL2
            attacker = CarliniWagnerL2(model, sess=sess)
            attack_params.update({
                'binary_search_steps': 1,
                'max_iterations': 100,
                'learning_rate': 0.1,
                'initial_const': 10,
                'batch_size': 10
            })

        else:  # eps and eps_iter in range 0-255
            attack_params.update({'eps': 8, 'ord': np.inf})
            if FLAGS.attack_type == 'fgsm':
                from cleverhans.attacks import FastGradientMethod
                attacker = FastGradientMethod(model, sess=sess)

            elif FLAGS.attack_type == 'pgd':
                attack_params.update({'eps_iter': 2, 'nb_iter': 20})
                from cleverhans.attacks import MadryEtAl
                attacker = MadryEtAl(model, sess=sess)

        eval_par = {'batch_size': FLAGS.batch_size}

        if FLAGS.sweep:
            max_eps = 16
            epsilons = np.linspace(1, max_eps, max_eps)
            for e in epsilons:
                t1 = time.time()
                attack_params.update({'eps': e})
                x_adv = attacker.generate(x, **attack_params)
                preds_adv = model.get_probs(x_adv)
                acc = model_eval(sess,
                                 x,
                                 y,
                                 preds_adv,
                                 X_test[:nb_samples],
                                 Y_test[:nb_samples],
                                 args=eval_par)
                print('Epsilon %.2f, accuracy on adversarial' % e,
                      'examples %0.4f\n' % acc)
            t2 = time.time()
        else:
            t1 = time.time()
            x_adv = attacker.generate(x, **attack_params)
            preds_adv = model.get_probs(x_adv)
            acc = model_eval(sess,
                             x,
                             y,
                             preds_adv,
                             X_test[:nb_samples],
                             Y_test[:nb_samples],
                             args=eval_par)
            t2 = time.time()
            print('Test accuracy on adversarial examples %0.4f\n' % acc)
        print("Took", t2 - t1, "seconds")
def main(_):
    tf.reset_default_graph()

    # Import data
    cifar = cf.cifar10(batchSize=FLAGS.batch_size, downloadDir=FLAGS.data_dir)
    cifar.preprocess()

    with tf.variable_scope('inputs'):
        # Create the model
        x = tf.placeholder(
            tf.float32,
            [None, FLAGS.img_width * FLAGS.img_height * FLAGS.img_channels])
        # Define loss and optimizer
        y_ = tf.placeholder(tf.float32, [None, FLAGS.num_classes])
        # Whether model is training
        train = tf.placeholder(tf.bool, [])

    # Reshape to use within a convolutional neural net.  Last dimension is for
    # 'features' - it would be 1 one for a grayscale image, 3 for an RGB image,
    # 4 for RGBA, etc.
    x_image = tf.reshape(
        x, [-1, FLAGS.img_width, FLAGS.img_height, FLAGS.img_channels])

    with tf.variable_scope('model'):
        # Build the graph for the deep net
        y_conv = deepnn(x_image, train)
        model = CallableModelWrapper(lambda _x: deepnn(_x, train), 'logits')

    # Define your loss function - softmax_cross_entropy
    with tf.variable_scope('x_entropy'):
        cross_entropy = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))

    # Define your AdamOptimiser, using FLAGS.learning_rate to minimixe the loss function
    decayed_learning_rate = tf.train.exponential_decay(
        FLAGS.learning_rate, tf.Variable(0, trainable=False), 1000,
        FLAGS.learning_rate_decay)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        optimiser = tf.train.AdamOptimizer(decayed_learning_rate,
                                           name="Adam").minimize(cross_entropy)

    # calculate the prediction and the accuracy
    accuracy, acc_op = tf.metrics.accuracy(labels=tf.argmax(y_, axis=1),
                                           predictions=tf.argmax(y_conv,
                                                                 axis=1))
    adv_accuracy, adv_acc_op = tf.metrics.accuracy(
        labels=tf.argmax(y_, axis=1), predictions=tf.argmax(y_conv, axis=1))

    # summaries for TensorBoard visualisation
    loss_summary = tf.summary.scalar('Loss', cross_entropy)
    adv_loss_summary = tf.summary.scalar('Adversarial Loss', cross_entropy)
    acc_summary = tf.summary.scalar('Accuracy', accuracy)
    adv_acc_summary = tf.summary.scalar('Adv Accuracy', adv_accuracy)
    image_summary = tf.summary.image('Test Images', x_image)

    # saver for checkpoints
    saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)

    with tf.Session() as sess:
        with tf.variable_scope('model', reuse=True):
            fgsm = FastGradientMethod(model, sess=sess)
            adv_image_op = fgsm.generate(x_image,
                                         eps=FLAGS.fgsm_eps,
                                         clip_min=0.0,
                                         clip_max=1.0)

        summary_writer_train = tf.summary.FileWriter(run_log_dir + '_train',
                                                     sess.graph,
                                                     flush_secs=5)
        summary_writer_validation = tf.summary.FileWriter(run_log_dir +
                                                          '_validate',
                                                          sess.graph,
                                                          flush_secs=5)
        summary_writer_adversarial = tf.summary.FileWriter(run_log_dir +
                                                           '_adversarial',
                                                           sess.graph,
                                                           flush_secs=5)
        summary_writer_images = tf.summary.FileWriter(run_log_dir + '_images',
                                                      sess.graph,
                                                      flush_secs=5)
        summary_writer_images_adversarial = tf.summary.FileWriter(
            run_log_dir + '_images_adversarial', sess.graph, flush_secs=5)

        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())

        # Training and validation
        for step in range(FLAGS.max_steps):
            # Training: Backpropagation using train set
            (train_images, train_labels) = cifar.getTrainBatch()
            (test_images, test_labels) = cifar.getTestBatch()

            _, summary_str, train_images_adv = sess.run(
                [optimiser, loss_summary, adv_image_op],
                feed_dict={
                    x: train_images,
                    y_: train_labels,
                    train: True
                })
            _, summary_str_adv = sess.run([optimiser, adv_loss_summary],
                                          feed_dict={
                                              x_image: train_images_adv,
                                              y_: train_labels,
                                              train: True
                                          })

            if step % (FLAGS.log_frequency + 1) == 0:
                summary_writer_train.add_summary(summary_str, step)
                summary_writer_train.add_summary(summary_str_adv, step)

            ## Validation: Monitoring accuracy using validation set
            if step % FLAGS.log_frequency == 0:
                accuracy, summary_str, image_str, test_images_adv = sess.run(
                    [acc_op, acc_summary, image_summary, adv_image_op],
                    feed_dict={
                        x: test_images,
                        y_: test_labels,
                        train: False
                    })
                adv_accuracy, adv_summary_str, adv_image_str = sess.run(
                    [adv_acc_op, adv_acc_summary, image_summary],
                    feed_dict={
                        x_image: test_images_adv,
                        y_: test_labels,
                        train: False
                    })
                print('step %d, accuracy on validation batch: %g' %
                      (step, accuracy))
                summary_writer_validation.add_summary(summary_str, step)
                summary_writer_images.add_summary(image_str)
                summary_writer_adversarial.add_summary(adv_summary_str, step)
                summary_writer_images_adversarial.add_summary(adv_image_str)

            ## Save the model checkpoint periodically.
            if step % FLAGS.save_model == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(run_log_dir + '_train',
                                               'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

        # Testing

        # resetting the internal batch indexes
        cifar.reset()
        evaluated_images = 0
        test_accuracy = 0
        adv_test_accuracy = 0
        batch_count = 0

        # don't loop back when we reach the end of the test set
        while evaluated_images != cifar.nTestSamples:
            (testImages,
             testLabels) = cifar.getTestBatch(allowSmallerBatches=True)
            test_accuracy_temp, _, adv_images = sess.run(
                [acc_op, acc_summary, adv_image_op],
                feed_dict={
                    x: testImages,
                    y_: testLabels,
                    train: False
                })
            adv_test_accuracy_temp = sess.run(adv_acc_op,
                                              feed_dict={
                                                  x_image: adv_images,
                                                  y_: testLabels,
                                                  train: False
                                              })

            batch_count = batch_count + 1
            test_accuracy = test_accuracy + test_accuracy_temp
            adv_test_accuracy = adv_test_accuracy + adv_test_accuracy_temp
            evaluated_images = evaluated_images + testLabels.shape[0]

        test_accuracy = test_accuracy / batch_count
        adv_test_accuracy = adv_test_accuracy / batch_count
        print('test set: accuracy on test set: %0.3f' % test_accuracy)
        print('test set: accuracy on adversarial test set: %0.3f' %
              adv_test_accuracy)
Beispiel #25
0
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.1):
    """
    MNIST CleverHans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    keras.layers.core.K.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10.
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
        report.clean_train_clean_eval = acc
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    model_train(sess,
                x,
                y,
                preds,
                X_train,
                Y_train,
                evaluate=evaluate,
                args=train_params)

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    fgsm = FastGradientMethod(model, sess=sess)
    fgsm_params = {'eps': 0.3}
    adv_x = fgsm.generate(x, **fgsm_params)
    preds_adv = model(adv_x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model()
    preds_2 = model_2(x)
    fgsm2 = FastGradientMethod(model_2, sess=sess)
    preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params))

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    model_train(sess,
                x,
                y,
                preds_2,
                X_train,
                Y_train,
                predictions_adv=preds_2_adv,
                evaluate=evaluate_2,
                args=train_params)

    return report
Beispiel #26
0
def test():
    """
    """
    tf.reset_default_graph()
    g = tf.get_default_graph()

    with g.as_default():
        # Placeholder nodes.
        images_holder = tf.placeholder(
            tf.float32,
            [None, FLAGS.IMAGE_ROWS, FLAGS.IMAGE_COLS, FLAGS.NUM_CHANNELS])
        label_holder = tf.placeholder(tf.float32, [None, FLAGS.NUM_CLASSES])
        is_training = tf.placeholder(tf.bool, ())

        # model
        model = model_mnist.RDPCNN(images_holder, label_holder,
                                   FLAGS.INPUT_SIGMA,
                                   is_training)  # for adv examples

        model_loss = model.loss()
        model_acc = model.cnn_accuracy

        # robust
        def inference(x):
            logits, _ = model.cnn.prediction(x)
            return logits

        def inference_prob(x):
            _, probs = model.cnn.prediction(x)
            return probs

        graph_dict = {}
        graph_dict["images_holder"] = images_holder
        graph_dict["label_holder"] = label_holder
        graph_dict["is_training"] = is_training

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config, graph=g) as sess:
        sess.run(tf.global_variables_initializer())
        # load model
        model.tf_load(sess, name=FLAGS.CNN_CKPT_RESTORE_NAME)

        # adv test
        ####################################################################################################
        x_advs = {}
        ch_model_logits = CallableModelWrapper(callable_fn=inference,
                                               output_layer='logits')
        ch_model_probs = CallableModelWrapper(callable_fn=inference_prob,
                                              output_layer='probs')
        # FastGradientMethod
        fgsm_obj = FastGradientMethod(model=ch_model_probs, sess=sess)
        x_advs["fgsm"] = fgsm_obj.generate(x=images_holder,
                                           eps=FLAGS.ATTACK_SIZE,
                                           clip_min=0.0,
                                           clip_max=1.0)  # testing now

        # Iterative FGSM (BasicIterativeMethod/ProjectedGradientMethod with no random init)
        # default: eps_iter=0.05, nb_iter=10
        ifgsm_obj = BasicIterativeMethod(model=ch_model_probs, sess=sess)
        x_advs["ifgsm"] = ifgsm_obj.generate(x=images_holder,
                                             eps=FLAGS.ATTACK_SIZE,
                                             eps_iter=FLAGS.ATTACK_SIZE / 10,
                                             nb_iter=10,
                                             clip_min=0.0,
                                             clip_max=1.0)

        # MomentumIterativeMethod
        # default: eps_iter=0.06, nb_iter=10
        mim_obj = MomentumIterativeMethod(model=ch_model_probs, sess=sess)
        x_advs["mim"] = mim_obj.generate(x=images_holder,
                                         eps=FLAGS.ATTACK_SIZE,
                                         eps_iter=FLAGS.ATTACK_SIZE / 10,
                                         nb_iter=10,
                                         decay_factor=1.0,
                                         clip_min=0.0,
                                         clip_max=1.0)

        # MadryEtAl (Projected Grdient with random init, same as rand+fgsm)
        # default: eps_iter=0.01, nb_iter=40
        madry_obj = MadryEtAl(model=ch_model_probs, sess=sess)
        x_advs["madry"] = madry_obj.generate(x=images_holder,
                                             eps=FLAGS.ATTACK_SIZE,
                                             eps_iter=FLAGS.ATTACK_SIZE / 10,
                                             nb_iter=10,
                                             clip_min=0.0,
                                             clip_max=1.0)
        graph_dict["x_advs"] = x_advs
        ####################################################################################################

        # tensorboard writer
        #test_writer = model_utils.init_writer(FLAGS.TEST_LOG_PATH, g)
        print("\nTest")
        if FLAGS.local:
            total_test_batch = 2
        else:
            total_test_batch = None
        dp_info = np.load(FLAGS.DP_INFO_NPY, allow_pickle=True).item()
        test_info(sess,
                  model,
                  None,
                  graph_dict,
                  dp_info,
                  FLAGS.TEST_LOG_FILENAME,
                  total_batch=total_test_batch)
        robust_info(sess, model, graph_dict, FLAGS.ROBUST_LOG_FILENAME)
Beispiel #27
0
class TestFastGradientMethod(CleverHansTest):
    def setUp(self):
        super(TestFastGradientMethod, self).setUp()

        self.sess = tf.Session()
        self.model = SimpleModel()
        self.attack = FastGradientMethod(self.model, sess=self.sess)

    def generate_adversarial_examples_np(self, ord, eps, **kwargs):
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        x_adv = self.attack.generate_np(x_val, eps=eps, ord=ord,
                                        clip_min=-5, clip_max=5, **kwargs)
        if ord == np.inf:
            delta = np.max(np.abs(x_adv - x_val), axis=1)
        elif ord == 1:
            delta = np.sum(np.abs(x_adv - x_val), axis=1)
        elif ord == 2:
            delta = np.sum(np.square(x_adv - x_val), axis=1)**.5

        return x_val, x_adv, delta

    def help_generate_np_gives_adversarial_example(self, ord, eps=.5, **kwargs):
        x_val, x_adv, delta = self.generate_adversarial_examples_np(ord, eps,
                                                                    **kwargs)
        self.assertClose(delta, eps)
        orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1)
        new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1)
        self.assertTrue(np.mean(orig_labs == new_labs) < 0.5)

    def test_generate_np_gives_adversarial_example_linfinity(self):
        self.help_generate_np_gives_adversarial_example(np.infty)

    def test_generate_np_gives_adversarial_example_l1(self):
        self.help_generate_np_gives_adversarial_example(1)

    def test_generate_np_gives_adversarial_example_l2(self):
        self.help_generate_np_gives_adversarial_example(2)

    def test_generate_respects_dtype(self):
        x = tf.placeholder(dtype=tf.float64, shape=(100, 2))
        x_adv = self.attack.generate(x)
        self.assertEqual(x_adv.dtype, tf.float64)

    def test_targeted_generate_np_gives_adversarial_example(self):
        random_labs = np.random.random_integers(0, 1, 100)
        random_labs_one_hot = np.zeros((100, 2))
        random_labs_one_hot[np.arange(100), random_labs] = 1

        _, x_adv, delta = self.generate_adversarial_examples_np(
            eps=.5, ord=np.inf, y_target=random_labs_one_hot)

        self.assertClose(delta, 0.5)

        new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1)
        self.assertTrue(np.mean(random_labs == new_labs) > 0.7)

    def test_generate_np_can_be_called_with_different_eps(self):
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        for eps in [0.1, 0.2, 0.3, 0.4]:
            x_adv = self.attack.generate_np(x_val, eps=eps, ord=np.inf,
                                            clip_min=-5.0, clip_max=5.0)

            delta = np.max(np.abs(x_adv - x_val), axis=1)
            self.assertClose(delta, eps)

    def test_generate_np_clip_works_as_expected(self):
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        x_adv = self.attack.generate_np(x_val, eps=0.5, ord=np.inf,
                                        clip_min=-0.2, clip_max=0.1)

        self.assertClose(np.min(x_adv), -0.2)
        self.assertClose(np.max(x_adv), 0.1)

    def test_generate_np_caches_graph_computation_for_eps_clip_or_xi(self):

        x_val = np.random.rand(1, 2)
        x_val = np.array(x_val, dtype=np.float32)

        self.attack.generate_np(x_val, eps=.3, num_iterations=10,
                                clip_max=-5.0, clip_min=-5.0,
                                xi=1e-6)

        old_grads = tf.gradients

        def fn(*x, **y):
            raise RuntimeError()
        tf.gradients = fn

        self.attack.generate_np(x_val, eps=.2, num_iterations=10,
                                clip_max=-4.0, clip_min=-4.0,
                                xi=1e-5)

        tf.gradients = old_grads
Beispiel #28
0
def attack_classifier(sess,
                      x,
                      y,
                      model,
                      x_test,
                      y_test,
                      attack_method="fgsm",
                      target=None,
                      batch_size=128):
    tf.set_random_seed(1822)
    set_log_level(logging.DEBUG)

    # Initialize attack
    if attack_method == "fgsm":
        from cleverhans.attacks import FastGradientMethod
        params = {'eps': 8 / 255, 'clip_min': 0., 'clip_max': 1.}
        if target is not None:
            params["y_target"] = tf.constant(
                np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0))
        method = FastGradientMethod(model, sess=sess)

    elif attack_method == "basic_iterative":
        from cleverhans.attacks import BasicIterativeMethod
        params = {
            'eps': 8 / 255,
            'eps_iter': 1 / 255,
            'nb_iter': 10,
            'clip_min': 0.,
            'clip_max': 1.
        }
        if target is not None:
            params["y_target"] = tf.constant(
                np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0))
        method = BasicIterativeMethod(model, sess=sess)

    elif attack_method == "momentum_iterative":
        from cleverhans.attacks import MomentumIterativeMethod
        params = {
            'eps': 8 / 255,
            'eps_iter': 1 / 255,
            'nb_iter': 10,
            'clip_min': 0.,
            'clip_max': 1.
        }
        if target is not None:
            params["y_target"] = tf.constant(
                np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0))
        method = MomentumIterativeMethod(model, sess=sess)

    elif attack_method == "saliency":
        from cleverhans.attacks import SaliencyMapMethod
        params = {
            'theta': 8 / 255,
            'gamma': 0.1,
            'clip_min': 0.,
            'clip_max': 1.
        }
        assert target is None
        method = SaliencyMapMethod(model, sess=sess)

    elif attack_method == "virtual":
        from cleverhans.attacks import VirtualAdversarialMethod
        params = {
            'eps': 8 / 255,
            'num_iterations': 10,
            'xi': 1e-6,
            'clip_min': 0.,
            'clip_max': 1.
        }
        assert target is None
        method = VirtualAdversarialMethod(model, sess=sess)

    elif attack_method == "cw":
        from cleverhans.attacks import CarliniWagnerL2
        params = {
            "confidence": 0,
            "batch_size": 128,
            "learning_rate": 1e-4,
            "binary_search_steps": 10,
            "max_iterations": 1000,
            "abort_early": True,
            "initial_const": 1e-2,
            "clip_min": 0,
            "clip_max": 1
        }
        if target is not None:
            params["y_target"] = tf.constant(
                np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0))
        method = CarliniWagnerL2(model, sess=sess)

    elif attack_method == "elastic_net":
        from cleverhans.attacks import ElasticNetMethod
        params = {
            "fista": "FISTA",
            "beta": 0.1,
            "decision_rule": "EN",
            "confidence": 0,
            "batch_size": 128,
            "learning_rate": 1e-4,
            "binary_search_steps": 10,
            "max_iterations": 1000,
            "abort_early": True,
            "initial_const": 1e-2,
            "clip_min": 0,
            "clip_max": 1
        }
        if target is not None:
            params["y_target"] = tf.constant(
                np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0))
        method = ElasticNetMethod(model, sess=sess)

    elif attack_method == "deepfool":
        from cleverhans.attacks import DeepFool
        params = {
            "nb_candidate": 10,
            "overshoot": 1e-3,
            "max_iter": 100,
            "nb_classes": 10,
            "clip_min": 0,
            "clip_max": 1
        }
        assert target is None
        method = DeepFool(model, sess=sess)

    elif attack_method == "lbfgs":
        from cleverhans.attacks import LBFGS
        params = {
            'batch_size': 128,
            "binary_search_steps": 10,
            "max_iterations": 1000,
            "initial_const": 1e-2,
            'clip_min': 0.,
            'clip_max': 1.
        }
        assert target is not None
        params["y_target"] = tf.constant(
            np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0))
        method = LBFGS(model, sess=sess)

    elif attack_method == "madry":
        from cleverhans.attacks import MadryEtAl
        params = {
            'eps': 8 / 255,
            'eps_iter': 1 / 255,
            'nb_iter': 10,
            'ord': np.inf,
            'clip_min': 0.,
            'clip_max': 1.
        }
        if target is not None:
            params["y_target"] = tf.constant(
                np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0))
        method = MadryEtAl(model, sess=sess)

    elif attack_method == "SPSA":
        from cleverhans.attacks import SPSA
        params = {
            'epsilon': 1 / 255,
            'num_steps': 10,
            'is_targeted': False,
            'early_stop_loss_threshold': None,
            'learning_rate': 0.01,
            'delta': 0.01,
            'batch_size': 128,
            'spsa_iters': 1,
            'is_debug': False
        }
        if target is not None:
            params["y_target"] = tf.constant(
                np.repeat(np.eye(10)[target:target + 1], batch_size, axis=0))
            params["is_targeted"] = True
        method = SPSA(model, sess=sess)

    else:
        raise ValueError("Can not recognize this attack method")

    adv_x = method.generate(x, **params)
    num_batch = x_test.shape[0] // batch_size
    adv_imgs = []
    for i in range(num_batch):
        if (i + 1) * batch_size >= x_test.shape[0]:
            adv_imgs.append(
                sess.run(adv_x,
                         feed_dict={
                             x: x_test[i * batch_size:],
                             y: y_test[i * batch_size:]
                         }))
        else:
            adv_imgs.append(
                sess.run(adv_x,
                         feed_dict={
                             x: x_test[i * batch_size:(i + 1) * batch_size],
                             y: y_test[i * batch_size:(i + 1) * batch_size]
                         }))
    adv_imgs = np.concatenate(adv_imgs, axis=0)

    return adv_imgs
Beispiel #29
0
    def setUp(self):
        super(TestFastGradientMethod, self).setUp()

        self.sess = tf.Session()
        self.model = SimpleModel()
        self.attack = FastGradientMethod(self.model, sess=self.sess)
Beispiel #30
0
def dknn_tutorial():
    # Get MNIST data.
    mnist = MNIST()
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Use Image Parameters.
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    with tf.Session() as sess:
        with tf.variable_scope('dknn'):
            # Define input TF placeholder.
            x = tf.placeholder(tf.float32,
                               shape=(None, img_rows, img_cols, nchannels))
            y = tf.placeholder(tf.float32, shape=(None, nb_classes))

            # Define a model.
            model = make_basic_picklable_cnn()
            preds = model.get_logits(x)
            loss = CrossEntropy(model, smoothing=0.)

            # Define the test set accuracy evaluation.
            def evaluate():
                acc = model_eval(sess,
                                 x,
                                 y,
                                 preds,
                                 x_test,
                                 y_test,
                                 args={'batch_size': FLAGS.batch_size})
                print('Test accuracy on test examples: %0.4f' % acc)

            # Train the model
            train_params = {
                'nb_epochs': FLAGS.nb_epochs,
                'batch_size': FLAGS.batch_size,
                'learning_rate': FLAGS.lr
            }
            train(sess,
                  loss,
                  x_train,
                  y_train,
                  evaluate=evaluate,
                  args=train_params,
                  var_list=model.get_params())

            # Define callable that returns a dictionary of all activations for a dataset
            def get_activations(data):
                data_activations = {}
                for layer in layers:
                    layer_sym = tf.layers.flatten(model.get_layer(x, layer))
                    data_activations[layer] = batch_eval(
                        sess, [x], [layer_sym], [data],
                        args={'batch_size': FLAGS.batch_size})[0]
                return data_activations

            # Use a holdout of the test set to simulate calibration data for the DkNN.
            train_data = x_train
            train_labels = np.argmax(y_train, axis=1)
            cali_data = x_test[:FLAGS.nb_cali]
            y_cali = y_test[:FLAGS.nb_cali]
            cali_labels = np.argmax(y_cali, axis=1)
            test_data = x_test[FLAGS.nb_cali:]
            y_test = y_test[FLAGS.nb_cali:]

            # Extract representations for the training and calibration data at each layer of interest to the DkNN.
            layers = ['ReLU1', 'ReLU3', 'ReLU5', 'logits']

            # Wrap the model into a DkNNModel
            dknn = DkNNModel(FLAGS.neighbors,
                             layers,
                             get_activations,
                             train_data,
                             train_labels,
                             nb_classes,
                             scope='dknn')
            dknn.calibrate(cali_data, cali_labels)

            # Generate adversarial examples
            fgsm = FastGradientMethod(model, sess=sess)
            attack_params = {'eps': .25, 'clip_min': 0., 'clip_max': 1.}
            adv = sess.run(fgsm.generate(x, **attack_params),
                           feed_dict={x: test_data})

            # Test the DkNN on clean test data and FGSM test data
            for data_in, fname in zip([test_data, adv], ['test', 'adv']):
                dknn_preds = dknn.fprop_np(data_in)
                print(dknn_preds.shape)
                print(
                    np.mean(
                        np.argmax(dknn_preds, axis=1) == np.argmax(y_test,
                                                                   axis=1)))
                plot_reliability_diagram(dknn_preds, np.argmax(y_test, axis=1),
                                         '/tmp/dknn_' + fname + '.pdf')

    return True
Beispiel #31
0
        probs = output.op.inputs[0]
        return probs


eps = 2.0 * 16.0 / 255.0
batch_shape = [FLAGS.batch_size, FLAGS.image_height, FLAGS.image_width, 3]
num_classes = 1001

tf.logging.set_verbosity(tf.logging.INFO)

with tf.Graph().as_default():
    x_input = tf.placeholder(tf.float32, shape=batch_shape)

    model = InceptionModel(num_classes)

    fgsm = FastGradientMethod(model)
    x_adv = fgsm.generate(x_input, eps=eps, clip_min=-1., clip_max=1.)

    saver = tf.train.Saver(slim.get_model_variables())
    session_creator = tf.train.ChiefSessionCreator(
        scaffold=tf.train.Scaffold(saver=saver),
        checkpoint_filename_with_path=FLAGS.checkpoint_path,
        master=FLAGS.master)

    with tf.train.MonitoredSession(session_creator=session_creator) as sess:
        for filenames, images in load_images(
                "../input/nips-2017-adversarial-learning-development-set/images/",
                batch_shape):
            adv_images = sess.run(x_adv, feed_dict={x_input: images})
            save_images(adv_images, filenames, "")
def mnist_tutorial(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=6, batch_size=128,
                   learning_rate=0.001, train_dir="train_dir",
                   filename="mnist.ckpt", load_model=False,
                   testing=False, label_smoothing=True):
    """
    MNIST CleverHans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param train_dir: Directory storing the saved model
    :param filename: Filename to save model under
    :param load_model: True for load, False for not load
    :param testing: if true, test error is calculated
    :return: an AccuracyReport object
    """
    keras.layers.core.K.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    if label_smoothing:
        label_smooth = .1
        y_train = y_train.clip(label_smooth / (nb_classes-1),
                               1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Define TF model graph
    model = cnn_model(img_rows=img_rows, img_cols=img_cols,
                      channels=nchannels, nb_filters=64,
                      nb_classes=nb_classes)
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
        report.clean_train_clean_eval = acc
#        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }

    rng = np.random.RandomState([2017, 8, 30])
    if not os.path.exists(train_dir):
        os.mkdir(train_dir)

    ckpt = tf.train.get_checkpoint_state(train_dir)
    print(train_dir, ckpt)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path
    wrap = KerasModelWrapper(model)

    if load_model and ckpt_path:
        saver = tf.train.Saver()
        print(ckpt_path)
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        loss = LossCrossEntropy(wrap, smoothing=0.1)
        train(sess, loss, x, y, x_train, y_train, evaluate=evaluate,
              args=train_params, save=True, rng=rng)

    # Calculate training error
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params)
        report.train_clean_train_clean_eval = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.3,
                   'clip_min': 0.,
                   'clip_max': 1.}
    adv_x = fgsm.generate(x, **fgsm_params)
    # Consider the attack to be constant
    adv_x = tf.stop_gradient(adv_x)
    preds_adv = model(adv_x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    # Calculating train error
    if testing:
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, x_train,
                         y_train, args=eval_par)
        report.train_clean_train_adv_eval = acc

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols,
                        channels=nchannels, nb_filters=64,
                        nb_classes=nb_classes)
    wrap_2 = KerasModelWrapper(model_2)
    preds_2 = model_2(x)
    fgsm2 = FastGradientMethod(wrap_2, sess=sess)

    def attack(x):
        return fgsm2.generate(x, **fgsm_params)

    preds_2_adv = model_2(attack(x))
    loss_2 = LossCrossEntropy(wrap_2, smoothing=0.1, attack=attack)

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, x_test, y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess, x, y, preds_2_adv, x_test,
                              y_test, args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    train(sess, loss_2, x, y, x_train, y_train, evaluate=evaluate_2,
          args=train_params, save=False, rng=rng)

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, x_train, y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess, x, y, preds_2_adv, x_train,
                              y_train, args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=NB_EPOCHS,
                   batch_size=BATCH_SIZE,
                   learning_rate=LEARNING_RATE,
                   clean_train=CLEAN_TRAIN,
                   testing=False,
                   backprop_through_attack=BACKPROP_THROUGH_ATTACK,
                   nb_filters=NB_FILTERS,
                   num_threads=None,
                   label_smoothing=0.1):
    """
  MNIST cleverhans tutorial
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param learning_rate: learning rate for training
  :param clean_train: perform normal training on clean examples only
                      before performing adversarial training.
  :param testing: if true, complete an AccuracyReport for unit tests
                  to verify that performance is adequate
  :param backprop_through_attack: If True, backprop through adversarial
                                  example construction process during
                                  adversarial training.
  :param label_smoothing: float, amount of label smoothing for cross entropy
  :return: an AccuracyReport object
  """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    if num_threads:
        config_args = dict(intra_op_parallelism_threads=1)
    else:
        config_args = {}
    sess = tf.Session(config=tf.ConfigProto(**config_args))

    # Get MNIST data
    mnist = MNIST(train_start=train_start,
                  train_end=train_end,
                  test_start=test_start,
                  test_end=test_end)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Use Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    eval_params = {'batch_size': batch_size}
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    rng = np.random.RandomState([2017, 8, 30])

    def do_eval(preds, x_set, y_set, report_key, is_adv=None):
        acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params)
        setattr(report, report_key, acc)
        if is_adv is None:
            report_text = None
        elif is_adv:
            report_text = 'adversarial'
        else:
            report_text = 'legitimate'
        if report_text:
            print('Test accuracy on %s examples: %0.4f' % (report_text, acc))

    if clean_train:
        model = ModelBasicCNN('model1', nb_classes, nb_filters)
        preds = model.get_logits(x)
        loss = CrossEntropy(model, smoothing=label_smoothing)

        def evaluate():
            do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False)

        train(sess,
              loss,
              x_train,
              y_train,
              evaluate=evaluate,
              args=train_params,
              rng=rng,
              var_list=model.get_params())

        # Calculate training error
        if testing:
            do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval')

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        fgsm = FastGradientMethod(model, sess=sess)
        adv_x = fgsm.generate(x, **fgsm_params)
        preds_adv = model.get_logits(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True)

        # Calculate training error
        if testing:
            do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval')

        print('Repeating the process, using adversarial training')

    # Create a new model and train it to be robust to FastGradientMethod
    model2 = ModelBasicCNN('model2', nb_classes, nb_filters)
    fgsm2 = FastGradientMethod(model2, sess=sess)

    def attack(x):
        return fgsm2.generate(x, **fgsm_params)

    loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack)
    preds2 = model2.get_logits(x)
    adv_x2 = attack(x)

    if not backprop_through_attack:
        # For the fgsm attack used in this tutorial, the attack has zero
        # gradient so enabling this flag does not change the gradient.
        # For some other attacks, enabling this flag increases the cost of
        # training, but gives the defender the ability to anticipate how
        # the atacker will change their strategy in response to updates to
        # the defender's parameters.
        adv_x2 = tf.stop_gradient(adv_x2)
    preds2_adv = model2.get_logits(adv_x2)

    def evaluate2():
        # Accuracy of adversarially trained model on legitimate test inputs
        do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False)
        # Accuracy of the adversarially trained model on adversarial examples
        do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True)

    # Perform and evaluate adversarial training
    train(sess,
          loss2,
          x_train,
          y_train,
          evaluate=evaluate2,
          args=train_params,
          rng=rng,
          var_list=model2.get_params())

    # Calculate training errors
    if testing:
        do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval')
        do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval')

    return report
def mdt(model, data_dir, checkpoint_dir,
        train_dir='./tmp/cifar10_train',
        adversarial_dir='./tmp/cifar10_adv', batch_size=128,
        data_aug=False, data_norm=True):

    # train model
    if not tf.gfile.Exists(train_dir):
        # set input and get logits
        images, labels = mdt_cifar10_input.inputs(False, data_dir, batch_size,
                                                  data_aug, data_norm)

        labels = tf.cast(labels, tf.int64)
        # target = False
        # adv_output_layer = 'adv_bounddecoder6'
        # loss = adv_net_loss(images, model, labels, target, adv_output_layer, 0, 10)
        logits = model(images)
        loss = stand_loss(logits, labels)
        train_process(model, loss, images, label, train_dir, batch_size)
    
    # define dataset format
    img_rows = 32
    img_cols = 32
    channels = 3
    nb_classes = 10

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Get predict tensor
    pred = model(x)

    sess = tf.Session()

    if not checkpoint_load(sess, checkpoint_dir):
        return False

    # fetch data
    cifar10_data.maybe_download_and_return_python(data_dir)
    X, Y = mdt_cifar10_input.numpy_input(True, data_dir)

    # create one-hot Y
    one_hot_Y = to_categorical(Y, nb_classes)

    # create mode feed
    train_feed = mode_feed(sess, True)
    eval_feed = mode_feed(sess, False)

    fgsm_params = {'eps': 1,
                   'clip_min': 0.,
                   'clip_max': 255.}
    fgsm = FastGradientMethod(model, sess=sess)
    adv_x = fgsm.generate(x, **fgsm_params)
    preds_adv = model.get_probs(adv_x)
 
    # eval model accuracy
    class_accuracy, accuracy = model_eval_each_class(sess, x, y, pred, 10, X, one_hot_Y,
                          feed=eval_feed,
                          args={'batch_size': 128})
    print('model accuracy: {0}'.format(accuracy))

    for i in range(10):
        print('class {0} accuracy: {1}'.format(i, class_accuracy[i]))

    # eval model's accuacy in cw adversarial examples
    fgsm_accuracy = model_eval(sess, x, y, preds_adv, X, one_hot_Y,
                             feed=eval_feed,
                             args={'batch_size': 128})
    print('model fgsm_accuracy: {0}'.format(fgsm_accuracy))


    jsma_params = {'theta': 1., 'gamma': 0.1,
                   'clip_min': 0., 'clip_max': 1.,
                   'y_target': None}
                   

    X = X[:128]
    Y=one_hot_Y[:128]
    adv_feed = {x:X, y:one_hot_Y}
    adv_feed.update(eval_feed)
    sta = time.time()
    adv_X_ = sess.run(adv_x,feed_dict=adv_feed)
    end = time.time()
    duration = end - sta
    print('finished in {0} seconds'.format(duration))

    l2_dis = calculate_l2_dis(X/255, adv_X_/255)
    print('adversarial examples\' mean l2 distance: {0}'.format(l2_dis))
def mnist_blackbox(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_classes=10,
                   batch_size=128,
                   learning_rate=0.001,
                   nb_epochs=10,
                   holdout=150,
                   data_aug=6,
                   nb_epochs_s=10,
                   lmbda=0.1,
                   epsilon=0.3):
    """
    MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :return: a dictionary with:
             * black-box model accuracy on test set
             * substitute model accuracy on test set
             * black-box model accuracy on adversarial examples transferred
               from the substitute model
    """

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)
    pyp = False
    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session
    sess = tf.Session()

    # Get MNIST data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Initialize substitute training set reserved for adversary
    X_sub = X_test[:holdout]
    Y_sub = np.argmax(Y_test[:holdout], axis=1)

    # Redefine test set as remaining samples unavailable to adversaries
    X_test = X_test[holdout:]
    Y_test = Y_test[holdout:]

    # Define input and output TF placeholders
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Seed random number generator so tutorial is reproducible
    rng = np.random.RandomState([2017, 8, 30])

    # Simulate the black-box model locally
    # You could replace this by a remote labeling API for instance
    print("Preparing the black-box model.")
    prep_bbox_out = prep_bbox(sess,
                              x,
                              y,
                              X_train,
                              Y_train,
                              X_test,
                              Y_test,
                              nb_epochs,
                              batch_size,
                              learning_rate,
                              rng=rng)
    model, bbox_preds, accuracies['bbox'] = prep_bbox_out

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    print("Training the substitute model.")
    train_sub_out = train_sub(sess,
                              x,
                              y,
                              bbox_preds,
                              X_sub,
                              Y_sub,
                              nb_classes,
                              nb_epochs_s,
                              batch_size,
                              learning_rate,
                              data_aug,
                              lmbda,
                              rng=rng)
    model_sub, preds_sub = train_sub_out

    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params)
    accuracies['sub'] = acc
    for epstep in [epsilon * i for i in range(20)]:
        # Initialize the Fast Gradient Sign Method (FGSM) attack object.
        fgsm_par = {
            'eps': epstep,
            'ord': np.inf,
            'clip_min': 0.,
            'clip_max': 1.
        }
        fgsm = FastGradientMethod(model_sub, sess=sess)

        # Craft adversarial examples using the substitute
        eval_params = {'batch_size': batch_size}
        x_adv_sub = fgsm.generate(x, **fgsm_par)

        def find_error(glb, mdl):
            temparray = []
            for i in range(len(glb)):
                prd = np.argmax(mdl.predict(np.array([glb[i]])))
                if prd != np.argmax(Y_test[i]):
                    # print('--')
                    # print(prd)
                    # print('diff')
                    # print(np.argmax(Y_test[i]))
                    # print('--')
                    temparray.append([glb[i], Y_test[i], X_test[i], prd, i])
            return temparray

    # Evaluate the accuracy of the "black-box" model on adversarial examples

        accuracy = model_eval(sess,
                              x,
                              y,
                              model(x_adv_sub),
                              X_test,
                              Y_test,
                              args=eval_params)
        print(
            'Test accuracy of oracle on BB Adversarial Samples with epsilon = %s : '
            % epstep + str(accuracy))
        if pyp:

            x_adv_np = fgsm.generate_np(X_test[0:200], **fgsm_par)
            y_adv_np = find_error(x_adv_np, keras_global_model)
            from matplotlib import pyplot as plt
            plt.rc('figure', figsize=(12.0, 12.0))
            for j in range(len(y_adv_np) - 1):
                print(
                    str(y_adv_np[j][3]) + "predit, et le reel etait : " +
                    str(np.argmax(y_adv_np[j][1])))
                plt.imshow(y_adv_np[j][0].reshape((28, 28)),
                           cmap="gray",
                           label=str(np.argmax(y_adv_np[j][3])))
                plt.pause(1)
                print('---')
        accuracies['bbox_on_sub_adv_ex' + str(epstep)] = accuracy

    return accuracies
def mnist_blackbox(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_classes=10, batch_size=128,
                   learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6,
                   nb_epochs_s=10, lmbda=0.1, aug_batch_size=512):
    """
    MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :return: a dictionary with:
             * black-box model accuracy on test set
             * substitute model accuracy on test set
             * black-box model accuracy on adversarial examples transferred
               from the substitute model
    """

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session
    sess = tf.Session()

    # Get MNIST data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)
    # Initialize substitute training set reserved for adversary
    X_sub = x_test[:holdout]
    Y_sub = np.argmax(y_test[:holdout], axis=1)

    # Redefine test set as remaining samples unavailable to adversaries
    x_test = x_test[holdout:]
    y_test = y_test[holdout:]

    # Obtain Image parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None,     nb_classes))

    # Seed random number generator so tutorial is reproducible
    rng = np.random.RandomState([2017, 8, 30])

    # Simulate the black-box model locally
    # You could replace this by a remote labeling API for instance
    print("Preparing the black-box model.")
    prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test,
                              nb_epochs, batch_size, learning_rate,
                              rng, nb_classes, img_rows, img_cols, nchannels)
    model, bbox_preds, accuracies['bbox'] = prep_bbox_out

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    print("Training the substitute model.")
    train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub,
                              nb_classes, nb_epochs_s, batch_size,
                              learning_rate, data_aug, lmbda, aug_batch_size,
                              rng, img_rows, img_cols, nchannels)
    model_sub, preds_sub = train_sub_out

    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_sub, x_test, y_test, args=eval_params)
    accuracies['sub'] = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.}
    fgsm = FastGradientMethod(model_sub, sess=sess)

    # Craft adversarial examples using the substitute
    eval_params = {'batch_size': batch_size}
    x_adv_sub = fgsm.generate(x, **fgsm_par)

    # Evaluate the accuracy of the "black-box" model on adversarial examples
    accuracy = model_eval(sess, x, y, model.get_logits(x_adv_sub),
                          x_test, y_test, args=eval_params)
    print('Test accuracy of oracle on adversarial examples generated '
          'using the substitute: ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex'] = accuracy

    return accuracies
Beispiel #37
0
class TestFastGradientMethod(CleverHansTest):
    def setUp(self):
        super(TestFastGradientMethod, self).setUp()

        self.sess = tf.Session()
        self.model = SimpleModel()
        self.attack = FastGradientMethod(self.model, sess=self.sess)

    def generate_adversarial_examples_np(self, ord, eps, **kwargs):
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        x_adv = self.attack.generate_np(x_val,
                                        eps=eps,
                                        ord=ord,
                                        clip_min=-5,
                                        clip_max=5,
                                        **kwargs)
        if ord == np.inf:
            delta = np.max(np.abs(x_adv - x_val), axis=1)
        elif ord == 1:
            delta = np.sum(np.abs(x_adv - x_val), axis=1)
        elif ord == 2:
            delta = np.sum(np.square(x_adv - x_val), axis=1)**.5

        return x_val, x_adv, delta

    def help_generate_np_gives_adversarial_example(self,
                                                   ord,
                                                   eps=.5,
                                                   **kwargs):
        x_val, x_adv, delta = self.generate_adversarial_examples_np(
            ord, eps, **kwargs)
        self.assertClose(delta, eps)
        orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1)
        new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1)
        self.assertTrue(np.mean(orig_labs == new_labs) < 0.5)

    def test_generate_np_gives_adversarial_example_linfinity(self):
        self.help_generate_np_gives_adversarial_example(np.infty)

    def test_generate_np_gives_adversarial_example_l1(self):
        self.help_generate_np_gives_adversarial_example(1)

    def test_generate_np_gives_adversarial_example_l2(self):
        self.help_generate_np_gives_adversarial_example(2)

    def test_generate_respects_dtype(self):
        self.attack = FastGradientMethod(self.model,
                                         sess=self.sess,
                                         dtypestr='float64')
        x = tf.placeholder(dtype=tf.float64, shape=(100, 2))
        x_adv = self.attack.generate(x)
        self.assertEqual(x_adv.dtype, tf.float64)

    def test_targeted_generate_np_gives_adversarial_example(self):
        random_labs = np.random.random_integers(0, 1, 100)
        random_labs_one_hot = np.zeros((100, 2))
        random_labs_one_hot[np.arange(100), random_labs] = 1

        _, x_adv, delta = self.generate_adversarial_examples_np(
            eps=.5, ord=np.inf, y_target=random_labs_one_hot)

        self.assertClose(delta, 0.5)

        new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1)
        self.assertTrue(np.mean(random_labs == new_labs) > 0.7)

    def test_generate_np_can_be_called_with_different_eps(self):
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        for eps in [0.1, 0.2, 0.3, 0.4]:
            x_adv = self.attack.generate_np(x_val,
                                            eps=eps,
                                            ord=np.inf,
                                            clip_min=-5.0,
                                            clip_max=5.0)

            delta = np.max(np.abs(x_adv - x_val), axis=1)
            self.assertClose(delta, eps)

    def test_generate_np_clip_works_as_expected(self):
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        x_adv = self.attack.generate_np(x_val,
                                        eps=0.5,
                                        ord=np.inf,
                                        clip_min=-0.2,
                                        clip_max=0.1)

        self.assertClose(np.min(x_adv), -0.2)
        self.assertClose(np.max(x_adv), 0.1)

    def test_generate_np_caches_graph_computation_for_eps_clip_or_xi(self):

        x_val = np.random.rand(1, 2)
        x_val = np.array(x_val, dtype=np.float32)

        self.attack.generate_np(x_val,
                                eps=.3,
                                num_iterations=10,
                                clip_max=-5.0,
                                clip_min=-5.0,
                                xi=1e-6)

        old_grads = tf.gradients

        def fn(*x, **y):
            raise RuntimeError()

        tf.gradients = fn

        self.attack.generate_np(x_val,
                                eps=.2,
                                num_iterations=10,
                                clip_max=-4.0,
                                clip_min=-4.0,
                                xi=1e-5)

        tf.gradients = old_grads
Beispiel #38
0
    def setUp(self):
        super(TestFastGradientMethod, self).setUp()

        self.sess = tf.Session()
        self.model = SimpleModel()
        self.attack = FastGradientMethod(self.model, sess=self.sess)
Beispiel #39
0
                                        [dae_47.layers[2].output])
get_hidden_layer_output_64 = K.function([dae_64.layers[0].input],
                                        [dae_64.layers[2].output])
get_hidden_layer_output_80 = K.function([dae_80.layers[0].input],
                                        [dae_80.layers[2].output])
get_hidden_layer_output_94 = K.function([dae_94.layers[0].input],
                                        [dae_94.layers[2].output])
get_hidden_layer_output_157 = K.function([dae_157.layers[0].input],
                                         [dae_157.layers[2].output])
trad_ae_output = K.function([trad_ae.layers[0].input],
                            [trad_ae.layers[2].output])

#Create adversarial examples on testing data
sess = backend.get_session()
wrap = KerasModelWrapper(attacker_classifier)
fgsm = FastGradientMethod(wrap, sess=sess)
adv_test_x = fgsm.generate_np(data_test, eps=eta, clip_min=0., clip_max=1.)

wrap_adv_trn = KerasModelWrapper(attack_classifier_adv_trn)
fgsm_adv_trn = FastGradientMethod(wrap_adv_trn, sess=sess)
adv_test_x_adv_trn = fgsm_adv_trn.generate_np(data_test,
                                              eps=eta,
                                              clip_min=0.,
                                              clip_max=1.)

#Evaluate on clean data
scores = fc_classifier.evaluate(data_test, labels_test)
print("Accuracy of clean data without any defense")
print("Accuracy: %.2f%%" % (scores[1] * 100))

#Evaluate model after attacking data with no defense
def mnist_tutorial(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=6, batch_size=128,
                   learning_rate=0.001, train_dir="/tmp",
                   filename="mnist.ckpt", load_model=False,
                   testing=False):
    """
    MNIST CleverHans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param train_dir: Directory storing the saved model
    :param filename: Filename to save model under
    :param load_model: True for load, False for not load
    :param testing: if true, test error is calculated
    :return: an AccuracyReport object
    """
    keras.layers.core.K.set_learning_phase(0)

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
        report.clean_train_clean_eval = acc
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }
    ckpt = tf.train.get_checkpoint_state(train_dir)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path

    rng = np.random.RandomState([2017, 8, 30])
    if load_model and ckpt_path:
        saver = tf.train.Saver()
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
        evaluate()
    else:
        print("Model was not loaded, training from scratch.")
        train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate,
              args=train_params, save=True)

    # Calculate training error
    if testing:
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params)
        report.train_clean_train_clean_eval = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    wrap = KerasModelWrapper(model)
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.3}
    adv_x = fgsm.generate(x, **fgsm_params)
    # Consider the attack to be constant
    adv_x = tf.stop_gradient(adv_x)
    preds_adv = model(adv_x)

    # Evaluate the accuracy of the MNIST model on adversarial examples
    eval_par = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par)
    print('Test accuracy on adversarial examples: %0.4f\n' % acc)
    report.clean_train_adv_eval = acc

    # Calculating train error
    if testing:
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, X_train,
                         Y_train, args=eval_par)
        report.train_clean_train_adv_eval = acc

    print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = cnn_model()
    preds_2 = model_2(x)
    wrap_2 = KerasModelWrapper(model_2)
    fgsm2 = FastGradientMethod(wrap_2, sess=sess)
    preds_2_adv = model_2(fgsm2.generate(x, **fgsm_params))

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess, x, y, preds_2_adv, X_test,
                              Y_test, args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    train(sess, x, y, preds_2, X_train, Y_train,
          predictions_adv=preds_2_adv, evaluate=evaluate_2,
          args=train_params, save=False)

    # Get a random slice of the data for linear extrapolation plots
    random_idx = np.random.randint(0, X_train.shape[0])
    X_slice = X_train[random_idx]
    Y_slice = Y_train[random_idx]

    # Plot the linear extrapolation plot for clean model
    log_prob_adv_array = get_logits_over_interval(
        sess, wrap, X_slice, fgsm_params)
    linear_extrapolation_plot(log_prob_adv_array, Y_slice,
                              'lep_clean.png')

    # Plot the linear extrapolation plot for adv model
    log_prob_adv_array = get_logits_over_interval(
        sess, wrap_2, X_slice, fgsm_params)
    linear_extrapolation_plot(log_prob_adv_array, Y_slice,
                              'lep_adv.png')

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess, x, y, preds_2_adv, X_train,
                              Y_train, args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
Beispiel #41
0
def mnist_blackbox(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_classes=10,
                   batch_size=128,
                   learning_rate=0.1,
                   nb_epochs=10,
                   holdout=150,
                   data_aug=6,
                   nb_epochs_s=10,
                   lmbda=0.1):
    """
    MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :return: a dictionary with:
             * black-box model accuracy on test set
             * substitute model accuracy on test set
             * black-box model accuracy on adversarial examples transferred
               from the substitute model
    """
    keras.layers.core.K.set_learning_phase(0)

    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get MNIST data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Initialize substitute training set reserved for adversary
    X_sub = X_test[:holdout]
    Y_sub = np.argmax(Y_test[:holdout], axis=1)

    # Redefine test set as remaining samples unavailable to adversaries
    X_test = X_test[holdout:]
    Y_test = Y_test[holdout:]

    # Define input and output TF placeholders
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Simulate the black-box model locally
    # You could replace this by a remote labeling API for instance
    print("Preparing the black-box model.")
    prep_bbox_out = prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test,
                              nb_epochs, batch_size, learning_rate)
    model, bbox_preds, accuracies['bbox'] = prep_bbox_out

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    print("Training the substitute model.")
    train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes,
                              nb_epochs_s, batch_size, learning_rate, data_aug,
                              lmbda)
    model_sub, preds_sub = train_sub_out

    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params)
    accuracies['sub'] = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.}
    fgsm = FastGradientMethod(model_sub, sess=sess)

    # Craft adversarial examples using the substitute
    eval_params = {'batch_size': batch_size}
    x_adv_sub = fgsm.generate(x, **fgsm_par)

    # Evaluate the accuracy of the "black-box" model on adversarial examples
    accuracy = model_eval(sess,
                          x,
                          y,
                          model(x_adv_sub),
                          X_test,
                          Y_test,
                          args=eval_params)
    print('Test accuracy of oracle on adversarial examples generated '
          'using the substitute: ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex'] = accuracy

    return accuracies