Esempio n. 1
0
    def attack_single_step(self, x, eta, y):
        """
        Given the original image and the perturbation computed so far, computes
        a new perturbation.

        :param x: A tensor with the original input.
        :param eta: A tensor the same shape as x that holds the perturbation.
        :param y: A tensor with the target labels or ground-truth labels.
        """
        import tensorflow as tf
        from cleverhans.utils_tf import model_loss, clip_eta

        adv_x = x + eta
        preds = self.model.get_probs(adv_x)
        loss = model_loss(y, preds)
        loss_vector = model_loss(y, preds, mean=False)
        if self.targeted:
            loss = -loss
        grad, = tf.gradients(loss, adv_x)
        scaled_signed_grad = self.eps_iter * tf.sign(grad)
        adv_x = adv_x + scaled_signed_grad
        if self.clip_min is not None and self.clip_max is not None:
            adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max)
        eta = adv_x - x
        eta = clip_eta(eta, self.ord, self.eps)
        return eta, loss, loss_vector
    def attack_single_step(self, x, eta, y):
        """
        Given the original image and the perturbation computed so far, computes
        a new perturbation.
        :param x: A tensor with the original input.
        :param eta: A tensor the same shape as x that holds the perturbation.
        :param y: A tensor with the target labels or ground-truth labels.
        """
        from cleverhans.utils_tf import model_loss, clip_eta

        adv_x = x + eta
        preds = self.model.get_probs(adv_x)
        loss = model_loss(y, preds)
        if self.targeted:
            loss = -loss
        grad, = tf.gradients(loss, adv_x)
        if self.pgd_update == 'sign':
            adv_x = adv_x + self.eps_iter * tf.sign(grad)
        elif self.pgd_update == 'plain':
            adv_x = adv_x + self.eps_iter * grad / tf.reduce_sum(
                grad**2, axis=[1, 2, 3], keep_dims=True)**0.5
        else:
            raise Exception('Wrong pgd_update.')
        if self.clip_min is not None and self.clip_max is not None:
            adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max)
        eta = adv_x - x
        eta = clip_eta(eta, self.ord, self.eps)
        return eta
Esempio n. 3
0
 def __init__(self, modelpath, model_file):
     self.image_size_height = 224
     self.image_size_width = 224
     self.num_channels = 3
     self.sess, self.model, self.logits = self.load_model(modelpath, model_file)
     self.y = tf.placeholder(tf.float32, shape=(None, 4))
     self.loss = utils_tf.model_loss(self.y, self.model.outputs[0], mean=False)
     self.x = self.model.inputs[0]  # input tensor
Esempio n. 4
0
 def __init__(self, modelpath):
     self.image_size_height = 224
     self.image_size_width = 224
     self.num_channels = 3
     self.sess, self.inputImage, self.output1 = self.load_model(modelpath)
     self.x = self.inputImage
     self.logits = self.output1
     self.y = tf.placeholder(tf.float32, shape=(None, 1001))
     self.loss = utils_tf.model_loss(self.y, self.logits, mean=False)
Esempio n. 5
0
def fg(x, predictions):
    # Compute loss
    y = tf.to_float(
        tf.equal(predictions, tf.reduce_max(predictions, 1, keep_dims=True)))
    y = y / tf.reduce_sum(y, 1, keep_dims=True)
    loss = utils_tf.model_loss(y, predictions, mean=False)

    # Define gradient of loss wrt input
    grad, = tf.gradients(loss, x)

    return grad
Esempio n. 6
0
def fgm_grad(x,
             preds,
             y=None,
             ord=np.inf,
             clip_min=None,
             clip_max=None,
             targeted=False):
    """
    Returns the scaled gradient of the fgm attack. Clipping now has to be done outside, but this
    avoids costly recomputation of the gradient.
    """

    if y is None:
        # Using model predictions as ground truth to avoid label leaking
        preds_max = tf.reduce_max(preds, 1, keep_dims=True)
        y = tf.to_float(tf.equal(preds, preds_max))
        y = tf.stop_gradient(y)
    y = y / tf.reduce_sum(y, 1, keep_dims=True)

    # Compute loss
    loss = utils_tf.model_loss(y, preds, mean=False)
    if targeted:
        loss = -loss

    # Define gradient of loss wrt input
    grad, = tf.gradients(loss, x)

    if ord == np.inf:
        # Take sign of gradient
        normalized_grad = tf.sign(grad)
        # The following line should not change the numerical results.
        # It applies only because `normalized_grad` is the output of
        # a `sign` op, which has zero derivative anyway.
        # It should not be applied for the other norms, where the
        # perturbation has a non-zero derivative.
        normalized_grad = tf.stop_gradient(normalized_grad)
    elif ord == 1:
        red_ind = list(range(1, len(x.get_shape())))
        normalized_grad = grad / tf.reduce_sum(
            tf.abs(grad), reduction_indices=red_ind, keep_dims=True)
    elif ord == 2:
        red_ind = list(range(1, len(x.get_shape())))
        square = tf.reduce_sum(tf.square(grad),
                               reduction_indices=red_ind,
                               keep_dims=True)
        normalized_grad = grad / tf.sqrt(square)
    else:
        raise NotImplementedError("Only L-inf, L1 and L2 norms are "
                                  "currently implemented.")

    return normalized_grad
Esempio n. 7
0
    def body(i, old_adv_x, old_loss, labels=labels):
      """Find example with max loss value amongst batch of perturbations."""
      deltas = tf.random_uniform(deltas_shape)

      # generate uniform samples from the l^p unit ball interior
      if self.ord == np.inf:
        deltas *= 2. * self.eps
        deltas -= self.eps
      elif self.ord == 1:
        # ref: https://mathoverflow.net/questions/9185/how-to-generate-random-points-in-ell-p-balls  pylint: disable=line-too-long
        exp = -tf.log(deltas)
        shift = -tf.log(tf.random_uniform(deltas_shape[:2]))
        norm = tf.reduce_sum(tf.abs(exp), range(2, len(deltas_shape) - 2))
        scale = tf.reshape(shift + norm,
                           deltas_shape[:2] + [1] * (len(deltas_shape) - 2))
        deltas = exp / scale
      elif self.ord == 2:
        # ref: https://blogs.sas.com/content/iml/2016/04/06/generate-points-uniformly-in-ball.html  pylint: disable=line-too-long
        dims = tf.reduce_prod(deltas_shape[2:])
        deltas = tf.pow(deltas, 1. / dims)
        normal = tf.random_normal(deltas)
        normal /= tf.sqrt(
            tf.reduce_sum(normal**2, axis=range(2,
                                                len(deltas_shape) - 2)),
            keepdims=True)
        deltas *= normal
      else:
        raise NotImplementedError('Only L-inf, L1 and L2 norms are '
                                  'currently implemented.')

      adv_x = tf.expand_dims(x, 1) + deltas
      labels = tf.expand_dims(labels, 1)
      labels = tf.tile(labels, [1, self.num_samples, 1])

      if (self.clip_min is not None) and (self.clip_max is not None):
        adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max)

      adv_x_r = tf.reshape(adv_x, [-1] + deltas_shape[2:])
      preds = self.model.get_probs(adv_x_r)
      preds_shape = preds.shape.as_list()
      preds = tf.reshape(preds, deltas_shape[:2] + preds_shape[1:])

      if labels is None:
        # Using model predictions as ground truth to avoid label leaking
        preds_max = tf.reduce_max(preds, -1, keep_dims=True)
        labels = tf.to_float(tf.equal(preds, preds_max))
        labels = tf.stop_gradient(labels)
      labels = labels / tf.reduce_sum(labels, -1, keep_dims=True)

      # Compute loss
      loss = utils_tf.model_loss(labels, preds, mean=False)
      if self.y_target is not None:
        loss = -loss

      # find the maximum loss value
      input_idx = tf.one_hot(tf.argmax(loss, axis=1), self.num_samples, axis=1)
      loss = tf.reduce_sum(loss * input_idx, axis=1)
      input_idx = tf.reshape(input_idx,
                             deltas_shape[:2] + [1] * (len(deltas_shape) - 2))
      adv_x = tf.reduce_sum(adv_x * input_idx, axis=1)

      condition = tf.greater(old_loss, loss)
      new_loss = tf.where(condition, old_loss, loss)
      new_adv_x = tf.where(condition, old_adv_x, adv_x)
      print(new_loss, new_adv_x)

      return i + 1, new_adv_x, new_loss
Esempio n. 8
0
def main(argv=None):
    print("going into setup")
    op, model, sess, pholders, varops = setup_attack_graph()

    data = load_many_images(FLAGS.attack_srcdir)
    num_images = len(data)

    feed_dict = {pholders['image_in']: data, 
                 pholders['attack_target']: get_adv_target(nb_inputs = num_images), 
                 pholders['noise_mask']: load_norm_mask(), 
                 keras.backend.learning_phase(): 0}

    if FLAGS.printability_optimization:
        feed_dict[pholders['printable_colors']] = get_print_triplets(FLAGS.printability_tuples)

    # used to save checkpoints after each epoch
    saver = tf.train.Saver(max_to_keep=50)

    # debug: sanity check to make sure the model isn't being adjusted
    # i.e. this should stay constant

    if FLAGS.fullres_input:
        clean_model_loss = model_loss(pholders['attack_target'], model(tf.image.resize_images(pholders['image_in'], (FLAGS.img_rows,FLAGS.img_cols))), mean=True) 
    else:
        clean_model_loss = model_loss(pholders['attack_target'], model(pholders['image_in']), mean=True) 
    
    for i in range(FLAGS.attack_epochs):
        print('Epoch %d'%i),
        sys.stdout.flush()
        
        if not FLAGS.fullres_input:
            _, train_loss, noisy_in, clean_loss, clean_classes, noisy_classes = sess.run( \
                (op, \
                varops['adv_loss'], \
                varops['noise_inputs'], \
                clean_model_loss, \
                model(pholders['image_in']), \
                varops['adv_pred']) \
                , feed_dict=feed_dict)
        else:
            _, train_loss, noisy_in, clean_loss, clean_classes, noisy_classes, rnin = sess.run( \
                (op, \
                varops['adv_loss'], \
                varops['noise_inputs'], \
                clean_model_loss, \
                model(tf.image.resize_images(pholders['image_in'], (FLAGS.img_rows,FLAGS.img_cols))), \
                varops['adv_pred'], \
                varops['resized_noise_in']) \
                , feed_dict=feed_dict)

        print(model(tf.image.resize_images(pholders['image_in'], (FLAGS.img_rows,FLAGS.img_cols))))

        print("adversarial loss %.5f model loss on clean img: %.5f"%(train_loss, clean_loss)),
        sys.stdout.flush()
       
        if FLAGS.printability_optimization:
            print("noise NPS %.5f"%sess.run(varops['printer_error'], feed_dict=feed_dict)),

        # num_misclassified = 0

        # for j in range(num_images):
        #     clean_classification = np.argmax(clean_classes[j])
        #     noise_classification = np.argmax(noisy_classes[j])
        #     if clean_classification != noise_classification:
        #         num_misclassified += 1

        # proportion_misclassified = float(num_misclassified)/float(num_images)
        # print('percent misclassified images %.1f'%(proportion_misclassified*100.0))


        # if i%FLAGS.save_frequency == 0 or proportion_misclassified > 0.9: 
        #     saver.save(sess, os.path.join('optimization_output', FLAGS.checkpoint, 'model', FLAGS.checkpoint), global_step=i)
        #     imsave(os.path.join('optimization_output', FLAGS.checkpoint, "noisy_images", "noisyimg_%s_epoch_%d.png"%(FLAGS.checkpoint,i)), (noisy_in[0]*255).astype(int))

        #     if FLAGS.fullres_input:
        #         imsave(os.path.join('optimization_output', FLAGS.checkpoint, "nimage_downsized_%d.png"%i), rnin[0])
        #         imsave(os.path.join('optimization_output', FLAGS.checkpoint, "noise_downsized_%d.png"%i),sess.run(varops['noise']))


        print()
       ### end of epoch
    sess.close()

    for i in range(num_images):
        imsave(os.path.join('optimization_output', FLAGS.checkpoint, "noisy-set", "%d.png"%(i)), (noisy_in[i]*255).astype(int))
Esempio n. 9
0
def fgm_range(x,
              preds,
              y=None,
              epsilons=[0.3],
              ord=np.inf,
              clip_min=None,
              clip_max=None,
              targeted=False):
    """
    This is a slight modification of the fast gradient method to
    return a series of fgm attacks with a set of different epilons, in
    order to avoid the costly recomputation of the gradient.
    """

    if y is None:
        # Using model predictions as ground truth to avoid label leaking
        preds_max = tf.reduce_max(preds, 1, keep_dims=True)
        y = tf.to_float(tf.equal(preds, preds_max))
        y = tf.stop_gradient(y)
    y = y / tf.reduce_sum(y, 1, keep_dims=True)

    # Compute loss
    loss = utils_tf.model_loss(y, preds, mean=False)
    if targeted:
        loss = -loss

    # Define gradient of loss wrt input
    grad, = tf.gradients(loss, x)

    if ord == np.inf:
        # Take sign of gradient
        normalized_grad = tf.sign(grad)
        # The following line should not change the numerical results.
        # It applies only because `normalized_grad` is the output of
        # a `sign` op, which has zero derivative anyway.
        # It should not be applied for the other norms, where the
        # perturbation has a non-zero derivative.
        normalized_grad = tf.stop_gradient(normalized_grad)
    elif ord == 1:
        red_ind = list(range(1, len(x.get_shape())))
        normalized_grad = grad / tf.reduce_sum(
            tf.abs(grad), reduction_indices=red_ind, keep_dims=True)
    elif ord == 2:
        red_ind = list(range(1, len(x.get_shape())))
        square = tf.reduce_sum(tf.square(grad),
                               reduction_indices=red_ind,
                               keep_dims=True)
        normalized_grad = grad / tf.sqrt(square)
    else:
        raise NotImplementedError("Only L-inf, L1 and L2 norms are "
                                  "currently implemented.")

    # Multiply by constant epsilon
    scaled_grads = [eps * normalized_grad for eps in epsilons]

    # Add perturbation to original example to obtain adversarial example
    adv_xs = [x + scaled_grad for scaled_grad in scaled_grads]

    # If clipping is needed, reset all values outside of [clip_min, clip_max]

    if (clip_min is not None) and (clip_max is not None):
        for adx_x in adv_xs:
            adv_x = tf.clip_by_value(adv_x, clip_min, clip_max)

    return adv_xs
Esempio n. 10
0
def setup_attack_graph_two_masks():
    '''
    Sets up the attack graph assuming two different masks will be fed in
    :return: a tuple of (the optimization op, the model, the session, the placeholders, the variable and other ops)
    '''

    assert not (FLAGS.fullres_input
                ), "High resolution input not supported with two masks"

    # this handles the setup of the Keras model, the initialization of the TF session,
    # and the loading of the model parameters from previously saved values
    model, sess = setup_model_and_sess()

    # at this point, only the model variables exist; we will need this set later
    # in order to tell TF to not initialize those variables again
    model_vars = set(tf.global_variables())

    # will hold the placeholders so that they can be returned
    placeholders = {}

    # will hold the variables and operations defined from now on
    varops = {}

    # set up the placeholders -- these are "input" places to the computation graph that change from run to run
    # these are "filled in" for each session run by using a feed_dict
    placeholders['image_in'] = tf.placeholder(tf.float32,
                                              shape=(None, FLAGS.img_rows,
                                                     FLAGS.img_cols,
                                                     FLAGS.nb_channels))

    # attack_target is the one-hot vector for the class
    # we are trying to mimic when feeding image_in into the network
    placeholders['attack_target'] = tf.placeholder(tf.float32,
                                                   shape=(None,
                                                          FLAGS.nb_classes))

    # this is the first mask being applied to limit the region of the perturbations
    placeholders['mask1'] = tf.placeholder(tf.float32,
                                           shape=(FLAGS.img_rows,
                                                  FLAGS.img_cols,
                                                  FLAGS.nb_channels))

    # this is the second mask being applied to limit the region of the perturbations
    placeholders['mask2'] = tf.placeholder(tf.float32,
                                           shape=(FLAGS.img_rows,
                                                  FLAGS.img_cols,
                                                  FLAGS.nb_channels))

    # this is the sum of mask1 and mask2
    varops['combined_mask'] = tf.add(placeholders['mask1'],
                                     placeholders['mask2'])

    if FLAGS.printability_optimization:
        ####!!! Assumption: the printable tuples were all expanded to match
        ### the size of the image, so one tuple (x, y, z) gets replicated 32x32 times
        # we will have a different set of printable colors for each mask
        placeholders['printable_colors_region_1'] = tf.placeholder(
            tf.float32, shape=(None, 32, 32, 3))
        placeholders['printable_colors_region_2'] = tf.placeholder(
            tf.float32, shape=(None, 32, 32, 3))

    # the noise variable is what is actually being optimized
    # the values stored in variables are persisted across session runs (but not across program runs, unless saved)
    if FLAGS.initial_value_for_noise != "" and FLAGS.initial_value_for_noise != " ":
        # if a specific color for the initialization has been specified,
        # set the initial value of the noise to that color
        noise_init_color = np.float32(
            FLAGS.initial_value_for_noise.split(",")) / 255.0
        assert noise_init_color.shape == (
            3,
        ), "You must provide 3 comma-separated values or no value for the initial_value_for_noise argument"
        noise_init = np.ndarray(
            [FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels],
            dtype='float32')
        noise_init[:, :] = noise_init_color
        varops['noise'] = tf.Variable(
            noise_init,
            name='noise',
            collections=[tf.GraphKeys.GLOBAL_VARIABLES, 'adv_var'])
    else:
        varops['noise'] = tf.Variable(
            tf.random_uniform([img_rows, img_cols, FLAGS.nb_channels], 0.0,
                              1.0),
            name='noise',
            collections=[tf.GraphKeys.GLOBAL_VARIABLES, 'adv_var'])

    if FLAGS.clipping:
        varops['noise'] = tf.clip_by_value(varops['noise'],
                                           FLAGS.noise_clip_min,
                                           FLAGS.noise_clip_max)
        varops['noise_mul'] = tf.multiply(varops['combined_mask'],
                                          varops['noise'])
        varops['noise_inputs'] = tf.clip_by_value(
            tf.add(placeholders['image_in'], varops['noise_mul']),
            FLAGS.noisy_input_clip_min, FLAGS.noisy_input_clip_max)
    else:
        varops['noise_mul'] = tf.multiply(varops['combined_mask'],
                                          varops['noise'])
        varops['noise_inputs'] = tf.add(placeholders['image_in'],
                                        varops['noise_mul'])

    varops['adv_pred'] = model(varops['noise_inputs'])

    # Regularization term to control size of perturbation
    if FLAGS.regloss == 'l1':
        varops['reg_loss'] = FLAGS.attack_lambda * l1_norm(
            tf.multiply(varops['combined_mask'], varops['noise']))
    else:
        varops['reg_loss'] = FLAGS.attack_lambda * l2_norm(
            tf.multiply(varops['combined_mask'], varops['noise']))

    # Compares adv predictions to given predictions
    # Default to cross-entropy (as defined in the model_loss cleverhans utility)
    if FLAGS.optimization_loss == 'mse':
        varops['loss'] = l2_loss(placeholders['attack_target'],
                                 varops['adv_pred'])
    else:
        varops['loss'] = model_loss(placeholders['attack_target'],
                                    varops['adv_pred'],
                                    mean=True)

    if FLAGS.printability_optimization:
        varops['nps1'] = get_nps_op(
            tf.multiply(varops['noise'], placeholders['mask1']),
            placeholders['printable_colors_region_1'])
        varops['nps2'] = get_nps_op(
            tf.multiply(varops['noise'], placeholders['mask2']),
            placeholders['printable_colors_region_2'])
        varops['adv_loss'] = varops['loss'] + varops['reg_loss'] + varops[
            'nps1'] + varops['nps2']
    else:
        varops['adv_loss'] = varops['loss'] + varops['reg_loss']

    op = tf.train.AdamOptimizer(learning_rate=FLAGS.optimization_rate,
                                beta1=FLAGS.adam_beta1,
                                beta2=FLAGS.adam_beta2,
                                epsilon=FLAGS.adam_epsilon).minimize(
                                    varops['adv_loss'],
                                    var_list=tf.get_collection('adv_var'))

    # initialize the noise variable
    sess.run(tf.variables_initializer(set(tf.global_variables()) - model_vars))

    return op, model, sess, placeholders, varops
Esempio n. 11
0
def cifar_tutorial(train_start=0,
                   train_end=49000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.001,
                   clean_train=True,
                   testing=False,
                   backprop_through_attack=False,
                   nb_filters=64,
                   num_threads=None):
    """
    MNIST cleverhans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param clean_train: perform normal training on clean examples only
                        before performing adversarial training.
    :param testing: if true, complete an AccuracyReport for unit tests
                    to verify that performance is adequate
    :param backprop_through_attack: If True, backprop through adversarial
                                    example construction process during
                                    adversarial training.
    :param clean_train: if true, train on clean examples
    :return: an AccuracyReport object
    """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    if num_threads:
        config_args = dict(intra_op_parallelism_threads=1)
    else:
        config_args = {}
    sess = tf.Session(config=tf.ConfigProto(**config_args))

    (X_train, Y_train), (X_test, Y_test) = cifar10.load_data()
    Y_train = np_utils.to_categorical(Y_train, 10)
    Y_test = np_utils.to_categorical(Y_test, 10)
    X_train = X_train.astype('float32')
    X_test = X_test.astype('float32')
    X_train /= 255
    X_test /= 255
    label_smooth = .1

    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))

    y = tf.placeholder(tf.float32, shape=([None, 10]))

    nb_classes = 10
    source_samples = 10
    img_rows = 32
    img_cols = 32
    channels = 3

    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    fgsm_params = {'eps': FLAGS.fgsm_eps, 'clip_min': 0., 'clip_max': 1.}
    rng = np.random.RandomState([2017, 8, 30])
    prune_factor = FLAGS.prune_factor
    if clean_train:
        prune_percent = {
            'conv1_w': 5,
            'conv2_w': 5,
            'conv3_w': 5,
            'conv4_w': 5,
            'fc1_w': prune_factor,
            'fc2_w': prune_factor,
            'fc3_w': prune_factor
        }
        #model = make_resnet(x,10,[None,32,32,3],reuse = True,prune_percent = prune_percent)
        model = make_strong_cnn(nb_filters=nb_filters,
                                prune_percent=prune_percent)
        initialize_uninitialized_global_variables(sess)
        preds = model.get_probs(x)
        saver = tf.train.Saver()
        eval_par = {'batch_size': batch_size}

        def fgsm_combo():
            acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_par)
            print('Test accuracy on legitimate examples: %0.4f\n' % acc)

            fgsm = FastGradientMethod(model, sess=sess)
            #initialize_uninitialized_global_variables(sess)
            adv_x = fgsm.generate(x, **fgsm_params)

            preds_adv = model.get_probs(adv_x)
            acc = model_eval(sess,
                             x,
                             y,
                             preds_adv,
                             X_test,
                             Y_test,
                             args=eval_par)

            print(
                'Test accuracy on adversarial examples generated by fgsm: %0.4f\n'
                % acc)
            bim = BasicIterativeMethod(model, sess=sess)
            adv_x = bim.generate(x)
            preds_adv = model.get_probs(adv_x)

            acc = model_eval(sess,
                             x,
                             y,
                             preds_adv,
                             X_test,
                             Y_test,
                             args=eval_par)
            print(
                'Test accuracy on adversarial examples generated by IterativeMethod: %0.4f\n'
                % acc)

        def evaluate():
            # Evaluate the accuracy of the MNIST model on legitimate test
            # examples

            acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_par)
            report.clean_train_clean_eval = acc
            assert X_test.shape[0] == test_end - test_start, X_test.shape
            print('Test accuracy on legitimate examples: %0.4f' % acc)

        ckpt_name = './cifar_model.ckpt'

        if not FLAGS.resume:
            model_train(sess,
                        x,
                        y,
                        preds,
                        X_train,
                        Y_train,
                        evaluate=evaluate,
                        args=train_params,
                        rng=rng)
            saver.save(sess, ckpt_name)
        if FLAGS.resume:
            saver = tf.train.import_meta_graph(ckpt_name + '.meta')
            print("loading pretrain model")
            saver.restore(sess, ckpt_name)
            acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_par)
            print('Test accuracy on pretrained model: %0.4f\n' % acc)
        if not FLAGS.resume:
            import sys
            sys.exit()

        def do_jsma():
            print('Crafting ' + str(source_samples) + ' * ' +
                  str(nb_classes - 1) + ' adversarial examples')

            # Keep track of success (adversarial example classified in target)
            results = np.zeros((nb_classes, source_samples), dtype='i')

            # Rate of perturbed features for each test set example and target class
            perturbations = np.zeros((nb_classes, source_samples), dtype='f')

            # Initialize our array for grid visualization
            grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            # Instantiate a SaliencyMapMethod attack object
            jsma = SaliencyMapMethod(model, back='tf', sess=sess)
            jsma_params = {
                'theta': 1.,
                'gamma': 0.1,
                'clip_min': 0.,
                'clip_max': 1.,
                'y_target': None
            }

            figure = None
            # Loop over the samples we want to perturb into adversarial examples
            for sample_ind in xrange(0, source_samples):
                print('--------------------------------------')
                print('Attacking input %i/%i' %
                      (sample_ind + 1, source_samples))
                sample = X_test[sample_ind:(sample_ind + 1)]

                # We want to find an adversarial example for each possible target class
                # (i.e. all classes that differ from the label given in the dataset)
                current_class = int(np.argmax(Y_test[sample_ind]))
                target_classes = other_classes(nb_classes, current_class)

                # For the grid visualization, keep original images along the diagonal
                grid_viz_data[current_class,
                              current_class, :, :, :] = np.reshape(
                                  sample, (img_rows, img_cols, channels))

                # Loop over all target classes
                for target in target_classes:
                    print('Generating adv. example for target class %i' %
                          target)

                    # This call runs the Jacobian-based saliency map approach
                    one_hot_target = np.zeros((1, nb_classes),
                                              dtype=np.float32)
                    one_hot_target[0, target] = 1
                    jsma_params['y_target'] = one_hot_target
                    adv_x = jsma.generate_np(sample, **jsma_params)

                    # Check if success was achieved
                    res = int(model_argmax(sess, x, preds, adv_x) == target)

                    # Computer number of modified features
                    adv_x_reshape = adv_x.reshape(-1)
                    test_in_reshape = X_test[sample_ind].reshape(-1)
                    nb_changed = np.where(
                        adv_x_reshape != test_in_reshape)[0].shape[0]
                    percent_perturb = float(nb_changed) / adv_x.reshape(
                        -1).shape[0]

                    # Display the original and adversarial images side-by-side
                    if FLAGS.viz_enabled:
                        figure = pair_visual(
                            np.reshape(sample, (img_rows, img_cols)),
                            np.reshape(adv_x, (img_rows, img_cols)), figure)

                    # Add our adversarial example to our grid data
                    grid_viz_data[target, current_class, :, :, :] = np.reshape(
                        adv_x, (img_rows, img_cols, channels))

                    # Update the arrays for later analysis
                    results[target, sample_ind] = res
                    perturbations[target, sample_ind] = percent_perturb

            print('--------------------------------------')

            # Compute the number of adversarial examples that were successfully found
            nb_targets_tried = ((nb_classes - 1) * source_samples)
            succ_rate = float(np.sum(results)) / nb_targets_tried
            print('Avg. rate of successful adv. examples {0:.4f}'.format(
                succ_rate))
            report.clean_train_adv_eval = 1. - succ_rate

            # Compute the average distortion introduced by the algorithm
            percent_perturbed = np.mean(perturbations)
            print('Avg. rate of perturbed features {0:.4f}'.format(
                percent_perturbed))

            # Compute the average distortion introduced for successful samples only
            percent_perturb_succ = np.mean(perturbations * (results == 1))
            print('Avg. rate of perturbed features for successful '
                  'adversarial examples {0:.4f}'.format(percent_perturb_succ))
            if FLAGS.viz_enabled:
                import matplotlib.pyplot as plt
                plt.close(figure)
                _ = grid_visual(grid_viz_data)

            return report

        def do_cw():
            nb_adv_per_sample = str(nb_classes - 1) if FLAGS.targeted else '1'
            print('Crafting ' + str(source_samples) + ' * ' +
                  nb_adv_per_sample + ' adversarial examples')
            print("This could take some time ...")

            # Instantiate a CW attack object
            cw = CarliniWagnerL2(model, back='tf', sess=sess)

            if FLAGS.viz_enabled:
                assert source_samples == nb_classes
                idxs = [
                    np.where(np.argmax(Y_test, axis=1) == i)[0][0]
                    for i in range(nb_classes)
                ]
            if FLAGS.targeted:
                if FLAGS.viz_enabled:
                    # Initialize our array for grid visualization
                    grid_shape = (nb_classes, nb_classes, img_rows, img_cols,
                                  channels)
                    grid_viz_data = np.zeros(grid_shape, dtype='f')

                    adv_inputs = np.array([[instance] * nb_classes
                                           for instance in X_test[idxs]],
                                          dtype=np.float32)
                else:
                    adv_inputs = np.array(
                        [[instance] * nb_classes
                         for instance in X_test[:source_samples]],
                        dtype=np.float32)

                one_hot = np.zeros((nb_classes, nb_classes))
                one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1

                adv_inputs = adv_inputs.reshape((source_samples * nb_classes,
                                                 img_rows, img_cols, channels))
                adv_ys = np.array([one_hot] * source_samples,
                                  dtype=np.float32).reshape(
                                      (source_samples * nb_classes,
                                       nb_classes))
                yname = "y_target"
            else:
                if FLAGS.viz_enabled:
                    # Initialize our array for grid visualization
                    grid_shape = (nb_classes, 2, img_rows, img_cols, channels)
                    grid_viz_data = np.zeros(grid_shape, dtype='f')

                    adv_inputs = X_test[idxs]
                else:
                    adv_inputs = X_test[:source_samples]

                adv_ys = None
                yname = "y"

            cw_params = {
                'binary_search_steps': 1,
                yname: adv_ys,
                'max_iterations': FLAGS.attack_iterations,
                'learning_rate': 0.1,
                'batch_size': source_samples *
                nb_classes if FLAGS.targeted else source_samples,
                'initial_const': 10
            }

            adv = cw.generate_np(adv_inputs, **cw_params)

            if FLAGS.targeted:
                adv_accuracy = model_eval(sess,
                                          x,
                                          y,
                                          preds,
                                          adv,
                                          adv_ys,
                                          args=eval_par)
            else:
                if FLAGS.viz_enabled:
                    adv_accuracy = 1 - \
                                   model_eval(sess, x, y, preds, adv, Y_test[
                                       idxs], args=eval_par)
                else:
                    adv_accuracy = 1 - \
                                   model_eval(sess, x, y, preds, adv, Y_test[
                                       :source_samples], args=eval_par)

            if FLAGS.viz_enabled:
                for j in range(nb_classes):
                    if FLAGS.targeted:
                        for i in range(nb_classes):
                            grid_viz_data[i, j] = adv[i * nb_classes + j]
                    else:
                        grid_viz_data[j, 0] = adv_inputs[j]
                        grid_viz_data[j, 1] = adv[j]

                print(grid_viz_data.shape)

            print('--------------------------------------')

            # Compute the number of adversarial examples that were successfully found
            print('Avg. rate of successful adv. examples {0:.4f}'.format(
                adv_accuracy))
            report.clean_train_adv_eval = 1. - adv_accuracy

            # Compute the average distortion introduced by the algorithm
            percent_perturbed = np.mean(
                np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5)
            print('Avg. L_2 norm of perturbations {0:.4f}'.format(
                percent_perturbed))
            # Close TF session
            #            sess.close()

            # Finally, block & display a grid of all the adversarial examples
            if FLAGS.viz_enabled:
                import matplotlib.pyplot as plt
                _ = grid_visual(grid_viz_data)

            return report

        fgsm = FastGradientMethod(model, sess=sess)
        #model.test_mode()
        #initialize_uninitialized_global_variables(sess)

        fgsm_combo()
        do_cw()
        do_jsma()
        if not FLAGS.load_pruned_model:
            print("start iterative pruning")
            for i in range(FLAGS.prune_iterations):
                print("iterative %d" % (i))
                dict_nzidx = model.apply_prune(sess)
                trainer = tf.train.AdamOptimizer(learning_rate)
                preds = model.get_probs(x)
                loss = model_loss(y, preds)
                grads = trainer.compute_gradients(loss)
                grads = model.apply_prune_on_grads(grads, dict_nzidx)
                prune_args = {'trainer': trainer, 'grads': grads}
                train_params = {
                    'nb_epochs': FLAGS.retrain_epoch,
                    'batch_size': batch_size,
                    'learning_rate': FLAGS.retrain_lr
                }
                model_train(sess,
                            x,
                            y,
                            preds,
                            X_train,
                            Y_train,
                            evaluate=evaluate,
                            args=train_params,
                            rng=rng,
                            prune_args=prune_args,
                            retrainindex=i)

                acc = model_eval(sess,
                                 x,
                                 y,
                                 preds_adv,
                                 X_test,
                                 Y_test,
                                 args=eval_par)
                print('Test accuracy on adversarial examples: %0.4f\n' % acc)
            saver.save(sess, './pruned_cifar_model.ckpt')
        else:
            print("load pruned model")
            saver = tf.train.import_meta_graph(
                './pruned_cifar_model.ckpt.meta')
            saver.restore(sess, './pruned_cifar_model.ckpt')
            print("before applying gradient inhibition\n")
            fgsm_combo()
            do_cw()
            do_jsma()
        if FLAGS.do_inhibition:
            model.inhibition(sess,
                             original_method=FLAGS.use_inhibition_original,
                             inhibition_eps=FLAGS.inhibition_eps)
        print("after applying gradient inhibition\n")
        fgsm_combo()
        do_cw()
        do_jsma()
Esempio n. 12
0
def setup_attack_graph():
    '''
    Sets up the attack graph
    :return: a tuple of (the optimization op, the model, the session, the placeholders, the variable and other ops)
    '''

    # the NPS calculation also depends on the correct input being provided in the correct size,
    # so for now, the NPS calculation will stay undefined when the full resolution input is used
    assert not (
        FLAGS.printability_optimization and FLAGS.fullres_input
    ), "Printability optimization and full resolution input are not set up to work together. Set at least one of them to false."

    # set up the input (to the TF graph) image size
    # by setting up the variables here, there is no need to do this same if each time we define an operation dependent on the input size
    if FLAGS.fullres_input:
        # if we are providing a full-res image, the noise and mask placeholders and vars
        # will also have this size; they will then be resized back down to FLAGS.img_rows, FLAGS.img_cols for input to the classification model
        img_rows = 300
        img_cols = 300
    else:
        # this is just the standard, 32 by 32 input
        img_rows = FLAGS.img_rows
        img_cols = FLAGS.img_cols

    # this handles the setup of the Keras model, the initialization of the TF session,
    # and the loading of the model parameters from previously saved values
    model, sess = setup_model_and_sess()

    # at this point, only the model variables exist; we will need this set later
    # in order to tell TF to not initialize those variables again
    model_vars = set(tf.global_variables())

    # will hold the placeholders so that they can be returned
    placeholders = {}

    # set up the placeholders -- these are "input" places to the computation graph that change from run to run
    # these are "filled in" for each session run by using a feed_dict
    # image_in was x: the input to the neural network
    placeholders['image_in'] = tf.placeholder(tf.float32,
                                              shape=(None, img_rows, img_cols,
                                                     FLAGS.nb_channels))

    # attack_target used to be y: the one-hot vector for the class
    # we are trying to mimic when feeding image_in into the network
    placeholders['attack_target'] = tf.placeholder(tf.float32,
                                                   shape=(None,
                                                          FLAGS.nb_classes))

    # this is the mask being applied to limit the region of the perturbations
    placeholders['noise_mask'] = tf.placeholder(tf.float32,
                                                shape=(img_rows, img_cols,
                                                       FLAGS.nb_channels))

    if FLAGS.printability_optimization:
        ####!!! Assumption: the printable tuples were all expanded to match
        ### the size of the image, so one tuple (x, y, z) gets replicated 32x32 times
        placeholders['printable_colors'] = tf.placeholder(tf.float32,
                                                          shape=(None, 32, 32,
                                                                 3))

    # will hold the variables and operations defined from now on
    varops = {}

    # the noise variable is what is actually being optimized
    # the values stored in variables are persisted across session runs (but not across program runs, unless saved)
    if FLAGS.initial_value_for_noise != "" and FLAGS.initial_value_for_noise != " ":
        # if a specific color for the initialization has been specified,
        # set the initial value of the noise to that color
        noise_init_color = np.float32(
            FLAGS.initial_value_for_noise.split(",")) / 255.0
        assert noise_init_color.shape == (
            3,
        ), "You must provide 3 comma-separated values or no value for the initial_value_for_noise argument"
        noise_init = np.ndarray([img_rows, img_cols, FLAGS.nb_channels],
                                dtype='float32')
        noise_init[:, :] = noise_init_color
        varops['noise'] = tf.Variable(
            noise_init,
            name='noise',
            collections=[tf.GraphKeys.GLOBAL_VARIABLES, 'adv_var'])
    else:
        varops['noise'] = tf.Variable(
            tf.random_uniform([img_rows, img_cols, FLAGS.nb_channels], 0.0,
                              1.0),
            name='noise',
            collections=[tf.GraphKeys.GLOBAL_VARIABLES, 'adv_var'])

    if FLAGS.clipping:
        varops['noise'] = tf.clip_by_value(varops['noise'],
                                           FLAGS.noise_clip_min,
                                           FLAGS.noise_clip_max)
        varops['noise_mul'] = tf.multiply(placeholders['noise_mask'],
                                          varops['noise'])
        varops['noise_inputs'] = tf.clip_by_value(
            tf.add(placeholders['image_in'], varops['noise_mul']),
            FLAGS.noisy_input_clip_min, FLAGS.noisy_input_clip_max)
    else:
        varops['noise_mul'] = tf.multiply(placeholders['noise_mask'],
                                          varops['noise'])
        varops['noise_inputs'] = tf.add(placeholders['image_in'],
                                        varops['noise_mul'])

    # add a resize before feeding into the model if the input to the TF graph
    # was provided in full resolution (needs to be down-scaled to fit into the classification model)
    if FLAGS.fullres_input:
        varops['resized_noise_in'] = tf.image.resize_images(
            varops['noise_inputs'], (FLAGS.img_rows, FLAGS.img_cols))
        varops['adv_pred'] = model(varops['resized_noise_in'])
    else:
        # adv_pred is what comes out of the model (network) for a given input
        varops['adv_pred'] = model(varops['noise_inputs'])

    # Regularization term to control size of perturbation
    if FLAGS.regloss == 'l1':
        varops['reg_loss'] = FLAGS.attack_lambda * l1_norm(
            tf.multiply(placeholders['noise_mask'], varops['noise']))
    else:
        varops['reg_loss'] = FLAGS.attack_lambda * l2_norm(
            tf.multiply(placeholders['noise_mask'], varops['noise']))

    # Compares adv predictions to given predictions
    # Default to cross-entropy (as defined in the model_loss cleverhans utility)
    if FLAGS.optimization_loss == 'mse':
        varops['loss'] = l2_loss(placeholders['attack_target'],
                                 varops['adv_pred'])
    else:
        varops['loss'] = model_loss(placeholders['attack_target'],
                                    varops['adv_pred'],
                                    mean=True)

    if FLAGS.printability_optimization:
        ####!!! Assumption: the printable tuples were all expanded to match
        ### the size of the image, so one tuple (x, y, z) gets replicated 32x32 times
        varops['printab_pixel_element_diff'] = tf.squared_difference(
            varops['noise_mul'], placeholders['printable_colors'])
        varops['printab_pixel_diff'] = tf.sqrt(
            tf.reduce_sum(varops['printab_pixel_element_diff'], 3))
        varops['printab_reduce_prod'] = tf.reduce_prod(
            varops['printab_pixel_diff'], 0)
        varops['printer_error'] = tf.reduce_sum(varops['printab_reduce_prod'])
        varops['adv_loss'] = varops['loss'] + varops['reg_loss'] + varops[
            'printer_error']
    else:
        varops['adv_loss'] = varops['loss'] + varops['reg_loss']

    op = tf.train.AdamOptimizer(learning_rate=FLAGS.optimization_rate,
                                beta1=FLAGS.adam_beta1,
                                beta2=FLAGS.adam_beta2,
                                epsilon=FLAGS.adam_epsilon).minimize(
                                    varops['adv_loss'],
                                    var_list=tf.get_collection('adv_var'))

    # initialize the noise variable
    sess.run(tf.variables_initializer(set(tf.global_variables()) - model_vars))

    return op, model, sess, placeholders, varops
def setup_attack_graph():
    '''
    This function sets up an attack graph
    based on the Robust Physical Perturbations
    optimization algorithm and returns the Tensorflow operation
    to run that graph, along with the model, session, variables, operations,
    and placeholders defined along the way in dictionaries
    addressed by their names.
    :return: a tuple of (operation, model, session, placeholders, varops)
    where operation is the operation to run,
    provided initially, session is the TF session used to run the attack,
    and placeholder and varops are dictionaries holding the placeholders,
    variables, and intermediate TF operations defined
    '''
    assert FLAGS.img_rows, 'img_rows needs to be defined in the parameters'
    assert FLAGS.img_cols, 'img_cols needs to be defined in the parameters'
    assert FLAGS.nb_channels, 'nb_channels needs to be defined in the parameters'
    assert FLAGS.nb_classes, 'nb_classes needs to be defined in the parameters'
    assert FLAGS.model_path, 'model_path needs to be defined in the parameters'
    assert FLAGS.printability_optimization is not None, \
            'printability_optimization needs to be defined in the parameters'
    if FLAGS.printability_optimization:
        assert FLAGS.printability_tuples, \
        'printability_tuples needs to be defined if printability_optimization is True'
    assert FLAGS.regloss, 'regloss needs to be defined in the parameters'
    assert FLAGS.optimization_loss, 'optimization_loss needs to be defined in the params'

    # begin by setting up the session that will be used
    sess = tf.Session()

    # place all placeholders in this dict
    # so that they can be returned for use outside of this function
    placeholders = {}

    # note that these are set to the size of the input,
    # resizing happens later (before building model) if different
    placeholders['image_in'] = tf.placeholder(tf.float32, \
            shape=(None, FLAGS.input_rows, FLAGS.input_cols, FLAGS.nb_channels),
            name="noiseattack/image_in")

    placeholders['attack_target'] = tf.placeholder(tf.float32, \
        shape=(None, FLAGS.nb_classes),
        name="noiseattack/attack_targe")

    # resize later
    placeholders['noise_mask'] = tf.placeholder(tf.float32, \
                                                shape= \
                                                (FLAGS.input_rows, \
                                                FLAGS.input_cols, \
                                                FLAGS.nb_channels), \
                                                name="noiseattack/noise_mask")

    if FLAGS.printability_optimization:
        ####!!! Assumption: the printable tuples were all expanded to match
        ### the size of the image, so one tuple (x, y, z) gets replicated 32x32 times
        placeholders['printable_colors'] = tf.placeholder(tf.float32, \
                                                          shape=(None, \
                                                          FLAGS.input_rows, \
                                                          FLAGS.input_cols, \
                                                          FLAGS.nb_channels), \
                                                          name="noiseattack/printable_colors")

    # will hold the variables and operations defined from now on
    varops = {}

    varops['noise'] = tf.Variable(tf.random_normal( \
        [FLAGS.input_rows, FLAGS.input_cols, FLAGS.nb_channels]), \
        name='noiseattack/noise', collections=[tf.GraphKeys.GLOBAL_VARIABLES, 'adv_var'])

    # the following operations are these:
    # noise: a clipped value of the noise
    # noise_mul: the multiplication of the noise by the mask
    # noisy_inputs: the addition of the masked noise to the image
    if FLAGS.clipping:
        varops['noise'] = tf.clip_by_value(varops['noise'], \
            FLAGS.noise_clip_min, FLAGS.noise_clip_max, \
            name="noiseattack/noise_clipped")
        varops['noise_mul'] = tf.multiply(placeholders['noise_mask'], varops['noise'], \
            name="noiseattack/noise_mul")
        varops['noisy_inputs'] = tf.clip_by_value(tf.add(placeholders['image_in'], \
                                                  varops['noise_mul']), \
                                FLAGS.noisy_input_clip_min, FLAGS.noisy_input_clip_max, \
                                name="noiseattack/noisy_inputs")
    else:
        varops['noise_mul'] = tf.multiply(placeholders['noise_mask'], varops['noise'], \
                                          name="noiseattack/noise_mul")
        varops['noisy_inputs'] = tf.add(placeholders['image_in'], varops['noise_mul'], \
                                        name="noiseattack/noisy_inputs")

    if FLAGS.img_rows != FLAGS.input_rows or FLAGS.img_cols != FLAGS.input_cols:
        if FLAGS.resize_method != "avgpool" and FLAGS.resize_method != "convresize":
            if FLAGS.resize_method == "area":
                resize_met = tf.image.ResizeMethod.AREA
            elif FLAGS.resize_method == "bicubic":
                resize_met = tf.image.ResizeMethod.BICUBIC
            elif FLAGS.resize_method == "bilinear":
                resize_met = tf.image.ResizeMethod.BILINEAR
            elif FLAGS.resize_method == "nearestneighbor":
                resize_met = tf.image.ResizeMethod.NEAREST_NEIGHBOR
            else:
                raise Exception("resize method needs to be one of: area, bicubic, bilinear, nearestneighbor")

            varops['noisy_inputs'] = tf.image.resize_images(varops['noisy_inputs'], \
                                                            size=(FLAGS.img_rows, FLAGS.img_cols), \
                                                            method=resize_met)
        elif FLAGS.resize_method == "convresize":
            assert FLAGS.img_rows == 32 and FLAGS.input_rows == 256, \
                    "Convresize only guaranteed to work with input 256 and a 32 model"
            f = [[[[1,0,0], [0,1,0], [0,0,1]] for _ in xrange(8)] for __ in xrange(8)]
            f = np.array(f).astype('float32')/(64.0)
            varops['noisy_inputs'] = tf.nn.conv2d(varops['noisy_inputs'], \
                                                 tf.constant(f), \
                                                 strides=[1, 8, 8, 1], \
                                                 padding='SAME')
        else:
            s = FLAGS.input_rows/FLAGS.img_rows
            assert FLAGS.input_rows%FLAGS.img_rows == 0, \
                   "Input size should be a multiple of model input size, \
                   currently input: %d model input: %d"%(FLAGS.input_rows, FLAGS.img_rows)
            varops['noisy_inputs'] = tf.nn.avg_pool(varops['noisy_inputs'], \
                                                    ksize=[1, s, s, 1], \
                                                    strides=[1, s, s, 1], \
                                                    padding='SAME')
 
    # instantiate the model
    model = YadavModel(train=False, custom_input=varops['noisy_inputs'])

    model_vars = filter(lambda x: not str(x.name).startswith("noiseattack"), \
                        tf.global_variables())
    print map(lambda x: x.name, model_vars)
    # load the model
    saver = tf.train.Saver(var_list=model_vars)
    saver.restore(sess, FLAGS.model_path)
    print 'Loaded the parameters for the model from', FLAGS.model_path

    # adv_pred is the output of the model for an image (or images) with noise
    varops['adv_pred'] = model.labels_pred

     # Regularization term to control size of perturbation
    if FLAGS.regloss != "none":
        if FLAGS.regloss == 'l1':
            varops['reg_loss'] = FLAGS.attack_lambda * \
                l1_norm(tf.multiply(placeholders['noise_mask'], varops['noise']))
        elif FLAGS.regloss == 'l2':
            varops['reg_loss'] = FLAGS.attack_lambda * \
                l2_norm(tf.multiply(placeholders['noise_mask'], varops['noise']))
        else:
            raise Exception("Regloss may only be none or l1 or l2. Now%s"%FLAGS.regloss)

    # Compares adv predictions to given predictions
    # Default to cross-entropy (as defined in the model_loss cleverhans utility)
    if FLAGS.optimization_loss == 'justmse':
        varops['loss'] = l2_loss(placeholders['attack_target'], varops['adv_pred'])
    elif FLAGS.optimization_loss == "justcrossentropy":
        varops['loss'] = model_loss(placeholders['attack_target'], varops['adv_pred'], mean=True)
    elif FLAGS.optimization_loss == "justconfmin":
        varops['loss'] = _confmin_loss(varops['adv_pred'], FLAGS.true_class, FLAGS.target_class) 
    elif FLAGS.optimization_loss == "confminandcrossentropy":
        varops['loss'] = model_loss(placeholders['attack_target'], varops['adv_pred'], mean=True) + \
                         _confmin_loss(varops['adv_pred'], FLAGS.true_class, FLAGS.target_class)
    else:
        raise Exception("Optimization_loss needs to be justmse or justcrossentropy \
                        or justconfmin. Now %s"%FLAGS.optimization_loss)

    if FLAGS.printability_optimization:
        ####!!! Assumption: the printable tuples were all expanded to match
        ### the size of the image, so one tuple (x, y, z) gets replicated 32x32 times
        varops['printab_pixel_element_diff'] = tf.squared_difference(varops['noise_mul'], \
            placeholders['printable_colors'])
        varops['printab_pixel_diff'] = tf.sqrt(tf.reduce_sum( \
            varops['printab_pixel_element_diff'], 3))
        varops['printab_reduce_prod'] = tf.reduce_prod(varops['printab_pixel_diff'], 0)
        varops['printer_error'] = tf.reduce_sum(varops['printab_reduce_prod'])
        if FLAGS.regloss != "none":
            varops['adv_loss'] = varops['loss'] + varops['reg_loss'] + varops['printer_error']
        else:
            varops['adv_loss'] = varops['loss'] + varops['printer_error']
    else:
        if FLAGS.regloss != "none":
            varops['adv_loss'] = varops['loss'] + varops['reg_loss']
        else:
            varops['adv_loss'] = varops['loss']

    optimization_op = tf.train.AdamOptimizer(learning_rate=FLAGS.optimization_rate, \
        beta1=FLAGS.adam_beta1, \
        beta2=FLAGS.adam_beta2, \
        epsilon=FLAGS.adam_epsilon).minimize(varops['adv_loss'], \
        var_list=tf.get_collection('adv_var'))

    
    sess.run(tf.variables_initializer(set(tf.global_variables()) - set(model_vars)))
    print 'Initialized the model variables'

    return optimization_op, model, sess, placeholders, varops
Esempio n. 14
0
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.001,
                   clean_train=True,
                   testing=False,
                   backprop_through_attack=False,
                   nb_filters=64):
    """
    MNIST cleverhans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param clean_train: perform normal training on clean examples only
                        before performing adversarial training.
    :param testing: if true, complete an AccuracyReport for unit tests
                    to verify that performance is adequate
    :param backprop_through_attack: If True, backprop through adversarial
                                    example construction process during
                                    adversarial training.
    :param clean_train: if true, train on clean examples
    :return: an AccuracyReport object
    """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Get MNIST test data
    # X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
    #                                               train_end=train_end,
    #                                               test_start=test_start,
    #                                               test_end=test_end)

    # Get notMNIST data
    with np.load("notmnist.npz") as data:
        X_train, Y_train, X_test, Y_test = data['examples_train'], data[
            'labels_train'], data['examples_test'], data['labels_test']

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    model_path = "./"
    model_name = "adv_trained_fgsm_model_mix_data_notmnist"

    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    rng = np.random.RandomState([1992, 8, 3])

    model = make_basic_cnn(nb_filters=nb_filters)
    preds = model(x)

    # Create TF session
    sess = tf.Session()

    fgsm = FastGradientMethod(model, sess=sess)
    adv_x = fgsm.generate(x, **fgsm_params)
    preds_adv = model(adv_x)
    mixed_x = tf.concat([x, adv_x], 0)
    mixed_y = tf.concat([y, y], 0)
    # length = tf.shape(mixed_x)[0]
    index_shuffle = list(range(batch_size * 2))
    rng.shuffle(index_shuffle)
    mixed_x = tf.gather(mixed_x, index_shuffle)
    mixed_y = tf.gather(mixed_y, index_shuffle)
    preds_mixed = model(mixed_x)

    loss = model_loss(mixed_y, preds_mixed)

    train_step = tf.train.AdamOptimizer(learning_rate=learning_rate)
    train_step = train_step.minimize(loss)

    tf.global_variables_initializer().run(session=sess)

    for epoch in xrange(nb_epochs):
        print('Training for epoch %i/%i' % (epoch, nb_epochs - 1))

        # Compute number of batches
        nb_batches = int(math.ceil(float(len(X_train)) / batch_size))
        assert nb_batches * batch_size >= len(X_train)

        # Indices to shuffle training set
        index_shuf = list(range(len(X_train)))
        rng.shuffle(index_shuf)

        prev = time.time()
        for batch in range(nb_batches):
            # re-instantiate FGSM object with new trained model
            # fgsm = FastGradientMethod(model, sess=sess)
            # adv_x = fgsm.generate(x, **fgsm_params)
            print('--------------------------------------')
            # create an array for storing adv examples
            print('batch: %i/%i' % (batch + 1, nb_batches))
            # adv_examples = np.empty([1,28,28,1])
            start, end = batch_indices(batch, len(X_train), batch_size)
            X_this_batch = X_train[index_shuf[start:end]]
            Y_this_batch = Y_train[index_shuf[start:end]]

            # adv_examples = sess.run(adv_x, feed_dict={x:X_this_batch})
            # for target labels
            #adv_targets = np.empty([1,10])
            # corresponding clean/correct label
            # adv_clean_labels = np.empty([1,10])
            # correspongding clean data
            # adv_clean_examples = np.empty([1,28,28,1])

            # adv_examples = np.reshape(adv_examples, (batch_size*(nb_classes-1),28,28,1))
            # adv_clean_examples = np.reshape(adv_clean_examples, (batch_size*(nb_classes-1),28,28,1))
            # mixed_X = np.concatenate((X_this_batch, adv_examples), axis=0)
            # mixed_Y = np.concatenate((Y_this_batch, Y_this_batch), axis=0)
            # print('mixed data have shape', np.shape(mixed_X))
            # print('mixed labels have shape', np.shape(mixed_Y))

            #shuffle the mixed data before training
            # index_of_batch = list(range(np.shape(mixed_Y)[0]))
            # rng.shuffle(index_of_batch)
            # mixed_X = mixed_X[index_of_batch]
            # mixed_Y = mixed_Y[index_of_batch]
            feed_dict = {x: X_this_batch, y: Y_this_batch}
            train_step.run(feed_dict=feed_dict, session=sess)

        cur = time.time()
        _logger.info("Epoch " + str(epoch) + " took " + str(cur - prev) +
                     " seconds")

        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

        acc2 = model_eval(sess,
                          x,
                          y,
                          preds_adv,
                          X_test,
                          Y_test,
                          args=eval_params)
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on adversarial examples: %0.4f' % acc2)

    print('Training finished.')

    # reload fgsm successfully attacking adv test data
    # with np.load("adversarial_fgsm.npz") as data:
    #     adv_X_test, adv_clean_Y_test, adv_clean_X_test = data['adv_examples'], data['adv_clean_labels'], data['adv_clean_examples']
    # print('FGSM adversarial data are successfully reloaded.')
    # preds_adv_test = model(x1)
    # # Evaluate the accuracy of the MNIST model on adversarial examples
    # # eval_par = {'batch_size': 10}
    # acc = model_eval(sess, x1, y, preds_adv_test, adv_X_test, adv_clean_Y_test, args=eval_par)
    # print('Test accuracy on pre-generated adversarial examples of fgsm: %0.4f\n' % acc)
    # # reload fgsm successfully attacking adv test data
    # with np.load("adversarial_mnist_test_from_1500.npz") as data:
    #     adv_X_test, adv_clean_Y_test, adv_clean_X_test = data['adv_examples'], data['adv_clean_labels'], data['adv_clean_examples']
    # print('JSMA adversarial data are successfully reloaded.')
    # # Evaluate the accuracy of the MNIST model on adversarial examples
    # acc2 = model_eval(sess, x1, y, preds_adv_test, adv_X_test, adv_clean_Y_test, args=eval_par)
    # print('Test accuracy on pre-generated adversarial examples of jsma: %0.4f\n' % acc2)
    save_path = os.path.join(model_path, model_name)
    saver = tf.train.Saver()
    saver.save(sess, save_path)
    _logger.info("Completed model training and saved at: " + str(save_path))
    # Close TF session
    sess.close()

    return
Esempio n. 15
0
def effective_train_jsma(train_start=0,
                         train_end=20,
                         test_start=0,
                         test_end=10000,
                         viz_enabled=False,
                         nb_epochs=6,
                         batch_size=128,
                         nb_classes=10,
                         source_samples=10,
                         learning_rate=0.001):

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)
    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    model_path = "./"
    model_name = "adv_trained_jsma_model_alpha0.4_fortest"

    # sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])

    # Define input TF placeholder
    x1 = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))  # for clean data
    x2 = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))  # for adv data
    y = tf.placeholder(tf.float32, shape=(None, 10))  # for adv clean targets

    # Initialize the model
    model = make_basic_cnn()
    preds = model(x1)
    preds_adv = model(x2)

    # Instantiate a SaliencyMapMethod attack object
    # jsma = SaliencyMapMethod(model, back='tf', sess=sess)
    jsma_params = {
        'theta': 1.,
        'gamma': 0.1,
        'clip_min': 0.,
        'clip_max': 1.,
        'y_target': None
    }

    # Define loss
    loss = 0.4 * model_loss(y, preds) + 0.6 * model_loss(y, preds_adv)

    train_step = tf.train.AdamOptimizer(learning_rate=learning_rate)
    train_step = train_step.minimize(loss)

    def evaluate_2(adv_examples_last_batch, adv_clean_labels_last_batch):
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x1,
                              y,
                              preds,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess,
                              x2,
                              y,
                              preds_adv,
                              adv_examples_last_batch,
                              adv_clean_labels_last_batch,
                              args=eval_params)
        print('Test accuracy on last batch of adversarial examples: %0.4f' %
              accuracy)
        report.adv_train_adv_eval = accuracy

    with sess.as_default():
        tf.global_variables_initializer().run()

        for epoch in xrange(nb_epochs):
            print('Training for epoch %i/%i' % (epoch, nb_epochs - 1))

            # Compute number of batches
            nb_batches = int(math.ceil(float(len(X_train)) / batch_size))
            assert nb_batches * batch_size >= len(X_train)

            # Indices to shuffle training set
            index_shuf = list(range(len(X_train)))
            rng.shuffle(index_shuf)

            prev = time.time()
            for batch in range(nb_batches):
                # re-instantiate Saliency object with new trained model
                jsma = SaliencyMapMethod(model, back='tf', sess=sess)
                print('--------------------------------------')
                # create an array for storing adv examples
                print('batch: %i/%i' % (batch + 1, nb_batches))
                # adv_examples = np.empty([1,28,28,1])
                adv_examples = []
                # for target labels
                #adv_targets = np.empty([1,10])
                # corresponding clean/correct label
                # adv_clean_labels = np.empty([1,10])
                adv_clean_labels = []
                # correspongding clean data
                # adv_clean_examples = np.empty([1,28,28,1])
                adv_clean_examples = []

                for sample_ind in xrange(0, batch_size):

                    print('Attacking input %i/%i' %
                          (sample_ind + 1, batch_size))
                    # Compute batch start and end indices
                    start, end = batch_indices(batch, len(X_train), batch_size)
                    X_this_batch = X_train[index_shuf[start:end]]
                    Y_this_batch = Y_train[index_shuf[start:end]]
                    # Perform one training step
                    # feed_dict = {x: X_train[index_shuf[start:end]],y: Y_train[index_shuf[start:end]]}

                    sample = X_this_batch[sample_ind:(
                        sample_ind + 1)]  # generate from training data

                    # We want to find an adversarial example for each possible target class
                    # (i.e. all classes that differ from the label given in the dataset)
                    current_class = int(np.argmax(Y_this_batch[sample_ind])
                                        )  # generate from training data
                    target_classes = other_classes(nb_classes, current_class)
                    print('Current class is ', current_class)

                    # For the grid visualization, keep original images along the diagonal
                    # grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
                    #     sample, (img_rows, img_cols, channels))

                    # Loop over all target classes
                    for target in target_classes:
                        print('Generating adv. example for target class %i' %
                              target)

                        # This call runs the Jacobian-based saliency map approach
                        one_hot_target = np.zeros((1, nb_classes),
                                                  dtype=np.float32)
                        #create fake target
                        one_hot_target[0, target] = 1
                        jsma_params['y_target'] = one_hot_target
                        adv_x = jsma.generate_np(
                            sample, **jsma_params
                        )  # get numpy array (1, 28, 28, 1), not Tensor

                        # Check if success was achieved
                        # res = int(model_argmax(sess, x, preds, adv_x) == target)
                        # if succeeds
                        # if res == 1:
                        # append new adv_x to adv_examples array
                        # append sample here, so that the number of times sample is appended mmatches number of adv_ex.
                        # adv_examples = np.append(adv_examples, adv_x, axis=0)
                        adv_examples.append(adv_x)
                        #adv_targets = np.append(adv_targets, one_hot_target, axis=0)
                        # adv_clean_labels = np.append(adv_clean_labels, np.expand_dims(Y_this_batch[sample_ind],axis=0), axis=0) # generate from training data
                        adv_clean_labels.append(Y_this_batch[sample_ind])
                        # adv_clean_examples = np.append(adv_clean_examples, sample, axis=0)
                        adv_clean_examples.append(sample)

                # what we have for this batch, batch_size * 9 data
                # adv_examples = adv_examples[1:,:,:,:]
                #adv_targets = adv_targets[1:,:]
                # adv_clean_labels = adv_clean_labels[1:,:]
                # adv_clean_examples = adv_clean_examples[1:,:,:,:]
                adv_examples = np.reshape(
                    adv_examples, (batch_size * (nb_classes - 1), 28, 28, 1))
                adv_clean_examples = np.reshape(adv_clean_examples,
                                                (batch_size *
                                                 (nb_classes - 1), 28, 28, 1))
                feed_dict = {
                    x1: adv_clean_examples,
                    x2: adv_examples,
                    y: adv_clean_labels
                }
                train_step.run(feed_dict=feed_dict)

            cur = time.time()
            _logger.info("Epoch " + str(epoch) + " took " + str(cur - prev) +
                         " seconds")

            evaluate_2(adv_examples, adv_clean_labels)
        print('Training finished.')

        # report on clean test data
        preds_test = model(x1)
        eval_par = {'batch_size': 10}
        acc_clean = model_eval(sess,
                               x1,
                               y,
                               preds_test,
                               X_test,
                               Y_test,
                               args=eval_par)
        print('Test accuracy on legitimate examples: %0.4f\n' % acc_clean)
        # reload fgsm successfully attacking adv test data
        # with np.load("adversarial_fgsm.npz") as data:
        #     adv_X_test, adv_clean_Y_test, adv_clean_X_test = data['adv_examples'], data['adv_clean_labels'], data['adv_clean_examples']
        # print('FGSM adversarial data are successfully reloaded.')
        # preds_adv_test = model(x1)
        # # Evaluate the accuracy of the MNIST model on adversarial examples
        # # eval_par = {'batch_size': 10}
        # acc = model_eval(sess, x1, y, preds_adv_test, adv_X_test, adv_clean_Y_test, args=eval_par)
        # print('Test accuracy on pre-generated adversarial examples of fgsm: %0.4f\n' % acc)
        # # reload fgsm successfully attacking adv test data
        # with np.load("adversarial_mnist_test_from_1500.npz") as data:
        #     adv_X_test, adv_clean_Y_test, adv_clean_X_test = data['adv_examples'], data['adv_clean_labels'], data['adv_clean_examples']
        # print('JSMA adversarial data are successfully reloaded.')
        # # Evaluate the accuracy of the MNIST model on adversarial examples
        # acc2 = model_eval(sess, x1, y, preds_adv_test, adv_X_test, adv_clean_Y_test, args=eval_par)
        # print('Test accuracy on pre-generated adversarial examples of jsma: %0.4f\n' % acc2)
        save_path = os.path.join(model_path, model_name)
        saver = tf.train.Saver()
        saver.save(sess, save_path)
        _logger.info("Completed model training and saved at: " +
                     str(save_path))
        # Close TF session
        sess.close()
def main(argv=None):
    with tf.device(FLAGS.device):
        print "Parameters"
        for k in sorted(FLAGS.__dict__["__flags"].keys()):
            print k, FLAGS.__dict__["__flags"][k]

        op, model_obj, sess, pholders, varops = setup_attack_graph()

        model = varops['adv_pred']

        data = map(
            lambda z: preprocess_yadav(z),
            map(
                lambda y: read_img(os.path.join(FLAGS.attack_srcdir, y)),
                filter(lambda x: x.endswith(".png"),
                       os.listdir(FLAGS.attack_srcdir))))
        num_images = len(data)

        feed_dict = {
            pholders['image_in']: data,
            pholders['attack_target']: get_adv_target(nb_inputs=num_images),
            pholders['noise_mask']: read_img(FLAGS.attack_mask) / 255.0,
            model_obj.keep_prob: 1.0
        }

        if FLAGS.printability_optimization:
            feed_dict[pholders['printable_colors']] = get_print_triplets()

        # used to save checkpoints after each epoch
        saver = tf.train.Saver(max_to_keep=5)

        clean_model_loss = model_loss(pholders['attack_target'],
                                      varops['adv_pred'],
                                      mean=True)

        latest_misrate = FLAGS.min_rate_to_save
        latest_loss = 10000

        for i in xrange(FLAGS.attack_epochs):
            print 'Epoch %d' % i,
            sys.stdout.flush()
            _,  train_loss, mod_loss, noisy_in, noisy_classes = sess.run( \
                (op, \
                varops['adv_loss'], \
                varops['loss'], \
                varops['noisy_inputs'], \
                varops['adv_pred']) \
                , feed_dict=feed_dict)

            if FLAGS.regloss != "none":
                reg_loss = sess.run(varops['reg_loss'], feed_dict=feed_dict)
            else:
                reg_loss = 0

            clean_loss, clean_classes = sess.run(
                (clean_model_loss, model),
                feed_dict={
                    pholders['image_in']:
                    data,
                    pholders['attack_target']:
                    get_adv_target(nb_inputs=num_images),
                    pholders['noise_mask']:
                    np.zeros([
                        FLAGS.input_rows, FLAGS.input_cols, FLAGS.nb_channels
                    ]),
                    model_obj.keep_prob:
                    1.0
                })

            print "adversarial loss %.5f reg loss %.5f model loss %.5f model loss on clean img: %.5f" % (
                train_loss, reg_loss, mod_loss, clean_loss),
            sys.stdout.flush()

            if FLAGS.printability_optimization:
                print "noise NPS %.5f" % sess.run(varops['printer_error'],
                                                  feed_dict=feed_dict),

            num_misclassified = 0

            for j in xrange(num_images):
                clean_classification = np.argmax(clean_classes[j])
                noise_classification = np.argmax(noisy_classes[j])
                if clean_classification != noise_classification and noise_classification == FLAGS.target_class:
                    num_misclassified += 1

            proportion_misclassified = float(num_misclassified) / float(
                num_images)
            print 'percent misclassified images %.1f' % (
                proportion_misclassified * 100.0)

            if proportion_misclassified > latest_misrate or \
                    (proportion_misclassified == latest_misrate and train_loss < latest_loss) \
                    or ("octagon" in FLAGS.attack_mask and train_loss < latest_loss):
                latest_misrate = proportion_misclassified
                latest_loss = train_loss
                saver.save(sess,
                           os.path.join('optimization_output',
                                        FLAGS.checkpoint, 'model',
                                        FLAGS.checkpoint),
                           global_step=i)
            if FLAGS.save_all_noisy_images:
                write_img(
                    os.path.join(
                        'optimization_output', FLAGS.checkpoint,
                        "noisy_images",
                        "noisyimg_%s_epoch_%d.png" % (FLAGS.checkpoint, i)),
                    ((noisy_in[0] + 0.5) * 255).astype(int))
Esempio n. 17
0
def fgm(x, preds, y=None, eps=0.3, ord=np.inf, clip_min=None, clip_max=None):
    """
    TensorFlow implementation of the Fast Gradient Method.
    :param x: the input placeholder
    :param preds: the model's output tensor
    :param y: (optional) A placeholder for the model labels. Only provide
              this parameter if you'd like to use true labels when crafting
              adversarial samples. Otherwise, model predictions are used as
              labels to avoid the "label leaking" effect (explained in this
              paper: https://arxiv.org/abs/1611.01236). Default is None.
              Labels should be one-hot-encoded.
    :param eps: the epsilon (input variation parameter)
    :param ord: (optional) Order of the norm (mimics Numpy).
                Possible values: np.inf, 1 or 2.
    :param clip_min: Minimum float value for adversarial example components
    :param clip_max: Maximum float value for adversarial example components
    :return: a tensor for the adversarial example
    """

    if y is None:
        # Using model predictions as ground truth to avoid label leaking
        preds_max = tf.reduce_max(preds, 1, keep_dims=True)
        y = tf.to_float(tf.equal(preds, preds_max))
    y = y / tf.reduce_sum(y, 1, keep_dims=True)

    # Compute loss
    loss = utils_tf.model_loss(y, preds, mean=False)

    # Define gradient of loss wrt input
    grad, = tf.gradients(loss, x)

    # smooth grad
    kernel = gkern(7, 2).astype(np.float32)
    stack_kernel = np.stack([kernel, kernel, kernel]).swapaxes(2, 0)
    stack_kernel = np.expand_dims(stack_kernel, 3)

    grad = tf.nn.depthwise_conv2d(grad,
                                  stack_kernel,
                                  strides=[1, 1, 1, 1],
                                  padding='SAME')

    if ord == np.inf:
        # Take sign of gradient
        signed_grad = tf.sign(grad)
    elif ord == 1:
        reduc_ind = list(xrange(1, len(x.get_shape())))
        signed_grad = grad / tf.reduce_sum(
            tf.abs(grad), reduction_indices=reduc_ind, keep_dims=True)
    elif ord == 2:
        reduc_ind = list(xrange(1, len(x.get_shape())))
        signed_grad = grad / tf.sqrt(
            tf.reduce_sum(
                tf.square(grad), reduction_indices=reduc_ind, keep_dims=True))
    else:
        raise NotImplementedError("Only L-inf, L1 and L2 norms are "
                                  "currently implemented.")

    # Multiply by constant epsilon
    scaled_signed_grad = eps * signed_grad

    # Add perturbation to original example to obtain adversarial example
    adv_x = tf.stop_gradient(x + scaled_signed_grad)

    # If clipping is needed, reset all values outside of [clip_min, clip_max]
    if (clip_min is not None) and (clip_max is not None):
        adv_x = tf.clip_by_value(adv_x, clip_min, clip_max)

    return adv_x
Esempio n. 18
0
        def body(i, old_adv_x, old_loss, labels=labels):
            """Find example with max loss value amongst batch of perturbations."""
            deltas = tf.random_uniform(deltas_shape)

            # generate uniform samples from the l^p unit ball interior
            if self.ord == np.inf:
                deltas *= 2. * self.eps
                deltas -= self.eps
            elif self.ord == 1:
                # ref: https://mathoverflow.net/questions/9185/how-to-generate-random-points-in-ell-p-balls  pylint: disable=line-too-long
                exp = -tf.log(deltas)
                shift = -tf.log(tf.random_uniform(deltas_shape[:2]))
                norm = tf.reduce_sum(tf.abs(exp),
                                     range(2,
                                           len(deltas_shape) - 2))
                scale = tf.reshape(
                    shift + norm,
                    deltas_shape[:2] + [1] * (len(deltas_shape) - 2))
                deltas = exp / scale
            elif self.ord == 2:
                # ref: https://blogs.sas.com/content/iml/2016/04/06/generate-points-uniformly-in-ball.html  pylint: disable=line-too-long
                dims = tf.reduce_prod(deltas_shape[2:])
                deltas = tf.pow(deltas, 1. / dims)
                normal = tf.random_normal(deltas)
                normal /= tf.sqrt(tf.reduce_sum(normal**2,
                                                axis=range(
                                                    2,
                                                    len(deltas_shape) - 2)),
                                  keepdims=True)
                deltas *= normal
            else:
                raise NotImplementedError('Only L-inf, L1 and L2 norms are '
                                          'currently implemented.')

            adv_x = tf.expand_dims(x, 1) + deltas
            labels = tf.expand_dims(labels, 1)
            labels = tf.tile(labels, [1, self.num_samples, 1])

            if (self.clip_min is not None) and (self.clip_max is not None):
                adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max)

            adv_x_r = tf.reshape(adv_x, [-1] + deltas_shape[2:])
            preds = self.model.get_probs(adv_x_r)
            preds_shape = preds.shape.as_list()
            preds = tf.reshape(preds, deltas_shape[:2] + preds_shape[1:])

            if labels is None:
                # Using model predictions as ground truth to avoid label leaking
                preds_max = tf.reduce_max(preds, -1, keep_dims=True)
                labels = tf.to_float(tf.equal(preds, preds_max))
                labels = tf.stop_gradient(labels)
            labels = labels / tf.reduce_sum(labels, -1, keep_dims=True)

            # Compute loss
            loss = utils_tf.model_loss(labels, preds, mean=False)
            if self.y_target is not None:
                loss = -loss

            # find the maximum loss value
            input_idx = tf.one_hot(tf.argmax(loss, axis=1),
                                   self.num_samples,
                                   axis=1)
            loss = tf.reduce_sum(loss * input_idx, axis=1)
            input_idx = tf.reshape(
                input_idx, deltas_shape[:2] + [1] * (len(deltas_shape) - 2))
            adv_x = tf.reduce_sum(adv_x * input_idx, axis=1)

            condition = tf.greater(old_loss, loss)
            new_loss = tf.where(condition, old_loss, loss)
            new_adv_x = tf.where(condition, old_adv_x, adv_x)
            print(new_loss, new_adv_x)

            return i + 1, new_adv_x, new_loss
Esempio n. 19
0
# Define model
model = cnn_model(img_rows=FLAGS.img_rows, img_cols=FLAGS.img_cols, channels=FLAGS.nb_channels, nb_classes=FLAGS.nb_classes)
model.summary()

# will hold the placeholders so that they can be returned 
placeholders = {}
img_rows = FLAGS.img_rows
img_cols = FLAGS.img_cols
placeholders['image_in'] = tf.placeholder(tf.float32, shape = (None, img_rows, img_cols, FLAGS.nb_channels))
placeholders['True_labels'] = tf.placeholder(tf.float32, shape = (None, FLAGS.nb_classes))

# will hold the variables and operations defined from now on
varops = {}
varops['pred'] = model(placeholders['image_in'])
varops['loss'] = model_loss(placeholders['True_labels'], varops['pred'], mean=True)

feed_dict = {placeholders['image_in']: imgs, 
             placeholders['True_labels']: labels, 
             keras.backend.learning_phase(): 0}


# Create TF session and set as Keras backend session
sess = tf.Session()
keras.backend.set_session(sess)
print("Created TensorFlow session and set Keras backend.")

#op = tf.train.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999, epsilon=1e-08, use_locking=False,name='Adam').minimize(varops['loss'])
op = tf.train.AdadeltaOptimizer(learning_rate=0.1, rho=0.95, epsilon=1e-08, use_locking=False,name='Adadelta').minimize(varops['loss'])
#op = tf.train.AdagradOptimizer(learning_rate=0.01, initial_accumulator_value=0.1, use_locking=False,name='Adagrad').minimize(varops['loss'])