Beispiel #1
0
        model.copy(sess, model_fix)
        saver = tf.train.Saver()

        attack_denoiser = LinfPGDAttack(model, config['epsilon'], config['k'],
                                        config['a'], config['random_start'],
                                        config['loss_func'])

    # progressive feature matching
    fea_matching = init_fea(sess, model, model_fix, distance_flag='L_inf')

    # saver.save(sess, os.path.join(model_dir, 'checkpoint'), global_step=adv_ep)

    for ii in range(max_num_training_steps):
        # over all adversarial adata
        x_batch_nat, y_batch = mnist.train.next_batch(batch_size)
        x_batch_adv = attack_denoiser.perturb(x_batch_nat, y_batch, sess)
        adv_dict = {model.x_input: x_batch_adv, model.y_input: y_batch}

        for i, tag_i in enumerate(fea_matching.tag_list):
            # layer by layer
            fea_matching.apply(sess, x_batch_nat, x_batch_adv, y_batch, tag_i)

            # monitor the accuracy
            if ii % 100 == 0:

                ######## training error
                nat_dict = {model.x_input: x_batch_nat, model.y_input: y_batch}
                nat_acc = sess.run(model.accuracy, feed_dict=nat_dict)
                hist_nat_acc += [nat_acc]
                adv_dict = {model.x_input: x_batch_adv, model.y_input: y_batch}
                adv_acc = sess.run(model.accuracy, feed_dict=adv_dict)
Beispiel #2
0
def advs_train(dataset='cifar-10',
               loss_name='ce',
               epochs=120,
               dynamic_epoch=100,
               batch_size=128,
               fosc_max=0.5,
               epsilon=0.031):
    """
    Adversarial training with PGD attack.
    """
    print(
        'DynamicAdvsTrain - Data set: %s, loss: %s, epochs: %s, dynamic_epoch: %s, batch: %s, epsilon: %s'
        % (dataset, loss_name, epochs, dynamic_epoch, batch_size, epsilon))

    X_train, Y_train, X_test, Y_test = get_data(dataset,
                                                clip_min=0.,
                                                clip_max=1.,
                                                onehot=True)

    n_images = X_train.shape[0]
    image_shape = X_train.shape[1:]
    n_class = Y_train.shape[1]
    print("n_images:", n_images, "n_class:", n_class, "image_shape:",
          image_shape)

    model = get_model(dataset,
                      input_shape=image_shape,
                      n_class=n_class,
                      softmax=True)
    # model.summary()

    # create loss
    if loss_name == 'ce':
        loss = cross_entropy
    else:
        print("New loss function should be defined first.")
        return

    optimizer = SGD(lr=0.01, decay=1e-4, momentum=0.9)

    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])

    # data augmentation
    if dataset in ['mnist']:
        datagen = ImageDataGenerator()
    elif dataset in ['cifar-10']:
        datagen = ImageDataGenerator(rotation_range=10,
                                     width_shift_range=0.2,
                                     height_shift_range=0.2,
                                     horizontal_flip=True)
    else:
        datagen = ImageDataGenerator(width_shift_range=0.2,
                                     height_shift_range=0.2,
                                     horizontal_flip=True)

    datagen.fit(X_train)

    # pgd attack for training
    attack = LinfPGDAttack(model,
                           epsilon=epsilon,
                           eps_iter=epsilon / 4,
                           nb_iter=10,
                           random_start=True,
                           loss_func='xent',
                           clip_min=np.min(X_train),
                           clip_max=np.max(X_train))

    # initialize logger
    mylogger = Logger(K.get_session(),
                      model,
                      X_train,
                      Y_train,
                      X_test,
                      Y_test,
                      dataset,
                      loss_name,
                      epochs,
                      suffix='%s' % epsilon)

    batch_iterator = datagen.flow(X_train, Y_train, batch_size=batch_size)

    start_time = time.time()

    for ep in range(epochs):
        # learning rate decay
        if (ep + 1) == 60:
            lr = float(K.get_value(model.optimizer.lr))
            K.set_value(model.optimizer.lr, lr / 10.0)

        if (ep + 1) == 100:
            lr = float(K.get_value(model.optimizer.lr))
            K.set_value(model.optimizer.lr, lr / 10.0)
        lr = float(K.get_value(model.optimizer.lr))

        # a simple linear decreasing of fosc
        fosc = fosc_max - fosc_max * (ep * 1.0 / dynamic_epoch)
        fosc = np.max([fosc, 0.0])

        steps_per_epoch = int(X_train.shape[0] / batch_size)
        pbar = tqdm(range(steps_per_epoch))
        for it in pbar:
            batch_x, batch_y = batch_iterator.next()
            batch_advs, fosc_batch = attack.perturb(K.get_session(), batch_x,
                                                    batch_y, batch_size, ep,
                                                    fosc)

            probs = model.predict(batch_advs)
            loss_weight = np.max(-batch_y * np.log(probs + 1e-12), axis=1)

            if it == 0:
                fosc_all = fosc_batch
            else:
                fosc_all = np.concatenate((fosc_all, fosc_batch), axis=0)

            if ep == 0:
                loss, acc = model.train_on_batch(batch_advs, batch_y)
            else:
                loss, acc = model.train_on_batch(batch_advs,
                                                 batch_y,
                                                 sample_weight=loss_weight)
            pbar.set_postfix(acc='%.4f' % acc, loss='%.4f' % loss)

        print('All time:', time.time() - start_time)

        log_path = './log'

        file_name = os.path.join(
            log_path, 'BatchSize_{}_Epoch_{}_fosc.npy'.format(batch_size, ep))
        np.save(file_name, fosc_all)

        val_loss, val_acc = model.evaluate(X_test,
                                           Y_test,
                                           batch_size=batch_size,
                                           verbose=0)
        logs = {
            'acc': acc,
            'loss': loss,
            'val_acc': val_acc,
            'val_loss': val_loss
        }

        print(
            "Epoch %s - loss: %.4f - acc: %.4f - val_loss: %.4f - val_acc: %.4f"
            % (ep, loss, acc, val_loss, val_acc))

        # save the log and model every epoch
        mylogger.on_epoch_end(epoch=ep, logs=logs)
        model.save_weights("model/advs_%s_%s_%s_%s.hdf5" %
                           (dataset, loss_name, epsilon, ep))
Beispiel #3
0
shutil.copy('config.json', model_dir)

with tf.Session() as sess:
    # Initialize the summary writer, global variables, and our time counter.
    summary_writer = tf.summary.FileWriter(model_dir, sess.graph)
    sess.run(tf.global_variables_initializer())
    training_time = 0.0

    # Main training loop
    for ii in range(max_num_training_steps):
        x_batch, y_batch = mnist.train.next_batch(batch_size)

        # Compute Adversarial Perturbations
        start = timer()
        x_batch_adv = attack.perturb(x_batch,
                                     y_batch,
                                     sess,
                                     trades=args.trades)
        end = timer()
        training_time += end - start

        full_dict = {
            x_nat_input: x_batch,
            x_adv_input: x_batch_adv,
            y_input: y_batch
        }

        # Output to stdout
        if ii % num_output_steps == 0:
            nat_acc_batch, adv_acc_batch, xent_batch, kl_batch, grad_reg_loss_batch = sess.run(
                [
                    nat_acc, adv_acc, adv_mean_xent
def train(tf_seed, np_seed, train_steps, only_finetune, finetune_train_steps,
          out_steps, summary_steps, checkpoint_steps, step_size_schedule,
          weight_decay, momentum, train_batch_size, do_advtrain, do_advreg,
          epsilon, pgd_steps, step_size, random_start, loss_func, replay_m,
          model_dir, source_model_dir, dataset, data_dir, beta, gamma,
          disc_update_steps, adv_update_steps_per_iter, disc_layers,
          disc_base_channels, steps_before_adv_opt, steps_before_adv_training,
          adv_encoder_type, enc_output_activation, sep_opt_version,
          grad_image_ratio, final_grad_image_ratio, num_grad_image_ratios,
          normalize_zero_mean, eval_adv_attack, same_optimizer,
          only_fully_connected, disc_avg_pool_hw, finetuned_source_model_dir,
          train_finetune_source_model, finetune_img_random_pert,
          img_random_pert, model_suffix, model_type, **kwargs):
    tf.set_random_seed(tf_seed)
    np.random.seed(np_seed)

    # Add pgd params to model name
    if do_advtrain:
        model_dir = model_dir + '_AdvTrain'
        if epsilon != 8:
            model_dir = model_dir + '_ep%d' % (epsilon)
        if random_start != True:
            model_dir = model_dir + '_norandstart'
        if pgd_steps != 7:
            model_dir = model_dir + '_%dsteps' % (pgd_steps)
        if step_size != 2:
            model_dir = model_dir + '_stepsize%d' % (step_size)
        model_dir = model_dir + '-{}-'.format(model_type)

    model_dir = model_dir + 'IGAM-%s_b%d' % (
        dataset, train_batch_size)  # TODO Replace with not defaults

    if tf_seed != 451760341:
        model_dir = model_dir + '_tf_seed%d' % (tf_seed)
    if np_seed != 216105420:
        model_dir = model_dir + '_np_seed%d' % (np_seed)

    model_dir = model_dir + model_suffix

    # Setting up the data and the model
    data_path = data_dir  #"./datasets/tiny-imagenet/tiny-imagenet-200"
    raw_data = tinyimagenet_input.TinyImagenetData(data_path)
    global_step = tf.train.get_or_create_global_step()
    increment_global_step_op = tf.assign(global_step, global_step + 1)
    reset_global_step_op = tf.assign(global_step, 0)

    if model_type == "igamsource":
        model = ModelTinyImagenetSource(
            mode='train',
            dataset='tinyimagenet',
            train_batch_size=train_batch_size,
            normalize_zero_mean=normalize_zero_mean)
    else:
        model = ModelTinyImagnet(mode='train',
                                 dataset='tinyimagenet',
                                 train_batch_size=train_batch_size,
                                 normalize_zero_mean=normalize_zero_mean)

    # Setting up the optimizers
    boundaries = [int(sss[0]) for sss in step_size_schedule][1:]
    values = [sss[1] for sss in step_size_schedule]
    learning_rate = tf.train.piecewise_constant(tf.cast(global_step, tf.int32),
                                                boundaries, values)
    c_optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)

    t_vars = tf.trainable_variables()
    C_vars = [var for var in t_vars if 'classifier' in var.name]

    classification_c_loss = model.mean_xent + weight_decay * model.weight_decay_loss
    total_loss = classification_c_loss

    classification_final_grads = c_optimizer.compute_gradients(
        classification_c_loss, var_list=t_vars)
    classification_no_pert_grad = [(tf.zeros_like(v),
                                    v) if 'perturbation' in v.name else (g, v)
                                   for g, v in classification_final_grads]
    c_classification_min_step = c_optimizer.apply_gradients(
        classification_no_pert_grad)

    # Setting up the Tensorboard and checkpoint outputs
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    saver = tf.train.Saver(max_to_keep=1)
    tf.summary.scalar('C accuracy', model.accuracy)
    tf.summary.scalar('C xent', model.xent / train_batch_size)
    merged_summaries = tf.summary.merge_all()

    # Set up adversary
    attack = LinfPGDAttack(model,
                           epsilon,
                           pgd_steps,
                           step_size,
                           random_start,
                           loss_func,
                           dataset=dataset)

    with tf.Session() as sess:
        print(
            'important params >>> \n model dir: %s \n dataset: %s \n training batch size: %d \n'
            % (model_dir, dataset, train_batch_size))
        # initialize data augmentation
        data = tinyimagenet_input.AugmentedTinyImagenetData(
            raw_data, sess, model)

        # Initialize the summary writer, global variables, and our time counter.
        summary_writer = tf.summary.FileWriter(model_dir + '/train',
                                               sess.graph)
        eval_summary_writer = tf.summary.FileWriter(model_dir + '/eval')
        sess.run(tf.global_variables_initializer())

        # Main training loop
        for ii in tqdm(range(train_steps)):
            x_batch, y_batch = data.train_data.get_next_batch(
                train_batch_size, multiple_passes=True)
            if img_random_pert and not (do_advtrain and random_start
                                        and ii >= steps_before_adv_training):
                x_batch = x_batch + np.random.uniform(-epsilon, epsilon,
                                                      x_batch.shape)
                x_batch = np.clip(x_batch, 0, 255)  # ensure valid pixel range

            labels_source_modelgrad_disc = np.ones_like(y_batch,
                                                        dtype=np.int64)
            nat_dict = {model.x_input: x_batch, model.y_input: y_batch}

            # Generate adversarial training examples
            if do_advtrain and ii >= steps_before_adv_training:
                x_batch_adv = attack.perturb(x_batch, y_batch, sess)

                train_dict = {
                    model.x_input: x_batch_adv,
                    model.y_input: y_batch
                }
            else:
                train_dict = nat_dict

            # Output to stdout
            if ii % summary_steps == 0:
                train_acc, train_c_loss, summary = sess.run(
                    [model.accuracy, total_loss, merged_summaries],
                    feed_dict=train_dict)
                summary_writer.add_summary(summary, global_step.eval(sess))

                x_eval_batch, y_eval_batch = data.eval_data.get_next_batch(
                    train_batch_size, multiple_passes=True)
                if img_random_pert and not (do_advtrain and random_start):
                    x_eval_batch = x_eval_batch + np.random.uniform(
                        -epsilon, epsilon, x_eval_batch.shape)
                    x_eval_batch = np.clip(x_eval_batch, 0,
                                           255)  # ensure valid pixel range

                labels_source_modelgrad_disc = np.ones_like(y_eval_batch,
                                                            dtype=np.int64)
                eval_nat_dict = {
                    model.x_input: x_eval_batch,
                    model.y_input: y_eval_batch
                }
                if do_advtrain:
                    x_eval_batch_adv = attack.perturb(x_eval_batch,
                                                      y_eval_batch, sess)
                    eval_dict = {
                        model.x_input: x_eval_batch_adv,
                        model.y_input: y_eval_batch
                    }
                else:
                    eval_dict = eval_nat_dict

                val_acc, val_c_loss, summary = sess.run(
                    [model.accuracy, total_loss, merged_summaries],
                    feed_dict=eval_dict)
                eval_summary_writer.add_summary(summary,
                                                global_step.eval(sess))
                print('Step {}:    ({})'.format(ii, datetime.now()))
                print(
                    '    training nat accuracy {:.4}% -- validation nat accuracy {:.4}%'
                    .format(train_acc * 100, val_acc * 100))
                print('    training nat c loss: {}'.format(train_c_loss))
                print('    validation nat c loss: {}'.format(val_c_loss))

                sys.stdout.flush()
            # Tensorboard summaries
            elif ii % out_steps == 0:
                nat_acc, nat_c_loss = sess.run([model.accuracy, total_loss],
                                               feed_dict=train_dict)
                print('Step {}:    ({})'.format(ii, datetime.now()))
                print('    training nat accuracy {:.4}%'.format(nat_acc * 100))
                print('    training nat c loss: {}'.format(nat_c_loss))

            # Write a checkpoint
            if (ii + 1) % checkpoint_steps == 0:
                saver.save(sess,
                           os.path.join(model_dir, 'checkpoint'),
                           global_step=global_step)

            sess.run(c_classification_min_step, feed_dict=train_dict)
            sess.run(increment_global_step_op)

        # full test evaluation
        raw_data = tinyimagenet_input.TinyImagenetData(data_path)

        data_size = raw_data.eval_data.n
        if data_size % train_batch_size == 0:
            eval_steps = data_size // train_batch_size
        else:
            eval_steps = data_size // train_batch_size
            # eval_steps = data_size // train_batch_size + 1
        total_num_correct = 0
        for ii in tqdm(range(eval_steps)):
            x_eval_batch, y_eval_batch = raw_data.eval_data.get_next_batch(
                train_batch_size, multiple_passes=False)
            eval_dict = {
                model.x_input: x_eval_batch,
                model.y_input: y_eval_batch
            }
            num_correct = sess.run(model.num_correct, feed_dict=eval_dict)
            total_num_correct += num_correct
        eval_acc = total_num_correct / data_size

        clean_eval_file_path = os.path.join(model_dir,
                                            'full_clean_eval_acc.txt')
        with open(clean_eval_file_path, "a+") as f:
            f.write("Full clean eval_acc: {}%".format(eval_acc * 100))
        print("Full clean eval_acc: {}%".format(eval_acc * 100))

        devices = sess.list_devices()
        for d in devices:
            print("sess' device names:")
            print(d.name)

    return model_dir
    ##
    #
    #讀取未經過對抗訓練的神經網絡
    model_path_raw = '/home/zrs/Desktop/adversarial_attacks/cnn_model/model.ckpt'  #模型保存地址
    load_path = saver.restore(sess, model_path_raw)

for step in range(5):
    #    print (step)
    #epoch = int(epoch / 2)

    for i in range(epoch):
        p = (i / epoch)
        print(p, step)
        batch_x = train_X[i * batch_size:(i + 1) * batch_size]
        batch_y = train_Y[i * batch_size:(i + 1) * batch_size]
        batch_x_adv = attack.perturb(batch_x, batch_y, sess)

        X_final = np.concatenate([batch_x, batch_x_adv])
        y_final = np.concatenate([batch_y, batch_y])

        lr = 0.0001 / (1. + 10 * p)**0.75
        #        sess.run(model.optimizer, feed_dict={model.x_input: batch_x, model.y_input: batch_y, model.learning_rate_1 : lr})
        sess.run(model.optimizer,
                 feed_dict={
                     model.x_input: X_final,
                     model.y_input: y_final,
                     model.learning_rate_1: lr
                 })

#        tensorborad_summaries
#        tensorboard_num = i + epoch * step
class SpatialAttack:
  def __init__(self, model, config, method=None, worstofk=None,
               attack_limits=None, fo_epsilon=2.0, fo_step_size=2.,
               fo_num_steps=5):
    self.model = model
    self.grid_store = []

    if config.use_linf:
        self.linf_attack = LinfPGDAttack(
            model, config, fo_epsilon, fo_step_size, fo_num_steps)
    else:
        self.linf_attack = None

    self.use_spatial = config.use_spatial
    if config.use_spatial:
      # Attack method
        if method == None:
          self.method = config.spatial_method
        else:
          self.method = method

        # Attack parameters
        if attack_limits == None:
          self.limits = config.spatial_limits
        else:
          self.limits = attack_limits

        if config.only_rotation:
            self.limits = [0,0,self.limits[2]]

        if config.only_translation:
            self.limits = [self.limits[0],self.limits[1],0]

        # Attack method parameters
        if self.method == 'grid':
            self.granularity = config.grid_granularity
        elif self.method == 'random':
          if worstofk == None:
            self.random_tries = config.random_tries
          else:
            self.random_tries = worstofk
        elif self.method == 'fo':
            self.fo_attack = SpatialPGDAttack(
                model, config, fo_epsilon, fo_step_size, fo_num_steps)
        else:
            raise NotImplementedError


  def perturb(self, x_nat, y, sess):
      if not self.use_spatial:
          t = np.zeros([len(x_nat), 3])
          if self.linf_attack:
              x = self.linf_attack.perturb(x_nat, y, sess, trans=t)
          else:
              x = x_nat
          return x, t
      if self.method == 'grid':
          return self.perturb_grid(x_nat, y, sess, -1)
      elif self.method == 'fo':
          return self.fo_attack.perturb(x_nat, y, sess)
      else: # random
          return self.perturb_grid(x_nat, y, sess, self.random_tries)

  def perturb_grid(self, x_nat, y, sess, random_tries=-1):
    n = len(x_nat)
    if random_tries > 0:
        # subsampling this list from the grid is a bad idea, instead we
        # will randomize each example from the full continuous range
        grid = [(42, 42, 42) for _ in range(random_tries)] # dummy list
    else: # exhaustive grid
        grid = product(*list(np.linspace(-l, l, num=g)
                             for l, g in zip(self.limits, self.granularity)))

    worst_x = np.copy(x_nat)
    worst_t = np.zeros([n, 3])
    max_xent = np.zeros(n)
    all_correct = np.ones(n).astype(bool)

    for tx, ty, r in grid:
        if random_tries > 0:
            # randomize each example separately
            t = np.stack((np.random.uniform(-l, l, n) for l in self.limits),
                         axis=1)
        else:
            t = np.stack(repeat([tx, ty, r], n))

        if self.linf_attack:
            x = self.linf_attack.perturb(x_nat, y, sess, trans=t)
        else:
            x = x_nat

        curr_dict = {self.model.x_input: x,
                     self.model.y_input: y,
                     self.model.is_training: False,
                     self.model.transform: t}

        cur_xent, cur_correct = sess.run([self.model.y_xent,
                                          self.model.correct_prediction],
                                         feed_dict = curr_dict) # shape (bsize,)
        cur_xent = np.asarray(cur_xent)
        cur_correct = np.asarray(cur_correct)

        # Select indices to update: we choose the misclassified transformation
        # of maximum xent (or just highest xent if everything else if correct).
        idx = (cur_xent > max_xent) & (cur_correct == all_correct)
        idx = idx | (cur_correct < all_correct)
        max_xent = np.maximum(cur_xent, max_xent)
        all_correct = cur_correct & all_correct

        idx = np.expand_dims(idx, axis=-1) # shape (bsize, 1)
        worst_t = np.where(idx, t, worst_t) # shape (bsize, 3)

        idx = np.expand_dims(idx, axis=-1)
        idx = np.expand_dims(idx, axis=-1) # shape (bsize, 1, 1, 1)
        worst_x = np.where(idx, x, worst_x,) # shape (bsize, 32, 32, 3)

    return worst_x, worst_t
Beispiel #7
0
    # Initialize the summary writer, global variables, and our time counter.
    summary_writer = tf.summary.FileWriter(model_dir, sess.graph)
    sess.run(tf.global_variables_initializer())
    saver.restore(
        sess,
        '/home/hope-yao/Documents/mnist_challenge/models/a_very_robust_model_madry/checkpoint-99900'
    )
    training_time = 0.0

    # Main training loop
    for ii in range(max_num_training_steps):
        x_batch, y_batch = mnist.train.next_batch(batch_size)

        # Compute Adversarial Perturbations
        start = timer()
        x_batch_adv = attack.perturb(x_batch, y_batch, sess)
        end = timer()
        training_time += end - start

        nat_dict = {model.x_input: x_batch, model.y_input: y_batch}

        adv_dict = {model.x_input: x_batch_adv, model.y_input: y_batch}

        # Output to stdout
        if ii % num_output_steps == 0:
            nat_acc = sess.run(model.accuracy, feed_dict=nat_dict)
            adv_acc = sess.run(model.accuracy, feed_dict=adv_dict)
            print('Step {}:    ({})'.format(ii, datetime.now()))
            print('    training nat accuracy {:.4}%'.format(nat_acc * 100))
            print('    training adv accuracy {:.4}%'.format(adv_acc * 100))
            if ii != 0:
        if i % report_batch == 1:
            np_adv_image = []
            np_benign_image = []
            np_label = []
            np_pgd_image = []
            np_pred_normal = []
            np_detection_normal = []
            np_pred_adv = []
            np_detection_adv = []
            np_pred_pgd = []
            np_detection_pgd = []

        x_train_val, y_train_val = get_data(sess)
        #print(x_train_val[0])
        #exit()
        x_train_perturbed = pgd_attack.perturb(x_train_val, y_train_val, sess)

        fdict = {content: x_train_perturbed, label: y_train_val}
        _acc = sess.run(norm_acc, feed_dict=fdict)

        fdict = {content: x_train_val, label: y_train_val}

        grad_attack()
        x_train_style = sess.run(adv_img, feed_dict=fdict)
        print("result normal:")
        _, p_set_normal, p_det_normal = gaussdetect.detect(
            x_train_val, y_train_val, batch_size=BATCH_SIZE)
        print("result pgd:")
        _, p_set_pgd, p_det_pgd = gaussdetect.detect(x_train_perturbed,
                                                     y_train_val,
                                                     batch_size=BATCH_SIZE)
    cur_ckpt = args.ckpt

    with tf.Session() as sess:
        for i in range(args.atta_loop):
            x_batch = mnist.test.images[batch_start:batch_start + 500]
            y_batch = mnist.test.labels[batch_start:batch_start + 500]
            x_batch_adv = x_batch.copy()
            path = args.log_prefix + str(i + 1) + ".log"
            print(path)
            log_file = open(path, 'w')

            print(os.path.join(model_dir, "checkpoint-" + str(cur_ckpt)))

            model_ckpt = os.path.join(model_dir, "checkpoint-" + str(cur_ckpt))
            saver.restore(sess, model_ckpt)

            x_batch_adv = attack.perturb(x_batch, y_batch, sess, log_file)

            nat_dict = {model.x_input: x_batch, model.y_input: y_batch}
            adv_dict = {model.x_input: x_batch_adv, model.y_input: y_batch}

            nat_loss = sess.run(model.mean_xent, feed_dict=nat_dict)
            loss = sess.run(model.mean_xent, feed_dict=adv_dict)

            print("adv loss:     {}".format(loss))
            print("nat-loss: {}".format(nat_loss))

            log_file.close()
            batch_start += 500
Beispiel #10
0
def train_model(dataset, config, plotter, adversarial, mixed):

    clear_session()

    # Set seeds
    tf.set_random_seed(config['random_seed'])
    np.random.seed(config['random_seed'])

    # Set save directory
    model_dir = ""
    if adversarial:
        if mixed:
            model_dir = config['model_dir_adv_mixed']
        else:
            model_dir = config['model_dir_adv']
    else:
        if mixed:
            model_dir = config['model_dir_mixed']
        else:
            model_dir = config['model_dir']
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    # Set dataset
    x_vals = dataset['X_train']
    y_vals = dataset['Y_train']

    # Get parameters
    batch_size = config['batch_size']
    weight_decay = config['weight_decay']
    C = 1.0 / (batch_size * weight_decay)
    # C = config['C']
    learning_rate = config['learning_rate']

    # Setup tensorflow objects
    svm_model = Model(batch_size, C=C)
    global_step = tf.compat.v1.train.get_or_create_global_step()

    attack = LinfPGDAttack(svm_model,
                           config['epsilon'],
                           config['k'],
                           config['a'],
                           config['random_start'],
                           config['momentum'],
                           config['beta'],
                           config['random_seed'],
                           plotter=plotter)

    # Set optimizer for model training
    my_opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
    train_step = my_opt.minimize(svm_model.loss)
    init = tf.initialize_all_variables()

    # Variables used during training
    X = None
    Y = None

    clean_loss_history = []
    clean_accuracy_history = []
    robust_loss_history = []
    robust_accuracy_history = []

    train_history = {}

    X_adv = None
    X_adv_save = None
    Y_save = None
    Y = None

    A = None
    b = None

    # Start tensorflow session
    with tf.Session() as sess:
        sess.run(init)

        # Training: Batch Gradient Descent
        for i in range(100):  # Orig: 100

            # Create randomly selected batch
            rand_index = np.random.choice(len(x_vals), size=batch_size)
            X = x_vals[rand_index]
            Y = np.transpose([y_vals[rand_index]])

            # In case of adversarial training we perturb the batch data
            X_adv = None
            if adversarial:
                X_adv = attack.perturb(X, Y, sess, debug=i == 40)
                X = X_adv

            # Storing batch set performance
            clean_loss = sess.run(svm_model.loss,
                                  feed_dict={
                                      svm_model.x_input: X,
                                      svm_model.y_input: Y
                                  })
            clean_acc = sess.run(svm_model.accuracy,
                                 feed_dict={
                                     svm_model.x_input: X,
                                     svm_model.y_input: Y,
                                     svm_model.prediction_grid: X
                                 })

            robust_loss = 0
            robust_acc = 0

            if adversarial:
                robust_loss = sess.run(svm_model.loss,
                                       feed_dict={
                                           svm_model.x_input: X_adv,
                                           svm_model.y_input: Y
                                       })
                robust_acc = sess.run(svm_model.accuracy,
                                      feed_dict={
                                          svm_model.x_input: X_adv,
                                          svm_model.y_input: Y,
                                          svm_model.prediction_grid: X_adv
                                      })

            if (i + 1) % 1 == 0:
                print('\nStep #' + str(i + 1))
                print('Clean Loss = ' + str(clean_loss))
                print('Clean Accuracy = ' + str(clean_acc))
                if adversarial:
                    print('Robust Loss = ' + str(robust_loss))
                    print('Robust Accuracy = ' + str(robust_acc))

            clean_loss_history.append(str(clean_loss[0][0]))
            clean_accuracy_history.append(str(clean_acc))
            if adversarial:
                robust_loss_history.append(str(robust_loss[0][0]))
                robust_accuracy_history.append(str(robust_acc))

            # Train model
            if adversarial:
                if not mixed:
                    X_adv_save = X_adv
                    Y_save = Y
                # print(X_adv - X)
                sess.run(train_step,
                         feed_dict={
                             svm_model.x_input: X_adv,
                             svm_model.y_input: Y
                         })
            else:
                sess.run(train_step,
                         feed_dict={
                             svm_model.x_input: X,
                             svm_model.y_input: Y
                         })

            plotter.plot(sess,
                         model=svm_model,
                         X=X,
                         Y=Y,
                         train_iter=i,
                         pgd_attack=False)

        # Save model
        A = sess.run(svm_model.A)
        b = sess.run(svm_model.b)
        saver = tf.train.Saver(max_to_keep=3)

        if adversarial:
            if mixed:
                saver.save(sess,
                           os.path.join(
                               config['model_dir_adv_mixed'], 'model_' +
                               'batch-size-' + str(config['batch_size']) +
                               '_C-' + str(config['C']) + '_learning-rate-' +
                               str(config['learning_rate'])),
                           global_step=global_step)
            else:
                saver.save(
                    sess,
                    os.path.join(
                        config['model_dir_adv'], 'model_' + 'batch-size-' +
                        str(config['batch_size']) + '_C-' + str(config['C']) +
                        '_learning-rate-' + str(config['learning_rate'])),
                    global_step=global_step)
        else:
            if mixed:
                saver.save(
                    sess,
                    os.path.join(
                        config['model_dir_mixed'], 'model_' + 'batch-size-' +
                        str(config['batch_size']) + '_C-' + str(config['C']) +
                        '_learning-rate-' + str(config['learning_rate'])),
                    global_step=global_step)
            else:
                saver.save(
                    sess,
                    os.path.join(
                        config['model_dir'], 'model_' + 'batch-size-' +
                        str(config['batch_size']) + '_C-' + str(config['C']) +
                        '_learning-rate-' + str(config['learning_rate'])),
                    global_step=global_step)

    train_history['clean loss'] = clean_loss_history
    train_history['clean accuracy'] = clean_accuracy_history
    train_history['robust loss'] = robust_loss_history
    train_history['robust accuracy'] = robust_accuracy_history
    train_history['A'] = A
    train_history['b'] = b

    # print(train_history)

    data = {'X': X_adv_save, 'Y': Y_save}

    # print(data)

    with open('gaussian_perturbed_train_test.npz', 'wb') as f:
        pickle.dump(data, f, protocol=2)

    return train_history
Beispiel #11
0
            targeted=False, num_classes=10, elementwise_best=True)

n_total = 0
n_correct = 0

train_adv_data = []
train_adv_labels = []

for i, (img, label) in enumerate(train_dataloader):
    #img = img.expand(img.data.shape[0], 3, 28, 28)
    
    batch_size = img.shape[0]
    img = img.cuda()
    label = label.cuda()

    adv_img = attacker.perturb(img, label)
    train_adv_data.extend(adv_img.cpu().numpy())
    train_adv_labels.extend(label.cpu().numpy())

    adv_output= model(input_data=adv_img)
    pred = adv_output.data.max(1, keepdim=True)[1]
    n_correct += pred.eq(label.data.view_as(pred)).cpu().sum()
    n_total += batch_size
    print('Process {}'.format(n_total))

accu = n_correct.data.numpy() * 1.0 / n_total

print('Adv acc:', accu)

adv_data_save_path_train = 'dataset/adv_mnist/train'
os.makedirs(adv_data_save_path_train, exist_ok=True)
Beispiel #12
0
def evaluate_checkpoint(filename, weight_prune, tolerance, relu_prune,
                        relu_prune_frac):
    with tf.Session() as sess:
        # Restore the checkpoint
        saver.restore(sess, filename)
        print('restored checkpoint for {}'.format(filename))
        print('First eval - no changes')

        x_single_train = mnist.train.images[0:1, :]
        y_single_train = mnist.train.labels[0:1]
        dict_nat_single = {
            model.x_input: x_single_train,
            model.x_input_natural: x_single_train,
            model.y_input: y_single_train
        }

        # Get the variables
        c1_v = [x for x in tf.global_variables() if x.op.name == 'Variable'][0]
        c1_b = [x for x in tf.global_variables()
                if x.op.name == 'Variable_1'][0]
        c2_v = [x for x in tf.global_variables()
                if x.op.name == 'Variable_2'][0]
        c2_b = [x for x in tf.global_variables()
                if x.op.name == 'Variable_3'][0]
        fc_v = [x for x in tf.global_variables()
                if x.op.name == 'Variable_4'][0]
        fc_b = [x for x in tf.global_variables()
                if x.op.name == 'Variable_5'][0]
        sm_v = [x for x in tf.global_variables()
                if x.op.name == 'Variable_6'][0]
        sm_b = [x for x in tf.global_variables()
                if x.op.name == 'Variable_7'][0]

        # Save values in the final variables
        c1, c1b, c2, c2b, fc, fcb, sm, smb = sess.run(
            [c1_v, c1_b, c2_v, c2_b, fc_v, fc_b, sm_v, sm_b],
            feed_dict=dict_nat_single)

        if do_eval:
            # Iterate over the eval samples batch-by-batch
            num_batches = int(math.ceil(num_eval_examples / eval_batch_size))
            total_corr_nat = 0
            total_corr_adv = 0
            tot_unstable1n = 0
            tot_unstable2n = 0
            tot_unstable3n = 0

            for ibatch in range(num_batches):
                bstart = ibatch * eval_batch_size
                bend = min(bstart + eval_batch_size, num_eval_examples)

                x_batch = mnist.test.images[bstart:bend, :]
                y_batch = mnist.test.labels[bstart:bend]

                dict_nat = {
                    model.x_input: x_batch,
                    model.x_input_natural: x_batch,
                    model.y_input: y_batch
                }

                x_batch_adv = attack.perturb(x_batch, y_batch, sess)

                dict_adv = {
                    model.x_input: x_batch_adv,
                    model.x_input_natural: x_batch,
                    model.y_input: y_batch
                }

                cur_corr_nat = sess.run(model.num_correct, feed_dict=dict_nat)
                cur_corr_adv = sess.run(model.num_correct, feed_dict=dict_adv)

                total_corr_nat += cur_corr_nat
                total_corr_adv += cur_corr_adv

                un1n, un2n, un3n = \
                  sess.run([model.unstable1, model.unstable2, \
                            model.unstable3],
                            feed_dict = dict_nat)

                tot_unstable1n += np.sum(un1n)
                tot_unstable2n += np.sum(un2n)
                tot_unstable3n += np.sum(un3n)

            avg_un1n = tot_unstable1n / num_eval_examples
            avg_un2n = tot_unstable2n / num_eval_examples
            avg_un3n = tot_unstable3n / num_eval_examples
            acc_nat = total_corr_nat / num_eval_examples
            acc_adv = total_corr_adv / num_eval_examples

            print('natural: {:.2f}%'.format(100 * acc_nat))
            print('adversarial: {:.2f}%'.format(100 * acc_adv))
            print('  un1n, un2n, un3n: {}, {}, {}'.format(
                avg_un1n, avg_un2n, avg_un3n))

        if weight_prune:
            print('Second eval - prune small weights')

            # Hardcoded variables
            prune_small_weights([c1_v, c2_v, fc_v], sess, tolerance)

            # These are the correct values (no need to refix-nonzeros) for the masked models
            c1, c1b, c2, c2b, fc, fcb, sm, smb = sess.run(
                [c1_v, c1_b, c2_v, c2_b, fc_v, fc_b, sm_v, sm_b],
                feed_dict=dict_nat_single)

            if do_eval:
                # Iterate over the eval samples batch-by-batch
                num_batches = int(
                    math.ceil(num_eval_examples / eval_batch_size))
                total_corr_nat = 0
                total_corr_adv = 0
                tot_unstable1n = 0
                tot_unstable2n = 0
                tot_unstable3n = 0

                for ibatch in range(num_batches):
                    bstart = ibatch * eval_batch_size
                    bend = min(bstart + eval_batch_size, num_eval_examples)

                    x_batch = mnist.test.images[bstart:bend, :]
                    y_batch = mnist.test.labels[bstart:bend]

                    dict_nat = {
                        model.x_input: x_batch,
                        model.x_input_natural: x_batch,
                        model.y_input: y_batch
                    }

                    x_batch_adv = attack.perturb(x_batch, y_batch, sess)

                    dict_adv = {
                        model.x_input: x_batch_adv,
                        model.x_input_natural: x_batch,
                        model.y_input: y_batch
                    }

                    cur_corr_nat = sess.run(model.num_correct,
                                            feed_dict=dict_nat)
                    cur_corr_adv = sess.run(model.num_correct,
                                            feed_dict=dict_adv)

                    total_corr_nat += cur_corr_nat
                    total_corr_adv += cur_corr_adv

                    un1n, un2n, un3n = \
                      sess.run([model.unstable1, model.unstable2, \
                                model.unstable3],
                                feed_dict = dict_nat)

                    tot_unstable1n += np.sum(un1n)
                    tot_unstable2n += np.sum(un2n)
                    tot_unstable3n += np.sum(un3n)

                avg_un1n = tot_unstable1n / num_eval_examples
                avg_un2n = tot_unstable2n / num_eval_examples
                avg_un3n = tot_unstable3n / num_eval_examples
                acc_nat = total_corr_nat / num_eval_examples
                acc_adv = total_corr_adv / num_eval_examples

                print('natural: {:.2f}%'.format(100 * acc_nat))
                print('adversarial: {:.2f}%'.format(100 * acc_adv))
                print('  un1n, un2n, un3n: {}, {}, {}'.format(
                    avg_un1n, avg_un2n, avg_un3n))

        if relu_prune:
            print('Third eval - prune relus')

            # Get locations of where relus are equal (or close) to 0 or 55000
            h1_rc = tf.reduce_sum(tf.cast(model.h_1 > 0, tf.int32), axis=0)
            h2_rc = tf.reduce_sum(tf.cast(model.h_2 > 0, tf.int32), axis=0)
            hfc_rc = tf.reduce_sum(tf.cast(model.h_fc_pre_relu > 0, tf.int32),
                                   axis=0)

            # Iterate over the training samples batch-by-batch to do relu count
            num_training_batches = int(
                math.ceil(num_training_examples / eval_batch_size))

            # Only do relu count for adv training examples only, since DNN is trained on adv
            tot_rc1 = 0
            tot_rc2 = 0
            tot_rfc = 0

            for ibatch in range(num_training_batches):
                bstart = ibatch * eval_batch_size
                bend = min(bstart + eval_batch_size, num_training_examples)

                x_batch = mnist.train.images[bstart:bend, :]
                y_batch = mnist.train.labels[bstart:bend]
                x_batch_adv = attack.perturb(x_batch, y_batch, sess)

                dict_adv = {
                    model.x_input: x_batch_adv,
                    model.x_input_natural: x_batch,
                    model.y_input: y_batch
                }

                rc1_adv = sess.run(h1_rc, feed_dict=dict_adv)
                rc2_adv = sess.run(h2_rc, feed_dict=dict_adv)
                rfc_adv = sess.run(hfc_rc, feed_dict=dict_adv)
                tot_rc1 += rc1_adv
                tot_rc2 += rc2_adv
                tot_rfc += rfc_adv

            def get_ops(adv, relu_prune_frac):
                num_to_remove = int(num_training_examples * relu_prune_frac)
                assert (num_to_remove <= num_training_examples / 2 + 1)
                linear_relus = adv >= (num_training_examples - num_to_remove)
                zero_relus = adv <= num_to_remove
                ops = np.zeros(adv.shape)
                ops[linear_relus] = 1
                ops[zero_relus] = -1
                print("number of relus left: ", len(ops[ops == 0]))
                return ops

            c1_ops = get_ops(tot_rc1, relu_prune_frac)
            c2_ops = get_ops(tot_rc2, relu_prune_frac)
            fc_ops = get_ops(tot_rfc, relu_prune_frac)

            if do_eval:
                mask_model = models.MNIST_naive_ia_masked.Model(
                    config, c1_ops, c2_ops, fc_ops)
                mask_model_attack = LinfPGDAttack(mask_model,
                                                  config['epsilon'],
                                                  config['k'], config['a'],
                                                  config['random_start'],
                                                  config['loss_func'])

                print("Created masked model")

                # Copy variables over from main model
                new_c1_v = [
                    x for x in tf.global_variables()
                    if x.op.name == 'Variable_8'
                ][0]
                new_c1_b = [
                    x for x in tf.global_variables()
                    if x.op.name == 'Variable_9'
                ][0]
                new_c2_v = [
                    x for x in tf.global_variables()
                    if x.op.name == 'Variable_10'
                ][0]
                new_c2_b = [
                    x for x in tf.global_variables()
                    if x.op.name == 'Variable_11'
                ][0]
                new_fc_v = [
                    x for x in tf.global_variables()
                    if x.op.name == 'Variable_12'
                ][0]
                new_fc_b = [
                    x for x in tf.global_variables()
                    if x.op.name == 'Variable_13'
                ][0]
                new_sm_v = [
                    x for x in tf.global_variables()
                    if x.op.name == 'Variable_14'
                ][0]
                new_sm_b = [
                    x for x in tf.global_variables()
                    if x.op.name == 'Variable_15'
                ][0]

                new_c1_v.assign(c1).eval()
                new_c1_b.assign(c1b).eval()
                new_c2_v.assign(c2).eval()
                new_c2_b.assign(c2b).eval()
                new_fc_v.assign(fc).eval()
                new_fc_b.assign(fcb).eval()
                new_sm_v.assign(sm).eval()
                new_sm_b.assign(smb).eval()

                # Iterate over the eval samples batch-by-batch
                num_batches = int(
                    math.ceil(num_eval_examples / eval_batch_size))
                total_corr_nat = 0
                total_corr_adv = 0
                tot_unstable1n = 0
                tot_unstable2n = 0
                tot_unstable3n = 0

                for ibatch in range(num_batches):
                    bstart = ibatch * eval_batch_size
                    bend = min(bstart + eval_batch_size, num_eval_examples)

                    x_batch = mnist.test.images[bstart:bend, :]
                    y_batch = mnist.test.labels[bstart:bend]

                    dict_nat = {
                        mask_model.x_input: x_batch,
                        mask_model.x_input_natural: x_batch,
                        mask_model.y_input: y_batch
                    }

                    x_batch_adv = mask_model_attack.perturb(
                        x_batch, y_batch, sess)

                    dict_adv = {
                        mask_model.x_input: x_batch_adv,
                        mask_model.x_input_natural: x_batch,
                        mask_model.y_input: y_batch
                    }

                    cur_corr_nat = sess.run(mask_model.num_correct,
                                            feed_dict=dict_nat)
                    cur_corr_adv = sess.run(mask_model.num_correct,
                                            feed_dict=dict_adv)

                    total_corr_nat += cur_corr_nat
                    total_corr_adv += cur_corr_adv

                    un1n, un2n, un3n = \
                      sess.run([mask_model.unstable1, mask_model.unstable2, \
                                mask_model.unstable3],
                                feed_dict = dict_nat)

                    tot_unstable1n += np.sum(un1n)
                    tot_unstable2n += np.sum(un2n)
                    tot_unstable3n += np.sum(un3n)

                avg_un1n = tot_unstable1n / num_eval_examples
                avg_un2n = tot_unstable2n / num_eval_examples
                avg_un3n = tot_unstable3n / num_eval_examples
                acc_nat = total_corr_nat / num_eval_examples
                acc_adv = total_corr_adv / num_eval_examples

                print('natural: {:.2f}%'.format(100 * acc_nat))
                print('adversarial: {:.2f}%'.format(100 * acc_adv))
                print('  un1n, un2n, un3n: {}, {}, {}'.format(
                    avg_un1n, avg_un2n, avg_un3n))

        new_model_weights = {
            'c1_w': c1,
            'c1_b': c1b,
            'c2_w': c2,
            'c2_b': c2b,
            'fc_w': fc,
            'fc_b': fcb,
            'sm_w': sm,
            'sm_b': smb,
        }
        if relu_prune:
            new_model_weights['c1_m'] = c1_ops
            new_model_weights['c2_m'] = c2_ops
            new_model_weights['fc_m'] = fc_ops
    return new_model_weights
Beispiel #13
0
            model_number = i + 1
            path = args.log_prefix + str(model_number) + ".log"
            print(path)
            log_file = open(path, 'w')
            log_loss = [0 for x in range(args.atta_max_step + 1)]
            total_nat_loss = 0
            total_adv_loss = 0
            for batch_start in range(s, s + 256, 64):
                x_batch = cifar.train_data.xs[batch_start:batch_start + 64]
                y_batch = cifar.train_data.ys[batch_start:batch_start + 64]

                saver.restore(sess, model_ckpt)

                x_batch_adv = attack.perturb(x_batch,
                                             y_batch,
                                             sess,
                                             log_loss,
                                             step=args.atta_max_step)

                # nat_dict = {model.x_input: x_batch,
                #             model.y_input: y_batch}
                # adv_dict = {model.x_input: x_batch_adv,
                #             model.y_input: y_batch}
                #
                # nat_loss = sess.run(model.mean_xent, feed_dict=nat_dict)
                # loss = sess.run(model.mean_xent, feed_dict=adv_dict)
                #
                # print("adv loss:     {}".format(loss))
                # print("nat-loss: {}".format(nat_loss))
                # print("per:      {}%".format(loss / nat_loss * 100))
            for ii in range(args.atta_max_step):
Beispiel #14
0
                                                       global_step=global_step)
    # saver_pretrained = tf.train.Saver(var_list = [v for v in tf.trainable_variables() if v.name in ['Variable_8:0','Variable_9:0']])
    # saver_pretrained.restore(sess, './models/pretrained_robust_model/95000/checkpoint_0-95000')
    # saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    # saver.restore(sess,'/home/hope-yao/Documents/mnist_challenge_voting/denoiser')
    training_time = 0.0

    for ii in range(5000):
        x_batch, y_batch = mnist.train.next_batch(batch_size)
        nat_dict = {model.x_input: x_batch, model.y_input: y_batch}

        if 0:
            # Compute Adversarial Perturbations
            start = timer()
            x_batch_adv = attack.perturb(x_batch, y_batch, sess)
            end = timer()
            training_time += end - start
            adv_dict = {model.x_input: x_batch_adv, model.y_input: y_batch}

        sess.run(pre_train_step, feed_dict=nat_dict)

        if ii % 100 == 0:
            x_batch, y_batch = mnist.test.next_batch(batch_size)
            nat_dict = {model.x_input: x_batch, model.y_input: y_batch}
            # Output to stdout
            nat_acc = sess.run(model.accuracy, feed_dict=nat_dict)
            #adv_acc = sess.run(model.accuracy, feed_dict=adv_dict)
            print('Step {}:    ({})'.format(ii, datetime.now()))
            print('    test nat accuracy {:.4}%'.format(nat_acc * 100))
            #print('    test adv accuracy {:.4}%'.format(adv_acc * 100))
Beispiel #15
0
        y_batch = np.eye(num_classes)[y_batch]  # one hot coding

        # Compute Adversarial Perturbations
        start = timer()
        if config['AVmixup']:
            x_batch_adv, y_batch_adv = attack.perturb_avmixup(
                x_batch,
                y_batch,
                config['gamma'],
                config['lambda1'],
                config['lambda2'],
                sess,
                is_training=True)
        else:
            x_batch_adv = attack.perturb(x_batch,
                                         y_batch,
                                         sess,
                                         is_training=True)
            y_batch_adv = y_batch
        end = timer()
        training_time += end - start

        nat_dict = {
            model.x_input: x_batch,
            model.is_training: True,
            model.y_input: y_batch
        }

        adv_dict = {
            model.x_input: x_batch_adv,
            model.is_training: True,
            model.y_input: y_batch_adv
Beispiel #16
0
class SpatialAttack:
  def __init__(self, model, config):
    self.model = model
    self.grid_store = []

    if config.use_linf:
        self.linf_attack = LinfPGDAttack(model, config)
    else:
        self.linf_attack = None

    self.use_spatial = config.use_spatial
    self.attack_method = config.attack_method
    if config.use_spatial:
        self.method = config.spatial_method
        self.limits = config.spatial_limits

        if self.method == 'grid':
            self.granularity = config.grid_granularity
        elif self.method == 'random':
            self.random_tries = config.random_tries
        elif self.method == 'max':
            self.random_tries = config.random_tries

  def perturb(self, x_nat, y, sess):
      if not self.use_spatial:
          t = np.zeros([len(x_nat), 3])
          if self.linf_attack:
              x = self.linf_attack.perturb(x_nat, y, sess, trans=t)
          else:
              x = x_nat
          return x, t
      if self.method == 'grid':
          return self.perturb_grid(x_nat, y, sess, -1)
      else: # random
          return self.perturb_grid(x_nat, y, sess, self.random_tries)

  def perturb_grid(self, x_nat, y, sess, random_tries=-1):
    n = len(x_nat)
    if random_tries > 0:
        # subsampling this list from the grid is a bad idea, instead we
        # will randomize each example from the full continuous range
        grid = [(42, 42, 42) for _ in range(random_tries)] # dummy list
    else: # exhaustive grid
        grid = product(*list(np.linspace(-l, l, num=g)
                             for l, g in zip(self.limits, self.granularity)))

    worst_x = np.copy(x_nat)
    worst_t = np.zeros([n, 3])
    max_xent = np.zeros(n)
    all_correct = np.ones(n).astype(bool)

    for tx, ty, r in grid:
        if random_tries > 0:
            if self.method == 'max':
                #In config, specify limits as [0 0 90] for 0 translation, 
                #but 90 rotation (either 0 or 90 is selected, nothing in between)
                t = np.stack((np.random.randint(0, 1+1, n)*l for l in self.limits),
                             axis=1)
            else:
                # Allows to set spatial limits in different ways like:
                # limits = [3,3,30] - original [low, high) for each element
                # limits = [[-3,3],[0,3],[20,30]] - within range
                # limits = [3,[3],[20,30]] - mix, if list_len == 1 do original
                temp = []
                for l in self.limits:
                    if isinstance(l, list):
                        if len(l) == 2:
                            temp.append(np.random.uniform(l[0], l[1], n))
                        elif len(l) == 1:
                            temp.append(np.random.uniform(-l[0], l[0], n))
                        else:
                            raise ValueError
                    else:
                        temp.append(np.random.uniform(-l, l, n))

                t = np.stack(temp, axis=1)
        else:
            t = np.stack(repeat([tx, ty, r], n))

        if self.linf_attack:
            x = self.linf_attack.perturb(x_nat, y, sess, trans=t)
        else:
            if self.attack_method == 'invert':
                # IPython.embed()
                x = v_invert_image(x_nat)
            elif self.attack_method == 'edge':
                x = canny_image(x_nat)
            else:
                x = x_nat

        curr_dict = {self.model.x_input: x,
                     self.model.y_input: y,
                     self.model.is_training: False,
                     self.model.transform: t}

        cur_xent, cur_correct = sess.run([self.model.y_xent,
                                          self.model.correct_prediction], 
                                         feed_dict = curr_dict) # shape (bsize,)
        cur_xent = np.asarray(cur_xent)
        cur_correct = np.asarray(cur_correct)

        # Select indices to update: we choose the misclassified transformation 
        # of maximum xent (or just highest xent if everything else if correct).
        idx = (cur_xent > max_xent) & (cur_correct == all_correct)
        idx = idx | (cur_correct < all_correct)
        max_xent = np.maximum(cur_xent, max_xent)
        all_correct = cur_correct & all_correct

        idx = np.expand_dims(idx, axis=-1) # shape (bsize, 1)
        worst_t = np.where(idx, t, worst_t) # shape (bsize, 3)

        idx = np.expand_dims(idx, axis=-1) 
        idx = np.expand_dims(idx, axis=-1) # shape (bsize, 1, 1, 1)
        worst_x = np.where(idx, x, worst_x,) # shape (bsize, 32, 32, 3)


    return worst_x, worst_t
Beispiel #17
0
        summary_writer_eval = tf.summary.FileWriter(eval_dir)
    sess.run(tf.global_variables_initializer())
    # checkpoint = tf.train.latest_checkpoint(model_dir)
    # saver.restore(sess, checkpoint)

    training_time = 0.0

    # Main training loop
    for ii in range(max_num_training_steps + 1):
        x_batch, y_batch = training_data.get_next_batch(batch_size,
                                                        multiple_passes=True)

        # Compute Adversarial Perturbations
        start = timer()
        if adv_training:
            x_batch_adv = attack.perturb(x_batch, y_batch, sess,
                                         ii / max_num_training_steps)
        else:
            x_batch_adv = x_batch
        end = timer()
        training_time += end - start

        nat_dict = {
            model.x_input: x_batch,
            model.x_input_natural: x_batch,
            model.y_input: y_batch
        }

        adv_dict = {
            model.x_input: x_batch_adv,
            model.x_input_natural: x_batch,
            model.y_input: y_batch
Beispiel #18
0
def main(cfg):
    img_size = cfg['img_size']
    batch_size = cfg['batch_size']
    num_glimpse = cfg['num_glimpse']
    glimpse_size = cfg['glimpse_size']
    lr = cfg['lr']
    input_images = tf.placeholder(tf.float32,
                                  shape=(batch_size, img_size, img_size, 1))
    input_label = tf.placeholder(tf.int64, shape=(batch_size))

    # build classifier
    #model = Model_att(input_images, input_label, glimpse_size, num_glimpse)
    # model = Model_madry(input_images, input_label)
    model = Model_crop(input_images, input_label)

    # setup attacker
    attack = LinfPGDAttack(model,
                           epsilon=0.3,
                           k=40,
                           a=0.01,
                           random_start=True,
                           loss_func='xent')

    ## OPTIMIZER ##
    learning_rate = tf.Variable(lr)  # learning rate for optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate, beta1=0.5)
    grads = optimizer.compute_gradients(model.xent)
    train_op = optimizer.apply_gradients(grads)
    saver = tf.train.Saver()
    ## training starts ###
    FLAGS = tf.app.flags.FLAGS
    tfconfig = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=True,
    )
    tfconfig.gpu_options.allow_growth = True
    sess = tf.Session(config=tfconfig)
    init = tf.global_variables_initializer()
    sess.run(init)
    mnist = input_data.read_data_sets('MNIST_data', one_hot=False)
    hist = {
        'train_acc': [],
        'train_adv_acc': [],
        'test_acc': [],
        'test_adv_acc': [],
        'train_loss': [],
        'test_loss': [],
        'train_adv_loss': [],
        'test_adv_loss': []
    }
    train_iters = 500000
    for itr in tqdm(range(train_iters)):
        x_batch_train, y_batch_train = mnist.train.next_batch(batch_size)
        if 1:  # adv train
            x_batch_train_adv = attack.perturb(
                x_batch_train.reshape(batch_size, img_size, img_size, 1),
                y_batch_train, sess)
            adv_dict_train = {
                input_images:
                x_batch_train_adv.reshape(batch_size, img_size, img_size, 1),
                input_label:
                y_batch_train
            }
            nat_dict_train = {
                input_images:
                x_batch_train.reshape(batch_size, img_size, img_size, 1),
                input_label:
                y_batch_train
            }
            sess.run(train_op, feed_dict=adv_dict_train)
        else:  # nat train
            nat_dict_train = {
                input_images:
                x_batch_train.reshape(batch_size, img_size, img_size, 1),
                input_label:
                y_batch_train
            }
            sess.run(train_op, feed_dict=nat_dict_train)

        if itr % 100 == 0:
            y_pred, train_loss_i = sess.run([model.y_pred, model.xent],
                                            feed_dict=nat_dict_train)
            counts = np.asarray([
                np.argmax(np.bincount(y_pred[:, i])) for i in range(batch_size)
            ])
            train_acc_i = np.mean(counts == nat_dict_train[input_label])
            x_batch_test, y_batch_test = mnist.test.next_batch(batch_size)
            nat_dict_test = {
                input_images:
                x_batch_test.reshape(batch_size, img_size, img_size, 1),
                input_label:
                y_batch_test
            }
            y_pred, test_loss_i = sess.run([model.y_pred, model.xent],
                                           feed_dict=nat_dict_test)
            counts = np.asarray([
                np.argmax(np.bincount(y_pred[:, i])) for i in range(batch_size)
            ])
            test_acc_i = np.mean(counts == nat_dict_test[input_label])
            print(
                "iter: {}, train_acc:{}  test_acc:{} train_loss:{}  test_loss:{} "
                .format(itr, train_acc_i, test_acc_i, train_loss_i,
                        test_loss_i))

            x_batch_train_adv = attack.perturb(
                x_batch_train.reshape(batch_size, img_size, img_size, 1),
                y_batch_train, sess)
            adv_dict_train = {
                input_images:
                x_batch_train_adv.reshape(batch_size, img_size, img_size, 1),
                input_label:
                y_batch_train
            }
            y_pred, train_adv_loss_i = sess.run([model.y_pred, model.xent],
                                                feed_dict=adv_dict_train)
            counts = np.asarray([
                np.argmax(np.bincount(y_pred[:, i])) for i in range(batch_size)
            ])
            train_adv_acc_i = np.mean(counts == adv_dict_train[input_label])
            x_batch_test_adv = attack.perturb(
                x_batch_test.reshape(batch_size, img_size, img_size, 1),
                y_batch_test, sess)
            adv_dict_test = {
                input_images:
                x_batch_test_adv.reshape(batch_size, img_size, img_size, 1),
                input_label:
                y_batch_test
            }
            y_pred, test_adv_loss_i = sess.run([model.y_pred, model.xent],
                                               feed_dict=adv_dict_test)
            counts = np.asarray([
                np.argmax(np.bincount(y_pred[:, i])) for i in range(batch_size)
            ])
            test_adv_acc_i = np.mean(counts == adv_dict_test[input_label])
            print(
                "iter: {}, train_adv_acc:{}  test_adv_acc:{} train_adv_loss:{}  test_adv_loss:{} "
                .format(itr, train_adv_acc_i, test_adv_acc_i, train_adv_loss_i,
                        test_adv_loss_i))
            hist['train_acc'] += [train_acc_i]
            hist['train_adv_acc'] += [train_adv_acc_i]
            hist['test_acc'] += [test_acc_i]
            hist['test_adv_acc'] += [test_adv_acc_i]
            hist['train_loss'] += [train_loss_i]
            hist['test_loss'] += [test_loss_i]
            hist['train_adv_loss'] += [train_adv_loss_i]
            hist['test_adv_loss'] += [test_adv_loss_i]
            np.save('hist', hist)
            saver.save(sess, 'crop_ckpt')
    print('done')
Beispiel #19
0
        saver.restore(sess, model_path)

        total_nat_corr = 0
        total_adv_corr = 0
        nat_acc = 0
        adv_acc = 0
        # print(cifar.eval_data.xs.shape)
        for batch_start in range(0, data_size, batch_size):
            # print(batch_start)
            batch_end = min(batch_start + batch_size, data_size)
            # size = batch_end - batch_start
            # print(size)
            x_batch = cifar.eval_data.xs[batch_start:batch_end]
            y_batch = cifar.eval_data.ys[batch_start:batch_end]
            # x_batch, y_batch = cifar.eval_data.get_next_batch(batch_size, multiple_passes=True)
            x_batch_adv = attack.perturb(x_batch, y_batch, sess, step=100)

            batch_s = x_batch.shape[0]
            # print(batch_s)

            nat_dict = {model.x_input: x_batch, model.y_input: y_batch}
            adv_dict = {model.x_input: x_batch_adv, model.y_input: y_batch}

            nat_corr = sess.run(model.num_correct, feed_dict=nat_dict)
            adv_corr = sess.run(model.num_correct, feed_dict=adv_dict)
            print("batch nat corr: {}, adv corr: {}".format(
                nat_corr, adv_corr))
            total_nat_corr += nat_corr
            total_adv_corr += adv_corr

        nat_acc = total_nat_corr / data_size
Beispiel #20
0
	os.makedirs(model_dir)

shutil.copy('config.json', model_dir)
training_time = 0.0
for epoch in range(max_num_training_steps):
	print("Epoch: {}".format(epoch))
	running_loss = 0.0 
	for data in tqdm(trainloader):

		inputs, labels = data 
		inputs, labels = Variable(inputs), Variable(labels)
		optimizer.zero_grad() 
	
		# Compute Adversarial Perturbations
		start = timer()
		x_adv = attack.perturb(inputs.data.numpy(), labels.data.numpy())
		x_adv_v = Variable(torch.FloatTensor(x_adv))
		end = timer()
		training_time += end - start

		outputs = net(x_adv_v)
		loss = criterion(outputs, labels)
		loss.backward()
		optimizer.step()

		running_loss += loss.data[0]

		natural_outputs = natural_net(inputs)
		loss = criterion(natural_outputs, labels)
		loss.backward()
		natural_optimizer.step()
Beispiel #21
0
shutil.copy('config.json', model_dir)

with tf.Session() as sess:
    # Initialize the summary writer, global variables, and our time counter.
    summary_writer = tf.summary.FileWriter(model_dir, sess.graph)
    sess.run(tf.global_variables_initializer())
    training_time = 0.0

    # Main training loop
    for ii in range(max_num_training_steps):
        x_batch, y_batch = mnist.train.next_batch(batch_size)

        # Compute Adversarial Perturbations
        start = timer()
        x_batch_adv = attack.perturb(x_batch, y_batch, sess)
        sp_x_batch_adv = sp_attack.perturb(x_batch, y_batch, sess)

        convention_adv_test = attack.perturb(x_batch, y_batch, sess, False)
        spatial_adv_test = sp_attack.perturb(x_batch, y_batch, sess, False)

        end = timer()
        training_time += end - start

        nat_dict = {model.x_input: x_batch, model.y_input: y_batch}

        adv_dict = {model.x_input: x_batch_adv, model.y_input: y_batch}

        sp_nat_dict = {sp_model.x_input: x_batch, sp_model.y_input: y_batch}

        sp_adv_dict = {
Beispiel #22
0
class SpatialAttack:
    def __init__(self, model, config, method=None, worstofk=None,
                 attack_limits=None, fo_epsilon=2.0, fo_step_size=2.,
                 fo_num_steps=5):
        self.model = model
        self.grid_store = []

        if config.use_linf:
            self.linf_attack = LinfPGDAttack(
                model, config, fo_epsilon, fo_step_size, fo_num_steps)
        else:
            self.linf_attack = None

        self.use_spatial = config.use_spatial
        if config.use_spatial:
            # Attack method
            if method == None:
                self.method = config.spatial_method
            else:
                self.method = method

            # Attack parameters
            if attack_limits == None:
                self.limits = config.spatial_limits
            else:
                self.limits = attack_limits

            if config.only_rotation:
                self.limits = [0, 0, self.limits[2]]

            if config.only_translation:
                self.limits = [self.limits[0], self.limits[1], 0]

            # Attack method parameters
            if self.method == 'grid':
                self.granularity = config.grid_granularity
            elif self.method == 'random':
                if worstofk == None:
                    self.random_tries = config.random_tries
                else:
                    self.random_tries = worstofk
            elif self.method == 'fo':
                self.fo_attack = SpatialPGDAttack(
                    model, config, fo_epsilon, fo_step_size, fo_num_steps)
            else:
                raise NotImplementedError

    def perturb(self, x_nat, y, max_func, sess):
        if not self.use_spatial:
            t = np.zeros([len(x_nat), 3])
            if self.linf_attack:
                x = self.linf_attack.perturb(x_nat, y, sess, trans=t)
            else:
                x = x_nat
            return x, t
        if self.method == 'grid':
            return self.perturb_grid(x_nat, y, sess, -1)
        elif self.method == 'fo':
            return self.fo_attack.perturb(x_nat, y, sess)
        else: # random
            return self.perturb_grid(x_nat, y, max_func, sess, self.random_tries)

    def perturb_grid(self, x_nat, y, max_func, sess, random_tries=-1):
        n = len(x_nat)
        if random_tries > 0:
            # subsampling this list from the grid is a bad idea, instead we
            # will randomize each example from the full continuous range
            grid = [(42, 42, 42) for _ in range(random_tries)] # dummy list
        else: # exhaustive grid
            grid = product(*list(np.linspace(-l, l, num=g)
                           for l, g in zip(self.limits, self.granularity)))

        worst_x = np.copy(x_nat)
        worst_t = np.zeros([n, 3])
        k = 0

        if self.linf_attack:
            raise NotImplementedError
        else:
            x = x_nat

        no_op = np.zeros([n, 3])
        # computing pre_softmax of f(x), notice f(x) is not true label y
        if max_func == "cce":
            pass
        else:
            nat_dict = {self.model.x_input: x,
                        self.model.y_input: y,
                        self.model.is_training: False,
                        self.model.transform: no_op}
            f_x_nat_presoftmax = sess.run(self.model.pre_softmax, feed_dict=nat_dict)

        for tx, ty, r in grid:
            if random_tries > 0:
                # randomize each example separately
                t = np.stack((np.random.uniform(-l, l, n) for l in self.limits), axis=1)
            else:
                t = np.stack(repeat([tx, ty, r], n))

            adv_dict = {self.model.x_input: x,
                        self.model.y_input: y,
                        self.model.is_training: False,
                        self.model.transform: t}

            if max_func == "cce":  # w.r.t. the cce, not regularizer
                adv_loss = sess.run(self.model.y_xent,
                                    feed_dict=adv_dict)  # shape (bsize,)

            elif max_func == "l2":  # w.r.t. the regularizer
                f_x_adv_presoftmax = sess.run(self.model.pre_softmax,
                                              feed_dict=adv_dict)  # shape (bsize,)
                adv_loss = self.l2_reg_loss(f_x_nat_presoftmax, f_x_adv_presoftmax)

            elif max_func == "kl":
                f_x_adv_presoftmax = sess.run(self.model.pre_softmax,
                                              feed_dict=adv_dict)  # shape (bsize,)
                adv_loss = self.kl_reg_loss(f_x_nat_presoftmax, f_x_adv_presoftmax)

            else:
                raise NotImplementedError

            adv_loss = np.asarray(adv_loss)
            # update indices if adv_loss is larger than previous max_adv_loss
            if k == 0:
                # in first iteration update all
                idx = np.ones(n).astype(bool)
            else:
                idx = adv_loss > max_adv_loss
            idx = np.expand_dims(idx, axis=-1)  # shape (bsize, 1)
            if k == 0:
                max_adv_loss = adv_loss
            else:
                max_adv_loss = np.maximum(adv_loss, max_adv_loss)
            worst_t = np.where(idx, t, worst_t)  # shape (bsize, 3)

            idx = np.expand_dims(idx, axis=-1)
            idx = np.expand_dims(idx, axis=-1)  # shape (bsize, 1, 1, 1)
            worst_x = np.where(idx, x, worst_x,)  # shape (bsize, 32, 32, 3)
            k += 1
        return worst_x, worst_t

    def l2_reg_loss(self, dist_a, dist_b):
        assert dist_a.shape == dist_b.shape
        return np.sum(np.square(dist_a - dist_b), axis=1)

    # pass the presoftmax in
    def kl_reg_loss(self, dist_nat, dist_adv):
        assert dist_nat.shape == dist_adv.shape
        #  compute KL-div of f(x) and f(x')
        epsilon = np.zeros(dist_nat.shape)
        epsilon.fill(1e-08)

        prob_adv = scipy.special.softmax(dist_adv, axis=1) + epsilon
        prob_nat = scipy.special.softmax(dist_nat, axis=1) + epsilon

        # scipy.stats.entropy calculate the KL divergence (although it's called entropy)
        return scipy.stats.entropy(np.transpose(prob_nat), np.transpose(prob_adv))