def get_Mahalanobis_score_adv(test_data, gaussian_score, grads, magnitude, scale):
    grad_file = os.path.join(characteristics_dir, 'gradients_{}.npy'.format(set))
    # if os.path.exists(grad_file):
    #     print('loading gradients from {}'.format(grad_file))
    #     gradients = np.load(grad_file)
    # else:
    gradients = batch_eval(sess, [x], grads, [test_data], FLAGS.batch_size)[0]
    # print('Saving gradients to {}'.format(grad_file))
    # np.save(grad_file, gradients)

    gradients = gradients.clip(min=0)
    gradients = (gradients - 0.5) * 2

    # scale hyper params given from the official deep_Mahalanobis_detector repo:
    # https://github.com/pokaxpoka/deep_Mahalanobis_detector
    # I/We set scale=1 by default, with their params
    RED_SCALE   = 0.2023 * scale
    GREEN_SCALE = 0.1994 * scale
    BLUE_SCALE  = 0.2010 * scale

    gradients_scaled = np.zeros_like(gradients)
    gradients_scaled[:, :, :, 0] = gradients[:, :, :, 0] / RED_SCALE
    gradients_scaled[:, :, :, 1] = gradients[:, :, :, 1] / GREEN_SCALE
    gradients_scaled[:, :, :, 2] = gradients[:, :, :, 2] / BLUE_SCALE

    tempInputs = test_data - magnitude * gradients_scaled
    noise_gaussian_score = batch_eval(sess, [x], [gaussian_score], [tempInputs], FLAGS.batch_size)[0]

    Mahalanobis = np.max(noise_gaussian_score, axis=1)

    return Mahalanobis
Ejemplo n.º 2
0
def fast_gradient_sign_method(sess,
                              model,
                              X,
                              Y,
                              eps,
                              clip_min=None,
                              clip_max=None,
                              batch_size=256):
    """
    TODO
    :param sess:
    :param model: predictions or after-softmax
    :param X:
    :param Y:
    :param eps:
    :param clip_min:
    :param clip_max:
    :param batch_size:
    :return:
    """
    # Define TF placeholders for the input and output
    x = tf.placeholder(tf.float32, shape=(None, ) + X.shape[1:])
    y = tf.placeholder(tf.float32, shape=(None, ) + Y.shape[1:])
    adv_x = fgsm(x,
                 model(x),
                 eps=eps,
                 clip_min=clip_min,
                 clip_max=clip_max,
                 y=y)
    X_adv, = batch_eval(sess, [x, y], [adv_x], [X, Y],
                        feed={},
                        args={'batch_size': batch_size})
    return X_adv
Ejemplo n.º 3
0
def batch_eval(*args, **kwargs):
    # Inside function to avoid circular import
    from cleverhans.evaluation import batch_eval
    warnings.warn("batch_eval has moved to cleverhans.evaluation. "
                  "batch_eval will be removed from utils_tf on or after "
                  "2019-03-09.")
    return batch_eval(*args, **kwargs)
Ejemplo n.º 4
0
def basic_iterative_method(sess, model, X, Y, eps, eps_iter, nb_iter=50,
                           clip_min=None, clip_max=None, batch_size=256):
    """
    TODO
    :param sess:
    :param model: predictions or after-softmax
    :param X:
    :param Y:
    :param eps:
    :param eps_iter:
    :param nb_iter:
    :param clip_min:
    :param clip_max:
    :param batch_size:
    :return:
    """
    print("nb_iter",nb_iter)
    # Define TF placeholders for the input and output
    x = tf.placeholder(tf.float32, shape=(None,)+X.shape[1:])
    y = tf.placeholder(tf.float32, shape=(None,)+Y.shape[1:])
    # results will hold the adversarial inputs at each iteration of BIM;
    # thus it will have shape (nb_iter, n_samples, n_rows, n_cols, n_channels)
    results = np.zeros((nb_iter, X.shape[0],) + X.shape[1:])
    # Initialize adversarial samples as the original samples, set upper and
    # lower bounds
    X_adv = X
    X_min = X_adv - eps
    X_max = X_adv + eps
    print('Running BIM iterations...')
    # "its" is a dictionary that keeps track of the iteration at which each
    # sample becomes misclassified. The default value will be (nb_iter-1), the
    # very last iteration.
    def f(val):
        return lambda: val
    its = defaultdict(f(nb_iter-1))
    # Out keeps track of which samples have already been misclassified
    out = set()
    for i in tqdm(range(nb_iter)):
        adv_x = fgsm(
            x, model(x), eps=eps_iter,
            clip_min=clip_min, clip_max=clip_max, y=y
        )
        X_adv, = batch_eval(
            sess, [x, y], [adv_x],
            [X_adv, Y], feed={K.learning_phase(): 0},
            args={'batch_size': batch_size}
        )
        X_adv = np.maximum(np.minimum(X_adv, X_max), X_min)
        results[i] = X_adv
        # check misclassifieds
        predictions = model.predict_classes(X_adv, batch_size=512, verbose=0)
        misclassifieds = np.where(predictions != Y.argmax(axis=1))[0]
        for elt in misclassifieds:
            if elt not in out:
                its[elt] = i
                out.add(elt)

    return its, results
Ejemplo n.º 5
0
 def get_activations(self, data, batch_size=10):
     data_activations = {}
     for layer in self.layers:
         data_activations[layer] = batch_eval(
             sess=self.sess,
             tf_inputs=[self.x_ph],
             tf_outputs=[self.layer_sym_ph[layer]],
             numpy_inputs=[data],
             batch_size=batch_size)[0]
     return data_activations
def adaptive_fast_gradient_sign_method(sess,
                                       model,
                                       X,
                                       Y,
                                       eps,
                                       clip_min=None,
                                       clip_max=None,
                                       batch_size=256,
                                       log_dir=None,
                                       model_logits=None,
                                       binary_steps=12,
                                       dataset="cifar"):
    """
    TODO
    :param sess:
    :param model: predictions or after-softmax
    :param X:
    :param Y:
    :param eps:
    :param clip_min:
    :param clip_max:
    :param batch_size:
    :return:
    """
    # Define TF placeholders for the input and output
    x = tf.placeholder(tf.float32, shape=(None, ) + X.shape[1:])
    y = tf.placeholder(tf.float32, shape=(None, ) + Y.shape[1:])
    alpha = tf.placeholder(tf.float32, shape=(None, ) + (1, ))
    num_samples = np.shape(X)[0]
    ALPHA = 0.1 * np.ones((num_samples, 1))
    ub = 10.0 * np.ones(num_samples)
    lb = 0.0 * np.ones(num_samples)
    Best_X_adv = None
    for i in range(binary_steps):
        print(i)
        adv_x = adaptive_fgsm(x,
                              model(x),
                              eps=eps,
                              clip_min=clip_min,
                              clip_max=clip_max,
                              y=y,
                              log_dir=log_dir,
                              model_logits=model_logits,
                              alpha=alpha)
        X_adv, = batch_eval(sess, [x, y, alpha], [adv_x], [X, Y, ALPHA],
                            feed={K.learning_phase(): 0},
                            args={'batch_size': batch_size})

        if (i == 0):
            Best_X_adv = X_adv

        ALPHA, Best_X_adv = binary_refinement(sess, Best_X_adv, X_adv, Y,
                                              ALPHA, ub, lb, model, dataset)

    return Best_X_adv
    def estimate(i_batch):
        start = i_batch * batch_size
        end = np.minimum(len(X_test), (i_batch + 1) * batch_size)
        n_feed = end - start
        lid_batch       = np.zeros(shape=(n_feed, lid_dim))
        lid_batch_adv   = np.zeros(shape=(n_feed, lid_dim))
        lid_batch_noisy = np.zeros(shape=(n_feed, lid_dim))

        X_act       = batch_eval(sess, [x], model.net.values(), [X_test[start:end]]      , batch_size)
        X_adv_act   = batch_eval(sess, [x], model.net.values(), [X_test_adv[start:end]]  , batch_size)
        X_noisy_act = batch_eval(sess, [x], model.net.values(), [X_test_noisy[start:end]], batch_size)

        for i in range(len(model.net)):
            X_act[i]       = np.asarray(X_act[i]      , dtype=np.float32).reshape((n_feed, -1))
            X_adv_act[i]   = np.asarray(X_adv_act[i]  , dtype=np.float32).reshape((n_feed, -1))
            X_noisy_act[i] = np.asarray(X_noisy_act[i], dtype=np.float32).reshape((n_feed, -1))

            # random clean samples
            # Maximum likelihood estimation of local intrinsic dimensionality (LID)
            lid_batch[:, i]       = mle_batch(X_act[i], X_act[i]      , k=k)
            lid_batch_adv[:, i]   = mle_batch(X_act[i], X_adv_act[i]  , k=k)
            lid_batch_noisy[:, i] = mle_batch(X_act[i], X_noisy_act[i], k=k)

        return lid_batch, lid_batch_noisy, lid_batch_adv
def get_knn_layers(X, y):
    knn = {}

    train_features = batch_eval(sess, [x], model.net.values(), [X], FLAGS.batch_size)
    print('Fitting knn models on all layers: {}'.format(model.net.keys()))
    for layer_index, layer in enumerate(model.net.keys()):
        if len(train_features[layer_index].shape) == 4:
            train_features[layer_index] = np.asarray(train_features[layer_index], dtype=np.float32).reshape((X.shape[0], -1, train_features[layer_index].shape[-1]))
            train_features[layer_index] = np.mean(train_features[layer_index], axis=1)
        elif len(train_features[layer_index].shape) == 2:
            pass  # leave as is
        else:
            raise AssertionError('Expecting size of 2 or 4 but got {} for {}'.format(len(train_features[layer_index].shape), layer))

        knn[layer] = NearestNeighbors(n_neighbors=X.shape[0], p=2, n_jobs=20, algorithm='brute')
        knn[layer].fit(train_features[layer_index], y)

    del train_features
    return knn
def calc_all_ranks_and_dists(X, subset, knn):
    num_output = len(model.net.keys())
    n_neighbors = knn[knn.keys()[0]].n_neighbors
    all_neighbor_ranks = -1 * np.ones((len(X), num_output, n_neighbors), dtype=np.int32)
    all_neighbor_dists = -1 * np.ones((len(X), num_output, n_neighbors), dtype=np.float32)

    features = batch_eval(sess, [x], model.net.values(), [X], FLAGS.batch_size)
    for layer_index, layer in enumerate(model.net.keys()):
        print('Calculating ranks and distances for subset {} for layer {}'.format(subset, layer))
        if len(features[layer_index].shape) == 4:
            features[layer_index] = np.asarray(features[layer_index], dtype=np.float32).reshape((X.shape[0], -1, features[layer_index].shape[-1]))
            features[layer_index] = np.mean(features[layer_index], axis=1)
        elif len(features[layer_index].shape) == 2:
            pass  # leave as is
        else:
            raise AssertionError('Expecting size of 2 or 4 but got {} for {}'.format(len(features[layer_index].shape), layer))

        all_neighbor_dists[:, layer_index], all_neighbor_ranks[:, layer_index] = \
            knn[layer].kneighbors(features[layer_index], return_distance=True)

    del features
    return all_neighbor_ranks, all_neighbor_dists
Ejemplo n.º 10
0
saver.restore(sess, checkpoint_path)

# predict labels from trainset
if USE_TRAIN_MINI:
    train_preds_file = os.path.join(model_dir, 'x_train_mini_preds.npy')
    train_features_file = os.path.join(model_dir, 'x_train_mini_features.npy')
else:
    train_preds_file = os.path.join(model_dir, 'x_train_preds.npy')
    train_features_file = os.path.join(model_dir, 'x_train_features.npy')
if not os.path.isfile(train_preds_file):
    tf_inputs = [x, y]
    tf_outputs = [preds, embeddings]
    numpy_inputs = [X_train, y_train]

    x_train_preds, x_train_features = batch_eval(sess, tf_inputs, tf_outputs,
                                                 numpy_inputs,
                                                 FLAGS.batch_size)
    x_train_preds = x_train_preds.astype(np.int32)
    np.save(train_preds_file, x_train_preds)
    np.save(train_features_file, x_train_features)
else:
    x_train_preds = np.load(train_preds_file)
    x_train_features = np.load(train_features_file)

# predict labels from validation set
if not os.path.isfile(os.path.join(model_dir, 'x_val_preds.npy')):
    tf_inputs = [x, y]
    tf_outputs = [preds, embeddings]
    numpy_inputs = [X_val, y_val]

    x_val_preds, x_val_features = batch_eval(sess, tf_inputs, tf_outputs,
Ejemplo n.º 11
0
checkpoint_path = os.path.join(model_dir, 'best_model.ckpt')
saver.restore(sess, checkpoint_path)

# predict labels from trainset
if USE_TRAIN_MINI:
    train_preds_file    = os.path.join(model_dir, 'x_train_mini_preds.npy')
    train_features_file = os.path.join(model_dir, 'x_train_mini_features.npy')
else:
    train_preds_file    = os.path.join(model_dir, 'x_train_preds.npy')
    train_features_file = os.path.join(model_dir, 'x_train_features.npy')
if not os.path.isfile(train_preds_file):
    tf_inputs    = [x, y]
    tf_outputs   = [preds, embeddings]
    numpy_inputs = [X_train, y_train]

    x_train_preds, x_train_features = batch_eval(sess, tf_inputs, tf_outputs, numpy_inputs, FLAGS.batch_size)
    x_train_preds = x_train_preds.astype(np.int32)
    np.save(train_preds_file, x_train_preds)
    np.save(train_features_file, x_train_features)
else:
    x_train_preds    = np.load(train_preds_file)
    x_train_features = np.load(train_features_file)

# predict labels from validation set
if not os.path.isfile(os.path.join(model_dir, 'x_val_preds.npy')):
    tf_inputs    = [x, y]
    tf_outputs   = [preds, embeddings]
    numpy_inputs = [X_val, y_val]

    x_val_preds, x_val_features = batch_eval(sess, tf_inputs, tf_outputs, numpy_inputs, FLAGS.batch_size)
    x_val_preds = x_val_preds.astype(np.int32)
    def do_eval(preds,
                x_set,
                y_set,
                report_key,
                is_adv=None,
                predictor=None,
                x_adv=None):
        if predictor is None:
            acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params)
        else:
            do_eval(preds, x_set, y_set, report_key, is_adv=is_adv)
            if x_adv is not None:
                x_set_adv, = batch_eval(sess, [x], [x_adv], [x_set],
                                        batch_size=batch_size)
                assert x_set.shape == x_set_adv.shape
                x_set = x_set_adv
            n_batches = math.ceil(x_set.shape[0] / batch_size)
            p_set, p_det = np.concatenate([
                predictor.send(x_set[b * batch_size:(b + 1) * batch_size])
                for b in tqdm.trange(n_batches)
            ]).T
            acc = np.equal(p_set, y_set[:len(p_set)].argmax(-1)).mean()
            # if is_adv:
            # import IPython ; IPython.embed() ; exit(1)
            if FLAGS.save_debug_dict:
                debug_dict['x_set'] = x_set
                debug_dict['y_set'] = y_set
                ddfn = 'logs/debug_dict_{}.pkl'.format(
                    'adv' if is_adv else 'clean')
                if not os.path.exists(ddfn):
                    with open(ddfn, 'wb') as f:
                        pickle.dump(debug_dict, f)
                debug_dict.clear()
        if is_adv is None:
            report_text = None
        elif is_adv:
            report_text = 'adversarial'
        else:
            report_text = 'legitimate'
        if report_text:
            print('Test accuracy on %s examples %s: %0.4f' %
                  (report_text, 'with correction'
                   if predictor is not None else 'without correction', acc))
            if is_adv is not None:
                label = 'test_acc_{}_{}'.format(
                    report_text, 'corrected' if predictor else 'uncorrected')
                swriter.add_scalar(label, acc)
                if predictor is not None:
                    detect = np.equal(p_det, is_adv).mean()
                    label = 'test_det_{}_{}'.format(
                        report_text,
                        'corrected' if predictor else 'uncorrected')
                    print(label, detect)
                    swriter.add_scalar(label, detect)
                    label = 'test_dac_{}_{}'.format(
                        report_text,
                        'corrected' if predictor else 'uncorrected')
                    swriter.add_scalar(
                        label,
                        np.equal(p_set,
                                 y_set[:len(p_set)].argmax(-1))[np.equal(
                                     p_det, is_adv)].mean())

        return acc
Ejemplo n.º 13
0
def eval(sess,
         model_name,
         X_train,
         Y_train,
         X_test,
         Y_test,
         cnn=False,
         rbf=False):
    """ Load model saved in model_name.json and model_name_weights.h5 and 
    evaluate its accuracy on legitimate test samples and adversarial samples.
    Use cnn=True if the model is CNN based.
    """

    # load saved model
    print("Load model ... ")
    '''
    json = open('models/{}.json'.format(model_name), 'r')
    model = json.read()
    json.close()
    loaded_model = model_from_json(model)
    loaded_model.load_weights("models/{}_weights.h5".format(model_name))
    '''
    if rbf:
        loaded_model = load_model("rbfmodels/{}.h5".format(model_name),
                                  custom_objects={'RBFLayer': RBFLayer})
    else:
        loaded_model = load_model("models/{}.h5".format(model_name))

    # Set placeholders
    if cnn:
        x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    else:
        x = tf.placeholder(tf.float32, shape=(None, 784))

    y = tf.placeholder(tf.float32, shape=(None, 10))

    predictions = loaded_model(x)

    accuracy = model_eval(sess,
                          x,
                          y,
                          predictions,
                          X_test,
                          Y_test,
                          args={"batch_size": 128})
    print('Test accuracy on legitimate test examples: ' + str(accuracy))

    # Craft adversarial examples using Fast Gradient Sign Method (FGSM)
    # Using functions from /cleverhans/attacks_tf.py
    # Will be deprecated next year
    adv_x = fgsm(x, predictions, eps=0.3)
    X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], batch_size=128)

    # Using functions from /cleverhans/attacks.py (as specified by creators)
    # Does not work at the moment
    '''
    wrap = KerasModelWrapper(loaded_model)
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.3}
                   #'y': y}
    adv_x = fgsm.generate(x, **fgsm_params)
    adv_x = tf.stop_gradient(adv_x)
    X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], batch_size=128)
    predictions_adv = loaded_model(adv_x)
    '''

    # Evaluate the accuracy of the MNIST model on adversarial examples
    accuracy = model_eval(sess,
                          x,
                          y,
                          predictions,
                          X_test_adv,
                          Y_test,
                          args={"batch_size": 128})
    print('Test accuracy on adversarial test examples: ' + str(accuracy))

    # Craft adversarial examples using Jacobian-based Saliency Map Approach (JSMA)
    wrap = KerasModelWrapper(loaded_model)
    jsma = SaliencyMapMethod(wrap, sess=sess)
    jsma_params = {
        'theta': 1.,
        'gamma': 1,
        'clip_min': 0.,
        'clip_max': 1.,
        'y_target': None
    }
    adv_x = jsma.generate(x, **jsma_params)
    adv_x = tf.stop_gradient(adv_x)
    preds_adv = loaded_model(adv_x)

    accuracy = model_eval(sess,
                          x,
                          y,
                          preds_adv,
                          X_test,
                          Y_test,
                          args={"batch_size": 512})
    print('Test accuracy on adversarial test examples: ' + str(accuracy))
    '''
    report = AccuracyReport()
    viz_enabled=VIZ_ENABLED
    source_samples=SOURCE_SAMPLES
    img_rows, img_cols, nchannels = 28, 28, 1
    nb_classes = 10

    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) +
        ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    wrap = KerasModelWrapper(loaded_model)
    jsma = SaliencyMapMethod(wrap, sess=sess)
    jsma_params = {'theta': 1., 'gamma': 0.1,
                   'clip_min': 0., 'clip_max': 1.,
                   'y_target': None}

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
      print('--------------------------------------')
      print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
      sample = X_test[sample_ind:(sample_ind + 1)]

      # We want to find an adversarial example for each possible target class
      # (i.e. all classes that differ from the label given in the dataset)
      current_class = int(np.argmax(y_test[sample_ind]))
      target_classes = other_classes(nb_classes, current_class)

      # For the grid visualization, keep original images along the diagonal
      grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
          sample, (img_rows, img_cols, nchannels))

      # Loop over all target classes
      for target in target_classes:
        print('Generating adv. example for target class %i' % target)

        # This call runs the Jacobian-based saliency map approach
        one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
        one_hot_target[0, target] = 1
        jsma_params['y_target'] = one_hot_target
        adv_x = jsma.generate_np(sample, **jsma_params)

        # Check if success was achieved
        res = int(model_argmax(sess, x, predictions, adv_x) == target)

        # Computer number of modified features
        adv_x_reshape = adv_x.reshape(-1)
        test_in_reshape = X_test[sample_ind].reshape(-1)
        nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
        percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

        # Display the original and adversarial images side-by-side
        if viz_enabled:
          figure = pair_visual(
              np.reshape(sample, (img_rows, img_cols, nchannels)),
              np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure)

        # Add our adversarial example to our grid data
        grid_viz_data[target, current_class, :, :, :] = np.reshape(
            adv_x, (img_rows, img_cols, nchannels))

        # Update the arrays for later analysis
        results[target, sample_ind] = res
        perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
      import matplotlib.pyplot as plt
      plt.close(figure)
      _ = grid_visual(grid_viz_data)
        
      #adv_x = jsma(sess, x, predictions, 10, X_test, Y_test, 0, 0.5, 0, 1)
      #X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], batch_size=128)
      #accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test, args={ "batch_size" : 128 })
    '''
    sess.close()
def adaptive_basic_iterative_method(sess,
                                    model,
                                    X,
                                    Y,
                                    eps,
                                    eps_iter,
                                    nb_iter=50,
                                    clip_min=None,
                                    clip_max=None,
                                    batch_size=256,
                                    log_dir=None,
                                    model_logits=None,
                                    binary_steps=9,
                                    attack_type="bim-b",
                                    dataset="cifar"):
    """
    TODO
    :param sess:
    :param model: predictions or after-softmax
    :param X:
    :param Y:
    :param eps:
    :param eps_iter:
    :param nb_iter:
    :param clip_min:
    :param clip_max:
    :param batch_size:
    :return:
    """
    print("nb_iter", nb_iter)
    # Define TF placeholders for the input and output
    x = tf.placeholder(tf.float32, shape=(None, ) + X.shape[1:])
    y = tf.placeholder(tf.float32, shape=(None, ) + Y.shape[1:])
    alpha = tf.placeholder(tf.float32, shape=(None, ) + (1, ))
    num_samples = np.shape(X)[0]
    ALPHA = 0.1 * np.ones((num_samples, 1))
    ub = 10.0 * np.ones(num_samples)
    lb = 0.0 * np.ones(num_samples)
    Best_X_adv = None

    results = np.zeros((
        nb_iter,
        X.shape[0],
    ) + X.shape[1:])
    # Initialize adversarial samples as the original samples, set upper and
    # lower bounds
    X_adv = X
    X_min = X_adv - eps
    X_max = X_adv + eps
    print('Running BIM iterations...')

    # "its" is a dictionary that keeps track of the iteration at which each
    # sample becomes misclassified. The default value will be (nb_iter-1), the
    # very last iteration.
    def f(val):
        return lambda: val

    its = defaultdict(f(nb_iter - 1))
    # Out keeps track of which samples have already been misclassified
    out = set()
    for j in range(binary_steps):

        for i in tqdm(range(nb_iter)):
            adv_x = adaptive_fgsm(x,
                                  model(x),
                                  eps=eps_iter,
                                  clip_min=clip_min,
                                  clip_max=clip_max,
                                  y=y,
                                  log_dir=log_dir,
                                  model_logits=model_logits,
                                  alpha=alpha)
            X_adv, = batch_eval(sess, [x, y, alpha], [adv_x],
                                [X_adv, Y, ALPHA],
                                feed={K.learning_phase(): 0},
                                args={'batch_size': batch_size})
            X_adv = np.maximum(np.minimum(X_adv, X_max), X_min)
            results[i] = X_adv
            # check misclassifieds
            predictions = model.predict_classes(X_adv,
                                                batch_size=512,
                                                verbose=0)
            misclassifieds = np.where(predictions != Y.argmax(axis=1))[0]
            for elt in misclassifieds:
                if elt not in out:
                    its[elt] = i
                    out.add(elt)
            print(i)

        X_adv = results[-1]
        if (j == 0):
            Best_X_adv = X_adv
        ALPHA, Best_X_adv = binary_refinement(sess, Best_X_adv, X_adv, Y,
                                              ALPHA, ub, lb, model, dataset)
    return Best_X_adv
Ejemplo n.º 15
0
def train_sub1(sess, x, y, bbox_preds, x_sub, y_sub, nb_classes,
              nb_epochs_s, batch_size, learning_rate, data_aug, lmbda,
              aug_batch_size, rng, img_rows=48, img_cols=48,
              nchannels=3):
    """
    This function creates the substitute by alternatively
    augmenting the training data and training the substitute.
    :param sess: TF session
    :param x: input TF placeholder
    :param y: output TF placeholder
    :param bbox_preds: output of black-box model predictions
    :param x_sub: initial substitute training data
    :param y_sub: initial substitute training labels
    :param nb_classes: number of output classes
    :param nb_epochs_s: number of epochs to train substitute model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param data_aug: number of times substitute training data is augmented
    :param lmbda: lambda from arxiv.org/abs/1602.02697
    :param rng: numpy.random.RandomState instance
    :return:
    """
    # Define TF model graph (for the black-box model)
    model_sub = ModelSubstitute('model_s', nb_classes)
    preds_sub = model_sub.get_logits(x)
    loss_sub = CrossEntropy(model_sub, smoothing=0)

    print("Defined TensorFlow model graph for the substitute.")

    # Define the Jacobian symbolically using TensorFlow
    grads = jacobian_graph(preds_sub, x, nb_classes)

    # Train the substitute and augment dataset alternatively
    for rho in xrange(data_aug):
        print("Substitute training epoch #" + str(rho))
        train_params = {
            'nb_epochs': nb_epochs_s,
            'batch_size': batch_size,
            'learning_rate': learning_rate
        }
        #with TemporaryLogLevel(logging.WARNING, "cleverhans.utils.tf"):
        train(sess, loss_sub, x, y, x_sub,
              to_categorical(y_sub, nb_classes),
              init_all=False, args=train_params, rng=rng)
              #var_list=model_sub.get_params())


        # If we are not at last substitute training iteration, augment dataset
        if rho < data_aug - 1:
            print("Augmenting substitute training data.")
            # Perform the Jacobian augmentation
            lmbda_coef = 2 * int(int(rho / 3) != 0) - 1
            # print(x.shape)
            # print(x_sub.shape)
            # print(y_sub.shape)
            #print(grads.shape)
            x_sub = jacobian_augmentation(sess, x, x_sub, y_sub, grads,
                                          lmbda_coef * lmbda, aug_batch_size)

            print("Labeling substitute training data.")
            # Label the newly generated synthetic points using the black-box
            y_sub = np.hstack([y_sub, y_sub])
            x_sub_prev = x_sub[int(len(x_sub)/2):]
            eval_params = {'batch_size': batch_size}
            #tmp = batch_eval(sess, [x], [bbox_preds], [x_sub_prev],args=eval_params)
            tmp = batch_eval(sess, [x], [bbox_preds], [x_sub_prev],batch_size=batch_size)
            print(tmp)
            bbox_val = tmp[0]

            # Note here that we take the argmax because the adversary
            # only has access to the label (not the probabilities) output
            # by the black-box model
            y_sub[int(len(x_sub)/2):] = np.argmax(bbox_val, axis=1)

    return model_sub, preds_sub
Ejemplo n.º 16
0
def evaluate_ch(model, config, sess, norm='l1', bound=None, verbose=True):
    dataset = config['data']
    num_eval_examples = config['num_eval_examples']
    eval_batch_size = config['eval_batch_size']

    if dataset == "mnist":
        from tensorflow.examples.tutorials.mnist import input_data
        mnist = input_data.read_data_sets('MNIST_data', one_hot=False)
        X = mnist.test.images[0:num_eval_examples, :].reshape(-1, 28, 28, 1)
        Y = mnist.test.labels[0:num_eval_examples]
        x_image = tf.placeholder(tf.float32, shape=[None, 28, 28, 1])
    else:
        import cifar10_input
        data_path = config["data_path"]
        cifar = cifar10_input.CIFAR10Data(data_path)
        X = cifar.eval_data.xs[0:num_eval_examples, :].astype(np.float32) / 255.0
        Y = cifar.eval_data.ys[0:num_eval_examples]
        x_image = tf.placeholder(tf.float32, shape=[None, 32, 32, 3])
        assert norm == 'l1'

    if norm=='l2':
        attack = CarliniWagnerL2(model, sess)
        params = {'batch_size': eval_batch_size, 'binary_search_steps': 9}
    else:
        attack = ElasticNetMethod(model, sess, clip_min=0.0, clip_max=1.0)
        params = {'beta': 1e-2,
                  'decision_rule': 'L1',
                  'batch_size': eval_batch_size,
                  'learning_rate': 1e-2,
                  'max_iterations': 1000}

    if verbose:
        set_log_level(logging.DEBUG, name="cleverhans")
    
    y = tf.placeholder(tf.int64, shape=[None, 10])
    params['y'] = y
    adv_x = attack.generate(x_image, **params)
    preds_adv = model.get_predicted_class(adv_x)
    preds_nat = model.get_predicted_class(x_image)

    all_preds, all_preds_adv, all_adv_x = batch_eval(
        sess, [x_image, y], [preds_nat, preds_adv, adv_x], [X, one_hot(Y, 10)], batch_size=eval_batch_size)

    print('acc nat', np.mean(all_preds == Y))
    print('acc adv', np.mean(all_preds_adv == Y))

    if dataset == "cifar10":
        X *= 255.0
        all_adv_x *= 255.0

    if norm == 'l2':
        lps = np.sqrt(np.sum(np.square(all_adv_x - X), axis=(1,2,3)))
    else:
        lps = np.sum(np.abs(all_adv_x - X), axis=(1,2,3))
    print('mean lp: ', np.mean(lps))
    for b in [bound, bound/2.0, bound/4.0, bound/8.0]:
        print('lp={}, acc={}'.format(b, np.mean((all_preds_adv == Y) | (lps > b))))

    all_corr_adv = (all_preds_adv == Y)
    all_corr_nat = (all_preds == Y)
    return all_corr_nat, all_corr_adv, lps
Ejemplo n.º 17
0
def generate(sess, model, X, Y, attack_method, dataset, attack_params):
    """
    detect adversarial examples
    :param model_name: the name of the target model. Models are named in the form of
                        model-<dataset>-<architecture>-<transform_type>.h5
    :param attack_method:  attack for generating adversarial examples
    :param X: examples to be attacked
    :param Y: correct label of the examples
    :return: adversarial examples
    """
    batch_size = 128

    img_rows, img_cols, nb_channels = X.shape[1:4]
    nb_classes = Y.shape[1]
    # label smoothing
    label_smoothing_rate = 0.1
    Y -= label_smoothing_rate * (Y - 1. / nb_classes)

    # to be able to call the model in the custom loss, we need to call it once before.
    # see https://github.com/tensorflow/tensorflow/issues/23769
    model(model.input)
    # wrap a keras model, making it fit the cleverhans framework
    wrap_model = KerasModelWrapper(model)

    # initialize the attack object
    attacker = None
    if attack_method == ATTACK.FGSM:
        """
        The Fast Gradient Sign Method,
        by Ian J. Goodfellow, Jonathon Shlens, Christian Szegedy 2014
        link: https://arxiv.org/abs/1412.6572
        """
        attacker = FastGradientMethod(wrap_model, sess=sess)
    elif attack_method == ATTACK.JSMA:
        """
        The Jacobian-based Saliency Map Method
        by Nicolas Papernot, Patrick McDaniel, Somesh Jha, Matt Fredrikson, Z. Berkay Celik, Ananthram Swami 2016
        link: https://arxiv.org/abs/1511.07528
        """
        batch_size = 64
        attacker = SaliencyMapMethod(wrap_model, sess=sess)
    elif attack_method == ATTACK.CW_L2:
        """
        Untageted attack
        """
        attacker = CarliniWagnerL2(wrap_model, sess=sess)

    elif attack_method == ATTACK.CW_Linf:
        """
        Untageted attack
        """
        # TODO: bug fix --- cannot compute gradients correctly
        # attacker = CarliniWagnerLinf(wrap_model, sess=sess)

    elif attack_method == ATTACK.CW_L0:
        """
        Untargeted attack
        """
        # TODO: bug fix --- cannot compute gradients correctly
        # attacker = CarliniWagnerL0(wrap_model, sess=sess)

    elif attack_method == ATTACK.DEEPFOOL:
        """
        The DeepFool Method, is an untargeted & iterative attack
        which is based on an iterative linearization of the classifier.
        by Seyed-Mohsen Moosavi-Dezfooli, Alhussein Fawzi, Pascal Frossard, 2016
        link: https://arxiv.org/abs/1511.04599
        """
        batch_size = 64
        ord = attack_params['ord']
        attack_params.pop('ord')

        if ord == 2:
            # cleverhans supports only l2 norm so far.
            attacker = DeepFool(wrap_model, sess=sess)
        elif ord == np.inf:
            # TODO
            pass
        else:
            raise ValueError('DeepFool supports only l2 and l-inf norms.')

    elif attack_method == ATTACK.BIM:
        """
        The Basic Iterative Method (also, iterative FGSM)
        by Alexey Kurakin, Ian Goodfellow, Samy Bengio, 2016
        link: https://arxiv.org/abs/1607.02533
        """
        attacker = BasicIterativeMethod(wrap_model, back='tf', sess=sess)
    elif attack_method == ATTACK.PGD:
        """
        The Projected Gradient Descent approach.
        """
        attacker = ProjectedGradientDescent(wrap_model)
    elif attack_method == ATTACK.MIM:
        """
        The Momentum Iterative Method
        by Yinpeng Dong, Fangzhou Liao, Tianyu Pang, Hang Su, Jun Zhu, Xiaolin Hu, Jianguo Li, 2018
        link: https://arxiv.org/abs/1710.06081
        """
        attacker = MomentumIterativeMethod(wrap_model, sess=sess)
    else:
        raise ValueError('{} attack is not supported.'.format(attack_method.upper()))

    # define custom loss function for adversary
    compile_params = get_compile_params(dataset,
                                        get_adversarial_metric(model, attacker, attack_params))
    print(compile_params)
    print('#### Recompile the model')
    model.compile(optimizer=compile_params['optimizer'],
                  loss=keras.losses.categorical_crossentropy,
                  metrics=['accuracy', compile_params['metrics']])
    # define the graph
    print('define the graph')
    adv_x = attacker.generate(model.input, **attack_params)
    # consider the attack to be constant
    adv_x = tf.stop_gradient(adv_x)

    # generating adversarial examples
    print('generating adversarial example...')
    adv_examples, = batch_eval(sess, [model.input, wrap_model(adv_x)], [adv_x],
                               [X, Y], batch_size=batch_size)

    if MODE.DEBUG:
        score = model.evaluate(adv_examples, Y, verbose=2)
        print('*** Evaluation on adversarial examples: {}'.format(score))

    return adv_examples, Y
Ejemplo n.º 18
0
def train_sub(sess, x, y, bbox_preds, x_sub, y_sub, nb_classes,
              nb_epochs_s, batch_size, learning_rate, data_aug, lmbda,
              aug_batch_size, rng, img_rows=48, img_cols=48,
              nchannels=3):
    """
    This function creates the substitute by alternatively
    augmenting the training data and training the substitute.
    :param sess: TF session
    :param x: input TF placeholder
    :param y: output TF placeholder
    :param bbox_preds: output of black-box model predictions
    :param x_sub: initial substitute training data
    :param y_sub: initial substitute training labels
    :param nb_classes: number of output classes
    :param nb_epochs_s: number of epochs to train substitute model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param data_aug: number of times substitute training data is augmented
    :param lmbda: lambda from arxiv.org/abs/1602.02697
    :param rng: numpy.random.RandomState instance
    :return:
    """
    assert(y_sub.shape[1]>1)

    try:
        saver.restore(sess, "./model.ckpt")
        model_sub = tf.get_variable("logits", shape=[1])
        preds_sub = tf.get_variable("probs", shape=[1])
        return model_sub, preds_sub
    except:
        print("Model ckpt is not found. Retrain substitute starts.")

    # Define TF model graph (for the black-box model)
    model_sub = ModelSubstitute('model_s',nb_classes, session=sess, istrain=True)
    logits = model_sub.get_logits(x)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y))
    optimiser = tf.train.AdamOptimizer().minimize(loss)
    preds_sub = tf.nn.softmax(logits=logits)

    saver = tf.train.Saver()

    print("Defined TensorFlow model graph for the substitute.")

    # Define the Jacobian symbolically using TensorFlow
    grads = jacobian_graph(preds_sub, x, nb_classes)
    sess.run(tf.global_variables_initializer())

    def evaluate():
        acc = model_eval(sess, x, y, preds_sub, x_sub, y_sub, args=eval_params)
        print('Test accuracy on test examples: %0.4f' % (acc))

    # Train the substitute and augment dataset alternatively
    for rho in xrange(data_aug):
        print("Substitute training epoch #" + str(rho))

        for s in range(batch_size):
            batch_xs = x_sub[s*batch_size: (s+1)*batch_size]
            batch_ys = y_sub[s*batch_size: (s+1)*batch_size]
            feed_dict = {x:batch_xs, y:batch_ys}
            op, lval,pre = sess.run([optimiser, loss, preds_sub], feed_dict=feed_dict)
        print("rho = {0}. loss : {1}".format(rho, sess.run(loss, feed_dict={x:batch_xs, y:batch_ys})))

        # If we are not at last substitute training iteration, augment dataset
        if 0: # rho < data_aug - 1:
            print("Augmenting substitute training data.")
            # Perform the Jacobian augmentation
            lmbda_coef = 2 * int(int(rho / 3) != 0) - 1
            y_sub_labels = np.argmax(y_sub, axis=1).reshape(-1,1)
            x_sub = jacobian_augmentation(sess, x, x_sub, y_sub_labels, grads,
                                          lmbda_coef * lmbda, aug_batch_size)

            # Label the newly generated synthetic points using the black-box
            new_y_sub_labels = np.vstack((y_sub_labels, y_sub_labels))
            x_sub_prev = x_sub[int(len(x_sub)/2):]
            eval_params = {'batch_size': batch_size}
            tmp = batch_eval(sess,[x],[bbox_preds],[x_sub_prev],batch_size=batch_size)
            bbox_val = tmp[0]

            # Note here that we take the argmax because the adversary
            # only has access to the label (not the probabilities) output
            # by the black-box model
            tmp1 = np.argmax(bbox_val, axis=1)
            tmp2 = y_sub_labels[int(len(x_sub)/2):]
            new_y_sub_labels[int(len(x_sub)/2):] = np.argmax(bbox_val, axis=1).reshape(-1,1)
            y_sub = to_categorical(new_y_sub_labels, nb_classes)

    save_path = saver.save(sess, "./model.ckpt")
    print("Model saved in path: %s" % save_path)

    print(preds_sub.shape)
    print(model_sub.shape)

    return model_sub, preds_sub
def sample_estimator(num_classes, X, Y):
    num_output           = len(model.net)
    feature_list         = np.zeros(num_output, dtype=np.int32)   # indicates the number of features in every layer
    num_sample_per_class = np.zeros(num_classes)  # how many samples are per class
    for i, key in enumerate(model.net):
        feature_list[i] = model.net[key].shape[-1].value
    assert (feature_list > 0).all()

    list_features = []  # list_features[<layer>][<label>] is a list that holds the features in a specific layer of a specific label
                        # is it basically list_features[<num_of_layer>][<num_of_label>] = List
    for i in range(num_output):
        temp_list = []
        for j in range(num_classes):
            temp_list.append([])
        list_features.append(temp_list)

    out_features = batch_eval(sess, [x], model.net.values(), [X], FLAGS.batch_size)
    for i in range(num_output):
        if len(out_features[i].shape) == 4:
            out_features[i] = np.asarray(out_features[i], dtype=np.float32).reshape((X.shape[0], -1, out_features[i].shape[-1]))
            out_features[i] = np.mean(out_features[i], axis=1)
        elif len(out_features[i].shape) == 2:
            pass  # leave as is
        else:
            raise AssertionError('Expecting size of 2 or 4 but got {} for i={}'.format(len(out_features[i].shape), i))

    for i in range(X.shape[0]):
        label = Y[i]
        for layer in range(num_output):
            list_features_temp = out_features[layer][i].reshape(1, -1)
            list_features[layer][label].extend(list_features_temp)
        num_sample_per_class[label] += 1

    # stacking everything
    for layer in range(num_output):
        for label in range(num_classes):
            list_features[layer][label] = np.stack(list_features[layer][label])

    sample_class_mean = []
    for layer in range(num_output):
        num_feature = feature_list[layer]
        temp_list = np.zeros((num_classes, num_feature))
        for i in range(num_classes):
            temp_list[i] = np.mean(list_features[layer][i], axis=0)
        sample_class_mean.append(temp_list)

    precision = []
    group_lasso = sklearn.covariance.EmpiricalCovariance(assume_centered=False)
    for layer in range(num_output):
        D = 0
        for i in range(num_classes):
            if i == 0:
                D = list_features[layer][i] - sample_class_mean[layer][i]
            else:
                D = np.concatenate((D, list_features[layer][i] - sample_class_mean[layer][i]), 0)

        # find inverse
        group_lasso.fit(D)
        temp_precision = group_lasso.precision_
        precision.append(temp_precision)

    return sample_class_mean, precision