def get_Mahalanobis_score_adv(test_data, gaussian_score, grads, magnitude, scale): grad_file = os.path.join(characteristics_dir, 'gradients_{}.npy'.format(set)) # if os.path.exists(grad_file): # print('loading gradients from {}'.format(grad_file)) # gradients = np.load(grad_file) # else: gradients = batch_eval(sess, [x], grads, [test_data], FLAGS.batch_size)[0] # print('Saving gradients to {}'.format(grad_file)) # np.save(grad_file, gradients) gradients = gradients.clip(min=0) gradients = (gradients - 0.5) * 2 # scale hyper params given from the official deep_Mahalanobis_detector repo: # https://github.com/pokaxpoka/deep_Mahalanobis_detector # I/We set scale=1 by default, with their params RED_SCALE = 0.2023 * scale GREEN_SCALE = 0.1994 * scale BLUE_SCALE = 0.2010 * scale gradients_scaled = np.zeros_like(gradients) gradients_scaled[:, :, :, 0] = gradients[:, :, :, 0] / RED_SCALE gradients_scaled[:, :, :, 1] = gradients[:, :, :, 1] / GREEN_SCALE gradients_scaled[:, :, :, 2] = gradients[:, :, :, 2] / BLUE_SCALE tempInputs = test_data - magnitude * gradients_scaled noise_gaussian_score = batch_eval(sess, [x], [gaussian_score], [tempInputs], FLAGS.batch_size)[0] Mahalanobis = np.max(noise_gaussian_score, axis=1) return Mahalanobis
def fast_gradient_sign_method(sess, model, X, Y, eps, clip_min=None, clip_max=None, batch_size=256): """ TODO :param sess: :param model: predictions or after-softmax :param X: :param Y: :param eps: :param clip_min: :param clip_max: :param batch_size: :return: """ # Define TF placeholders for the input and output x = tf.placeholder(tf.float32, shape=(None, ) + X.shape[1:]) y = tf.placeholder(tf.float32, shape=(None, ) + Y.shape[1:]) adv_x = fgsm(x, model(x), eps=eps, clip_min=clip_min, clip_max=clip_max, y=y) X_adv, = batch_eval(sess, [x, y], [adv_x], [X, Y], feed={}, args={'batch_size': batch_size}) return X_adv
def batch_eval(*args, **kwargs): # Inside function to avoid circular import from cleverhans.evaluation import batch_eval warnings.warn("batch_eval has moved to cleverhans.evaluation. " "batch_eval will be removed from utils_tf on or after " "2019-03-09.") return batch_eval(*args, **kwargs)
def basic_iterative_method(sess, model, X, Y, eps, eps_iter, nb_iter=50, clip_min=None, clip_max=None, batch_size=256): """ TODO :param sess: :param model: predictions or after-softmax :param X: :param Y: :param eps: :param eps_iter: :param nb_iter: :param clip_min: :param clip_max: :param batch_size: :return: """ print("nb_iter",nb_iter) # Define TF placeholders for the input and output x = tf.placeholder(tf.float32, shape=(None,)+X.shape[1:]) y = tf.placeholder(tf.float32, shape=(None,)+Y.shape[1:]) # results will hold the adversarial inputs at each iteration of BIM; # thus it will have shape (nb_iter, n_samples, n_rows, n_cols, n_channels) results = np.zeros((nb_iter, X.shape[0],) + X.shape[1:]) # Initialize adversarial samples as the original samples, set upper and # lower bounds X_adv = X X_min = X_adv - eps X_max = X_adv + eps print('Running BIM iterations...') # "its" is a dictionary that keeps track of the iteration at which each # sample becomes misclassified. The default value will be (nb_iter-1), the # very last iteration. def f(val): return lambda: val its = defaultdict(f(nb_iter-1)) # Out keeps track of which samples have already been misclassified out = set() for i in tqdm(range(nb_iter)): adv_x = fgsm( x, model(x), eps=eps_iter, clip_min=clip_min, clip_max=clip_max, y=y ) X_adv, = batch_eval( sess, [x, y], [adv_x], [X_adv, Y], feed={K.learning_phase(): 0}, args={'batch_size': batch_size} ) X_adv = np.maximum(np.minimum(X_adv, X_max), X_min) results[i] = X_adv # check misclassifieds predictions = model.predict_classes(X_adv, batch_size=512, verbose=0) misclassifieds = np.where(predictions != Y.argmax(axis=1))[0] for elt in misclassifieds: if elt not in out: its[elt] = i out.add(elt) return its, results
def get_activations(self, data, batch_size=10): data_activations = {} for layer in self.layers: data_activations[layer] = batch_eval( sess=self.sess, tf_inputs=[self.x_ph], tf_outputs=[self.layer_sym_ph[layer]], numpy_inputs=[data], batch_size=batch_size)[0] return data_activations
def adaptive_fast_gradient_sign_method(sess, model, X, Y, eps, clip_min=None, clip_max=None, batch_size=256, log_dir=None, model_logits=None, binary_steps=12, dataset="cifar"): """ TODO :param sess: :param model: predictions or after-softmax :param X: :param Y: :param eps: :param clip_min: :param clip_max: :param batch_size: :return: """ # Define TF placeholders for the input and output x = tf.placeholder(tf.float32, shape=(None, ) + X.shape[1:]) y = tf.placeholder(tf.float32, shape=(None, ) + Y.shape[1:]) alpha = tf.placeholder(tf.float32, shape=(None, ) + (1, )) num_samples = np.shape(X)[0] ALPHA = 0.1 * np.ones((num_samples, 1)) ub = 10.0 * np.ones(num_samples) lb = 0.0 * np.ones(num_samples) Best_X_adv = None for i in range(binary_steps): print(i) adv_x = adaptive_fgsm(x, model(x), eps=eps, clip_min=clip_min, clip_max=clip_max, y=y, log_dir=log_dir, model_logits=model_logits, alpha=alpha) X_adv, = batch_eval(sess, [x, y, alpha], [adv_x], [X, Y, ALPHA], feed={K.learning_phase(): 0}, args={'batch_size': batch_size}) if (i == 0): Best_X_adv = X_adv ALPHA, Best_X_adv = binary_refinement(sess, Best_X_adv, X_adv, Y, ALPHA, ub, lb, model, dataset) return Best_X_adv
def estimate(i_batch): start = i_batch * batch_size end = np.minimum(len(X_test), (i_batch + 1) * batch_size) n_feed = end - start lid_batch = np.zeros(shape=(n_feed, lid_dim)) lid_batch_adv = np.zeros(shape=(n_feed, lid_dim)) lid_batch_noisy = np.zeros(shape=(n_feed, lid_dim)) X_act = batch_eval(sess, [x], model.net.values(), [X_test[start:end]] , batch_size) X_adv_act = batch_eval(sess, [x], model.net.values(), [X_test_adv[start:end]] , batch_size) X_noisy_act = batch_eval(sess, [x], model.net.values(), [X_test_noisy[start:end]], batch_size) for i in range(len(model.net)): X_act[i] = np.asarray(X_act[i] , dtype=np.float32).reshape((n_feed, -1)) X_adv_act[i] = np.asarray(X_adv_act[i] , dtype=np.float32).reshape((n_feed, -1)) X_noisy_act[i] = np.asarray(X_noisy_act[i], dtype=np.float32).reshape((n_feed, -1)) # random clean samples # Maximum likelihood estimation of local intrinsic dimensionality (LID) lid_batch[:, i] = mle_batch(X_act[i], X_act[i] , k=k) lid_batch_adv[:, i] = mle_batch(X_act[i], X_adv_act[i] , k=k) lid_batch_noisy[:, i] = mle_batch(X_act[i], X_noisy_act[i], k=k) return lid_batch, lid_batch_noisy, lid_batch_adv
def get_knn_layers(X, y): knn = {} train_features = batch_eval(sess, [x], model.net.values(), [X], FLAGS.batch_size) print('Fitting knn models on all layers: {}'.format(model.net.keys())) for layer_index, layer in enumerate(model.net.keys()): if len(train_features[layer_index].shape) == 4: train_features[layer_index] = np.asarray(train_features[layer_index], dtype=np.float32).reshape((X.shape[0], -1, train_features[layer_index].shape[-1])) train_features[layer_index] = np.mean(train_features[layer_index], axis=1) elif len(train_features[layer_index].shape) == 2: pass # leave as is else: raise AssertionError('Expecting size of 2 or 4 but got {} for {}'.format(len(train_features[layer_index].shape), layer)) knn[layer] = NearestNeighbors(n_neighbors=X.shape[0], p=2, n_jobs=20, algorithm='brute') knn[layer].fit(train_features[layer_index], y) del train_features return knn
def calc_all_ranks_and_dists(X, subset, knn): num_output = len(model.net.keys()) n_neighbors = knn[knn.keys()[0]].n_neighbors all_neighbor_ranks = -1 * np.ones((len(X), num_output, n_neighbors), dtype=np.int32) all_neighbor_dists = -1 * np.ones((len(X), num_output, n_neighbors), dtype=np.float32) features = batch_eval(sess, [x], model.net.values(), [X], FLAGS.batch_size) for layer_index, layer in enumerate(model.net.keys()): print('Calculating ranks and distances for subset {} for layer {}'.format(subset, layer)) if len(features[layer_index].shape) == 4: features[layer_index] = np.asarray(features[layer_index], dtype=np.float32).reshape((X.shape[0], -1, features[layer_index].shape[-1])) features[layer_index] = np.mean(features[layer_index], axis=1) elif len(features[layer_index].shape) == 2: pass # leave as is else: raise AssertionError('Expecting size of 2 or 4 but got {} for {}'.format(len(features[layer_index].shape), layer)) all_neighbor_dists[:, layer_index], all_neighbor_ranks[:, layer_index] = \ knn[layer].kneighbors(features[layer_index], return_distance=True) del features return all_neighbor_ranks, all_neighbor_dists
saver.restore(sess, checkpoint_path) # predict labels from trainset if USE_TRAIN_MINI: train_preds_file = os.path.join(model_dir, 'x_train_mini_preds.npy') train_features_file = os.path.join(model_dir, 'x_train_mini_features.npy') else: train_preds_file = os.path.join(model_dir, 'x_train_preds.npy') train_features_file = os.path.join(model_dir, 'x_train_features.npy') if not os.path.isfile(train_preds_file): tf_inputs = [x, y] tf_outputs = [preds, embeddings] numpy_inputs = [X_train, y_train] x_train_preds, x_train_features = batch_eval(sess, tf_inputs, tf_outputs, numpy_inputs, FLAGS.batch_size) x_train_preds = x_train_preds.astype(np.int32) np.save(train_preds_file, x_train_preds) np.save(train_features_file, x_train_features) else: x_train_preds = np.load(train_preds_file) x_train_features = np.load(train_features_file) # predict labels from validation set if not os.path.isfile(os.path.join(model_dir, 'x_val_preds.npy')): tf_inputs = [x, y] tf_outputs = [preds, embeddings] numpy_inputs = [X_val, y_val] x_val_preds, x_val_features = batch_eval(sess, tf_inputs, tf_outputs,
checkpoint_path = os.path.join(model_dir, 'best_model.ckpt') saver.restore(sess, checkpoint_path) # predict labels from trainset if USE_TRAIN_MINI: train_preds_file = os.path.join(model_dir, 'x_train_mini_preds.npy') train_features_file = os.path.join(model_dir, 'x_train_mini_features.npy') else: train_preds_file = os.path.join(model_dir, 'x_train_preds.npy') train_features_file = os.path.join(model_dir, 'x_train_features.npy') if not os.path.isfile(train_preds_file): tf_inputs = [x, y] tf_outputs = [preds, embeddings] numpy_inputs = [X_train, y_train] x_train_preds, x_train_features = batch_eval(sess, tf_inputs, tf_outputs, numpy_inputs, FLAGS.batch_size) x_train_preds = x_train_preds.astype(np.int32) np.save(train_preds_file, x_train_preds) np.save(train_features_file, x_train_features) else: x_train_preds = np.load(train_preds_file) x_train_features = np.load(train_features_file) # predict labels from validation set if not os.path.isfile(os.path.join(model_dir, 'x_val_preds.npy')): tf_inputs = [x, y] tf_outputs = [preds, embeddings] numpy_inputs = [X_val, y_val] x_val_preds, x_val_features = batch_eval(sess, tf_inputs, tf_outputs, numpy_inputs, FLAGS.batch_size) x_val_preds = x_val_preds.astype(np.int32)
def do_eval(preds, x_set, y_set, report_key, is_adv=None, predictor=None, x_adv=None): if predictor is None: acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) else: do_eval(preds, x_set, y_set, report_key, is_adv=is_adv) if x_adv is not None: x_set_adv, = batch_eval(sess, [x], [x_adv], [x_set], batch_size=batch_size) assert x_set.shape == x_set_adv.shape x_set = x_set_adv n_batches = math.ceil(x_set.shape[0] / batch_size) p_set, p_det = np.concatenate([ predictor.send(x_set[b * batch_size:(b + 1) * batch_size]) for b in tqdm.trange(n_batches) ]).T acc = np.equal(p_set, y_set[:len(p_set)].argmax(-1)).mean() # if is_adv: # import IPython ; IPython.embed() ; exit(1) if FLAGS.save_debug_dict: debug_dict['x_set'] = x_set debug_dict['y_set'] = y_set ddfn = 'logs/debug_dict_{}.pkl'.format( 'adv' if is_adv else 'clean') if not os.path.exists(ddfn): with open(ddfn, 'wb') as f: pickle.dump(debug_dict, f) debug_dict.clear() if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples %s: %0.4f' % (report_text, 'with correction' if predictor is not None else 'without correction', acc)) if is_adv is not None: label = 'test_acc_{}_{}'.format( report_text, 'corrected' if predictor else 'uncorrected') swriter.add_scalar(label, acc) if predictor is not None: detect = np.equal(p_det, is_adv).mean() label = 'test_det_{}_{}'.format( report_text, 'corrected' if predictor else 'uncorrected') print(label, detect) swriter.add_scalar(label, detect) label = 'test_dac_{}_{}'.format( report_text, 'corrected' if predictor else 'uncorrected') swriter.add_scalar( label, np.equal(p_set, y_set[:len(p_set)].argmax(-1))[np.equal( p_det, is_adv)].mean()) return acc
def eval(sess, model_name, X_train, Y_train, X_test, Y_test, cnn=False, rbf=False): """ Load model saved in model_name.json and model_name_weights.h5 and evaluate its accuracy on legitimate test samples and adversarial samples. Use cnn=True if the model is CNN based. """ # load saved model print("Load model ... ") ''' json = open('models/{}.json'.format(model_name), 'r') model = json.read() json.close() loaded_model = model_from_json(model) loaded_model.load_weights("models/{}_weights.h5".format(model_name)) ''' if rbf: loaded_model = load_model("rbfmodels/{}.h5".format(model_name), custom_objects={'RBFLayer': RBFLayer}) else: loaded_model = load_model("models/{}.h5".format(model_name)) # Set placeholders if cnn: x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) else: x = tf.placeholder(tf.float32, shape=(None, 784)) y = tf.placeholder(tf.float32, shape=(None, 10)) predictions = loaded_model(x) accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args={"batch_size": 128}) print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) # Using functions from /cleverhans/attacks_tf.py # Will be deprecated next year adv_x = fgsm(x, predictions, eps=0.3) X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], batch_size=128) # Using functions from /cleverhans/attacks.py (as specified by creators) # Does not work at the moment ''' wrap = KerasModelWrapper(loaded_model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3} #'y': y} adv_x = fgsm.generate(x, **fgsm_params) adv_x = tf.stop_gradient(adv_x) X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], batch_size=128) predictions_adv = loaded_model(adv_x) ''' # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test, args={"batch_size": 128}) print('Test accuracy on adversarial test examples: ' + str(accuracy)) # Craft adversarial examples using Jacobian-based Saliency Map Approach (JSMA) wrap = KerasModelWrapper(loaded_model) jsma = SaliencyMapMethod(wrap, sess=sess) jsma_params = { 'theta': 1., 'gamma': 1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } adv_x = jsma.generate(x, **jsma_params) adv_x = tf.stop_gradient(adv_x) preds_adv = loaded_model(adv_x) accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={"batch_size": 512}) print('Test accuracy on adversarial test examples: ' + str(accuracy)) ''' report = AccuracyReport() viz_enabled=VIZ_ENABLED source_samples=SOURCE_SAMPLES img_rows, img_cols, nchannels = 28, 28, 1 nb_classes = 10 print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object wrap = KerasModelWrapper(loaded_model) jsma = SaliencyMapMethod(wrap, sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, predictions, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) #adv_x = jsma(sess, x, predictions, 10, X_test, Y_test, 0, 0.5, 0, 1) #X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], batch_size=128) #accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test, args={ "batch_size" : 128 }) ''' sess.close()
def adaptive_basic_iterative_method(sess, model, X, Y, eps, eps_iter, nb_iter=50, clip_min=None, clip_max=None, batch_size=256, log_dir=None, model_logits=None, binary_steps=9, attack_type="bim-b", dataset="cifar"): """ TODO :param sess: :param model: predictions or after-softmax :param X: :param Y: :param eps: :param eps_iter: :param nb_iter: :param clip_min: :param clip_max: :param batch_size: :return: """ print("nb_iter", nb_iter) # Define TF placeholders for the input and output x = tf.placeholder(tf.float32, shape=(None, ) + X.shape[1:]) y = tf.placeholder(tf.float32, shape=(None, ) + Y.shape[1:]) alpha = tf.placeholder(tf.float32, shape=(None, ) + (1, )) num_samples = np.shape(X)[0] ALPHA = 0.1 * np.ones((num_samples, 1)) ub = 10.0 * np.ones(num_samples) lb = 0.0 * np.ones(num_samples) Best_X_adv = None results = np.zeros(( nb_iter, X.shape[0], ) + X.shape[1:]) # Initialize adversarial samples as the original samples, set upper and # lower bounds X_adv = X X_min = X_adv - eps X_max = X_adv + eps print('Running BIM iterations...') # "its" is a dictionary that keeps track of the iteration at which each # sample becomes misclassified. The default value will be (nb_iter-1), the # very last iteration. def f(val): return lambda: val its = defaultdict(f(nb_iter - 1)) # Out keeps track of which samples have already been misclassified out = set() for j in range(binary_steps): for i in tqdm(range(nb_iter)): adv_x = adaptive_fgsm(x, model(x), eps=eps_iter, clip_min=clip_min, clip_max=clip_max, y=y, log_dir=log_dir, model_logits=model_logits, alpha=alpha) X_adv, = batch_eval(sess, [x, y, alpha], [adv_x], [X_adv, Y, ALPHA], feed={K.learning_phase(): 0}, args={'batch_size': batch_size}) X_adv = np.maximum(np.minimum(X_adv, X_max), X_min) results[i] = X_adv # check misclassifieds predictions = model.predict_classes(X_adv, batch_size=512, verbose=0) misclassifieds = np.where(predictions != Y.argmax(axis=1))[0] for elt in misclassifieds: if elt not in out: its[elt] = i out.add(elt) print(i) X_adv = results[-1] if (j == 0): Best_X_adv = X_adv ALPHA, Best_X_adv = binary_refinement(sess, Best_X_adv, X_adv, Y, ALPHA, ub, lb, model, dataset) return Best_X_adv
def train_sub1(sess, x, y, bbox_preds, x_sub, y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, aug_batch_size, rng, img_rows=48, img_cols=48, nchannels=3): """ This function creates the substitute by alternatively augmenting the training data and training the substitute. :param sess: TF session :param x: input TF placeholder :param y: output TF placeholder :param bbox_preds: output of black-box model predictions :param x_sub: initial substitute training data :param y_sub: initial substitute training labels :param nb_classes: number of output classes :param nb_epochs_s: number of epochs to train substitute model :param batch_size: size of training batches :param learning_rate: learning rate for training :param data_aug: number of times substitute training data is augmented :param lmbda: lambda from arxiv.org/abs/1602.02697 :param rng: numpy.random.RandomState instance :return: """ # Define TF model graph (for the black-box model) model_sub = ModelSubstitute('model_s', nb_classes) preds_sub = model_sub.get_logits(x) loss_sub = CrossEntropy(model_sub, smoothing=0) print("Defined TensorFlow model graph for the substitute.") # Define the Jacobian symbolically using TensorFlow grads = jacobian_graph(preds_sub, x, nb_classes) # Train the substitute and augment dataset alternatively for rho in xrange(data_aug): print("Substitute training epoch #" + str(rho)) train_params = { 'nb_epochs': nb_epochs_s, 'batch_size': batch_size, 'learning_rate': learning_rate } #with TemporaryLogLevel(logging.WARNING, "cleverhans.utils.tf"): train(sess, loss_sub, x, y, x_sub, to_categorical(y_sub, nb_classes), init_all=False, args=train_params, rng=rng) #var_list=model_sub.get_params()) # If we are not at last substitute training iteration, augment dataset if rho < data_aug - 1: print("Augmenting substitute training data.") # Perform the Jacobian augmentation lmbda_coef = 2 * int(int(rho / 3) != 0) - 1 # print(x.shape) # print(x_sub.shape) # print(y_sub.shape) #print(grads.shape) x_sub = jacobian_augmentation(sess, x, x_sub, y_sub, grads, lmbda_coef * lmbda, aug_batch_size) print("Labeling substitute training data.") # Label the newly generated synthetic points using the black-box y_sub = np.hstack([y_sub, y_sub]) x_sub_prev = x_sub[int(len(x_sub)/2):] eval_params = {'batch_size': batch_size} #tmp = batch_eval(sess, [x], [bbox_preds], [x_sub_prev],args=eval_params) tmp = batch_eval(sess, [x], [bbox_preds], [x_sub_prev],batch_size=batch_size) print(tmp) bbox_val = tmp[0] # Note here that we take the argmax because the adversary # only has access to the label (not the probabilities) output # by the black-box model y_sub[int(len(x_sub)/2):] = np.argmax(bbox_val, axis=1) return model_sub, preds_sub
def evaluate_ch(model, config, sess, norm='l1', bound=None, verbose=True): dataset = config['data'] num_eval_examples = config['num_eval_examples'] eval_batch_size = config['eval_batch_size'] if dataset == "mnist": from tensorflow.examples.tutorials.mnist import input_data mnist = input_data.read_data_sets('MNIST_data', one_hot=False) X = mnist.test.images[0:num_eval_examples, :].reshape(-1, 28, 28, 1) Y = mnist.test.labels[0:num_eval_examples] x_image = tf.placeholder(tf.float32, shape=[None, 28, 28, 1]) else: import cifar10_input data_path = config["data_path"] cifar = cifar10_input.CIFAR10Data(data_path) X = cifar.eval_data.xs[0:num_eval_examples, :].astype(np.float32) / 255.0 Y = cifar.eval_data.ys[0:num_eval_examples] x_image = tf.placeholder(tf.float32, shape=[None, 32, 32, 3]) assert norm == 'l1' if norm=='l2': attack = CarliniWagnerL2(model, sess) params = {'batch_size': eval_batch_size, 'binary_search_steps': 9} else: attack = ElasticNetMethod(model, sess, clip_min=0.0, clip_max=1.0) params = {'beta': 1e-2, 'decision_rule': 'L1', 'batch_size': eval_batch_size, 'learning_rate': 1e-2, 'max_iterations': 1000} if verbose: set_log_level(logging.DEBUG, name="cleverhans") y = tf.placeholder(tf.int64, shape=[None, 10]) params['y'] = y adv_x = attack.generate(x_image, **params) preds_adv = model.get_predicted_class(adv_x) preds_nat = model.get_predicted_class(x_image) all_preds, all_preds_adv, all_adv_x = batch_eval( sess, [x_image, y], [preds_nat, preds_adv, adv_x], [X, one_hot(Y, 10)], batch_size=eval_batch_size) print('acc nat', np.mean(all_preds == Y)) print('acc adv', np.mean(all_preds_adv == Y)) if dataset == "cifar10": X *= 255.0 all_adv_x *= 255.0 if norm == 'l2': lps = np.sqrt(np.sum(np.square(all_adv_x - X), axis=(1,2,3))) else: lps = np.sum(np.abs(all_adv_x - X), axis=(1,2,3)) print('mean lp: ', np.mean(lps)) for b in [bound, bound/2.0, bound/4.0, bound/8.0]: print('lp={}, acc={}'.format(b, np.mean((all_preds_adv == Y) | (lps > b)))) all_corr_adv = (all_preds_adv == Y) all_corr_nat = (all_preds == Y) return all_corr_nat, all_corr_adv, lps
def generate(sess, model, X, Y, attack_method, dataset, attack_params): """ detect adversarial examples :param model_name: the name of the target model. Models are named in the form of model-<dataset>-<architecture>-<transform_type>.h5 :param attack_method: attack for generating adversarial examples :param X: examples to be attacked :param Y: correct label of the examples :return: adversarial examples """ batch_size = 128 img_rows, img_cols, nb_channels = X.shape[1:4] nb_classes = Y.shape[1] # label smoothing label_smoothing_rate = 0.1 Y -= label_smoothing_rate * (Y - 1. / nb_classes) # to be able to call the model in the custom loss, we need to call it once before. # see https://github.com/tensorflow/tensorflow/issues/23769 model(model.input) # wrap a keras model, making it fit the cleverhans framework wrap_model = KerasModelWrapper(model) # initialize the attack object attacker = None if attack_method == ATTACK.FGSM: """ The Fast Gradient Sign Method, by Ian J. Goodfellow, Jonathon Shlens, Christian Szegedy 2014 link: https://arxiv.org/abs/1412.6572 """ attacker = FastGradientMethod(wrap_model, sess=sess) elif attack_method == ATTACK.JSMA: """ The Jacobian-based Saliency Map Method by Nicolas Papernot, Patrick McDaniel, Somesh Jha, Matt Fredrikson, Z. Berkay Celik, Ananthram Swami 2016 link: https://arxiv.org/abs/1511.07528 """ batch_size = 64 attacker = SaliencyMapMethod(wrap_model, sess=sess) elif attack_method == ATTACK.CW_L2: """ Untageted attack """ attacker = CarliniWagnerL2(wrap_model, sess=sess) elif attack_method == ATTACK.CW_Linf: """ Untageted attack """ # TODO: bug fix --- cannot compute gradients correctly # attacker = CarliniWagnerLinf(wrap_model, sess=sess) elif attack_method == ATTACK.CW_L0: """ Untargeted attack """ # TODO: bug fix --- cannot compute gradients correctly # attacker = CarliniWagnerL0(wrap_model, sess=sess) elif attack_method == ATTACK.DEEPFOOL: """ The DeepFool Method, is an untargeted & iterative attack which is based on an iterative linearization of the classifier. by Seyed-Mohsen Moosavi-Dezfooli, Alhussein Fawzi, Pascal Frossard, 2016 link: https://arxiv.org/abs/1511.04599 """ batch_size = 64 ord = attack_params['ord'] attack_params.pop('ord') if ord == 2: # cleverhans supports only l2 norm so far. attacker = DeepFool(wrap_model, sess=sess) elif ord == np.inf: # TODO pass else: raise ValueError('DeepFool supports only l2 and l-inf norms.') elif attack_method == ATTACK.BIM: """ The Basic Iterative Method (also, iterative FGSM) by Alexey Kurakin, Ian Goodfellow, Samy Bengio, 2016 link: https://arxiv.org/abs/1607.02533 """ attacker = BasicIterativeMethod(wrap_model, back='tf', sess=sess) elif attack_method == ATTACK.PGD: """ The Projected Gradient Descent approach. """ attacker = ProjectedGradientDescent(wrap_model) elif attack_method == ATTACK.MIM: """ The Momentum Iterative Method by Yinpeng Dong, Fangzhou Liao, Tianyu Pang, Hang Su, Jun Zhu, Xiaolin Hu, Jianguo Li, 2018 link: https://arxiv.org/abs/1710.06081 """ attacker = MomentumIterativeMethod(wrap_model, sess=sess) else: raise ValueError('{} attack is not supported.'.format(attack_method.upper())) # define custom loss function for adversary compile_params = get_compile_params(dataset, get_adversarial_metric(model, attacker, attack_params)) print(compile_params) print('#### Recompile the model') model.compile(optimizer=compile_params['optimizer'], loss=keras.losses.categorical_crossentropy, metrics=['accuracy', compile_params['metrics']]) # define the graph print('define the graph') adv_x = attacker.generate(model.input, **attack_params) # consider the attack to be constant adv_x = tf.stop_gradient(adv_x) # generating adversarial examples print('generating adversarial example...') adv_examples, = batch_eval(sess, [model.input, wrap_model(adv_x)], [adv_x], [X, Y], batch_size=batch_size) if MODE.DEBUG: score = model.evaluate(adv_examples, Y, verbose=2) print('*** Evaluation on adversarial examples: {}'.format(score)) return adv_examples, Y
def train_sub(sess, x, y, bbox_preds, x_sub, y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, aug_batch_size, rng, img_rows=48, img_cols=48, nchannels=3): """ This function creates the substitute by alternatively augmenting the training data and training the substitute. :param sess: TF session :param x: input TF placeholder :param y: output TF placeholder :param bbox_preds: output of black-box model predictions :param x_sub: initial substitute training data :param y_sub: initial substitute training labels :param nb_classes: number of output classes :param nb_epochs_s: number of epochs to train substitute model :param batch_size: size of training batches :param learning_rate: learning rate for training :param data_aug: number of times substitute training data is augmented :param lmbda: lambda from arxiv.org/abs/1602.02697 :param rng: numpy.random.RandomState instance :return: """ assert(y_sub.shape[1]>1) try: saver.restore(sess, "./model.ckpt") model_sub = tf.get_variable("logits", shape=[1]) preds_sub = tf.get_variable("probs", shape=[1]) return model_sub, preds_sub except: print("Model ckpt is not found. Retrain substitute starts.") # Define TF model graph (for the black-box model) model_sub = ModelSubstitute('model_s',nb_classes, session=sess, istrain=True) logits = model_sub.get_logits(x) loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y)) optimiser = tf.train.AdamOptimizer().minimize(loss) preds_sub = tf.nn.softmax(logits=logits) saver = tf.train.Saver() print("Defined TensorFlow model graph for the substitute.") # Define the Jacobian symbolically using TensorFlow grads = jacobian_graph(preds_sub, x, nb_classes) sess.run(tf.global_variables_initializer()) def evaluate(): acc = model_eval(sess, x, y, preds_sub, x_sub, y_sub, args=eval_params) print('Test accuracy on test examples: %0.4f' % (acc)) # Train the substitute and augment dataset alternatively for rho in xrange(data_aug): print("Substitute training epoch #" + str(rho)) for s in range(batch_size): batch_xs = x_sub[s*batch_size: (s+1)*batch_size] batch_ys = y_sub[s*batch_size: (s+1)*batch_size] feed_dict = {x:batch_xs, y:batch_ys} op, lval,pre = sess.run([optimiser, loss, preds_sub], feed_dict=feed_dict) print("rho = {0}. loss : {1}".format(rho, sess.run(loss, feed_dict={x:batch_xs, y:batch_ys}))) # If we are not at last substitute training iteration, augment dataset if 0: # rho < data_aug - 1: print("Augmenting substitute training data.") # Perform the Jacobian augmentation lmbda_coef = 2 * int(int(rho / 3) != 0) - 1 y_sub_labels = np.argmax(y_sub, axis=1).reshape(-1,1) x_sub = jacobian_augmentation(sess, x, x_sub, y_sub_labels, grads, lmbda_coef * lmbda, aug_batch_size) # Label the newly generated synthetic points using the black-box new_y_sub_labels = np.vstack((y_sub_labels, y_sub_labels)) x_sub_prev = x_sub[int(len(x_sub)/2):] eval_params = {'batch_size': batch_size} tmp = batch_eval(sess,[x],[bbox_preds],[x_sub_prev],batch_size=batch_size) bbox_val = tmp[0] # Note here that we take the argmax because the adversary # only has access to the label (not the probabilities) output # by the black-box model tmp1 = np.argmax(bbox_val, axis=1) tmp2 = y_sub_labels[int(len(x_sub)/2):] new_y_sub_labels[int(len(x_sub)/2):] = np.argmax(bbox_val, axis=1).reshape(-1,1) y_sub = to_categorical(new_y_sub_labels, nb_classes) save_path = saver.save(sess, "./model.ckpt") print("Model saved in path: %s" % save_path) print(preds_sub.shape) print(model_sub.shape) return model_sub, preds_sub
def sample_estimator(num_classes, X, Y): num_output = len(model.net) feature_list = np.zeros(num_output, dtype=np.int32) # indicates the number of features in every layer num_sample_per_class = np.zeros(num_classes) # how many samples are per class for i, key in enumerate(model.net): feature_list[i] = model.net[key].shape[-1].value assert (feature_list > 0).all() list_features = [] # list_features[<layer>][<label>] is a list that holds the features in a specific layer of a specific label # is it basically list_features[<num_of_layer>][<num_of_label>] = List for i in range(num_output): temp_list = [] for j in range(num_classes): temp_list.append([]) list_features.append(temp_list) out_features = batch_eval(sess, [x], model.net.values(), [X], FLAGS.batch_size) for i in range(num_output): if len(out_features[i].shape) == 4: out_features[i] = np.asarray(out_features[i], dtype=np.float32).reshape((X.shape[0], -1, out_features[i].shape[-1])) out_features[i] = np.mean(out_features[i], axis=1) elif len(out_features[i].shape) == 2: pass # leave as is else: raise AssertionError('Expecting size of 2 or 4 but got {} for i={}'.format(len(out_features[i].shape), i)) for i in range(X.shape[0]): label = Y[i] for layer in range(num_output): list_features_temp = out_features[layer][i].reshape(1, -1) list_features[layer][label].extend(list_features_temp) num_sample_per_class[label] += 1 # stacking everything for layer in range(num_output): for label in range(num_classes): list_features[layer][label] = np.stack(list_features[layer][label]) sample_class_mean = [] for layer in range(num_output): num_feature = feature_list[layer] temp_list = np.zeros((num_classes, num_feature)) for i in range(num_classes): temp_list[i] = np.mean(list_features[layer][i], axis=0) sample_class_mean.append(temp_list) precision = [] group_lasso = sklearn.covariance.EmpiricalCovariance(assume_centered=False) for layer in range(num_output): D = 0 for i in range(num_classes): if i == 0: D = list_features[layer][i] - sample_class_mean[layer][i] else: D = np.concatenate((D, list_features[layer][i] - sample_class_mean[layer][i]), 0) # find inverse group_lasso.fit(D) temp_precision = group_lasso.precision_ precision.append(temp_precision) return sample_class_mean, precision