def jsma(sess, x, predictions, grads, sample, target, theta, gamma, increase, nb_classes, clip_min, clip_max, verbose=False): """ TensorFlow implementation of the jacobian-based saliency map method (JSMA). :param sess: TF session :param x: the input placeholder :param predictions: the model's symbolic output (linear output, pre-softmax) :param sample: numpy array with sample input :param target: target class for sample input :param theta: delta for each feature adjustment :param gamma: a float between 0 - 1 indicating the maximum distortion percentage :param increase: boolean; true if we are increasing pixels, false otherwise :param nb_classes: integer indicating the number of classes in the model :param clip_min: optional parameter that can be used to set a minimum value for components of the example returned :param clip_max: optional parameter that can be used to set a maximum value for components of the example returned :param verbose: boolean; whether to print status updates or not :return: an adversarial sample """ # Copy the source sample and define the maximum number of features # (i.e. the maximum number of iterations) that we may perturb adv_x = copy.copy(sample) # count the number of features. For MNIST, 1x28x28 = 784; for # CIFAR, 3x32x32 = 3072; etc. nb_features = np.product(adv_x.shape[1:]) # reshape sample for sake of standardization original_shape = adv_x.shape adv_x = np.reshape(adv_x, (1, nb_features)) # compute maximum number of iterations max_iters = np.floor(nb_features * gamma / 2) if verbose: print('Maximum number of iterations: {0}'.format(max_iters)) # Compute our initial search domain. We optimize the initial search domain # by removing all features that are already at their maximum values (if # increasing input features---otherwise, at their minimum value). if increase: search_domain = set( [i for i in xrange(nb_features) if adv_x[0, i] < clip_max]) else: search_domain = set( [i for i in xrange(nb_features) if adv_x[0, i] > clip_min]) # Initialize the loop variables iteration = 0 adv_x_original_shape = np.reshape(adv_x, original_shape) current = model_argmax(sess, x, predictions, adv_x_original_shape, feed={K.learning_phase(): 0}) # Repeat this main loop until we have achieved misclassification while (current != target and iteration < max_iters and len(search_domain) > 1): # Reshape the adversarial example adv_x_original_shape = np.reshape(adv_x, original_shape) # Compute the Jacobian components grads_target, grads_others = jacobian(sess, x, grads, target, adv_x_original_shape, nb_features, nb_classes, feed={K.learning_phase(): 0}) # Compute the saliency map for each of our target classes # and return the two best candidate features for perturbation i, j, search_domain = saliency_map(grads_target, grads_others, search_domain, increase) # Apply the perturbation to the two input features selected previously adv_x = apply_perturbations(i, j, adv_x, increase, theta, clip_min, clip_max) # Update our current prediction by querying the model current = model_argmax(sess, x, predictions, adv_x_original_shape, feed={K.learning_phase(): 0}) # Update loop variables iteration += 1 # This process may take a while, so outputting progress regularly if iteration % 5 == 0 and verbose: msg = 'Current iteration: {0} - Current Prediction: {1}' print(msg.format(iteration, current)) # Compute the ratio of pixels perturbed by the algorithm percent_perturbed = float(iteration * 2) / nb_features # Report success when the adversarial example is misclassified in the # target class if current == target: if verbose: print('Successful') return np.reshape(adv_x, original_shape), 1, percent_perturbed else: if verbose: print('Unsuccesful') return np.reshape(adv_x, original_shape), 0, percent_perturbed
def jsma(self, sess, x, predictions, grads, sample, target, theta, gamma, clip_min, clip_max, feed=None): """ TensorFlow implementation of the JSMA (see https://arxiv.org/abs/1511.07528 for details about the algorithm design choices). :param sess: TF session :param x: the input placeholder :param predictions: the model's symbolic output (the attack expects the probabilities, i.e., the output of the softmax, but will also work with logits typically) :param grads: symbolic gradients :param sample: numpy array with sample input :param target: target class for sample input :param theta: delta for each feature adjustment :param gamma: a float between 0 - 1 indicating the maximum distortion percentage :param clip_min: minimum value for components of the example returned :param clip_max: maximum value for components of the example returned :return: an adversarial sample """ # Copy the source sample and define the maximum number of features # (i.e. the maximum number of iterations) that we may perturb adv_x = copy.copy(sample) # count the number of features. For MNIST, 1x28x28 = 784; for # CIFAR, 3x32x32 = 3072; etc. nb_features = np.product(adv_x.shape[1:]) # reshape sample for sake of standardization original_shape = adv_x.shape adv_x = np.reshape(adv_x, (1, nb_features)) # compute maximum number of iterations max_iters = np.floor(nb_features * gamma / 2) # Find number of classes based on grads nb_classes = len(grads) increase = bool(theta > 0) # Compute our initial search domain. We optimize the initial search domain # by removing all features that are already at their maximum values (if # increasing input features---otherwise, at their minimum value). if increase: search_domain = set([i for i in range(nb_features) if adv_x[0, i] < clip_max]) else: search_domain = set([i for i in range(nb_features) if adv_x[0, i] > clip_min]) # Initialize the loop variables iteration = 0 adv_x_original_shape = np.reshape(adv_x, original_shape) current = utils_tf.model_argmax(sess, x, predictions, adv_x_original_shape, feed=feed) # charlee: Used to log when the model gets confused orig_label = current confused_at = 0 success_at = 0 # must have at least 10 successful pred to be judged as successful confuse_count = 0 success_count = 0 logger.debug("Starting JSMA attack up to {} iterations".format(max_iters)) # Repeat this main loop until we have achieved misclassification while (success_at == 0 and iteration < max_iters and len(search_domain) > 1): # Reshape the adversarial example adv_x_original_shape = np.reshape(adv_x, original_shape) # Compute the Jacobian components grads_target, grads_others = jacobian(sess, x, grads, target, adv_x_original_shape, nb_features, nb_classes, feed=feed) if iteration % ((max_iters + 1) // 5) == 0 and iteration > 0: logger.debug("Iteration {} of {}".format(iteration, int(max_iters))) # Compute the saliency map for each of our target classes # and return the two best candidate features for perturbation i, j, search_domain = saliency_map( grads_target, grads_others, search_domain, increase) # Apply the perturbation to the two input features selected previously adv_x = apply_perturbations( i, j, adv_x, increase, theta, clip_min, clip_max) # Update our current prediction by querying the model current = utils_tf.model_argmax(sess, x, predictions, adv_x_original_shape, feed=feed) # Update loop variables iteration = iteration + 1 # charlee: Record the iternation when model gets confused if current != orig_label and confused_at == 0: confuse_count += 1 if confuse_count >= 1: confused_at = iteration else: confuse_count = 0 if current == target: success_count += 1 if success_count >= 1: logger.info("Attack succeeded using {} iterations".format(iteration)) success_at = iteration else: success_count = 0 if success_at == 0: logger.info(("Failed to find adversarial example " + "after {} iterations").format(iteration)) # Compute the ratio of pixels perturbed by the algorithm percent_perturbed = float(iteration * 2) / nb_features confused_at = float(confused_at * 2) / nb_features success_at = float(success_at * 2) / nb_features # Report success when the adversarial example is misclassified in the # target class return np.reshape(adv_x, original_shape), percent_perturbed, confused_at, success_at, orig_label, current