Beispiel #1
0
def jsma(sess,
         x,
         predictions,
         grads,
         sample,
         target,
         theta,
         gamma,
         increase,
         nb_classes,
         clip_min,
         clip_max,
         verbose=False):
    """
    TensorFlow implementation of the jacobian-based saliency map method (JSMA).
    :param sess: TF session
    :param x: the input placeholder
    :param predictions: the model's symbolic output (linear output,
        pre-softmax)
    :param sample: numpy array with sample input
    :param target: target class for sample input
    :param theta: delta for each feature adjustment
    :param gamma: a float between 0 - 1 indicating the maximum distortion
        percentage
    :param increase: boolean; true if we are increasing pixels, false otherwise
    :param nb_classes: integer indicating the number of classes in the model
    :param clip_min: optional parameter that can be used to set a minimum
                    value for components of the example returned
    :param clip_max: optional parameter that can be used to set a maximum
                    value for components of the example returned
    :param verbose: boolean; whether to print status updates or not
    :return: an adversarial sample
    """

    # Copy the source sample and define the maximum number of features
    # (i.e. the maximum number of iterations) that we may perturb
    adv_x = copy.copy(sample)
    # count the number of features. For MNIST, 1x28x28 = 784; for
    # CIFAR, 3x32x32 = 3072; etc.
    nb_features = np.product(adv_x.shape[1:])
    # reshape sample for sake of standardization
    original_shape = adv_x.shape
    adv_x = np.reshape(adv_x, (1, nb_features))
    # compute maximum number of iterations
    max_iters = np.floor(nb_features * gamma / 2)
    if verbose:
        print('Maximum number of iterations: {0}'.format(max_iters))

    # Compute our initial search domain. We optimize the initial search domain
    # by removing all features that are already at their maximum values (if
    # increasing input features---otherwise, at their minimum value).
    if increase:
        search_domain = set(
            [i for i in xrange(nb_features) if adv_x[0, i] < clip_max])
    else:
        search_domain = set(
            [i for i in xrange(nb_features) if adv_x[0, i] > clip_min])

    # Initialize the loop variables
    iteration = 0
    adv_x_original_shape = np.reshape(adv_x, original_shape)
    current = model_argmax(sess,
                           x,
                           predictions,
                           adv_x_original_shape,
                           feed={K.learning_phase(): 0})

    # Repeat this main loop until we have achieved misclassification
    while (current != target and iteration < max_iters
           and len(search_domain) > 1):
        # Reshape the adversarial example
        adv_x_original_shape = np.reshape(adv_x, original_shape)

        # Compute the Jacobian components
        grads_target, grads_others = jacobian(sess,
                                              x,
                                              grads,
                                              target,
                                              adv_x_original_shape,
                                              nb_features,
                                              nb_classes,
                                              feed={K.learning_phase(): 0})

        # Compute the saliency map for each of our target classes
        # and return the two best candidate features for perturbation
        i, j, search_domain = saliency_map(grads_target, grads_others,
                                           search_domain, increase)

        # Apply the perturbation to the two input features selected previously
        adv_x = apply_perturbations(i, j, adv_x, increase, theta, clip_min,
                                    clip_max)

        # Update our current prediction by querying the model
        current = model_argmax(sess,
                               x,
                               predictions,
                               adv_x_original_shape,
                               feed={K.learning_phase(): 0})

        # Update loop variables
        iteration += 1

        # This process may take a while, so outputting progress regularly
        if iteration % 5 == 0 and verbose:
            msg = 'Current iteration: {0} - Current Prediction: {1}'
            print(msg.format(iteration, current))

    # Compute the ratio of pixels perturbed by the algorithm
    percent_perturbed = float(iteration * 2) / nb_features

    # Report success when the adversarial example is misclassified in the
    # target class
    if current == target:
        if verbose:
            print('Successful')
        return np.reshape(adv_x, original_shape), 1, percent_perturbed
    else:
        if verbose:
            print('Unsuccesful')
        return np.reshape(adv_x, original_shape), 0, percent_perturbed
Beispiel #2
0
    def jsma(self, sess, x, predictions, grads, sample, target, theta, gamma, clip_min,
            clip_max, feed=None):
        """
        TensorFlow implementation of the JSMA (see https://arxiv.org/abs/1511.07528
        for details about the algorithm design choices).
        :param sess: TF session
        :param x: the input placeholder
        :param predictions: the model's symbolic output (the attack expects the
                    probabilities, i.e., the output of the softmax, but will
                    also work with logits typically)
        :param grads: symbolic gradients
        :param sample: numpy array with sample input
        :param target: target class for sample input
        :param theta: delta for each feature adjustment
        :param gamma: a float between 0 - 1 indicating the maximum distortion
            percentage
        :param clip_min: minimum value for components of the example returned
        :param clip_max: maximum value for components of the example returned
        :return: an adversarial sample
        """

        # Copy the source sample and define the maximum number of features
        # (i.e. the maximum number of iterations) that we may perturb
        adv_x = copy.copy(sample)
        # count the number of features. For MNIST, 1x28x28 = 784; for
        # CIFAR, 3x32x32 = 3072; etc.
        nb_features = np.product(adv_x.shape[1:])
        # reshape sample for sake of standardization
        original_shape = adv_x.shape
        adv_x = np.reshape(adv_x, (1, nb_features))
        # compute maximum number of iterations
        max_iters = np.floor(nb_features * gamma / 2)

        # Find number of classes based on grads
        nb_classes = len(grads)

        increase = bool(theta > 0)

        # Compute our initial search domain. We optimize the initial search domain
        # by removing all features that are already at their maximum values (if
        # increasing input features---otherwise, at their minimum value).
        if increase:
            search_domain = set([i for i in range(nb_features)
                                if adv_x[0, i] < clip_max])
        else:
            search_domain = set([i for i in range(nb_features)
                                if adv_x[0, i] > clip_min])

        # Initialize the loop variables
        iteration = 0
        adv_x_original_shape = np.reshape(adv_x, original_shape)
        current = utils_tf.model_argmax(sess, x, predictions, adv_x_original_shape,
                                        feed=feed)

        # charlee: Used to log when the model gets confused
        orig_label = current
        confused_at = 0
        success_at = 0

        # must have at least 10 successful pred to be judged as successful
        confuse_count = 0
        success_count = 0

        logger.debug("Starting JSMA attack up to {} iterations".format(max_iters))
        # Repeat this main loop until we have achieved misclassification
        while (success_at == 0 and iteration < max_iters and
            len(search_domain) > 1):
            # Reshape the adversarial example
            adv_x_original_shape = np.reshape(adv_x, original_shape)

            # Compute the Jacobian components
            grads_target, grads_others = jacobian(sess, x, grads, target,
                                                adv_x_original_shape,
                                                nb_features, nb_classes,
                                                feed=feed)

            if iteration % ((max_iters + 1) // 5) == 0 and iteration > 0:
                logger.debug("Iteration {} of {}".format(iteration,
                                                        int(max_iters)))
            # Compute the saliency map for each of our target classes
            # and return the two best candidate features for perturbation
            i, j, search_domain = saliency_map(
                grads_target, grads_others, search_domain, increase)

            # Apply the perturbation to the two input features selected previously
            adv_x = apply_perturbations(
                i, j, adv_x, increase, theta, clip_min, clip_max)

            # Update our current prediction by querying the model
            current = utils_tf.model_argmax(sess, x, predictions,
                                            adv_x_original_shape, feed=feed)

            # Update loop variables
            iteration = iteration + 1

            # charlee: Record the iternation when model gets confused
            if current != orig_label and confused_at == 0:
                confuse_count += 1
                if confuse_count >= 1:
                    confused_at = iteration
            else:
                confuse_count = 0

            if current == target:
                success_count += 1
                if success_count >= 1:
                    logger.info("Attack succeeded using {} iterations".format(iteration))
                    success_at = iteration
            else:
                success_count = 0

        if success_at == 0:
            logger.info(("Failed to find adversarial example " +
                        "after {} iterations").format(iteration))

        # Compute the ratio of pixels perturbed by the algorithm
        percent_perturbed = float(iteration * 2) / nb_features
        confused_at = float(confused_at * 2) / nb_features
        success_at = float(success_at * 2) / nb_features

        # Report success when the adversarial example is misclassified in the
        # target class
        return np.reshape(adv_x, original_shape), percent_perturbed, confused_at, success_at, orig_label, current