Esempio n. 1
0
def _project_perturbation(perturbation,
                          epsilon,
                          input_image,
                          clip_min=None,
                          clip_max=None):
    """Project `perturbation` onto L-infinity ball of radius `epsilon`.
    Also project into hypercube such that the resulting adversarial example
    is between clip_min and clip_max, if applicable.
    """

    if clip_min is None or clip_max is None:
        raise NotImplementedError(
            "_project_perturbation currently has clipping "
            "hard-coded in.")

    # Ensure inputs are in the correct range
    with tf.control_dependencies([
            utils_tf.assert_less_equal(input_image,
                                       tf.cast(clip_max, input_image.dtype)),
            utils_tf.assert_greater_equal(input_image,
                                          tf.cast(clip_min,
                                                  input_image.dtype)),
    ]):
        clipped_perturbation = utils_tf.clip_by_value(perturbation, -epsilon,
                                                      epsilon)
        new_image = utils_tf.clip_by_value(input_image + clipped_perturbation,
                                           clip_min, clip_max)
        return new_image - input_image
def add_noise(x, eps=0.3, clip_min=None, clip_max=None, type='Gaussian'):
    """
        :param x: the input placeholder
        :param logits: output of model.get_logits
        :param y: (optional) A placeholder for the model labels. If targeted
        is true, then provide the target label. Otherwise, only provide
        this parameter if you'd like to use true labels when crafting
        adversarial samples. Otherwise, model predictions are used as
        labels to avoid the "label leaking" effect (explained in this
        paper: https://arxiv.org/abs/1611.01236). Default is None.
        Labels should be one-hot-encoded.
        :param eps: the epsilon (input variation parameter)
        :param ord: (optional) Order of the norm (mimics NumPy).
        Possible values: np.inf, 1 or 2.
        :param clip_min: Minimum float value for adversarial example components
        :param clip_max: Maximum float value for adversarial example components
        :param targeted: Is the attack targeted or untargeted? Untargeted, the
        default, will try to make the label incorrect. Targeted
        will instead try to move in the direction of being more
        like y.
        :return: a tensor for the adversarial example
        """

    asserts = []

    # If a data range was specified, check that the input was in that range
    if clip_min is not None:
        asserts.append(
            utils_tf.assert_greater_equal(x, tf.cast(clip_min, x.dtype)))

    if clip_max is not None:
        asserts.append(
            utils_tf.assert_less_equal(x, tf.cast(clip_max, x.dtype)))

    if type == 'Gaussian':
        perturbation = tf.random.normal(x.shape, mean=0.0, stddev=eps)
    elif type == 'Uniform':
        perturbation = tf.random.uniform(x.shape, minval=-eps, maxval=eps)
    else:
        print("Unknown noise type")

    # Add perturbation to original example to obtain adversarial example
    adv_x = x + perturbation

    # If clipping is needed, reset all values outside of [clip_min, clip_max]
    if (clip_min is not None) or (clip_max is not None):
        # We don't currently support one-sided clipping
        assert clip_min is not None and clip_max is not None
        adv_x = utils_tf.clip_by_value(adv_x, clip_min, clip_max)

    return adv_x
Esempio n. 3
0
  def fprop(self, x, **kwargs):
    output, _ = self.callable_fn(x, reuse=tf.AUTO_REUSE, **kwargs)

    # Do some sanity checking to reduce the chance that probs are used
    # as logits accidentally or vice versa
    if self.output_layer == 'probs':
      assert output.op.type == "Softmax"
      min_prob = tf.reduce_min(output)
      max_prob = tf.reduce_max(output)
      asserts = [utils_tf.assert_greater_equal(min_prob,
                                               tf.cast(0., min_prob.dtype)),
                 utils_tf.assert_less_equal(max_prob,
                                            tf.cast(1., max_prob.dtype))]
      with tf.control_dependencies(asserts):
        output = tf.identity(output)
    elif self.output_layer == 'logits':
      assert output.op.type != 'Softmax'

    return {self.output_layer: output}
Esempio n. 4
0
 def get_probs(self, x, **kwargs):
   """
   :param x: A symbolic representation (Tensor) of the network input
   :return: A symbolic representation (Tensor) of the output
   probabilities (i.e., the output values produced by the softmax layer).
   """
   d = self.fprop(x, **kwargs)
   if self.O_PROBS in d:
     output = d[self.O_PROBS]
     min_prob = tf.reduce_min(output)
     max_prob = tf.reduce_max(output)
     asserts = [utils_tf.assert_greater_equal(min_prob,
                                              tf.cast(0., min_prob.dtype)),
                utils_tf.assert_less_equal(max_prob,
                                           tf.cast(1., min_prob.dtype))]
     with tf.control_dependencies(asserts):
       output = tf.identity(output)
     return output
   elif self.O_LOGITS in d:
     return tf.nn.softmax(logits=d[self.O_LOGITS])
   else:
     raise ValueError('Cannot find probs or logits.')
    def generate(self, x, **kwargs):
        """
    Generate symbolic graph for adversarial examples and return.

    :param x: The model's symbolic inputs.
    :param kwargs: See `parse_params`
    """
        # Parse and save attack-specific parameters
        assert self.parse_params(**kwargs)

        asserts = []

        # If a data range was specified, check that the input was in that range
        if self.clip_min is not None:
            asserts.append(
                utils_tf.assert_greater_equal(x,
                                              tf.cast(self.clip_min, x.dtype)))

        if self.clip_max is not None:
            asserts.append(
                utils_tf.assert_less_equal(x, tf.cast(self.clip_max, x.dtype)))

        # Initialize loop variables
        if self.rand_init:
            eta = random_lp_vector(tf.shape(x),
                                   ord=1,
                                   eps=tf.cast(self.eps, x.dtype),
                                   dtype=x.dtype)
        else:
            eta = tf.zeros(tf.shape(x))

        # Clip eta
        eta = clip_eta(eta, ord=1, eps=self.eps)
        adv_x = x + eta
        if self.clip_min is not None or self.clip_max is not None:
            adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max)

        if self.y_target is not None:
            y = self.y_target
            targeted = True
        elif self.y is not None:
            y = self.y
            targeted = False
        else:
            model_preds = self.model.get_probs(x)
            preds_max = tf.reduce_max(model_preds, 1, keepdims=True)
            y = tf.to_float(tf.equal(model_preds, preds_max))
            y = tf.stop_gradient(y)
            targeted = False
            del model_preds

        y_kwarg = 'y_target' if targeted else 'y'

        def cond(i, _):
            """Iterate until requested number of iterations is completed"""
            return tf.less(i, self.nb_iter)

        def body(i, adv_x):
            """Do a projected gradient step"""

            labels, _ = self.get_or_guess_labels(adv_x, {y_kwarg: y})
            logits = self.model.get_logits(adv_x)

            adv_x = sparse_l1_descent(adv_x,
                                      logits,
                                      y=labels,
                                      eps=self.eps_iter,
                                      q=self.grad_sparsity,
                                      clip_min=self.clip_min,
                                      clip_max=self.clip_max,
                                      clip_grad=self.clip_grad,
                                      targeted=(self.y_target is not None),
                                      sanity_checks=self.sanity_checks)

            # Clipping perturbation eta to the l1-ball
            eta = adv_x - x
            eta = clip_eta(eta, ord=1, eps=self.eps)
            adv_x = x + eta

            # Redo the clipping.
            # Subtracting and re-adding eta can add some small numerical error.
            if self.clip_min is not None or self.clip_max is not None:
                adv_x = utils_tf.clip_by_value(adv_x, self.clip_min,
                                               self.clip_max)

            return i + 1, adv_x

        _, adv_x = tf.while_loop(cond,
                                 body, (tf.zeros([]), adv_x),
                                 back_prop=True,
                                 maximum_iterations=self.nb_iter)

        # Asserts run only on CPU.
        # When multi-GPU eval code tries to force all PGD ops onto GPU, this
        # can cause an error.
        common_dtype = tf.float32
        asserts.append(
            utils_tf.assert_less_equal(
                tf.cast(self.eps_iter, dtype=common_dtype),
                tf.cast(self.eps, dtype=common_dtype)))

        if self.sanity_checks:
            with tf.control_dependencies(asserts):
                adv_x = tf.identity(adv_x)

        return adv_x
def sparse_l1_descent(x,
                      logits,
                      y=None,
                      eps=1.0,
                      q=99,
                      clip_min=None,
                      clip_max=None,
                      clip_grad=False,
                      targeted=False,
                      sanity_checks=True):
    """
  TensorFlow implementation of the Dense L1 Descent Method.
  :param x: the input placeholder
  :param logits: output of model.get_logits
  :param y: (optional) A placeholder for the true labels. If targeted
            is true, then provide the target label. Otherwise, only provide
            this parameter if you'd like to use true labels when crafting
            adversarial samples. Otherwise, model predictions are used as
            labels to avoid the "label leaking" effect (explained in this
            paper: https://arxiv.org/abs/1611.01236). Default is None.
            Labels should be one-hot-encoded.
  :param eps: the epsilon (input variation parameter)
  :param q: the percentile above which gradient values are retained. Either a
            scalar or a vector of same length as the input batch dimension.
  :param clip_min: Minimum float value for adversarial example components
  :param clip_max: Maximum float value for adversarial example components
  :param clip_grad: (optional bool) Ignore gradient components
                    at positions where the input is already at the boundary
                    of the domain, and the update step will get clipped out.
  :param targeted: Is the attack targeted or untargeted? Untargeted, the
                   default, will try to make the label incorrect. Targeted
                   will instead try to move in the direction of being more
                   like y.
  :return: a tensor for the adversarial example
  """

    asserts = []

    # If a data range was specified, check that the input was in that range
    if clip_min is not None:
        asserts.append(
            utils_tf.assert_greater_equal(x, tf.cast(clip_min, x.dtype)))

    if clip_max is not None:
        asserts.append(
            utils_tf.assert_less_equal(x, tf.cast(clip_max, x.dtype)))

    # Make sure the caller has not passed probs by accident
    assert logits.op.type != 'Softmax'

    if y is None:
        # Using model predictions as ground truth to avoid label leaking
        preds_max = reduce_max(logits, 1, keepdims=True)
        y = tf.to_float(tf.equal(logits, preds_max))
        y = tf.stop_gradient(y)
    y = y / reduce_sum(y, 1, keepdims=True)

    # Compute loss
    loss = softmax_cross_entropy_with_logits(labels=y, logits=logits)
    if targeted:
        loss = -loss

    # Define gradient of loss wrt input
    grad, = tf.gradients(loss, x)

    if clip_grad:
        grad = utils_tf.zero_out_clipped_grads(grad, x, clip_min, clip_max)

    red_ind = list(range(1, len(grad.get_shape())))
    dim = tf.reduce_prod(tf.shape(x)[1:])

    abs_grad = tf.reshape(tf.abs(grad), (-1, dim))

    # if q is a scalar, broadcast it to a vector of same length as the batch dim
    q = tf.cast(tf.broadcast_to(q, tf.shape(x)[0:1]), tf.float32)
    k = tf.cast(tf.floor(q / 100 * tf.cast(dim, tf.float32)), tf.int32)

    # `tf.sort` is much faster than `tf.contrib.distributions.percentile`.
    # For TF <= 1.12, use `tf.nn.top_k` as `tf.sort` is not implemented.
    if LooseVersion(tf.__version__) <= LooseVersion('1.12.0'):
        # `tf.sort` is only available in TF 1.13 onwards
        sorted_grad = -tf.nn.top_k(-abs_grad, k=dim, sorted=True)[0]
    else:
        sorted_grad = tf.sort(abs_grad, axis=-1)

    idx = tf.stack((tf.range(tf.shape(abs_grad)[0]), k), -1)
    percentiles = tf.gather_nd(sorted_grad, idx)
    tied_for_max = tf.greater_equal(abs_grad, tf.expand_dims(percentiles, -1))
    tied_for_max = tf.reshape(tf.cast(tied_for_max, x.dtype), tf.shape(grad))
    num_ties = tf.reduce_sum(tied_for_max, red_ind, keepdims=True)

    optimal_perturbation = tf.sign(grad) * tied_for_max / num_ties

    # Add perturbation to original example to obtain adversarial example
    adv_x = x + utils_tf.mul(eps, optimal_perturbation)

    # If clipping is needed, reset all values outside of [clip_min, clip_max]
    if (clip_min is not None) or (clip_max is not None):
        # We don't currently support one-sided clipping
        assert clip_min is not None and clip_max is not None
        adv_x = utils_tf.clip_by_value(adv_x, clip_min, clip_max)

    if sanity_checks:
        with tf.control_dependencies(asserts):
            adv_x = tf.identity(adv_x)

    return adv_x
Esempio n. 7
0
  def generate(self, x, **kwargs):
    """
    Generate symbolic graph for adversarial examples and return.

    :param x: The model's symbolic inputs.
    :param kwargs: See `parse_params`
    """
    # Parse and save attack-specific parameters
    assert self.parse_params(**kwargs)

    asserts = []

    # If a data range was specified, check that the input was in that range
    if self.clip_min is not None:
      asserts.append(utils_tf.assert_greater_equal(x,
                                                   tf.cast(self.clip_min,
                                                           x.dtype)))

    if self.clip_max is not None:
      asserts.append(utils_tf.assert_less_equal(x,
                                                tf.cast(self.clip_max,
                                                        x.dtype)))

    # Initialize loop variables
    if self.rand_init:
      eta = tf.random_uniform(tf.shape(x),
                              tf.cast(-self.rand_minmax, x.dtype),
                              tf.cast(self.rand_minmax, x.dtype),
                              dtype=x.dtype)
    else:
      eta = tf.zeros(tf.shape(x))

    # Clip eta
    eta = clip_eta(eta, self.ord, self.eps)
    adv_x = x + eta
    if self.clip_min is not None or self.clip_max is not None:
      adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max)

    if self.y_target is not None:
      y = self.y_target
      targeted = True
    elif self.y is not None:
      y = self.y
      targeted = False
    else:
      model_preds = self.model.get_probs(x)
      preds_max = reduce_max(model_preds, 1, keepdims=True)
      y = tf.to_float(tf.equal(model_preds, preds_max))
      y = tf.stop_gradient(y)
      targeted = False
      del model_preds

    y_kwarg = 'y_target' if targeted else 'y'
    fgm_params = {
        'eps': self.eps_iter,
        y_kwarg: y,
        'ord': self.ord,
        'clip_min': self.clip_min,
        'clip_max': self.clip_max
    }
    if self.ord == 1:
      raise NotImplementedError("It's not clear that FGM is a good inner loop"
                                " step for PGD when ord=1, because ord=1 FGM "
                                " changes only one pixel at a time. We need "
                                " to rigorously test a strong ord=1 PGD "
                                "before enabling this feature.")

    # Use getattr() to avoid errors in eager execution attacks
    FGM = self.FGM_CLASS(
        self.model,
        sess=getattr(self, 'sess', None),
        dtypestr=self.dtypestr)

    def cond(i, _):
      return tf.less(i, self.nb_iter)

    def body(i, adv_x):
      adv_x = FGM.generate(adv_x, **fgm_params)

      # Clipping perturbation eta to self.ord norm ball
      eta = adv_x - x
      eta = clip_eta(eta, self.ord, self.eps)
      adv_x = x + eta

      # Redo the clipping.
      # FGM already did it, but subtracting and re-adding eta can add some
      # small numerical error.
      if self.clip_min is not None or self.clip_max is not None:
        adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max)

      return i + 1, adv_x

    _, adv_x = tf.while_loop(cond, body, (tf.zeros([]), adv_x), back_prop=True,
                             maximum_iterations=self.nb_iter)

    # Asserts run only on CPU.
    # When multi-GPU eval code tries to force all PGD ops onto GPU, this
    # can cause an error.
    common_dtype = tf.float64
    asserts.append(utils_tf.assert_less_equal(tf.cast(self.eps_iter,
                                                      dtype=common_dtype),
                                              tf.cast(self.eps, dtype=common_dtype)))
    if self.ord == np.inf and self.clip_min is not None:
      # The 1e-6 is needed to compensate for numerical error.
      # Without the 1e-6 this fails when e.g. eps=.2, clip_min=.5,
      # clip_max=.7
      asserts.append(utils_tf.assert_less_equal(tf.cast(self.eps, x.dtype),
                                                1e-6 + tf.cast(self.clip_max,
                                                               x.dtype)
                                                - tf.cast(self.clip_min,
                                                          x.dtype)))

    if self.sanity_checks:
      with tf.control_dependencies(asserts):
        adv_x = tf.identity(adv_x)

    return adv_x
Esempio n. 8
0
def projected_optimization(loss_fn,
                           input_image,
                           label,
                           epsilon,
                           num_steps,
                           clip_min=None,
                           clip_max=None,
                           optimizer=TensorAdam(),
                           project_perturbation=_project_perturbation,
                           early_stop_loss_threshold=None,
                           is_debug=False):
  """Generic projected optimization, generalized to work with approximate
  gradients. Used for e.g. the SPSA attack.

  Args:
    :param loss_fn: A callable which takes `input_image` and `label` as
                    arguments, and returns a batch of loss values. Same
                    interface as TensorOptimizer.
    :param input_image: Tensor, a batch of images
    :param label: Tensor, a batch of labels
    :param epsilon: float, the L-infinity norm of the maximum allowable
                    perturbation
    :param num_steps: int, the number of steps of gradient descent
    :param clip_min: float, minimum pixel value
    :param clip_max: float, maximum pixel value
    :param optimizer: A `TensorOptimizer` object
    :param project_perturbation: A function, which will be used to enforce
                                 some constraint. It should have the same
                                 signature as `_project_perturbation`.
    :param early_stop_loss_threshold: A float or None. If specified, the attack will end if the loss is below
       `early_stop_loss_threshold`.
        Enabling this option can have several different effects:
          - Setting the threshold to 0. guarantees that if a successful attack is found, it is returned.
            This increases the attack success rate, because without early stopping the optimizer can accidentally
            bounce back to a point where the attack fails.
          - Early stopping can make the attack run faster because it may run for fewer steps.
          - Early stopping can make the attack run slower because the loss must be calculated at each step.
            The loss is not calculated as part of the normal SPSA optimization procedure.
            For most reasonable choices of hyperparameters, early stopping makes the attack much faster because
            it decreases the number of steps dramatically.
    :param is_debug: A bool. If True, print debug info for attack progress.

  Returns:
    adversarial version of `input_image`, with L-infinity difference less than
      epsilon, which tries to minimize loss_fn.

  Note that this function is not intended as an Attack by itself. Rather, it
  is designed as a helper function which you can use to write your own attack
  methods. The method uses a tf.while_loop to optimize a loss function in
  a single sess.run() call.
  """
  assert num_steps is not None
  if is_debug:
    with tf.device("/cpu:0"):
      input_image = tf.Print(
          input_image, [],
          "Starting PGD attack with epsilon: %s" % epsilon)

  init_perturbation = tf.random_uniform(
      tf.shape(input_image),
      minval=tf.cast(-epsilon, input_image.dtype),
      maxval=tf.cast(epsilon, input_image.dtype),
      dtype=input_image.dtype)
  init_perturbation = project_perturbation(init_perturbation, epsilon,
                                           input_image, clip_min=clip_min,
                                           clip_max=clip_max)
  init_optim_state = optimizer.init_state([init_perturbation])
  nest = tf.contrib.framework.nest

  def loop_body(i, perturbation, flat_optim_state):
    """Update perturbation to input image."""
    optim_state = nest.pack_sequence_as(
        structure=init_optim_state, flat_sequence=flat_optim_state)

    def wrapped_loss_fn(x):
      return loss_fn(input_image + x, label)

    new_perturbation_list, new_optim_state = optimizer.minimize(
        wrapped_loss_fn, [perturbation], optim_state)
    projected_perturbation = project_perturbation(new_perturbation_list[0],
                                                  epsilon, input_image,
                                                  clip_min=clip_min,
                                                  clip_max=clip_max)

    # Be careful with this bool. A value of 0. is a valid threshold but evaluates to False, so we must explicitly
    # check whether the value is None.
    early_stop = early_stop_loss_threshold is not None
    compute_loss = is_debug or early_stop
    # Don't waste time building the loss graph if we're not going to use it
    if compute_loss:
      # NOTE: this step is not actually redundant with the optimizer step.
      # SPSA calculates the loss at randomly perturbed points but doesn't calculate the loss at the current point.
      loss = reduce_mean(wrapped_loss_fn(projected_perturbation), axis=0)

      if is_debug:
        with tf.device("/cpu:0"):
          loss = tf.Print(loss, [loss], "Total batch loss")

      if early_stop:
        i = tf.cond(tf.less(loss, early_stop_loss_threshold), lambda: float(num_steps), lambda: i)

    return i + 1, projected_perturbation, nest.flatten(new_optim_state)

  def cond(i, *_):
    return tf.less(i, num_steps)

  flat_init_optim_state = nest.flatten(init_optim_state)
  _, final_perturbation, _ = tf.while_loop(
      cond,
      loop_body,
      loop_vars=(tf.constant(0.), init_perturbation, flat_init_optim_state),
      parallel_iterations=1,
      back_prop=False,
      maximum_iterations=num_steps)
  if project_perturbation is _project_perturbation:
    # TODO: this assert looks totally wrong.
    # Not bothering to fix it now because it's only an assert.
    # 1) Multiplying by 1.1 gives a huge margin of error. This should probably
    #    take the difference and allow a tolerance of 1e-6 or something like
    #    that.
    # 2) I think it should probably check the *absolute value* of
    # final_perturbation
    perturbation_max = epsilon * 1.1
    check_diff = utils_tf.assert_less_equal(
        final_perturbation,
        tf.cast(perturbation_max, final_perturbation.dtype),
        message="final_perturbation must change no pixel by more than "
                "%s" % perturbation_max)
  else:
    # TODO: let caller pass in a check_diff function as well as
    # project_perturbation
    check_diff = tf.no_op()

  if clip_min is None or clip_max is None:
    raise NotImplementedError("This function only supports clipping for now")
  check_range = [utils_tf.assert_less_equal(input_image,
                                            tf.cast(clip_max,
                                                    input_image.dtype)),
                 utils_tf.assert_greater_equal(input_image,
                                               tf.cast(clip_min,
                                                       input_image.dtype))]

  with tf.control_dependencies([check_diff] + check_range):
    adversarial_image = input_image + final_perturbation
  return tf.stop_gradient(adversarial_image)
Esempio n. 9
0
    def generate(self, x, **kwargs):
        """
    Generate symbolic graph for adversarial examples and return.
    :param x: The model's symbolic inputs.
    :param kwargs: Keyword arguments. See `parse_params` for documentation.
    """
        # Parse and save attack-specific parameters
        assert self.parse_params(**kwargs)

        asserts = []

        # If a data range was specified, check that the input was in that range
        if self.clip_min is not None:
            asserts.append(
                utils_tf.assert_greater_equal(x,
                                              tf.cast(self.clip_min, x.dtype)))

        if self.clip_max is not None:
            asserts.append(
                utils_tf.assert_less_equal(x, tf.cast(self.clip_max, x.dtype)))

        # Initialize loop variables
        momentum = tf.zeros_like(x)
        adv_x = x

        # Fix labels to the first model predictions for loss computation
        y, _nb_classes = self.get_or_guess_labels(x, kwargs)
        y = y / reduce_sum(y, 1, keepdims=True)
        targeted = (self.y_target is not None)

        def cond(i, _, __):
            """Iterate until number of iterations completed"""
            return tf.less(i, self.nb_iter)

        def body(i, ax, m):
            """Do a momentum step"""
            logits = self.model.get_logits(
                input_diversity(ax, self.prob, self.resc))
            loss = softmax_cross_entropy_with_logits(labels=y, logits=logits)
            if targeted:
                loss = -loss

            # Define gradient of loss wrt input
            grad, = tf.gradients(loss, ax)

            # Normalize current gradient and add it to the accumulated gradient
            red_ind = list(range(1, len(grad.get_shape())))
            avoid_zero_div = tf.cast(1e-12, grad.dtype)
            grad = grad / tf.maximum(
                avoid_zero_div,
                reduce_mean(tf.abs(grad), red_ind, keepdims=True))
            m = self.decay_factor * m + grad

            optimal_perturbation = optimize_linear(m, self.eps_iter, self.ord)
            if self.ord == 1:
                raise NotImplementedError(
                    "This attack hasn't been tested for ord=1."
                    "It's not clear that FGM makes a good inner "
                    "loop step for iterative optimization since "
                    "it updates just one coordinate at a time.")

            # Update and clip adversarial example in current iteration
            ax = ax + optimal_perturbation
            ax = x + utils_tf.clip_eta(ax - x, self.ord, self.eps)

            if self.clip_min is not None and self.clip_max is not None:
                ax = utils_tf.clip_by_value(ax, self.clip_min, self.clip_max)

            ax = tf.stop_gradient(ax)

            return i + 1, ax, m

        _, adv_x, _ = tf.while_loop(cond,
                                    body, (tf.zeros([]), adv_x, momentum),
                                    back_prop=True,
                                    maximum_iterations=self.nb_iter)

        if self.sanity_checks:
            with tf.control_dependencies(asserts):
                adv_x = tf.identity(adv_x)

        return adv_x
Esempio n. 10
0
def fgm(x,
        logits,
        y=None,
        eps=0.3,
        ord=np.inf,
        clip_min=None,
        clip_max=None,
        targeted=False,
        sanity_checks=True):
    """
    TensorFlow implementation of the Fast Gradient Method.
    :param x: the input placeholder
    :param logits: output of model.get_logits
    :param y: (optional) A placeholder for the true labels. If targeted
              is true, then provide the target label. Otherwise, only provide
              this parameter if you'd like to use true labels when crafting
              adversarial samples. Otherwise, model predictions are used as
              labels to avoid the "label leaking" effect (explained in this
              paper: https://arxiv.org/abs/1611.01236). Default is None.
              Labels should be one-hot-encoded.
    :param eps: the epsilon (input variation parameter)
    :param ord: (optional) Order of the norm (mimics NumPy).
                Possible values: np.inf, 1 or 2.
    :param clip_min: Minimum float value for adversarial example components
    :param clip_max: Maximum float value for adversarial example components
    :param targeted: Is the attack targeted or untargeted? Untargeted, the
                     default, will try to make the label incorrect. Targeted
                     will instead try to move in the direction of being more
                     like y.
    :return: a tensor for the adversarial example
    """

    asserts = []

    # If a data range was specified, check that the input was in that range
    if clip_min is not None:
        asserts.append(
            utils_tf.assert_greater_equal(x, tf.cast(clip_min, x.dtype)))

    if clip_max is not None:
        asserts.append(
            utils_tf.assert_less_equal(x, tf.cast(clip_max, x.dtype)))

    # Make sure the caller has not passed probs by accident
    assert logits.op.type != 'Softmax'

    if y is None:
        # Using model predictions as ground truth to avoid label leaking
        preds_max = reduce_max(logits, 1, keepdims=True)
        y = tf.to_float(tf.equal(logits, preds_max))
        y = tf.stop_gradient(y)
    y = y / reduce_sum(y, 1, keepdims=True)

    # Compute loss
    loss = softmax_cross_entropy_with_logits(labels=y, logits=logits)
    if targeted:
        loss = -loss

    # Define gradient of loss wrt input
    grad, = tf.gradients(loss, x)

    optimal_perturbation = optimize_linear(grad, eps, ord)

    # Add perturbation to original example to obtain adversarial example
    adv_x = x + optimal_perturbation

    # If clipping is needed, reset all values outside of [clip_min, clip_max]
    if (clip_min is not None) or (clip_max is not None):
        # We don't currently support one-sided clipping
        assert clip_min is not None and clip_max is not None
        adv_x = utils_tf.clip_by_value(adv_x, clip_min, clip_max)

    if sanity_checks:
        with tf.control_dependencies(asserts):
            adv_x = tf.identity(adv_x)

    return adv_x
Esempio n. 11
0
  def generate(self, x, **kwargs):
    assert self.parse_params(**kwargs)

    asserts = []

    if self.clip_min is not None:
      asserts.append(utils_tf.assert_greater_equal(
        x, tf.cast(self.clip_min,x.dtype)))

    if self.clip_max is not None:
      asserts.append(utils_tf.assert_less_equal(
        x, tf.cast(self.clip_max, x.dtype)))

    m_cache = tf.zeros_like(x)
    v_cache = tf.zeros_like(x)
    adv_x = x

    y, _nb_classes = self.get_or_guess_labels(x, kwargs)
    y = y / reduce_sum(y, 1, keepdims=True)
    targeted = (self.y_target is not None)

    def save_batch(directory, images, labels, iteration, batch_idx):
      for idx, (image, label) in enumerate(zip(images, labels)):
        filename = "id{}_b{}_it{}_l{}.png".format(idx, batch_idx,
                                                  iteration, np.argmax(label))
        save_image_np(join(directory, filename), image)

    for i in range(self.nb_iter):
      self.logger.debug("Starting #{} iteration".format(i + 1))

      logits = self.model.get_logits(adv_x)
      loss = softmax_cross_entropy_with_logits(labels=y, logits=logits)
      if targeted:
        loss = -loss

      grad, = tf.gradients(loss, adv_x)

      red_ind = list(range(1, len(grad.get_shape())))
      avoid_zero_div = tf.cast(1e-8, grad.dtype)
      grad = grad / tf.maximum(
        avoid_zero_div,
        reduce_mean(tf.abs(grad), red_ind, keepdims=True))

      m_cache = self.betha1 * m_cache + (1 - self.betha1) * grad
      v_cache = self.betha2 * v_cache + (1 - self.betha2) * tf.square(grad)
      update = tf.divide(m_cache, tf.sqrt(v_cache + avoid_zero_div))

      optimal_perturbation = optimize_linear(update, self.eps_iter, self.ord)
      if self.ord == 1:
        raise NotImplementedError("This attack hasn't been tested for ord=1."
                                  "It's not clear that FGM makes a good inner "
                                  "loop step for iterative optimization since "
                                  "it updates just one coordinate at a time.")

      adv_x = adv_x + optimal_perturbation
      adv_x = x + utils_tf.clip_eta(adv_x - x, self.ord, self.eps)

      if self.clip_min is not None and self.clip_max is not None:
        adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max)

      adv_x = tf.stop_gradient(adv_x)

      if self.sanity_checks:
        with tf.control_dependencies(asserts):
          adv_x = tf.identity(adv_x)

      with self.sess.as_default():
        self.sess.run(self.init_op)
        for batch in range(self.nb_batches):
          adv_x_np, y_np = self.sess.run([adv_x, y])
          self.logger.debug("Saving attacked batch #{}".format(batch + 1))
          save_batch(self.adv_dir, adv_x_np, y_np, i, batch)
    def generate(self, x, **kwargs):
        """
        Generate symbolic graph for adversarial examples and return.

        :param x: The model's symbolic inputs.
        :param kwargs: See `parse_params`
        """
        # Parse and save attack-specific parameters
        assert self.parse_params(**kwargs)

        asserts = []

        # If a data range was specified, check that the input was in that range
        if self.clip_min is not None:
            asserts.append(
                utils_tf.assert_greater_equal(x,
                                              tf.cast(self.clip_min, x.dtype)))

        if self.clip_max is not None:
            asserts.append(
                utils_tf.assert_less_equal(x, tf.cast(self.clip_max, x.dtype)))

        # Initialize loop variables
        if self.rand_init:
            eta = random_lp_vector(
                tf.shape(x),
                self.ord,
                tf.cast(self.rand_init_eps, x.dtype),
                dtype=x.dtype,
            )
        else:
            eta = tf.zeros(tf.shape(x))

        # Clip eta
        eta = clip_eta(eta, self.ord, self.eps)
        adv_x = x + eta
        if self.clip_min is not None or self.clip_max is not None:
            adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max)

        if self.y_target is not None:
            y = self.y_target
            targeted = True
        elif self.y is not None:
            y = self.y
            targeted = False
        else:
            model_preds = self.model.get_probs(x)
            preds_max = tf.reduce_max(model_preds, 1, keepdims=True)
            y = tf.to_float(tf.equal(model_preds, preds_max))
            y = tf.stop_gradient(y)
            targeted = False
            del model_preds

        y_kwarg = "y_target" if targeted else "y"

        fgm_params = {
            "eps": self.eps_iter,
            y_kwarg: y,
            "ord": self.ord,
            "loss_fn": self.loss_fn,
            "clip_min": self.clip_min,
            "clip_max": self.clip_max,
            "clip_grad": self.clip_grad,
        }
        if self.ord == 1:
            raise NotImplementedError(
                "FGM is not a good inner loop step for PGD "
                " when ord=1, because ord=1 FGM changes only "
                " one pixel at a time. Use the SparseL1Descent "
                " attack instead, which allows fine-grained "
                " control over the sparsity of the gradient "
                " updates.")

        # Use getattr() to avoid errors in eager execution attacks
        FGM = self.FGM_CLASS(self.model,
                             sess=getattr(self, "sess", None),
                             dtypestr=self.dtypestr)

        def cond(i, _):
            """Iterate until requested number of iterations is completed"""
            return tf.less(i, self.nb_iter)

        def body(i, adv_x):
            """Do a projected gradient step"""
            adv_x = FGM.generate(adv_x, **fgm_params)

            # Clipping perturbation eta to self.ord norm ball
            eta = adv_x - x
            eta = clip_eta(eta, self.ord, self.eps)
            adv_x = x + eta

            # Redo the clipping.
            # FGM already did it, but subtracting and re-adding eta can add some
            # small numerical error.
            if self.clip_min is not None or self.clip_max is not None:
                adv_x = utils_tf.clip_by_value(adv_x, self.clip_min,
                                               self.clip_max)

            return i + 1, adv_x

        _, adv_x = tf.while_loop(
            cond,
            body,
            (tf.zeros([]), adv_x),
            back_prop=True,
            maximum_iterations=self.nb_iter,
        )

        # Asserts run only on CPU.
        # When multi-GPU eval code tries to force all PGD ops onto GPU, this
        # can cause an error.
        common_dtype = tf.float32
        asserts.append(
            utils_tf.assert_less_equal(
                tf.cast(self.eps_iter, dtype=common_dtype),
                tf.cast(self.eps, dtype=common_dtype),
            ))
        if self.ord == np.inf and self.clip_min is not None:
            # The 1e-6 is needed to compensate for numerical error.
            # Without the 1e-6 this fails when e.g. eps=.2, clip_min=.5,
            # clip_max=.7
            asserts.append(
                utils_tf.assert_less_equal(
                    tf.cast(self.eps, x.dtype),
                    1e-6 + tf.cast(self.clip_max, x.dtype) -
                    tf.cast(self.clip_min, x.dtype),
                ))

        if self.sanity_checks:
            with tf.control_dependencies(asserts):
                adv_x = tf.identity(adv_x)

        return adv_x