Example #1
0
def margin_logit_loss(model_logits, label, nb_classes=10, num_classes=None):
    """Computes difference between logit for `label` and next highest logit.

    The loss is high when `label` is unlikely (targeted by default).
    This follows the same interface as `loss_fn` for TensorOptimizer and
    projected_optimization, i.e. it returns a batch of loss values.
    """
    if num_classes is not None:
        warnings.warn("`num_classes` is depreciated. Switch to `nb_classes`."
                      " `num_classes` may be removed on or after 2019-04-23.")
        nb_classes = num_classes
        del num_classes
    if "int" in str(label.dtype):
        logit_mask = tf.one_hot(label, depth=nb_classes, axis=-1)
    else:
        logit_mask = label
    if "int" in str(logit_mask.dtype):
        logit_mask = tf.to_float(logit_mask)
    try:
        label_logits = reduce_sum(logit_mask * model_logits, axis=-1)
    except TypeError:
        raise TypeError("Could not take row-wise dot product between "
                        "logit mask, of dtype " + str(logit_mask.dtype) +
                        " and model_logits, of dtype " +
                        str(model_logits.dtype))
    logits_with_target_label_neg_inf = model_logits - logit_mask * 99999
    highest_nonlabel_logits = reduce_max(logits_with_target_label_neg_inf,
                                         axis=-1)
    loss = highest_nonlabel_logits - label_logits
    return loss
Example #2
0
 def get_or_guess_labels(self, x, kwargs):
   """
   Get the label to use in generating an adversarial example for x.
   The kwargs are fed directly from the kwargs of the attack.
   If 'y' is in kwargs, then assume it's an untargeted attack and
   use that as the label.
   If 'y_target' is in kwargs and is not none, then assume it's a
   targeted attack and use that as the label.
   Otherwise, use the model's prediction as the label and perform an
   untargeted attack.
   """
   if 'y' in kwargs and 'y_target' in kwargs:
     raise ValueError("Can not set both 'y' and 'y_target'.")
   elif 'y' in kwargs:
     labels = kwargs['y']
   elif 'y_target' in kwargs and kwargs['y_target'] is not None:
     labels = kwargs['y_target']
   else:
     preds = self.model.get_probs(x)
     preds_max = reduce_max(preds, 1, keepdims=True)
     original_predictions = tf.to_float(tf.equal(preds, preds_max))
     labels = tf.stop_gradient(original_predictions)
     del preds
   if isinstance(labels, np.ndarray):
     nb_classes = labels.shape[1]
   else:
     nb_classes = labels.get_shape().as_list()[1]
   return labels, nb_classes
Example #3
0
def spm(x, model, y=None, n_samples=None, dx_min=-0.1,
        dx_max=0.1, n_dxs=5, dy_min=-0.1, dy_max=0.1, n_dys=5,
        angle_min=-30, angle_max=30, n_angles=31, black_border_size=0):
  """
  TensorFlow implementation of the Spatial Transformation Method.
  :return: a tensor for the adversarial example
  """
  if y is None:
    preds = model.get_probs(x)
    # Using model predictions as ground truth to avoid label leaking
    preds_max = reduce_max(preds, 1, keepdims=True)
    y = tf.to_float(tf.equal(preds, preds_max))
    y = tf.stop_gradient(y)
    del preds
  y = y / reduce_sum(y, 1, keepdims=True)

  # Define the range of transformations
  dxs = np.linspace(dx_min, dx_max, n_dxs)
  dys = np.linspace(dy_min, dy_max, n_dys)
  angles = np.linspace(angle_min, angle_max, n_angles)

  if n_samples is None:
    import itertools
    transforms = list(itertools.product(*[dxs, dys, angles]))
  else:
    sampled_dxs = np.random.choice(dxs, n_samples)
    sampled_dys = np.random.choice(dys, n_samples)
    sampled_angles = np.random.choice(angles, n_samples)
    transforms = zip(sampled_dxs, sampled_dys, sampled_angles)
  transformed_ims = parallel_apply_transformations(
      x, transforms, black_border_size)

  def _compute_xent(x):
    preds = model.get_logits(x)
    return tf.nn.softmax_cross_entropy_with_logits_v2(
        labels=y, logits=preds)

  all_xents = tf.map_fn(
      _compute_xent,
      transformed_ims,
      parallel_iterations=1)  # Must be 1 to avoid keras race conditions

  # Return the adv_x with worst accuracy

  # all_xents is n_total_samples x batch_size (SB)
  all_xents = tf.stack(all_xents)  # SB

  # We want the worst case sample, with the largest xent_loss
  worst_sample_idx = tf.argmax(all_xents, axis=0)  # B

  batch_size = tf.shape(x)[0]
  keys = tf.stack([
      tf.range(batch_size, dtype=tf.int32),
      tf.cast(worst_sample_idx, tf.int32)
  ], axis=1)
  transformed_ims_bshwc = tf.einsum('sbhwc->bshwc', transformed_ims)
  after_lookup = tf.gather_nd(transformed_ims_bshwc, keys)  # BHWC
  return after_lookup
Example #4
0
def l2_batch_normalize(x, epsilon=1e-12, scope=None):
    """
    Helper function to normalize a batch of vectors.
    :param x: the input placeholder
    :param epsilon: stabilizes division
    :return: the batch of l2 normalized vector
    """
    with tf.name_scope(scope, "l2_batch_normalize") as scope:
        x_shape = tf.shape(x)
        x = tf.contrib.layers.flatten(x)
        x /= (epsilon + reduce_max(tf.abs(x), 1, keepdims=True))
        square_sum = reduce_sum(tf.square(x), 1, keepdims=True)
        x_inv_norm = tf.rsqrt(np.sqrt(epsilon) + square_sum)
        x_norm = tf.multiply(x, x_inv_norm)
        return tf.reshape(x_norm, x_shape, scope)
def l2_batch_normalize(x, epsilon=1e-12, scope=None):
    """
  Helper function to normalize a batch of vectors.
  :param x: the input placeholder
  :param epsilon: stabilizes division
  :return: the batch of l2 normalized vector
  """
    with tf.name_scope(scope, "l2_batch_normalize") as name_scope:
        x_shape = tf.shape(x)
        x = tf.contrib.layers.flatten(x)
        x /= (epsilon + reduce_max(tf.abs(x), 1, keepdims=True))
        square_sum = reduce_sum(tf.square(x), 1, keepdims=True)
        x_inv_norm = tf.rsqrt(np.sqrt(epsilon) + square_sum)
        x_norm = tf.multiply(x, x_inv_norm)
        return tf.reshape(x_norm, x_shape, name_scope)
Example #6
0
def adv_image_dynamic_shape(temp_imgs, data_shape, label_shape, data_channel,
                                  class_num, batch_size, sess, net):
    '''
    Create one adversarial image with sub regions along z-axis
    The height and width of input tensor is adapted to those of the input image
    '''
    # construct graph
    [D, H, W] = temp_imgs[0].shape
    Hx = max(int((H+3)/4)*4, data_shape[1])
    Wx = max(int((W+3)/4)*4, data_shape[2])
    data_slice = data_shape[0]
    label_slice = label_shape[0]
    full_data_shape = [batch_size, data_slice, Hx, Wx, data_channel]
    x = tf.placeholder(tf.float32, full_data_shape)
    predicty = net(x, is_training = True)
    proby = tf.nn.softmax(predicty)

    preds_max = reduce_max(predicty, 1, keepdims=True)
    y = tf.to_float(tf.equal(predicty, preds_max))
    y = tf.stop_gradient(y)
    y = y / reduce_sum(y, 1, keepdims=True)

    # Create adversarial attack
    loss_func = LossFunction(n_class=class_num)
    loss = loss_func(predicty, y)
    fgsm = FastGradientMethod(net)
    adv_steps = 2
    fgsm_params = {'eps': 0.4/adv_steps, 'loss_func': loss}
    adv_x = fgsm.generate(x, **fgsm_params)

    new_data_shape = [data_slice, Hx, Wx]
    new_label_shape = [label_slice, Hx, Wx]

    print("Running adversarial attack with %d steps" % adv_steps)
    for i in range(adv_steps):
        temp_imgs = volume_probability_prediction(temp_imgs, new_data_shape, new_label_shape, data_channel,
                                              class_num, batch_size, sess, adv_x, x)
    return temp_imgs
Example #7
0
  def generate(self, x, **kwargs):
    """
    Generate symbolic graph for adversarial examples and return.

    :param x: The model's symbolic inputs.
    :param kwargs: See `parse_params`
    """
    # Parse and save attack-specific parameters
    assert self.parse_params(**kwargs)

    asserts = []

    # If a data range was specified, check that the input was in that range
    if self.clip_min is not None:
      asserts.append(utils_tf.assert_greater_equal(x,
                                                   tf.cast(self.clip_min,
                                                           x.dtype)))

    if self.clip_max is not None:
      asserts.append(utils_tf.assert_less_equal(x,
                                                tf.cast(self.clip_max,
                                                        x.dtype)))

    # Initialize loop variables
    if self.rand_init:
      eta = tf.random_uniform(tf.shape(x),
                              tf.cast(-self.rand_minmax, x.dtype),
                              tf.cast(self.rand_minmax, x.dtype),
                              dtype=x.dtype)
    else:
      eta = tf.zeros(tf.shape(x))

    # Clip eta
    eta = clip_eta(eta, self.ord, self.eps)
    adv_x = x + eta
    if self.clip_min is not None or self.clip_max is not None:
      adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max)

    if self.y_target is not None:
      y = self.y_target
      targeted = True
    elif self.y is not None:
      y = self.y
      targeted = False
    else:
      model_preds = self.model.get_probs(x)
      preds_max = reduce_max(model_preds, 1, keepdims=True)
      y = tf.to_float(tf.equal(model_preds, preds_max))
      y = tf.stop_gradient(y)
      targeted = False
      del model_preds

    y_kwarg = 'y_target' if targeted else 'y'
    fgm_params = {
        'eps': self.eps_iter,
        y_kwarg: y,
        'ord': self.ord,
        'clip_min': self.clip_min,
        'clip_max': self.clip_max
    }
    if self.ord == 1:
      raise NotImplementedError("It's not clear that FGM is a good inner loop"
                                " step for PGD when ord=1, because ord=1 FGM "
                                " changes only one pixel at a time. We need "
                                " to rigorously test a strong ord=1 PGD "
                                "before enabling this feature.")

    # Use getattr() to avoid errors in eager execution attacks
    FGM = self.FGM_CLASS(
        self.model,
        sess=getattr(self, 'sess', None),
        dtypestr=self.dtypestr)

    def cond(i, _):
      return tf.less(i, self.nb_iter)

    def body(i, adv_x):
      adv_x = FGM.generate(adv_x, **fgm_params)

      # Clipping perturbation eta to self.ord norm ball
      eta = adv_x - x
      eta = clip_eta(eta, self.ord, self.eps)
      adv_x = x + eta

      # Redo the clipping.
      # FGM already did it, but subtracting and re-adding eta can add some
      # small numerical error.
      if self.clip_min is not None or self.clip_max is not None:
        adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max)

      return i + 1, adv_x

    _, adv_x = tf.while_loop(cond, body, (tf.zeros([]), adv_x), back_prop=True,
                             maximum_iterations=self.nb_iter)

    # Asserts run only on CPU.
    # When multi-GPU eval code tries to force all PGD ops onto GPU, this
    # can cause an error.
    common_dtype = tf.float64
    asserts.append(utils_tf.assert_less_equal(tf.cast(self.eps_iter,
                                                      dtype=common_dtype),
                                              tf.cast(self.eps, dtype=common_dtype)))
    if self.ord == np.inf and self.clip_min is not None:
      # The 1e-6 is needed to compensate for numerical error.
      # Without the 1e-6 this fails when e.g. eps=.2, clip_min=.5,
      # clip_max=.7
      asserts.append(utils_tf.assert_less_equal(tf.cast(self.eps, x.dtype),
                                                1e-6 + tf.cast(self.clip_max,
                                                               x.dtype)
                                                - tf.cast(self.clip_min,
                                                          x.dtype)))

    if self.sanity_checks:
      with tf.control_dependencies(asserts):
        adv_x = tf.identity(adv_x)

    return adv_x
    def body(x_in, y_in, domain_in, i_in, cond_in):

        preds = model.get_probs(x_in)
        preds_onehot = tf.one_hot(tf.argmax(preds, axis=1), depth=nb_classes)

        # create the Jacobian graph
        list_derivatives = []
        for class_ind in xrange(nb_classes):
            derivatives = tf.gradients(preds[:, class_ind], x_in)
            list_derivatives.append(derivatives[0])
        grads = tf.reshape(tf.stack(list_derivatives),
                           shape=[nb_classes, -1, nb_features])

        # Compute the Jacobian components
        # To help with the computation later, reshape the target_class
        # and other_class to [nb_classes, -1, 1].
        # The last dimention is added to allow broadcasting later.
        target_class = tf.reshape(tf.transpose(y_in, perm=[1, 0]),
                                  shape=[nb_classes, -1, 1])
        other_classes = tf.cast(tf.not_equal(target_class, 1), tf_dtype)

        grads_target = reduce_sum(grads * target_class, axis=0)
        grads_other = reduce_sum(grads * other_classes, axis=0)

        # Remove the already-used input features from the search space
        # Subtract 2 times the maximum value from those value so that
        # they won't be picked later
        increase_coef = (4 * int(increase) - 2) \
            * tf.cast(tf.equal(domain_in, 0), tf_dtype)

        target_tmp = grads_target
        target_tmp -= increase_coef \
            * reduce_max(tf.abs(grads_target), axis=1, keepdims=True)
        target_sum = tf.reshape(target_tmp, shape=[-1, nb_features, 1]) \
            + tf.reshape(target_tmp, shape=[-1, 1, nb_features])

        other_tmp = grads_other
        other_tmp += increase_coef \
            * reduce_max(tf.abs(grads_other), axis=1, keepdims=True)
        other_sum = tf.reshape(other_tmp, shape=[-1, nb_features, 1]) \
            + tf.reshape(other_tmp, shape=[-1, 1, nb_features])

        # Create a mask to only keep features that match conditions
        if increase:
            scores_mask = ((target_sum > 0) & (other_sum < 0))
        else:
            scores_mask = ((target_sum < 0) & (other_sum > 0))

        # Create a 2D numpy array of scores for each pair of candidate features
        scores = tf.cast(scores_mask, tf_dtype) \
            * (-target_sum * other_sum) * zero_diagonal

        # Extract the best two pixels
        best = tf.argmax(tf.reshape(scores,
                                    shape=[-1, nb_features * nb_features]),
                         axis=1)

        p1 = tf.mod(best, nb_features)
        p2 = tf.floordiv(best, nb_features)
        p1_one_hot = tf.one_hot(p1, depth=nb_features)
        p2_one_hot = tf.one_hot(p2, depth=nb_features)

        # Check if more modification is needed for each sample
        mod_not_done = tf.equal(reduce_sum(y_in * preds_onehot, axis=1), 0)
        cond = mod_not_done & (reduce_sum(domain_in, axis=1) >= 2)

        # Update the search domain
        cond_float = tf.reshape(tf.cast(cond, tf_dtype), shape=[-1, 1])
        to_mod = (p1_one_hot + p2_one_hot) * cond_float

        domain_out = domain_in - to_mod

        # Apply the modification to the images
        to_mod_reshape = tf.reshape(to_mod,
                                    shape=([-1] + x_in.shape[1:].as_list()))
        if increase:
            x_out = tf.minimum(clip_max, x_in + to_mod_reshape * theta)
        else:
            x_out = tf.maximum(clip_min, x_in - to_mod_reshape * theta)

        # Increase the iterator, and check if all misclassifications are done
        i_out = tf.add(i_in, 1)
        cond_out = reduce_any(cond)

        return x_out, y_in, domain_out, i_out, cond_out
    def __init__(self, sess, model, beta, decision_rule, batch_size, confidence, targeted, learning_rate,
                 binary_search_steps,
                 max_iterations, abort_early, initial_const, clip_min, clip_max, num_labels, shape, labels_shape):
        """
        EAD Attack

        Return a tensor that constructs adversarial examples for the given
        input. Generate uses tf.py_func in order to operate over tensors.

        :param sess: a TF session.
        :param model: a cleverhans.model.Model object.
        :param beta: Trades off L2 distortion with L1 distortion: higher produces examples with lower L1 distortion,
                      at the cost of higher L2 (and typically Linf) distortion
        :param decision_rule: EN or L1. Select final adversarial example from all successful examples based on the least elastic-net or L1 distortion criterion.
        :param batch_size: Number of attacks to run simultaneously.
        :param confidence: Confidence of adversarial examples: higher produces examples with larger l2 distortion, but more strongly classified as adversarial.
        :param targeted: boolean controlling the behavior of the adversarial examples produced. If set to False, they will be
                         misclassified in any wrong class. If set to True, they will be misclassified in a chosen target class.
        :param learning_rate: The learning rate for the attack algorithm. Smaller values produce better results but are slower to converge.
        :param binary_search_steps: The number of times we perform binary search to find the optimal tradeoff- constant between norm of the perturbation
                                    and confidence of the classification. Set 'initial_const' to a large value and fix this param to 1 for speed.
        :param max_iterations: The maximum number of iterations. Setting this to a larger value will produce lower distortion
                               results. Using only a few iterations requires a larger learning rate, and will produce larger distortion results.
        :param abort_early: If true, allows early abort when the total loss starts to increase (greatly speeds up attack, but hurts performance, particularly on ImageNet)
        :param initial_const: The initial tradeoff-constant to use to tune the relative importance of size of the perturbation
                              and confidence of classification. If binary_search_steps is large, the initial constant is not important. A smaller value of
                              this constant gives lower distortion results. For computational efficiency, fix binary_search_steps to 1 and set this param to a large value.
        :param clip_min: (optional float) Minimum input component value.
        :param clip_max: (optional float) Maximum input component value.
        :param num_labels: the number of classes in the model's output.
        :param shape: the shape of the model's input tensor.
        """

        self.sess = sess
        self.TARGETED = targeted
        self.LEARNING_RATE = learning_rate
        self.MAX_ITERATIONS = max_iterations
        self.BINARY_SEARCH_STEPS = binary_search_steps
        self.ABORT_EARLY = abort_early
        self.CONFIDENCE = confidence
        self.initial_const = initial_const
        self.batch_size = batch_size
        self.clip_min = clip_min
        self.clip_max = clip_max
        self.model = model
        self.decision_rule = decision_rule

        self.beta = beta
        self.beta_t = tf.cast(self.beta, tf_dtype)

        self.repeat = binary_search_steps >= 10

        self.shape = shape = tuple([batch_size] + list(shape))
        self.labels_shape = labels_shape = tuple([batch_size] + labels_shape)
        # these are variables to be more efficient in sending data to tf
        self.timg = tf.Variable(np.zeros(shape), dtype=tf_dtype, name='timg')
        self.newimg = tf.Variable(np.zeros(shape), dtype=tf_dtype, name='newimg')
        self.slack = tf.Variable(np.zeros(shape), dtype=tf_dtype, name='slack')
        # self.tlab = tf.Variable(np.zeros((batch_size, num_labels)), dtype=tf_dtype, name='tlab')
        self.tlab = tf.Variable(np.zeros(labels_shape), dtype=tf_dtype, name='tlab')
        self.const = tf.Variable(np.zeros(batch_size), dtype=tf_dtype, name='const')

        # and here's what we use to assign them
        self.assign_timg = tf.placeholder(tf_dtype, shape, name='assign_timg')
        self.assign_newimg = tf.placeholder(tf_dtype, shape, name='assign_newimg')
        self.assign_slack = tf.placeholder(tf_dtype, shape, name='assign_slack')
        # self.assign_tlab = tf.placeholder(tf_dtype, (batch_size, num_labels), name='assign_tlab')
        self.assign_tlab = tf.placeholder(tf_dtype, labels_shape, name='assign_tlab')
        self.assign_const = tf.placeholder(tf_dtype, [batch_size], name='assign_const')

        self.global_step = tf.Variable(0, trainable=False)
        self.global_step_t = tf.cast(self.global_step, tf_dtype)

        # Fast Iterative Shrinkage Thresholding
        self.zt = tf.divide(self.global_step_t, self.global_step_t + tf.cast(3, tf_dtype))
        cond1 = tf.cast(tf.greater(tf.subtract(self.slack, self.timg), self.beta_t), tf_dtype)
        cond2 = tf.cast(tf.less_equal(tf.abs(tf.subtract(self.slack, self.timg)), self.beta_t), tf_dtype)
        cond3 = tf.cast(tf.less(tf.subtract(self.slack, self.timg), tf.negative(self.beta_t)), tf_dtype)

        upper = tf.minimum(tf.subtract(self.slack, self.beta_t), tf.cast(self.clip_max, tf_dtype))
        lower = tf.maximum(tf.add(self.slack, self.beta_t), tf.cast(self.clip_min, tf_dtype))

        self.assign_newimg = tf.multiply(cond1, upper)
        self.assign_newimg += tf.multiply(cond2, self.timg)
        self.assign_newimg += tf.multiply(cond3, lower)

        self.assign_slack = self.assign_newimg
        self.assign_slack += tf.multiply(self.zt, self.assign_newimg - self.newimg)

        # --------------------------------
        self.setter = tf.assign(self.newimg, self.assign_newimg)
        self.setter_y = tf.assign(self.slack, self.assign_slack)

        # prediction BEFORE-SOFTMAX of the model
        self.output = model.get_logits(self.newimg)
        self.output_y = model.get_logits(self.slack)

        # distance to the input data
        self.l2dist = reduce_sum(tf.square(self.newimg - self.timg), list(range(1, len(shape))))
        self.l2dist_y = reduce_sum(tf.square(self.slack - self.timg), list(range(1, len(shape))))
        self.l1dist = reduce_sum(tf.abs(self.newimg - self.timg), list(range(1, len(shape))))
        self.l1dist_y = reduce_sum(tf.abs(self.slack - self.timg), list(range(1, len(shape))))
        self.elasticdist = self.l2dist + tf.multiply(self.l1dist, self.beta_t)
        self.elasticdist_y = self.l2dist_y + tf.multiply(self.l1dist_y, self.beta_t)
        if self.decision_rule == 'EN':
            self.crit = self.elasticdist
            self.crit_p = 'Elastic'
        else:
            self.crit = self.l1dist
            self.crit_p = 'L1'

        # compute the probability of the label class versus the maximum other
        real = reduce_sum((self.tlab) * self.output, 1)
        real_y = reduce_sum((self.tlab) * self.output_y, 1)
        other = reduce_max((1 - self.tlab) * self.output - (self.tlab * 10000), 1)
        other_y = reduce_max((1 - self.tlab) * self.output_y - (self.tlab * 10000), 1)

        if self.TARGETED:
            # if targeted, optimize for making the other class most likely
            loss1 = tf.maximum(ZERO(), other - real + self.CONFIDENCE)
            loss1_y = tf.maximum(ZERO(), other_y - real_y + self.CONFIDENCE)
        else:
            # if untargeted, optimize for making this class least likely.
            loss1 = tf.maximum(ZERO(), real - other + self.CONFIDENCE)
            loss1_y = tf.maximum(ZERO(), real_y - other_y + self.CONFIDENCE)

        # sum up the losses
        self.loss21 = reduce_sum(self.l1dist)
        self.loss21_y = reduce_sum(self.l1dist_y)
        self.loss2 = reduce_sum(self.l2dist)
        self.loss2_y = reduce_sum(self.l2dist_y)
        self.loss1 = reduce_sum(self.const * loss1)
        self.loss1_y = reduce_sum(self.const * loss1_y)
        self.loss_opt = self.loss1_y + self.loss2_y
        self.loss = self.loss1 + self.loss2 + tf.multiply(self.beta_t, self.loss21)

        self.learning_rate = tf.train.polynomial_decay(self.LEARNING_RATE, self.global_step, self.MAX_ITERATIONS, 0,
                                                       power=0.5)

        # Setup the optimizer and keep track of variables we're creating
        start_vars = set(x.name for x in tf.global_variables())
        optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
        self.train = optimizer.minimize(self.loss_opt, var_list=[self.slack], global_step=self.global_step)
        end_vars = tf.global_variables()
        new_vars = [x for x in end_vars if x.name not in start_vars]

        # these are the variables to initialize when we run
        self.setup = []
        self.setup.append(self.timg.assign(self.assign_timg))
        self.setup.append(self.tlab.assign(self.assign_tlab))
        self.setup.append(self.const.assign(self.assign_const))

        var_list = [self.global_step] + [self.slack] + [self.newimg] + new_vars
        self.init = tf.variables_initializer(var_list=var_list)
    def body(x_in, y_in, domain_in, i_in, cond_in, predictions):
        logits = model.get_logits(x_in)
        preds = tf.nn.softmax(logits)
        preds_onehot = tf.one_hot(tf.argmax(preds, axis=1), depth=nb_classes)
        tensor1 = tf.zeros((1, i_in * 10))
        tensor2 = tf.zeros((1, (max_iters - 1 - i_in) * 10))
        reshaped_preds = tf.concat([tensor1, preds, tensor2], 1)
        predictions = tf.add(predictions, reshaped_preds)

        list_derivatives = []
        for class_ind in xrange(nb_classes):
            derivatives = tf.gradients(logits[:, class_ind], x_in)
            list_derivatives.append(derivatives[0])

        if attack == "tjsma":
            grads0 = tf.reshape(tf.stack(list_derivatives),
                                shape=[nb_classes, -1, nb_features])

            grads = tf.reshape(1 - x_in, shape=[1, nb_features]) * grads0

            target_class = tf.reshape(tf.transpose(y_in, perm=[1, 0]),
                                      shape=[nb_classes, -1, 1])
            other_classes = tf.cast(tf.not_equal(target_class, 1), tf_dtype)

            grads_target = reduce_sum(grads * target_class, axis=0)

        else:
            grads = tf.reshape(tf.stack(list_derivatives),
                               shape=[nb_classes, -1, nb_features])

            target_class = tf.reshape(tf.transpose(y_in, perm=[1, 0]),
                                      shape=[nb_classes, -1, 1])
            other_classes = tf.cast(tf.not_equal(target_class, 1), tf_dtype)

            grads_target = reduce_sum(grads * target_class, axis=0)

        if attack == "tjsma" or attack == "wjsma":
            grads_other = reduce_sum(
                grads * other_classes *
                tf.reshape(preds, shape=[nb_classes, -1, 1]),
                axis=0)
        else:
            grads_other = reduce_sum(grads * other_classes, axis=0)

        increase_coef = (4 * int(increase) - 2) * tf.cast(
            tf.equal(domain_in, 0), tf_dtype)

        target_tmp = grads_target
        target_tmp -= increase_coef * reduce_max(
            tf.abs(grads_target), axis=1, keepdims=True)
        target_sum = tf.reshape(target_tmp, shape=[-1, nb_features, 1]) + \
            tf.reshape(target_tmp, shape=[-1, 1, nb_features])

        other_tmp = grads_other
        other_tmp += increase_coef * reduce_max(
            tf.abs(grads_other), axis=1, keepdims=True)
        other_sum = tf.reshape(other_tmp, shape=[-1, nb_features, 1]) + \
            tf.reshape(other_tmp, shape=[-1, 1, nb_features])

        if increase:
            scores_mask = ((target_sum > 0) & (other_sum < 0))
        else:
            scores_mask = ((target_sum < 0) & (other_sum > 0))

        scores = tf.cast(scores_mask,
                         tf_dtype) * (-target_sum * other_sum) * zero_diagonal

        best = tf.argmax(tf.reshape(scores,
                                    shape=[-1, nb_features * nb_features]),
                         axis=1)

        p1 = tf.mod(best, nb_features)
        p2 = tf.floordiv(best, nb_features)
        p1_one_hot = tf.one_hot(p1, depth=nb_features)
        p2_one_hot = tf.one_hot(p2, depth=nb_features)

        mod_not_done = tf.equal(reduce_sum(y_in * preds_onehot, axis=1), 0)
        cond = mod_not_done & (reduce_sum(domain_in, axis=1) >= 2)

        cond_float = tf.reshape(tf.cast(cond, tf_dtype), shape=[-1, 1])
        to_mod = (p1_one_hot + p2_one_hot) * cond_float

        domain_out = domain_in - to_mod

        to_mod_reshape = tf.reshape(to_mod,
                                    shape=([-1] + x_in.shape[1:].as_list()))

        if increase:
            x_out = tf.minimum(clip_max, x_in + to_mod_reshape * theta)
        else:
            x_out = tf.maximum(clip_min, x_in - to_mod_reshape * theta)

        i_out = tf.add(i_in, 1)
        cond_out = reduce_any(cond)

        return x_out, y_in, domain_out, i_out, cond_out, predictions
Example #11
0
def fgm(x,
        logits,
        y=None,
        eps=0.3,
        ord=np.inf,
        clip_min=None,
        clip_max=None,
        targeted=False,
        sanity_checks=True):
    """
    TensorFlow implementation of the Fast Gradient Method.
    :param x: the input placeholder
    :param logits: output of model.get_logits
    :param y: (optional) A placeholder for the true labels. If targeted
              is true, then provide the target label. Otherwise, only provide
              this parameter if you'd like to use true labels when crafting
              adversarial samples. Otherwise, model predictions are used as
              labels to avoid the "label leaking" effect (explained in this
              paper: https://arxiv.org/abs/1611.01236). Default is None.
              Labels should be one-hot-encoded.
    :param eps: the epsilon (input variation parameter)
    :param ord: (optional) Order of the norm (mimics NumPy).
                Possible values: np.inf, 1 or 2.
    :param clip_min: Minimum float value for adversarial example components
    :param clip_max: Maximum float value for adversarial example components
    :param targeted: Is the attack targeted or untargeted? Untargeted, the
                     default, will try to make the label incorrect. Targeted
                     will instead try to move in the direction of being more
                     like y.
    :return: a tensor for the adversarial example
    """

    asserts = []

    # If a data range was specified, check that the input was in that range
    if clip_min is not None:
        asserts.append(
            utils_tf.assert_greater_equal(x, tf.cast(clip_min, x.dtype)))

    if clip_max is not None:
        asserts.append(
            utils_tf.assert_less_equal(x, tf.cast(clip_max, x.dtype)))

    # Make sure the caller has not passed probs by accident
    assert logits.op.type != 'Softmax'

    if y is None:
        # Using model predictions as ground truth to avoid label leaking
        preds_max = reduce_max(logits, 1, keepdims=True)
        y = tf.to_float(tf.equal(logits, preds_max))
        y = tf.stop_gradient(y)
    y = y / reduce_sum(y, 1, keepdims=True)

    # Compute loss
    loss = softmax_cross_entropy_with_logits(labels=y, logits=logits)
    if targeted:
        loss = -loss

    # Define gradient of loss wrt input
    grad, = tf.gradients(loss, x)

    optimal_perturbation = optimize_linear(grad, eps, ord)

    # Add perturbation to original example to obtain adversarial example
    adv_x = x + optimal_perturbation

    # If clipping is needed, reset all values outside of [clip_min, clip_max]
    if (clip_min is not None) or (clip_max is not None):
        # We don't currently support one-sided clipping
        assert clip_min is not None and clip_max is not None
        adv_x = utils_tf.clip_by_value(adv_x, clip_min, clip_max)

    if sanity_checks:
        with tf.control_dependencies(asserts):
            adv_x = tf.identity(adv_x)

    return adv_x
    def generate(self, x, **kwargs):
        """
    Generate symbolic graph for adversarial examples and return.
    :param x: The model's symbolic inputs.
    :param eps: (optional float) maximum distortion of adversarial example
                compared to original input
    :param eps_iter: (optional float) step size for each attack iteration
    :param nb_iter: (optional int) Number of attack iterations.
    :param rand_init: (optional) Whether to use random initialization
    :param y: (optional) A tensor with the true class labels
      NOTE: do not use smoothed labels here
    :param y_target: (optional) A tensor with the labels to target. Leave
                     y_target=None if y is also set. Labels should be
                     one-hot-encoded.
      NOTE: do not use smoothed labels here
    :param ord: (optional) Order of the norm (mimics Numpy).
                Possible values: np.inf, 1 or 2.
    :param clip_min: (optional float) Minimum input component value
    :param clip_max: (optional float) Maximum input component value
    """
        # Parse and save attack-specific parameters
        assert self.parse_params(**kwargs)

        # Initialize loop variables
        if self.rand_init:
            eta = tf.random_uniform(tf.shape(x),
                                    tf.cast(-self.rand_minmax, x.dtype),
                                    tf.cast(self.rand_minmax, x.dtype),
                                    dtype=x.dtype)
        else:
            eta = tf.zeros(tf.shape(x))

        # Clip eta
        eta = clip_eta(eta, self.ord, self.eps)
        adv_x = x + eta
        if self.clip_min is not None or self.clip_max is not None:
            adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max)

        if self.y_target is not None:
            y = self.y_target
            targeted = True
        elif self.y is not None:
            y = self.y
            targeted = False
        else:
            model_preds = self.model.get_probs(x)
            preds_max = reduce_max(model_preds, 1, keepdims=True)
            y = tf.to_float(tf.equal(model_preds, preds_max))
            y = tf.stop_gradient(y)
            targeted = False
            del model_preds

        y_kwarg = 'y_target' if targeted else 'y'
        fgm_params = {
            'eps': self.eps_iter,
            y_kwarg: y,
            'ord': self.ord,
            'clip_min': self.clip_min,
            'clip_max': self.clip_max,
            'loss_func': self.loss_func
        }
        if self.ord == 1:
            raise NotImplementedError(
                "It's not clear that FGM is a good inner loop"
                " step for PGD when ord=1, because ord=1 FGM "
                " changes only one pixel at a time. We need "
                " to rigorously test a strong ord=1 PGD "
                "before enabling this feature.")

        # Use getattr() to avoid errors in eager execution attacks
        FGM = self.FGM_CLASS(self.model,
                             sess=getattr(self, 'sess', None),
                             dtypestr=self.dtypestr)

        def cond(i, _):
            return tf.less(i, self.nb_iter)

        def body(i, adv_x):
            #fgm_params['loss_func'] = self.loss_func#(labels=fgm_params['y'], logits=self.model.get_logits(adv_x))
            adv_x = FGM.generate(adv_x, **fgm_params)

            # Clipping perturbation eta to self.ord norm ball
            eta = adv_x - x
            eta = clip_eta(eta, self.ord, self.eps)
            adv_x = x + eta

            # Redo the clipping.
            # FGM already did it, but subtracting and re-adding eta can add some
            # small numerical error.
            if self.clip_min is not None or self.clip_max is not None:
                adv_x = utils_tf.clip_by_value(adv_x, self.clip_min,
                                               self.clip_max)

            return i + 1, adv_x

        _, adv_x = tf.while_loop(cond,
                                 body, [tf.zeros([]), adv_x],
                                 back_prop=True)

        asserts = []

        # Asserts run only on CPU.
        # When multi-GPU eval code tries to force all PGD ops onto GPU, this
        # can cause an error.
        with tf.device("/CPU:0"):
            asserts.append(tf.assert_less_equal(self.eps_iter, self.eps))
            if self.ord == np.inf and self.clip_min is not None:
                # The 1e-6 is needed to compensate for numerical error.
                # Without the 1e-6 this fails when e.g. eps=.2, clip_min=.5,
                # clip_max=.7
                asserts.append(
                    tf.assert_less_equal(self.eps,
                                         1e-6 + self.clip_max - self.clip_min))

        if self.sanity_checks:
            with tf.control_dependencies(asserts):
                adv_x = tf.identity(adv_x)

        return adv_x
Example #13
0
    def __init__(self, sess, model, ensemble, batch_size, confidence, targeted,
                 learning_rate, binary_search_steps, max_iterations,
                 abort_early, initial_const, clip_min, clip_max, num_labels,
                 shape):
        """
        """

        self.sess = sess
        self.TARGETED = targeted
        self.LEARNING_RATE = learning_rate
        self.MAX_ITERATIONS = max_iterations
        self.BINARY_SEARCH_STEPS = binary_search_steps
        self.ABORT_EARLY = abort_early
        self.CONFIDENCE = confidence
        self.initial_const = initial_const
        self.batch_size = batch_size
        self.clip_min = clip_min
        self.clip_max = clip_max
        self.model = model
        self.ensemble = ensemble

        self.repeat = binary_search_steps >= 10

        self.shape = shape = tuple([batch_size] + list(shape))

        # the variable we're going to optimize over
        modifier = tf.Variable(np.zeros(shape, dtype=np_dtype))

        # these are variables to be more efficient in sending data to tf
        self.timg = tf.Variable(np.zeros(shape), dtype=tf_dtype, name='timg')
        self.tlab = tf.Variable(np.zeros((batch_size, num_labels)),
                                dtype=tf_dtype,
                                name='tlab')
        self.const = tf.Variable(np.zeros(batch_size),
                                 dtype=tf_dtype,
                                 name='const')

        # and here's what we use to assign them
        self.assign_timg = tf.placeholder(tf_dtype, shape, name='assign_timg')
        self.assign_tlab = tf.placeholder(tf_dtype, (batch_size, num_labels),
                                          name='assign_tlab')
        self.assign_const = tf.placeholder(tf_dtype, [batch_size],
                                           name='assign_const')

        # the resulting instance, tanh'd to keep bounded from clip_min
        # to clip_max
        self.newimg = (tf.tanh(modifier + self.timg) + 1) / 2
        self.newimg = self.newimg * (clip_max - clip_min) + clip_min

        # prediction BEFORE-SOFTMAX of the model
        self.output = model.get_logits(self.newimg)

        # distance to the input data
        self.other = (tf.tanh(self.timg) + 1) / \
            2 * (clip_max - clip_min) + clip_min
        self.l2dist = reduce_sum(tf.square(self.newimg - self.other),
                                 list(range(1, len(shape))))

        # compute the probability of the label class versus the maximum other
        real = reduce_sum((self.tlab) * self.output, 1)
        other = reduce_max((1 - self.tlab) * self.output - self.tlab * 10000,
                           1)

        if self.TARGETED:
            # if targeted, optimize for making the other class most likely
            loss1 = tf.maximum(ZERO(), other - real + self.CONFIDENCE)
        else:
            # if untargeted, optimize for making this class least likely.
            loss1 = tf.maximum(ZERO(), real - other + self.CONFIDENCE)

        # sum up the losses
        self.loss2 = reduce_sum(self.l2dist)

        # ==================== Add ensemble part ==================== #
        # Get the number of small nets for each class
        self.n_nets = np.array([len(x) for x in self.ensemble])
        # Max number of small nets in one class
        n_nets_max = np.max(self.n_nets)

        # Gather all outputs from the ensemble
        all_nets = []
        for i in range(num_labels):
            class_nets = []
            for j in range(n_nets_max):
                if j < self.n_nets[i]:
                    class_nets.append(self.ensemble[i][j].get_logits(
                        self.newimg))
                else:
                    # Padding: append [0, 0] for classes that have the number
                    # of NNs less than n_nets_max
                    class_nets.append(tf.zeros([batch_size, 2]))
            all_nets.append(tf.stack(class_nets, axis=1))
        self.ensemble_out = tf.stack(all_nets, axis=1)

        # Based on output, see which set of the ensemble to use
        # Find label/class to look for in ensemble
        if self.TARGETED:
            label = tf.argmax(self.tlab, axis=1)
        else:
            # Output of original image
            self.orig_output = model.get_logits(self.other)
            label = tf.argmax(self.orig_output, axis=1)
        ind = tf.range(batch_size, dtype=tf.int64)
        ind_label = tf.stack([ind, label], axis=1)
        # Use gather_nd to do numpy slicing
        self.label_nets = tf.gather_nd(self.ensemble_out, ind_label)

        # DEBUG
        # print("self.ensemble_out: ", self.ensemble_out)
        # print("label: ", label)
        # print("ind_label: ", ind_label)
        # print("label_nets: ", self.label_nets)

        # Get the loss function for the small net part
        if self.TARGETED:
            diff = self.label_nets[:, :, 0] - self.label_nets[:, :, 1]
        else:
            diff = self.label_nets[:, :, 1] - self.label_nets[:, :, 0]
        # Find the largest difference among small nets
        max_diff = tf.reduce_max(diff, axis=1)
        # Add confidence margin and clip at zero
        ensemble_loss = tf.maximum(ZERO(), max_diff + self.CONFIDENCE)
        # The objective function only includes max(clf_loss, any_ensemble_loss)
        loss1 = tf.maximum(loss1, tf.squeeze(ensemble_loss))
        self.loss1 = reduce_sum(self.const * loss1)
        self.loss = self.loss1 + self.loss2

        # DEBUG
        # print("max_diff: ", max_diff)
        # print("ensemble_loss: ", ensemble_loss)
        # print("loss1: ", loss1)
        # print("reduce_sum loss1: ", self.loss1)

        # Setup the adam optimizer and keep track of variables we're creating
        start_vars = set(x.name for x in tf.global_variables())
        optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE)
        self.train = optimizer.minimize(self.loss, var_list=[modifier])
        end_vars = tf.global_variables()
        new_vars = [x for x in end_vars if x.name not in start_vars]

        # these are the variables to initialize when we run
        self.setup = []
        self.setup.append(self.timg.assign(self.assign_timg))
        self.setup.append(self.tlab.assign(self.assign_tlab))
        self.setup.append(self.const.assign(self.assign_const))

        self.init = tf.variables_initializer(var_list=[modifier] + new_vars)
Example #14
0
    def __init__(self, sess, model, reconstructor, batch_size, confidence,
                 targeted, learning_rate, binary_search_steps, max_iterations,
                 abort_early, initial_const, clip_min, clip_max, num_labels,
                 shape):

        self.sess = sess
        self.TARGETED = targeted
        self.LEARNING_RATE = learning_rate
        self.MAX_ITERATIONS = max_iterations
        self.BINARY_SEARCH_STEPS = binary_search_steps
        self.ABORT_EARLY = abort_early
        self.CONFIDENCE = confidence
        self.initial_const = initial_const
        self.batch_size = batch_size
        self.clip_min = clip_min
        self.clip_max = clip_max
        self.model = model
        self.reconstructor = reconstructor

        self.repeat = binary_search_steps >= 10

        self.shape = shape = tuple([batch_size] + list(shape))

        # the variable we're going to optimize over
        modifier = tf.Variable(np.zeros(shape, dtype=np_dtype))

        # these are variables to be more efficient in sending data to tf
        self.timg = tf.Variable(np.zeros(shape), dtype=tf_dtype, name='timg')
        self.tlab = tf.Variable(np.zeros((batch_size, num_labels)),
                                dtype=tf_dtype,
                                name='tlab')
        self.const = tf.Variable(np.zeros(batch_size),
                                 dtype=tf_dtype,
                                 name='const')

        # and here's what we use to assign them
        self.assign_timg = tf.placeholder(tf_dtype, shape, name='assign_timg')
        self.assign_tlab = tf.placeholder(tf_dtype, (batch_size, num_labels),
                                          name='assign_tlab')
        self.assign_const = tf.placeholder(tf_dtype, [batch_size],
                                           name='assign_const')

        # the resulting instance, tanh'd to keep bounded from clip_min
        # to clip_max
        self.newimg = (tf.tanh(modifier + self.timg) + 1) / 2
        self.newimg = self.newimg * (clip_max - clip_min) + clip_min

        recon_img = tf.stop_gradient(
            self.reconstructor.reconstruct(self.newimg,
                                           batch_size=batch_size)[0])
        recon_img = (tf.tanh(recon_img) + 1) / 2 * (clip_max -
                                                    clip_min) + clip_min

        # prediction BEFORE-SOFTMAX of the model
        self.output = model.get_logits(recon_img)

        # distance to the input data
        self.other = (tf.tanh(self.timg) + 1) / \
            2 * (clip_max - clip_min) + clip_min
        #self.l2dist = reduce_sum(
        #    tf.square(self.newimg - self.other), list(range(1, len(shape))))
        self.l2dist = reduce_sum(tf.square(recon_img - self.other),
                                 list(range(1, len(shape))))

        # compute the probability of the label class versus the maximum other
        real = reduce_sum((self.tlab) * self.output, 1)
        other = reduce_max((1 - self.tlab) * self.output - self.tlab * 10000,
                           1)

        if self.TARGETED:
            # if targeted, optimize for making the other class most likely
            loss1 = tf.maximum(ZERO(), other - real + self.CONFIDENCE)
        else:
            # if untargeted, optimize for making this class least likely.
            loss1 = tf.maximum(ZERO(), real - other + self.CONFIDENCE)

        # sum up the losses
        self.loss2 = reduce_sum(self.l2dist)
        self.loss1 = reduce_sum(self.const * loss1)
        self.loss = self.loss1 + self.loss2

        # Setup the adam optimizer and keep track of variables we're creating
        start_vars = set(x.name for x in tf.global_variables())
        optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE)
        grads_and_vars = optimizer.compute_gradients(self.loss, [recon_img])
        grads_and_vars = [(grads_and_vars[0][0], modifier)]
        self.train = optimizer.apply_gradients(grads_and_vars)
        #self.train = optimizer.minimize(self.loss, var_list=[modifier])
        end_vars = tf.global_variables()
        new_vars = [x for x in end_vars if x.name not in start_vars]

        # these are the variables to initialize when we run
        self.setup = []
        self.setup.append(self.timg.assign(self.assign_timg))
        self.setup.append(self.tlab.assign(self.assign_tlab))
        self.setup.append(self.const.assign(self.assign_const))

        self.init = tf.variables_initializer(var_list=[modifier] + new_vars)
def sparse_l1_descent(x,
                      logits,
                      y=None,
                      eps=1.0,
                      q=99,
                      clip_min=None,
                      clip_max=None,
                      clip_grad=False,
                      targeted=False,
                      sanity_checks=True):
    """
  TensorFlow implementation of the Dense L1 Descent Method.
  :param x: the input placeholder
  :param logits: output of model.get_logits
  :param y: (optional) A placeholder for the true labels. If targeted
            is true, then provide the target label. Otherwise, only provide
            this parameter if you'd like to use true labels when crafting
            adversarial samples. Otherwise, model predictions are used as
            labels to avoid the "label leaking" effect (explained in this
            paper: https://arxiv.org/abs/1611.01236). Default is None.
            Labels should be one-hot-encoded.
  :param eps: the epsilon (input variation parameter)
  :param q: the percentile above which gradient values are retained. Either a
            scalar or a vector of same length as the input batch dimension.
  :param clip_min: Minimum float value for adversarial example components
  :param clip_max: Maximum float value for adversarial example components
  :param clip_grad: (optional bool) Ignore gradient components
                    at positions where the input is already at the boundary
                    of the domain, and the update step will get clipped out.
  :param targeted: Is the attack targeted or untargeted? Untargeted, the
                   default, will try to make the label incorrect. Targeted
                   will instead try to move in the direction of being more
                   like y.
  :return: a tensor for the adversarial example
  """

    asserts = []

    # If a data range was specified, check that the input was in that range
    if clip_min is not None:
        asserts.append(
            utils_tf.assert_greater_equal(x, tf.cast(clip_min, x.dtype)))

    if clip_max is not None:
        asserts.append(
            utils_tf.assert_less_equal(x, tf.cast(clip_max, x.dtype)))

    # Make sure the caller has not passed probs by accident
    assert logits.op.type != 'Softmax'

    if y is None:
        # Using model predictions as ground truth to avoid label leaking
        preds_max = reduce_max(logits, 1, keepdims=True)
        y = tf.to_float(tf.equal(logits, preds_max))
        y = tf.stop_gradient(y)
    y = y / reduce_sum(y, 1, keepdims=True)

    # Compute loss
    loss = softmax_cross_entropy_with_logits(labels=y, logits=logits)
    if targeted:
        loss = -loss

    # Define gradient of loss wrt input
    grad, = tf.gradients(loss, x)

    if clip_grad:
        grad = utils_tf.zero_out_clipped_grads(grad, x, clip_min, clip_max)

    red_ind = list(range(1, len(grad.get_shape())))
    dim = tf.reduce_prod(tf.shape(x)[1:])

    abs_grad = tf.reshape(tf.abs(grad), (-1, dim))

    # if q is a scalar, broadcast it to a vector of same length as the batch dim
    q = tf.cast(tf.broadcast_to(q, tf.shape(x)[0:1]), tf.float32)
    k = tf.cast(tf.floor(q / 100 * tf.cast(dim, tf.float32)), tf.int32)

    # `tf.sort` is much faster than `tf.contrib.distributions.percentile`.
    # For TF <= 1.12, use `tf.nn.top_k` as `tf.sort` is not implemented.
    if LooseVersion(tf.__version__) <= LooseVersion('1.12.0'):
        # `tf.sort` is only available in TF 1.13 onwards
        sorted_grad = -tf.nn.top_k(-abs_grad, k=dim, sorted=True)[0]
    else:
        sorted_grad = tf.sort(abs_grad, axis=-1)

    idx = tf.stack((tf.range(tf.shape(abs_grad)[0]), k), -1)
    percentiles = tf.gather_nd(sorted_grad, idx)
    tied_for_max = tf.greater_equal(abs_grad, tf.expand_dims(percentiles, -1))
    tied_for_max = tf.reshape(tf.cast(tied_for_max, x.dtype), tf.shape(grad))
    num_ties = tf.reduce_sum(tied_for_max, red_ind, keepdims=True)

    optimal_perturbation = tf.sign(grad) * tied_for_max / num_ties

    # Add perturbation to original example to obtain adversarial example
    adv_x = x + utils_tf.mul(eps, optimal_perturbation)

    # If clipping is needed, reset all values outside of [clip_min, clip_max]
    if (clip_min is not None) or (clip_max is not None):
        # We don't currently support one-sided clipping
        assert clip_min is not None and clip_max is not None
        adv_x = utils_tf.clip_by_value(adv_x, clip_min, clip_max)

    if sanity_checks:
        with tf.control_dependencies(asserts):
            adv_x = tf.identity(adv_x)

    return adv_x
Example #16
0
    def __init__(self, sess, model, batch_size, confidence, targeted,
                 learning_rate, binary_search_steps, max_iterations,
                 abort_early, initial_const, clip_min, clip_max, num_labels,
                 shape):
        """
    Return a tensor that constructs adversarial examples for the given
    input. Generate uses tf.py_func in order to operate over tensors.

    :param sess: a TF session.
    :param model: a cleverhans.model.Model object.
    :param batch_size: Number of attacks to run simultaneously.
    :param confidence: Confidence of adversarial examples: higher produces
                       examples with larger l2 distortion, but more
                       strongly classified as adversarial.
    :param targeted: boolean controlling the behavior of the adversarial
                     examples produced. If set to False, they will be
                     misclassified in any wrong class. If set to True,
                     they will be misclassified in a chosen target class.
    :param learning_rate: The learning rate for the attack algorithm.
                          Smaller values produce better results but are
                          slower to converge.
    :param binary_search_steps: The number of times we perform binary
                                search to find the optimal tradeoff-
                                constant between norm of the purturbation
                                and confidence of the classification.
    :param max_iterations: The maximum number of iterations. Setting this
                           to a larger value will produce lower distortion
                           results. Using only a few iterations requires
                           a larger learning rate, and will produce larger
                           distortion results.
    :param abort_early: If true, allows early aborts if gradient descent
                        is unable to make progress (i.e., gets stuck in
                        a local minimum).
    :param initial_const: The initial tradeoff-constant to use to tune the
                          relative importance of size of the pururbation
                          and confidence of classification.
                          If binary_search_steps is large, the initial
                          constant is not important. A smaller value of
                          this constant gives lower distortion results.
    :param clip_min: (optional float) Minimum input component value.
    :param clip_max: (optional float) Maximum input component value.
    :param num_labels: the number of classes in the model's output.
    :param shape: the shape of the model's input tensor.
    """

        self.sess = sess
        self.TARGETED = targeted
        self.LEARNING_RATE = learning_rate
        self.MAX_ITERATIONS = max_iterations
        self.BINARY_SEARCH_STEPS = binary_search_steps
        self.ABORT_EARLY = abort_early
        self.CONFIDENCE = confidence
        self.initial_const = initial_const
        self.batch_size = batch_size
        self.clip_min = clip_min
        self.clip_max = clip_max
        self.model = model

        self.repeat = binary_search_steps >= 10

        self.shape = shape = tuple([batch_size] + list(shape))

        # the variable we're going to optimize over
        modifier = tf.Variable(np.zeros(shape, dtype=np_dtype))

        # these are variables to be more efficient in sending data to tf
        self.timg = tf.Variable(np.zeros(shape), dtype=tf_dtype, name='timg')
        self.tlab = tf.Variable(np.zeros((batch_size, num_labels)),
                                dtype=tf_dtype,
                                name='tlab')
        self.const = tf.Variable(np.zeros(batch_size),
                                 dtype=tf_dtype,
                                 name='const')

        # and here's what we use to assign them
        self.assign_timg = tf.placeholder(tf_dtype, shape, name='assign_timg')
        self.assign_tlab = tf.placeholder(tf_dtype, (batch_size, num_labels),
                                          name='assign_tlab')
        self.assign_const = tf.placeholder(tf_dtype, [batch_size],
                                           name='assign_const')

        # the resulting instance, tanh'd to keep bounded from clip_min
        # to clip_max
        self.newimg = (tf.tanh(modifier + self.timg) + 1) / 2
        self.newimg = self.newimg * (clip_max - clip_min) + clip_min

        # prediction BEFORE-SOFTMAX of the model
        self.output = model.get_logits(self.newimg)

        # distance to the input data
        self.other = (tf.tanh(self.timg) + 1) / \
            2 * (clip_max - clip_min) + clip_min
        self.l2dist = reduce_sum(tf.square(self.newimg - self.other),
                                 list(range(1, len(shape))))

        # compute the probability of the label class versus the maximum other
        real = reduce_sum((self.tlab) * self.output, 1)
        other = reduce_max((1 - self.tlab) * self.output - self.tlab * 10000,
                           1)

        if self.TARGETED:
            # if targeted, optimize for making the other class most likely
            loss1 = tf.maximum(ZERO(), other - real + self.CONFIDENCE)
        else:
            # if untargeted, optimize for making this class least likely.
            loss1 = tf.maximum(ZERO(), real - other + self.CONFIDENCE)

        # sum up the losses
        self.loss2 = reduce_sum(self.l2dist)
        self.loss1 = reduce_sum(self.const * loss1)
        self.loss = self.loss1 + self.loss2

        # Setup the adam optimizer and keep track of variables we're creating
        start_vars = set(x.name for x in tf.global_variables())
        optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE)
        self.train = optimizer.minimize(self.loss, var_list=[modifier])
        end_vars = tf.global_variables()
        new_vars = [x for x in end_vars if x.name not in start_vars]

        # these are the variables to initialize when we run
        self.setup = []
        self.setup.append(self.timg.assign(self.assign_timg))
        self.setup.append(self.tlab.assign(self.assign_tlab))
        self.setup.append(self.const.assign(self.assign_const))

        self.init = tf.variables_initializer(var_list=[modifier] + new_vars)
Example #17
0
def fgm(x,
        logits,
        y=None,
        eps=0.3,
        ord=np.inf,
        clip_min=None,
        clip_max=None,
        targeted=False):
    """
    TensorFlow implementation of the Fast Gradient Method.
    :param x: the input placeholder
    :param logits: output of model.get_logits
    :param y: (optional) A placeholder for the model labels. If targeted
                is true, then provide the target label. Otherwise, only provide
                this parameter if you'd like to use true labels when crafting
                adversarial samples. Otherwise, model predictions are used as
                labels to avoid the "label leaking" effect (explained in this
                paper: https://arxiv.org/abs/1611.01236). Default is None.
                Labels should be one-hot-encoded.
    :param eps: the epsilon (input variation parameter)
    :param ord: (optional) Order of the norm (mimics NumPy).
                Possible values: np.inf, 1 or 2.
    :param clip_min: Minimum float value for adversarial example components
    :param clip_max: Maximum float value for adversarial example components
    :param targeted: Is the attack targeted or untargeted? Untargeted, the
                    default, will try to make the label incorrect. Targeted
                    will instead try to move in the direction of being more
                    like y.
    :return: a tensor for the adversarial example
    """

    # Make sure the caller has not passed probs by accident
    assert logits.op.type != 'Softmax'

    if y is None:
        # Using model predictions as ground truth to avoid label leaking
        preds_max = reduce_max(logits, 1, keepdims=True)
        y = tf.to_float(tf.equal(logits, preds_max))
        y = tf.stop_gradient(y)
    y = y / reduce_sum(y, 1, keepdims=True)

    # Compute loss
    # loss = softmax_cross_entropy_with_logits(labels=y, logits=logits)
    # if targeted:
    #     loss = -loss
    # Hinge loss
    real = reduce_sum((y) * logits, 1)
    other = reduce_max((1 - y) * logits - y * 1e9, 1)
    if targeted:
        # if targeted, optimize for making the other class most likely
        loss = tf.maximum(ZERO(), other - real + 0.1)
    else:
        # if untargeted, optimize for making this class least likely.
        loss = tf.maximum(ZERO(), real - other + 0.1)
    loss = -loss

    # Define gradient of loss wrt input
    grad, = tf.gradients(loss, x)

    if ord == np.inf:
        # Take sign of gradient
        normalized_grad = tf.sign(grad)
        # The following line should not change the numerical results.
        # It applies only because `normalized_grad` is the output of
        # a `sign` op, which has zero derivative anyway.
        # It should not be applied for the other norms, where the
        # perturbation has a non-zero derivative.
        normalized_grad = tf.stop_gradient(normalized_grad)
    elif ord == 1:
        red_ind = list(xrange(1, len(x.get_shape())))
        avoid_zero_div = 1e-12
        avoid_nan_norm = tf.maximum(
            avoid_zero_div,
            reduce_sum(tf.abs(grad), reduction_indices=red_ind, keepdims=True))
        normalized_grad = grad / avoid_nan_norm
    elif ord == 2:
        red_ind = list(xrange(1, len(x.get_shape())))
        avoid_zero_div = 1e-12
        square = tf.maximum(
            avoid_zero_div,
            reduce_sum(tf.square(grad),
                       reduction_indices=red_ind,
                       keepdims=True))
        normalized_grad = grad / tf.sqrt(square)
    else:
        raise NotImplementedError("Only L-inf, L1 and L2 norms are "
                                  "currently implemented.")

    # Multiply by constant epsilon
    scaled_grad = eps * normalized_grad

    # Add perturbation to original example to obtain adversarial example
    adv_x = x + scaled_grad

    # If clipping is needed, reset all values outside of [clip_min, clip_max]
    if (clip_min is not None) and (clip_max is not None):
        adv_x = tf.clip_by_value(adv_x, clip_min, clip_max)

    return adv_x
Example #18
0
    def generate(self, x, **kwargs):
        """
        Generate symbolic graph for adversarial examples and return.

        :param x: The model's symbolic inputs.
        :param eps: (optional float) maximum distortion of adversarial example
                    compared to original input
        :param eps_iter: (optional float) step size for each attack iteration
        :param nb_iter: (optional int) Number of attack iterations.
        :param rand_init: (optional) Whether to use random initialization
        :param y: (optional) A tensor with the true class labels
            NOTE: do not use smoothed labels here
        :param y_target: (optional) A tensor with the labels to target. Leave
                            y_target=None if y is also set. Labels should be
                            one-hot-encoded.
            NOTE: do not use smoothed labels here
        :param ord: (optional) Order of the norm (mimics Numpy).
                    Possible values: np.inf, 1 or 2.
        :param clip_min: (optional float) Minimum input component value
        :param clip_max: (optional float) Maximum input component value
        """
        # Parse and save attack-specific parameters
        assert self.parse_params(**kwargs)

        # Initialize loop variables
        if self.rand_init:
            eta = tf.random_uniform(tf.shape(x),
                                    -self.rand_minmax,
                                    self.rand_minmax,
                                    dtype=self.tf_dtype)
        else:
            eta = tf.zeros(tf.shape(x))
        eta = clip_eta(eta, self.ord, self.eps)

        # Fix labels to the first model predictions for loss computation
        model_preds = self.model.get_output(x)
        preds_max = reduce_max(model_preds, 1, keepdims=True)
        if self.y_target is not None:
            y = self.y_target
            targeted = True
        elif self.y is not None:
            y = self.y
            targeted = False
        else:
            y = tf.to_float(tf.equal(model_preds, preds_max))
            y = tf.stop_gradient(y)
            targeted = False

        y_kwarg = 'y_target' if targeted else 'y'
        fgm_params = {
            'eps': self.eps_iter,
            y_kwarg: y,
            'ord': self.ord,
            'clip_min': self.clip_min,
            'clip_max': self.clip_max
        }

        # Use getattr() to avoid errors in eager execution attacks
        FGM = self.FGM_CLASS(self.model,
                             sess=getattr(self, 'sess', None),
                             dtypestr=self.dtypestr)

        def cond(i, _):
            return tf.less(i, self.nb_iter)

        def body(i, e):
            adv_x = FGM.generate(x + e, **fgm_params)

            # Clipping perturbation according to clip_min and clip_max
            if self.clip_min is not None and self.clip_max is not None:
                adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max)

            # Clipping perturbation eta to self.ord norm ball
            eta = adv_x - x
            eta = clip_eta(eta, self.ord, self.eps)
            return i + 1, eta

        _, eta = tf.while_loop(cond, body, [tf.zeros([]), eta], back_prop=True)

        # Define adversarial example (and clip if necessary)
        adv_x = x + eta
        if self.clip_min is not None or self.clip_max is not None:
            assert self.clip_min is not None and self.clip_max is not None
            adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max)

        asserts = []

        # Asserts run only on CPU.
        # When multi-GPU eval code tries to force all PGD ops onto GPU, this
        # can cause an error.
        with tf.device("/CPU:0"):
            asserts.append(tf.assert_less_equal(self.eps_iter, self.eps))
            if self.ord == np.inf and self.clip_min is not None:
                # The 1e-6 is needed to compensate for numerical error.
                # Without the 1e-6 this fails when e.g. eps=.2, clip_min=.5, clip_max=.7
                asserts.append(
                    tf.assert_less_equal(self.eps,
                                         1e-6 + self.clip_max - self.clip_min))

        if self.sanity_checks:
            with tf.control_dependencies(asserts):
                adv_x = tf.identity(adv_x)

        return adv_x
Example #19
0
  def __init__(self, sess, model, batch_size, confidence, targeted,
               learning_rate, const_a_min, const_a_max, max_iterations, 
               clip_min, clip_max, num_labels, shape):
               
    """
    Return a tensor that constructs adversarial examples for the given
    input. Generate uses tf.py_func in order to operate over tensors.

    :param sess: a TF session.
    :param model: a cleverhans.model.Model object.
    :param batch_size: Number of attacks to run simultaneously.
    :param confidence: Confidence of adversarial examples: higher produces
                       examples with larger l2 distortion, but more
                       strongly classified as adversarial.
    :param targeted: boolean controlling the behavior of the adversarial
                     examples produced. If set to False, they will be
                     misclassified in any wrong class. If set to True,
                     they will be misclassified in a chosen target class.
    :param learning_rate: The learning rate for the attack algorithm.
                          Smaller values produce better results but are
                          slower to converge.
    :param const_a_min: The constant value for parameter a (min).
    :param const_a_max: The constant value for parameter a (max).
    :param max_iterations: The maximum number of iterations. Setting this
                           to a larger value will produce lower distortion
                           results. Using only a few iterations requires
                           a larger learning rate, and will produce larger
                           distortion results.
    :param clip_min: (optional float) Minimum input component value.
    :param clip_max: (optional float) Maximum input component value.
    :param num_labels: the number of classes in the model's output.
    :param shape: the shape of the model's input tensor.
    """

    self.sess = sess
    self.TARGETED = targeted
    self.LEARNING_RATE = learning_rate
    self.MAX_ITERATIONS = max_iterations
    self.CONST_A_MIN = const_a_min
    self.CONST_A_MAX = const_a_max
    self.CONFIDENCE = confidence
    self.batch_size = batch_size
    self.clip_min = clip_min
    self.clip_max = clip_max
    self.model = model

    self.shape = shape = tuple([batch_size] + list(shape))

    # the variable we're going to optimize over
    modifier = tf.Variable(np.zeros(shape, dtype=np_dtype))

    # these are variables to be more efficient in sending data to tf
    self.timg = tf.Variable(np.zeros(shape), dtype=tf_dtype, name='timg')
    self.tlab = tf.Variable(
        np.zeros((batch_size, num_labels)), dtype=tf_dtype, name='tlab')
    self.const = tf.Variable(
        np.zeros(batch_size), dtype=tf_dtype, name='const')

    # and here's what we use to assign them
    self.assign_timg = tf.placeholder(tf_dtype, shape, name='assign_timg')
    self.assign_tlab = tf.placeholder(
        tf_dtype, (batch_size, num_labels), name='assign_tlab')
    self.assign_const = tf.placeholder(
        tf_dtype, [batch_size], name='assign_const')

    # the resulting instance, tanh'd to keep bounded from clip_min
    # to clip_max
    self.newimg = (tf.tanh(modifier + self.timg) + 1) / 2
    self.newimg = self.newimg * (clip_max - clip_min) + clip_min

    # prediction BEFORE-SOFTMAX of the model
    self.output = model.get_logits(self.newimg)

    # distance to the input data
    self.other = (tf.tanh(self.timg) + 1) / \
        2 * (clip_max - clip_min) + clip_min
    self.l2dist = reduce_sum(
        tf.square(self.newimg - self.other), list(range(1, len(shape))))

    # compute the probability of the label class versus the maximum other
    real = reduce_sum((self.tlab) * self.output, 1)
    other = reduce_max((1 - self.tlab) * self.output - self.tlab * 10000,
                       1)

    if self.TARGETED:
      # if targeted, optimize for making the other class most likely
      loss1 = tf.maximum(ZERO(), other - real + self.CONFIDENCE)
    else:
      # if untargeted, optimize for making this class least likely.
      loss1 = tf.maximum(ZERO(), real - other + self.CONFIDENCE)

    # sum up the losses
    self.loss2 = reduce_sum(self.l2dist)
    self.loss1 = reduce_sum(self.const * loss1)
    self.loss = self.loss1 + self.loss2
    
    # Setup the adam optimizer and keep track of variables we're creating
    start_vars = set(x.name for x in tf.global_variables())
    batch_step = tf.Variable(99, trainable=False)
    learn_rate = tf.train.inverse_time_decay(learning_rate=self.LEARNING_RATE*100,
                                             global_step=batch_step * batch_size,
                                             decay_steps=1.0, decay_rate=1.0)
    optimizer = tf.train.MomentumOptimizer(learning_rate=learn_rate, momentum=0.0,
                                           use_nesterov=False)
    # Passing batch_step to minimize() will increment it at each step
    self.train = optimizer.minimize(self.loss, var_list=[modifier], global_step=batch_step)
    end_vars = tf.global_variables()
    new_vars = [x for x in end_vars if x.name not in start_vars]

    # these are the variables to initialize when we run
    self.setup = []
    self.setup.append(self.timg.assign(self.assign_timg))
    self.setup.append(self.tlab.assign(self.assign_tlab))
    self.setup.append(self.const.assign(self.assign_const))

    self.init = tf.variables_initializer(var_list=[modifier] + new_vars)