def testProjectToBallLInfTensorList(self): tensors = [tf.constant([[1.0, -3.0], [-2.0, 4.0]]), tf.constant([[0.0], [-5.0]])] projected_tensors = self.evaluate( utils.project_to_ball(tensors, 2.5, configs.NormType.INFINITY)) expected_results = [[[1.0, -2.5], [-2.0, 2.5]], [[0.0], [-2.5]]] self.assertAllClose(projected_tensors, expected_results)
def testProjectToBallL2SingleTensor(self): tensor = tf.constant([[3.0, -4.0], [-0.7, 2.4]]) projected_tensor = self.evaluate( utils.project_to_ball(tensor, 1.0, configs.NormType.L2)) # norm: [5.0, 2.5], scale: [0.2, 0.4] expected_result = [[0.6, -0.8], [-0.28, 0.96]] self.assertAllClose(projected_tensor, expected_result)
def testProjectToBallL2MultipleFeatures(self, radius, factor1, factor2): # Sum of squares is 25 + 9 + 49 + 169 = 252 for element 1, and 1 for element # 2. f1 = tf.constant([[[[0.0, 3.0, -4.0], [1.0, 2.0, -2.0]], [[2.0, 3.0, 6.0], [3.0, 4.0, 12.0]]], [[[1.0, 0.0, 0.0], [0.0, 0.0, 0.0]], [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]]]) # Sum of squares is 25 + 144 = 169 for element 1, 0 for element 2. f2 = tf.constant([[[3.0, 4.0], [12.0, 0.0]], [[0.0, 0.0], [0.0, 0.0]]]) input_dict = {'f1': f1, 'f2': f2} projected_tensor_dict = self.evaluate( utils.project_to_ball(input_dict, radius, configs.NormType.L2)) expected_f1_sample1 = f1[0] * factor1 expected_f1_sample2 = f1[1] * factor2 expected_f2_sample1 = f2[0] * factor1 expected_f2_sample2 = f2[1] * factor2 self.assertAllEqual(projected_tensor_dict['f1'][0], expected_f1_sample1) self.assertAllEqual(projected_tensor_dict['f1'][1], expected_f1_sample2) self.assertAllEqual(projected_tensor_dict['f2'][0], expected_f2_sample1) self.assertAllEqual(projected_tensor_dict['f2'][1], expected_f2_sample2)
def testProjectToBallL2TensorList(self): tensors = [tf.constant([[1.0, -2.0], [-8.0, 1.0]]), tf.constant([[2.0], [-4.0]])] projected_tensors = self.evaluate( utils.project_to_ball(tensors, 0.9, configs.NormType.L2)) # norm: [3.0, 0.9], scale: [0.3, 0.1] expected_results = [[[0.3, -0.6], [-0.8, 0.1]], [[0.6], [-0.4]]] self.assertAllClose(projected_tensors, expected_results)
def testProjectToBallL2(self, eps, first_factor, second_factor): target_tensor_dict = { 'f1': tf.constant([[1.0, -2.0, 2.0], [2.0, 10.0, 11.0]]) } projected_tensor_dict = self.evaluate( utils.project_to_ball(target_tensor_dict, eps, configs.NormType.L2)) expected_tensor = target_tensor_dict['f1'] * tf.constant([[first_factor], [second_factor]]) self.assertAllEqual(projected_tensor_dict['f1'], expected_tensor)
def testProjectToBallLInfMultipleFeatures(self): f1 = tf.constant([[1.0, 2.0, -4.0], [-1.0, 3.0, 5.0]]) f2 = tf.constant([[1.0, 6.0], [2.0, 4.0]]) input_dict = {'f1': f1, 'f2': f2} projected_tensor_dict = self.evaluate( utils.project_to_ball(input_dict, 1.5, configs.NormType.INFINITY)) expected_f1 = tf.constant([[1.0, 1.5, -1.5], [-1.0, 1.5, 1.5]]) expected_f2 = tf.constant([[1.0, 1.5], [1.5, 1.5]]) self.assertAllEqual(projected_tensor_dict['f1'], expected_f1) self.assertAllEqual(projected_tensor_dict['f2'], expected_f2)
def _apply_gradients(self, dense_features, keyed_grads, dense_original_features): """Applies gradients to the features to generate perturbed features. Args: dense_features: A dictionary of tensors at which gradients are computed. keyed_grads: A dictionary of tensors representing the gradients. dense_original_features: A dictionary of tensors which the epsilon constraint (`adv_config.pgd_epsilon`) should reference. This should have the same structure as `dense_features`. Returns: A dictionary of tensors with the same structure as `dense_features` representing the perturbed features. """ masked_grads = { key: utils.apply_feature_mask(grad, self.feature_masks.get(key, None)) for key, grad in keyed_grads.items() } perturb_directions = utils.maximize_within_unit_norm( masked_grads, self._adv_config.adv_grad_norm) # Clip perturbations into epsilon ball here. Note that this ball is # centered around the original input point. perturbations = {} for key, direction in perturb_directions.items(): perturbations[key] = ( direction * self._adv_config.adv_step_size + dense_features[key] - dense_original_features[key]) if self._adv_config.pgd_epsilon is not None: perturbations = utils.project_to_ball(perturbations, self._adv_config.pgd_epsilon, self._adv_config.adv_grad_norm) # Apply feature constraints specified in the config. perturbed_features = {} for key, feature in dense_original_features.items(): if key not in perturbations: # No perturbation due to no gradient perturbed_features[key] = feature else: perturbed_features[key] = _apply_feature_constraints( feature + tf.stop_gradient(perturbations[key]), self.feature_min.get(key, None), self.feature_max.get(key, None)) return perturbed_features
def _compute_perturbations(self, dense_features, keyed_grads, dense_original_features): """Generates perturbations based on gradients and features. This computes the steepest descent direction from the gradient, multiplies by the step size, and projects into the epsilon ball. Args: dense_features: A dictionary of tensors at which gradients are computed. keyed_grads: A dictionary of tensors representing the gradients. dense_original_features: A dictionary of tensors which the epsilon constraint (`adv_config.pgd_epsilon`) should reference. This should have the same structure as `dense_features`. Returns: A dictionary of tensors with the same structure as `dense_features` representing the perturbations to be applied on `dense_original_features`. """ masked_grads = { key: utils.apply_feature_mask(grad, self.feature_masks.get(key, None)) for key, grad in keyed_grads.items() } perturb_directions = utils.maximize_within_unit_norm( masked_grads, self._adv_config.adv_grad_norm) # Clip perturbations into epsilon ball here. Note that this ball is # centered around the original input point. perturbations = {} for key, direction in perturb_directions.items(): perturbations[key] = ( direction * self._adv_config.adv_step_size + dense_features[key] - dense_original_features[key]) if self._adv_config.pgd_epsilon is not None: perturbations = utils.project_to_ball(perturbations, self._adv_config.pgd_epsilon, self._adv_config.adv_grad_norm) return perturbations
def gen_neighbor(self, input_features, pgd_labels=None): """Generates adversarial neighbors and the corresponding weights. This function perturbs only *dense* tensors to generate adversarial neighbors. No perturbation will be applied on sparse tensors (e.g., string or int). Therefore, in the generated adversarial neighbors, the values of these sparse tensors will be kept the same as the input_features. In other words, if input_features is a dictionary mapping feature names to tensors, the dense features will be perturbed and the values of sparse features will remain the same. Arguments: input_features: a dense (float32) tensor, a list of dense tensors, or a dictionary of feature names and dense tensors. The shape of the tensor(s) should be either: (a) pointwise samples: [batch_size, feat_len], or (b) sequence samples: [batch_size, seq_len, feat_len] pgd_labels: the labels corresponding to each input. This should have shape `[batch_size, 1]`. This is required for PGD-generated adversaries, and unused otherwise. Returns: adv_neighbor: the perturbed example, with the same shape and structure as input_features adv_weight: a dense (float32) tensor with shape of [batch_size, 1], representing the weight for each neighbor Raises: ValueError: if some of the `input_features` cannot be perturbed due to (a) it is a `tf.SparseTensor`, (b) it has a non-differentiable type like string or integer, or (c) it is not involved in loss computation. This error is suppressed if `raise_invalid_gradient` is set to False (which is the default). """ loss = self._labeled_loss gradient_tape = self._gradient_tape # Composes both features and feature_masks to dictionaries, so that the # feature_masks can be looked up by key. features = self._compose_as_dict(input_features) dense_original_features, sparse_original_features = self._split_dict( features, lambda feature: isinstance(feature, tf.Tensor)) feature_masks = self._compose_as_dict(self._adv_config.feature_mask) feature_min = self._compose_as_dict(self._adv_config.clip_value_min) feature_max = self._compose_as_dict(self._adv_config.clip_value_max) if sparse_original_features: sparse_keys = str(sparse_original_features.keys()) if self._raise_invalid_gradient: raise ValueError('Cannot perturb non-Tensor input: ' + sparse_keys) logging.log_first_n(logging.WARNING, 'Cannot perturb non-Tensor input: %s', 1, sparse_keys) dense_features = dense_original_features for t in range(self._adv_config.iterations): keyed_grads = self._compute_gradient(loss, dense_features, gradient_tape) masked_grads = { key: utils.apply_feature_mask(grad, feature_masks.get(key, None)) for key, grad in keyed_grads.items() } unit_perturbations = utils.maximize_within_unit_norm( masked_grads, self._adv_config.adv_grad_norm) perturbations = tf.nest.map_structure( lambda t: t * self._adv_config.adv_step_size, unit_perturbations) # Clip perturbations into epsilon ball here. Note that this ball is # centered around the original input point. diff = {} bounded_diff = {} for key, perturb in perturbations.items(): # Only include features for which perturbation occurred. There is # nothing to project for features without perturbations. diff[key] = dense_features[key] + perturb - dense_original_features[key] if self._adv_config.epsilon is not None: bounded_diff = utils.project_to_ball(diff, self._adv_config.epsilon, self._adv_config.adv_grad_norm) else: bounded_diff = diff # Backfill the rest of the dense features. for key, feature in dense_features.items(): if key not in bounded_diff: bounded_diff[key] = feature - dense_original_features[key] adv_neighbor = dict(sparse_original_features) for key, feature in dense_original_features.items(): adv_neighbor[key] = tf.stop_gradient( _apply_feature_constraints( feature + bounded_diff[key] if key in perturbations else feature, feature_min.get(key, None), feature_max.get(key, None))) # Update for the next iteration. if t < self._adv_config.iterations - 1: inputs_t = self._decompose_as(input_features, adv_neighbor) # Compute the new loss to calculate gradients with. features = self._compose_as_dict(inputs_t) dense_features, _ = self._split_dict( features, lambda feature: isinstance(feature, tf.Tensor)) if gradient_tape is not None: with gradient_tape: # Gradient calculated against dense features only. gradient_tape.watch(dense_features) loss = self._pgd_loss_fn(pgd_labels, self._pgd_model_fn(inputs_t)) else: loss = self._pgd_loss_fn(pgd_labels, self._pgd_model_fn(inputs_t)) # Converts the perturbed examples back to their original structure. adv_neighbor = self._decompose_as(input_features, adv_neighbor) batch_size = tf.shape(list(features.values())[0])[0] adv_weight = tf.ones([batch_size, 1]) return adv_neighbor, adv_weight
def testProjectToBallL2WithZero(self): input_dict = {'f1': tf.constant(0.0, shape=[2, 3])} projected_tensor_dict = self.evaluate( utils.project_to_ball(input_dict, 0.5, configs.NormType.L2)) expected_tensor = tf.constant(0.0, shape=[2, 3]) self.assertAllEqual(projected_tensor_dict['f1'], expected_tensor)
def testProjectToBallL1(self): target_tensor = tf.constant([[1.0, 2.0, -4.0]]) with self.assertRaises(NotImplementedError): self.evaluate( utils.project_to_ball(target_tensor, 0.2, configs.NormType.L1))
def testProjectToBallLInfSingleTensor(self): tensor = tf.constant([[1.0, -3.0], [-2.0, 4.0]]) projected_tensor = self.evaluate( utils.project_to_ball(tensor, 2.5, configs.NormType.INFINITY)) expected_result = [[1.0, -2.5], [-2.0, 2.5]] self.assertAllClose(projected_tensor, expected_result)
def testProjectToBallLInf(self, input_tensor, eps, expected_tensor): input_dict = {'f1': tf.constant(input_tensor)} projected_tensor_dict = self.evaluate( utils.project_to_ball(input_dict, eps, configs.NormType.INFINITY)) self.assertAllEqual(projected_tensor_dict['f1'], tf.constant(expected_tensor))