def __call__(self, perturbed: T, outputs: T) -> T: outputs_, restore_type = ep.astensor_(outputs) del perturbed, outputs classes = outputs_.argmax(axis=-1) is_adv = classes == self.target_classes return restore_type(is_adv)
def run( self, model: Model, inputs: T, criterion: Union[Criterion, Any] = None, *, epsilon: float, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x, restore_type = ep.astensor_(inputs) del inputs, criterion, kwargs verify_input_bounds(x, model) min_, max_ = model.bounds target = min_ + self.target * (max_ - min_) direction = target - x norms = ep.norms.l2(flatten(direction), axis=-1) scale = epsilon / atleast_kd(norms, direction.ndim) scale = ep.minimum(scale, 1) x = x + scale * direction x = x.clip(min_, max_) return restore_type(x)
def test_astensor_restore_tensor(t: Tensor) -> None: r = t y, restore_type = ep.astensor_(r) assert (y == t).all() assert type(restore_type(y)) == type(r) y = y + 1 assert type(restore_type(y)) == type(r)
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, starting_points: Optional[ep.Tensor] = None, **kwargs: Any, ) -> T: originals, restore_type = ep.astensor_(inputs) self._nqueries = {i: 0 for i in range(len(originals))} self._set_cos_sin_function(originals) self.theta_max = ep.ones(originals, len(originals)) * self._theta_max criterion = get_criterion(criterion) self._criterion_is_adversarial = get_is_adversarial(criterion, model) # Get Starting Point if starting_points is not None: best_advs = starting_points elif starting_points is None: init_attack: MinimizationAttack = LinearSearchBlendedUniformNoiseAttack(steps=50) best_advs = init_attack.run(model, originals, criterion, early_stop=early_stop) else: raise ValueError("starting_points {} doesn't exist.".format(starting_points)) assert self._is_adversarial(best_advs).all() # Initialize the direction orthogonalized with the first direction fd = best_advs - originals norm = ep.norms.l2(fd.flatten(1), axis=1) fd = fd / atleast_kd(norm, fd.ndim) self._directions_ortho = {i: v.expand_dims(0) for i, v in enumerate(fd)} # Load Basis if "basis_params" in kwargs: self._basis = Basis(originals, **kwargs["basis_params"]) else: self._basis = Basis(originals) for _ in range(self._steps): # Get candidates. Shape: (n_candidates, batch_size, image_size) candidates = self._get_candidates(originals, best_advs) candidates = candidates.transpose((1, 0, 2, 3, 4)) best_candidates = ep.zeros_like(best_advs).raw for i, o in enumerate(originals): o_repeated = ep.concatenate([o.expand_dims(0)] * len(candidates[i]), axis=0) index = ep.argmax(self.distance(o_repeated, candidates[i])).raw best_candidates[i] = candidates[i][index].raw is_success = self.distance(best_candidates, originals) < self.distance(best_advs, originals) best_advs = ep.where(atleast_kd(is_success, best_candidates.ndim), ep.astensor(best_candidates), best_advs) if all(v > self._max_queries for v in self._nqueries.values()): print("Max queries attained for all the images.") break return restore_type(best_advs)
def __call__(self, model: Model, inputs: T, criterion: Union[Misclassification, T]) -> T: x, restore_type = ep.astensor_(inputs) criterion_ = get_criterion(criterion) del inputs, criterion N = len(x) if isinstance(criterion_, Misclassification): classes = criterion_.labels else: raise ValueError("unsupported criterion") if classes.shape != (N, ): raise ValueError( f"expected labels to have shape ({N},), got {classes.shape}") min_, max_ = model.bounds x_l2_norm = flatten(x.square()).sum(1) def loss_fun( x: ep.Tensor) -> Tuple[ep.Tensor, Tuple[ep.Tensor, ep.Tensor]]: logits = model(x) scores = ep.softmax(logits) pred_scores = scores[range(N), classes] loss = pred_scores.sum() return loss, (scores, pred_scores) for i in range(self.steps): # (1) get the scores and gradients _, (scores, pred_scores), gradients = ep.value_aux_and_grad(loss_fun, x) pred = scores.argmax(-1) num_classes = scores.shape[-1] # (2) calculate gradient norm gradients_l2_norm = flatten(gradients.square()).sum(1) # (3) calculate delta a = self.stepsize * x_l2_norm * gradients_l2_norm b = pred_scores - 1.0 / num_classes delta = ep.minimum(a, b) # (4) stop the attack if an adversarial example has been found # this is not described in the paper but otherwise once the prob. drops # below chance level the likelihood is not decreased but increased is_not_adversarial = (pred == classes).float32() delta *= is_not_adversarial # (5) calculate & apply current perturbation a = atleast_kd(delta / gradients_l2_norm.square(), gradients.ndim) x -= a * gradients x = ep.clip(x, min_, max_) return restore_type(x)
def get_projected_gradients(x_current, x_orig, label, surrogate_model): if surrogate_model is None: return None device = surrogate_model.device source_direction_ = x_orig - x_current source_direction = source_direction_.numpy() #maybe make it in another way source_norm = np.linalg.norm(source_direction) source_direction = source_direction / source_norm criterion = fb.criteria.Misclassification(torch.tensor([0], device=device)) classes = criterion.labels loss_fn = get_loss_fn(surrogate_model, classes) x0, restore_type = ep.astensor_(x_current + 1e-2 * source_direction_) _, gradients = value_and_grad(loss_fn, x0) gradients = gradients.numpy() #gradients = np.nan_to_num(gradients, nan=0.0, posinf=0.0, neginf=0.0) # Project the gradients. dot = np.vdot(gradients, source_direction) projected_gradient = gradients - dot * source_direction norm_ = np.linalg.norm(projected_gradient) if norm_ > 1e-5: projected_gradient /= norm_ projected_gradient = (-1.) * projected_gradient return projected_gradient
def extract_target_logits(model: Model, inputs: ep.Tensor, labels: ep.Tensor): """ This implementation uses any correctly classified sample as the target logit """ if not isinstance(labels, ep.Tensor): labels, _ = ep.astensor_(labels) num_classes = 10 # Hack for CIFAR10 result = np.zeros([num_classes, num_classes]) present = np.zeros(num_classes) Z = model(inputs) # all the logits if isinstance(Z, ep.Tensor): Z = Z.raw else: Z = Z.detach() for i in range(labels.shape[0]): t = labels[i].raw.item() if present[t]: continue z = Z[i:i + 1] if z.argmax() == t: result[t, :] = z.cpu() present[t] = 1 if sum(present) == num_classes: break return result
def __call__(self, model: Model, inputs: T, criterion: Union[Criterion, T]) -> T: x, restore_type = ep.astensor_(inputs) del inputs criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) min_, max_ = model.bounds target = min_ + self.target * (max_ - min_) direction = target - x lower_bound = ep.zeros(x, len(x)) upper_bound = ep.ones(x, len(x)) epsilons = lower_bound for _ in range(self.binary_search_steps): eps = atleast_kd(epsilons, x.ndim) is_adv = is_adversarial(x + eps * direction) lower_bound = ep.where(is_adv, lower_bound, epsilons) upper_bound = ep.where(is_adv, epsilons, upper_bound) epsilons = (lower_bound + upper_bound) / 2 epsilons = upper_bound eps = atleast_kd(epsilons, x.ndim) xp = x + eps * direction return restore_type(xp)
def __call__(self, model: Model, inputs: T, criterion: Union[Misclassification, T]) -> T: x0, restore_type = ep.astensor_(inputs) criterion_ = get_criterion(criterion) del inputs, criterion if not isinstance(criterion_, Misclassification): raise ValueError("unsupported criterion") labels = criterion_.labels def loss_fn(inputs: ep.Tensor) -> ep.Tensor: logits = model(inputs) return ep.crossentropy(logits, labels).sum() x = x0 if self.random_start: x = x + ep.uniform(x, x.shape, -self.epsilon, self.epsilon) x = ep.clip(x, *model.bounds) for _ in range(self.steps): _, gradients = ep.value_and_grad(loss_fn, x) gradients = gradients.sign() x = x + self.stepsize * gradients x = x0 + ep.clip(x - x0, -self.epsilon, self.epsilon) x = ep.clip(x, *model.bounds) return restore_type(x)
def __call__(self, model: Model, inputs: T, criterion: Union[Criterion, T]) -> T: x, restore_type = ep.astensor_(inputs) del inputs criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) min_, max_ = model.bounds target = min_ + self.target * (max_ - min_) direction = target - x best = ep.ones(x, len(x)) epsilon = 0.0 stepsize = 1.0 / self.steps for _ in range(self.steps): # TODO: reduce the batch size to the ones that have not yet been sucessful is_adv = is_adversarial(x + epsilon * direction) is_best_adv = ep.logical_and(is_adv, best == 1) best = ep.where(is_best_adv, epsilon, best) if (best < 1).all(): break epsilon += stepsize eps = atleast_kd(best, x.ndim) xp = x + eps * direction return restore_type(xp)
def __call__(self, perturbed: T, outputs: T) -> T: outputs_, restore_type = ep.astensor_(outputs) del perturbed, outputs classes = outputs_.argmax(axis=-1) assert classes.shape == self.labels.shape is_adv = classes != self.labels return restore_type(is_adv)
def __call__( # type: ignore self, model: Model, inputs: T, criterion: Any, *, epsilons: Union[Sequence[Union[float, None]], float, None], **kwargs: Any, ) -> Union[Tuple[List[T], List[T], T], Tuple[T, T, T]]: x, restore_type = ep.astensor_(inputs) del inputs criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) was_iterable = True if not isinstance(epsilons, Iterable): epsilons = [epsilons] was_iterable = False N = len(x) K = len(epsilons) # None means: just minimize, no early stopping, no limit on the perturbation size if any(eps is None for eps in epsilons): early_stop = None else: early_stop = min(epsilons) # run the actual attack xp = self.run(model, x, criterion, early_stop=early_stop, **kwargs) xpcs = [] success = [] for epsilon in epsilons: start = timer() if epsilon is None: xpc = xp else: xpc = self.distance.clip_perturbation(x, xp, epsilon) is_adv = is_adversarial(xpc) xpcs.append(xpc) success.append(is_adv) end = timer() print(end-start) success_ = ep.stack(success) assert success_.shape == (K, N) xp_ = restore_type(xp) xpcs_ = [restore_type(xpc) for xpc in xpcs] if was_iterable: return [xp_] * K, xpcs_, restore_type(success_) else: assert len(xpcs_) == 1 return xp_, xpcs_[0], restore_type(success_.squeeze(axis=0))
def __call__(self, model: Model, inputs: T, criterion: Union[Misclassification, T]) -> T: x, restore_type = ep.astensor_(inputs) criterion_ = get_criterion(criterion) del inputs, criterion N = len(x) if isinstance(criterion_, Misclassification): classes = criterion_.labels else: raise ValueError("unsupported criterion") if classes.shape != (N, ): raise ValueError( f"expected labels to have shape ({N},), got {classes.shape}") bounds = model.bounds def loss_fun(delta: ep.Tensor, logits: ep.Tensor) -> ep.Tensor: assert x.shape[0] == logits.shape[0] assert delta.shape == x.shape x_hat = x + delta logits_hat = model(x_hat) loss = ep.kl_div_with_logits(logits, logits_hat).sum() return loss value_and_grad = ep.value_and_grad_fn(x, loss_fun, has_aux=False) clean_logits = model(x) # start with random vector as search vector d = ep.normal(x, shape=x.shape, mean=0, stddev=1) for it in range(self.iterations): # normalize proposal to be unit vector d = d * self.xi / atleast_kd(ep.norms.l2(flatten(d), axis=-1), x.ndim) # use gradient of KL divergence as new search vector _, grad = value_and_grad(d, clean_logits) d = grad # rescale search vector d = (bounds[1] - bounds[0]) * d if ep.any(ep.norms.l2(flatten(d), axis=-1) < 1e-64): raise RuntimeError( "Gradient vanished; this can happen if xi is too small.") final_delta = (self.epsilon / ep.sqrt( (d**2).sum(keepdims=True, axis=(1, 2, 3))) * d) x_adv = ep.clip(x + final_delta, *bounds) return restore_type(x_adv)
def __call__(self, model: Model, inputs: T, criterion: Union[Criterion, Any] = None) -> T: x, restore_type = ep.astensor_(inputs) del inputs, criterion min_, max_ = model.bounds x = min_ + max_ - x return restore_type(x)
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x, restore_type = ep.astensor_(inputs) del inputs, kwargs verify_input_bounds(x, model) criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) found = is_adversarial(x) results = x def grid_search_generator() -> Generator[Any, Any, Any]: dphis = np.linspace(-self.max_rot, self.max_rot, self.num_rots) dxs = np.linspace(-self.max_trans, self.max_trans, self.num_trans) dys = np.linspace(-self.max_trans, self.max_trans, self.num_trans) for dphi in dphis: for dx in dxs: for dy in dys: yield dphi, dx, dy def random_search_generator() -> Generator[Any, Any, Any]: dphis = np.random.uniform(-self.max_rot, self.max_rot, self.random_steps) dxs = np.random.uniform(-self.max_trans, self.max_trans, self.random_steps) dys = np.random.uniform(-self.max_trans, self.max_trans, self.random_steps) for dphi, dx, dy in zip(dphis, dxs, dys): yield dphi, dx, dy gen = grid_search_generator( ) if self.grid_search else random_search_generator() for dphi, dx, dy in gen: # TODO: reduce the batch size to the ones that haven't been successful x_p = rotate_and_shift(x, translation=(dx, dy), rotation=dphi) is_adv = is_adversarial(x_p) new_adv = ep.logical_and(is_adv, found.logical_not()) results = ep.where(atleast_kd(new_adv, x_p.ndim), x_p, results) found = ep.logical_or(new_adv, found) if found.all(): break # all images in batch misclassified return restore_type(results)
def run( self, model: Model, inputs: T, criterion: Union[Misclassification, TargetedMisclassification, T], *, epsilon: float, mc: int, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x0, restore_type = ep.astensor_(inputs) criterion_ = get_criterion(criterion) del inputs, criterion, kwargs # perform a gradient ascent (targeted attack) or descent (untargeted attack) if isinstance(criterion_, Misclassification): gradient_step_sign = 1.0 classes = criterion_.labels elif hasattr(criterion_, "target_classes"): gradient_step_sign = -1.0 classes = criterion_.target_classes # type: ignore else: raise ValueError("unsupported criterion") loss_fn = self.get_loss_fn(model, classes) if self.abs_stepsize is None: stepsize = self.rel_stepsize * epsilon else: stepsize = self.abs_stepsize if self.random_start: x = self.get_random_start(x0, epsilon) x = ep.clip(x, *model.bounds) else: x = x0 for _ in range(self.steps): gradient_sum = 0. for _ in range(mc): _, gradients = self.value_and_grad(loss_fn, x) gradient_sum += gradients gradients = self.normalize(gradient_sum, x=x, bounds=model.bounds) x = x + gradient_step_sign * stepsize * gradients x = self.project(x, x0, epsilon) x = ep.clip(x, *model.bounds) return restore_type(x)
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) self.process_raw() assert self.inputs is not None assert self.outputs is not None x, restore_type = ep.astensor_(inputs) del inputs, kwargs verify_input_bounds(x, model) criterion = get_criterion(criterion) result = x found = criterion(x, model(x)) batch_size = len(x) # for every sample try every other sample index_pools: List[List[int]] = [] for i in range(batch_size): indices = list(range(batch_size)) indices.remove(i) indices = list(indices) np.random.shuffle(indices) index_pools.append(indices) for i in range(batch_size - 1): if found.all(): break indices = np.array([pool[i] for pool in index_pools]) xp = self.inputs[indices] yp = self.outputs[indices] is_adv = criterion(xp, yp) new_found = ep.logical_and(is_adv, found.logical_not()) result = ep.where(atleast_kd(new_found, result.ndim), xp, result) found = ep.logical_or(found, new_found) return restore_type(result)
def run( self, model: Model, inputs: T, criterion: Union[Criterion, Any] = None, *, early_stop: Optional[float] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x, restore_type = ep.astensor_(inputs) del inputs, criterion, kwargs min_, max_ = model.bounds x = min_ + max_ - x return restore_type(x)
def get_modified_loss(model, inputs, labels, untargeted_fn, targeted_fn, targeted, modifier): """ Return the loss function based on the modifiers. There are five modifiers: 1. softmax: flag to control if attack on raw outputs / the one with softmax 2. loss_diff: flag to controls the targeted loss to be subtracted by the untargeted loss *3. indiv: modify the return to return individual losses instead of the sum of losses *4. logits: return the prediction (affected by labels) *5. labels: takes in the array of labels and return the logits of the labels and the logits of the top class (for SQR) Here 3, 4, 5 does not change the functionality, and they are used to change the return for algorithm implementation """ logits = model(inputs) logits, restore_type = ep.astensor_(logits) outputs = logits if 'softmax' in modifier: if modifier['softmax']: outputs = logits.softmax() if targeted: if 'loss_diff' in modifier and modifier['loss_diff']: ind_sorted = outputs.argsort(axis=1) ind = (ind_sorted[:, -1]) losses = targeted_fn(outputs, labels) + untargeted_fn(outputs, ind) else: losses = targeted_fn(outputs, labels) loss = losses.sum() else: losses = untargeted_fn(outputs, labels) loss = losses.sum() result = [restore_type(loss)] if 'indiv' in modifier: result.append(restore_type(losses)) if 'logits' in modifier: result.append(restore_type(outputs)) if 'labels' in modifier: curr_idx = modifier['labels'] u = np.arange(labels.shape[0]) y_corr = logits[u, curr_idx] logits.raw[u, curr_idx] = -float('inf') y_others = logits.max(axis=-1) result.append([restore_type(y_corr), restore_type(y_others)]) if len(result) == 1: result = result[0] return result
def run( self, model: Model, inputs: T, criterion: Union[Misclassification, T], *, epsilon: float, mc: int, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x0, restore_type = ep.astensor_(inputs) criterion_ = get_criterion(criterion) del inputs, criterion, kwargs if not isinstance(criterion_, Misclassification): raise ValueError("unsupported criterion") labels = criterion_.labels loss_fn = self.get_loss_fn(model, labels) if self.abs_stepsize is None: stepsize = self.rel_stepsize * epsilon else: stepsize = self.abs_stepsize if self.random_start: x = self.get_random_start(x0, epsilon) x = ep.clip(x, *model.bounds) else: x = x0 for _ in range(self.steps): gradientsCum = 0 for _ in range(mc): _, gradients = self.value_and_grad(loss_fn, x) # import pdb # pdb.set_trace() # assert not (gradients == gradientsCum).all() gradientsCum += gradients gradients = self.normalize(gradientsCum, x=x, bounds=model.bounds) x = x + stepsize * gradients x = self.project(x, x0, epsilon) x = ep.clip(x, *model.bounds) return restore_type(x)
def __call__(self, model: Model, inputs: T, criterion: Union[Criterion, Any] = None) -> T: x, restore_type = ep.astensor_(inputs) del inputs, criterion min_, max_ = model.bounds target = min_ + self.target * (max_ - min_) direction = target - x norms = ep.norms.l2(flatten(direction), axis=-1) scale = self.epsilon / atleast_kd(norms, direction.ndim) scale = ep.minimum(scale, 1) x = x + scale * direction x = x.clip(min_, max_) return restore_type(x)
def run( self, model: Model, inputs: T, criterion: Union[Criterion, Any] = None, *, epsilon: float, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x, restore_type = ep.astensor_(inputs) del inputs, criterion, kwargs min_, max_ = model.bounds p = self.sample_noise(x) epsilons = self.get_epsilons(x, p, epsilon, min_=min_, max_=max_) x = x + epsilons * p x = x.clip(min_, max_) return restore_type(x)
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x, restore_type = ep.astensor_(inputs) del inputs, kwargs verify_input_bounds(x, model) criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) min_, max_ = model.bounds target = min_ + self.target * (max_ - min_) direction = target - x best = ep.ones(x, len(x)) epsilon = 0.0 stepsize = 1.0 / self.steps for _ in range(self.steps): # TODO: reduce the batch size to the ones that have not yet been sucessful is_adv = is_adversarial(x + epsilon * direction) is_best_adv = ep.logical_and(is_adv, best == 1) best = ep.where(is_best_adv, epsilon, best) if (best < 1).all(): break # pragma: no cover epsilon += stepsize eps = atleast_kd(best, x.ndim) xp = x + eps * direction return restore_type(xp)
def run( self, model: Model, inputs: T, criterion: Union[Criterion, Any] = None, *, epsilon: float, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x, restore_type = ep.astensor_(inputs) del inputs, criterion, kwargs min_, max_ = model.bounds p = self.sample_noise(x) norms = self.get_norms(p) p = p / atleast_kd(norms, p.ndim) x = x + epsilon * p x = x.clip(min_, max_) return restore_type(x)
def run( self, model: Model, inputs: T, criterion: Union[Criterion, Any] = None, *, epsilon: float, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x0, restore_type = ep.astensor_(inputs) criterion_ = get_criterion(criterion) del inputs, criterion, kwargs verify_input_bounds(x0, model) is_adversarial = get_is_adversarial(criterion_, model) min_, max_ = model.bounds result = x0 if self.check_trivial: found = is_adversarial(result) else: found = ep.zeros(x0, len(result)).bool() for _ in range(self.repeats): if found.all(): break p = self.sample_noise(x0) epsilons = self.get_epsilons(x0, p, epsilon, min_=min_, max_=max_) x = x0 + epsilons * p x = x.clip(min_, max_) is_adv = is_adversarial(x) is_new_adv = ep.logical_and(is_adv, ep.logical_not(found)) result = ep.where(atleast_kd(is_new_adv, x.ndim), x, result) found = ep.logical_or(found, is_adv) return restore_type(result)
def __call__(self, model: Model, inputs: T, criterion: Union[Criterion, T]) -> T: x, restore_type = ep.astensor_(inputs) del inputs criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) best = self._attack(model, x, criterion) best_is_adv = is_adversarial(best) for _ in range(1, self._times): xp = self._attack(model, x, criterion) # assumes xp does not violate the perturbation size constraint is_adv = is_adversarial(xp) new_best = ep.logical_and(is_adv, best_is_adv.logical_not()) best = ep.where(atleast_kd(new_best, best.ndim), xp, best) best_is_adv = ep.logical_or(is_adv, best_is_adv) return restore_type(best)
def __call__( # type: ignore self, model: Model, inputs: T, criterion: Any, **kwargs: Any, ) -> Tuple[T, T, T]: x, restore_type = ep.astensor_(inputs) del inputs criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) if x.ndim != 4: raise NotImplementedError( "only implemented for inputs with two spatial dimensions (and one channel and one batch dimension)" ) xp = self.run(model, x, criterion) success = is_adversarial(xp) xp_ = restore_type(xp) return xp_, xp_, restore_type(success) # twice to match API
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x, restore_type = ep.astensor_(inputs) del inputs, kwargs verify_input_bounds(x, model) criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) min_, max_ = model.bounds target = min_ + self.target * (max_ - min_) direction = target - x lower_bound = ep.zeros(x, len(x)) upper_bound = ep.ones(x, len(x)) epsilons = lower_bound for _ in range(self.binary_search_steps): eps = atleast_kd(epsilons, x.ndim) is_adv = is_adversarial(x + eps * direction) lower_bound = ep.where(is_adv, lower_bound, epsilons) upper_bound = ep.where(is_adv, epsilons, upper_bound) epsilons = (lower_bound + upper_bound) / 2 epsilons = upper_bound eps = atleast_kd(epsilons, x.ndim) xp = x + eps * direction return restore_type(xp)
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) self.process_raw() assert self.inputs is not None assert self.outputs is not None x, restore_type = ep.astensor_(inputs) del inputs, kwargs criterion = get_criterion(criterion) result = x found = criterion(x, model(x)) dataset_size = len(self.inputs) batch_size = len(x) while not found.all(): indices = np.random.randint(0, dataset_size, size=(batch_size, )) xp = self.inputs[indices] yp = self.outputs[indices] is_adv = criterion(xp, yp) new_found = ep.logical_and(is_adv, found.logical_not()) result = ep.where(atleast_kd(new_found, result.ndim), xp, result) found = ep.logical_or(found, new_found) return restore_type(result)
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, **kwargs: Any, ) -> T: #raise_if_kwargs(kwargs) x, restore_type = ep.astensor_(inputs) del inputs, kwargs verify_input_bounds(x, model) criterion = get_criterion(criterion) min_, max_ = model.bounds logits = model(x) classes = logits.argsort(axis=-1).flip(axis=-1) if self.candidates is None: candidates = logits.shape[-1] # pragma: no cover else: candidates = min(self.candidates, logits.shape[-1]) if not candidates >= 2: raise ValueError( # pragma: no cover f"expected the model output to have atleast 2 classes, got {logits.shape[-1]}" ) logging.info(f"Only testing the top-{candidates} classes") classes = classes[:, :candidates] N = len(x) rows = range(N) loss_fun = self._get_loss_fn(model, classes) loss_aux_and_grad = ep.value_and_grad_fn(x, loss_fun, has_aux=True) x0 = x p_total = ep.zeros_like(x) for _ in range(self.steps): # let's first get the logits using k = 1 to see if we are done diffs = [loss_aux_and_grad(x, 1)] _, (_, logits), _ = diffs[0] is_adv = criterion(x, logits) if is_adv.all(): break # then run all the other k's as well # we could avoid repeated forward passes and only repeat # the backward pass, but this cannot currently be done in eagerpy diffs += [loss_aux_and_grad(x, k) for k in range(2, candidates)] # we don't need the logits diffs_ = [(losses, grad) for _, (losses, _), grad in diffs] losses = ep.stack([lo for lo, _ in diffs_], axis=1) grads = ep.stack([g for _, g in diffs_], axis=1) assert losses.shape == (N, candidates - 1) assert grads.shape == (N, candidates - 1) + x0.shape[1:] # calculate the distances distances = self.get_distances(losses, grads) assert distances.shape == (N, candidates - 1) # determine the best directions best = distances.argmin(axis=1) distances = distances[rows, best] losses = losses[rows, best] grads = grads[rows, best] assert distances.shape == (N,) assert losses.shape == (N,) assert grads.shape == x0.shape # apply perturbation distances = distances + 1e-4 # for numerical stability p_step = self.get_perturbations(distances, grads) assert p_step.shape == x0.shape p_total += p_step # don't do anything for those that are already adversarial x = ep.where( atleast_kd(is_adv, x.ndim), x, x0 + (1.0 + self.overshoot) * p_total ) x = ep.clip(x, min_, max_) return restore_type(x)