def normalize(self, gradients: ep.Tensor, *, x: ep.Tensor, bounds: Bounds) -> ep.Tensor: bad_pos = ep.logical_or( ep.logical_and(x == bounds.lower, gradients < 0), ep.logical_and(x == bounds.upper, gradients > 0), ) gradients = ep.where(bad_pos, ep.zeros_like(gradients), gradients) abs_gradients = gradients.abs() quantiles = np.quantile(flatten(abs_gradients).numpy(), q=self.quantile, axis=-1) keep = abs_gradients >= atleast_kd(ep.from_numpy(gradients, quantiles), gradients.ndim) e = ep.where(keep, gradients.sign(), ep.zeros_like(gradients)) return normalize_lp_norms(e, p=1)
def __call__(self, model: Model, inputs: T, criterion: Union[Criterion, T]) -> T: x, restore_type = ep.astensor_(inputs) del inputs criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) min_, max_ = model.bounds target = min_ + self.target * (max_ - min_) direction = target - x best = ep.ones(x, len(x)) epsilon = 0.0 stepsize = 1.0 / self.steps for _ in range(self.steps): # TODO: reduce the batch size to the ones that have not yet been sucessful is_adv = is_adversarial(x + epsilon * direction) is_best_adv = ep.logical_and(is_adv, best == 1) best = ep.where(is_best_adv, epsilon, best) if (best < 1).all(): break epsilon += stepsize eps = atleast_kd(best, x.ndim) xp = x + eps * direction return restore_type(xp)
def __call__(self, inputs, labels, *, steps=1000): x = ep.astensor(inputs) y = ep.astensor(labels) assert x.shape[0] == y.shape[0] assert y.ndim == 1 assert x.ndim == 4 if self.channel_axis == 1: h, w = x.shape[2:4] elif self.channel_axis == 3: h, w = x.shape[1:3] else: raise ValueError( "expected 'channel_axis' to be 1 or 3, got {channel_axis}") size = max(h, w) min_, max_ = self.model.bounds() x0 = x x0np = x0.numpy() epsilons = np.linspace(0, 1, num=steps + 1)[1:] logits = ep.astensor(self.model.forward(x0.tensor)) classes = logits.argmax(axis=-1) is_adv = classes != labels found = is_adv result = x0 for epsilon in epsilons: # TODO: reduce the batch size to the ones that haven't been sucessful sigmas = [epsilon * size] * 4 sigmas[0] = 0 sigmas[self.channel_axis] = 0 # TODO: once we can implement gaussian_filter in eagerpy, avoid converting from numpy x = gaussian_filter(x0np, sigmas) x = np.clip(x, min_, max_) x = ep.from_numpy(x0, x) logits = ep.astensor(self.model.forward(x.tensor)) classes = logits.argmax(axis=-1) is_adv = classes != labels new_adv = ep.logical_and(is_adv, found.logical_not()) result = ep.where(atleast_kd(new_adv, x.ndim), x, result) found = ep.logical_or(new_adv, found) if found.all(): break return result.tensor
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x, restore_type = ep.astensor_(inputs) del inputs, kwargs verify_input_bounds(x, model) criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) found = is_adversarial(x) results = x def grid_search_generator() -> Generator[Any, Any, Any]: dphis = np.linspace(-self.max_rot, self.max_rot, self.num_rots) dxs = np.linspace(-self.max_trans, self.max_trans, self.num_trans) dys = np.linspace(-self.max_trans, self.max_trans, self.num_trans) for dphi in dphis: for dx in dxs: for dy in dys: yield dphi, dx, dy def random_search_generator() -> Generator[Any, Any, Any]: dphis = np.random.uniform(-self.max_rot, self.max_rot, self.random_steps) dxs = np.random.uniform(-self.max_trans, self.max_trans, self.random_steps) dys = np.random.uniform(-self.max_trans, self.max_trans, self.random_steps) for dphi, dx, dy in zip(dphis, dxs, dys): yield dphi, dx, dy gen = grid_search_generator( ) if self.grid_search else random_search_generator() for dphi, dx, dy in gen: # TODO: reduce the batch size to the ones that haven't been successful x_p = rotate_and_shift(x, translation=(dx, dy), rotation=dphi) is_adv = is_adversarial(x_p) new_adv = ep.logical_and(is_adv, found.logical_not()) results = ep.where(atleast_kd(new_adv, x_p.ndim), x_p, results) found = ep.logical_or(new_adv, found) if found.all(): break # all images in batch misclassified return restore_type(results)
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) self.process_raw() assert self.inputs is not None assert self.outputs is not None x, restore_type = ep.astensor_(inputs) del inputs, kwargs verify_input_bounds(x, model) criterion = get_criterion(criterion) result = x found = criterion(x, model(x)) batch_size = len(x) # for every sample try every other sample index_pools: List[List[int]] = [] for i in range(batch_size): indices = list(range(batch_size)) indices.remove(i) indices = list(indices) np.random.shuffle(indices) index_pools.append(indices) for i in range(batch_size - 1): if found.all(): break indices = np.array([pool[i] for pool in index_pools]) xp = self.inputs[indices] yp = self.outputs[indices] is_adv = criterion(xp, yp) new_found = ep.logical_and(is_adv, found.logical_not()) result = ep.where(atleast_kd(new_found, result.ndim), xp, result) found = ep.logical_or(found, new_found) return restore_type(result)
def __call__(self, inputs, labels, *, epsilon, criterion, repeats=100, check_trivial=True): originals = ep.astensor(inputs) labels = ep.astensor(labels) def is_adversarial(p: ep.Tensor) -> ep.Tensor: """For each input in x, returns true if it is an adversarial for the given model and criterion""" logits = self.model.forward(p) return criterion(originals, labels, p, logits) x0 = ep.astensor(inputs) min_, max_ = self.model.bounds() result = x0 if check_trivial: found = is_adversarial(result) else: found = ep.zeros(x0, len(result)).bool() for _ in range(repeats): if found.all(): break p = self.sample_noise(x0) norms = self.get_norms(p) p = p / atleast_kd(norms, p.ndim) x = x0 + epsilon * p x = x.clip(min_, max_) is_adv = is_adversarial(x) is_new_adv = ep.logical_and(is_adv, ep.logical_not(found)) result = ep.where(atleast_kd(is_new_adv, x.ndim), x, result) found = ep.logical_or(found, is_adv) return result.tensor
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x, restore_type = ep.astensor_(inputs) del inputs, kwargs verify_input_bounds(x, model) criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) min_, max_ = model.bounds target = min_ + self.target * (max_ - min_) direction = target - x best = ep.ones(x, len(x)) epsilon = 0.0 stepsize = 1.0 / self.steps for _ in range(self.steps): # TODO: reduce the batch size to the ones that have not yet been sucessful is_adv = is_adversarial(x + epsilon * direction) is_best_adv = ep.logical_and(is_adv, best == 1) best = ep.where(is_best_adv, epsilon, best) if (best < 1).all(): break # pragma: no cover epsilon += stepsize eps = atleast_kd(best, x.ndim) xp = x + eps * direction return restore_type(xp)
def run( self, model: Model, inputs: T, criterion: Union[Criterion, Any] = None, *, epsilon: float, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x0, restore_type = ep.astensor_(inputs) criterion_ = get_criterion(criterion) del inputs, criterion, kwargs verify_input_bounds(x0, model) is_adversarial = get_is_adversarial(criterion_, model) min_, max_ = model.bounds result = x0 if self.check_trivial: found = is_adversarial(result) else: found = ep.zeros(x0, len(result)).bool() for _ in range(self.repeats): if found.all(): break p = self.sample_noise(x0) epsilons = self.get_epsilons(x0, p, epsilon, min_=min_, max_=max_) x = x0 + epsilons * p x = x.clip(min_, max_) is_adv = is_adversarial(x) is_new_adv = ep.logical_and(is_adv, ep.logical_not(found)) result = ep.where(atleast_kd(is_new_adv, x.ndim), x, result) found = ep.logical_or(found, is_adv) return restore_type(result)
def __call__(self, model: Model, inputs: T, criterion: Union[Criterion, T]) -> T: x, restore_type = ep.astensor_(inputs) del inputs criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) best = self._attack(model, x, criterion) best_is_adv = is_adversarial(best) for _ in range(1, self._times): xp = self._attack(model, x, criterion) # assumes xp does not violate the perturbation size constraint is_adv = is_adversarial(xp) new_best = ep.logical_and(is_adv, best_is_adv.logical_not()) best = ep.where(atleast_kd(new_best, best.ndim), xp, best) best_is_adv = ep.logical_or(is_adv, best_is_adv) return restore_type(best)
def _apply_decision_rule( decision_rule: Union[Literal["EN"], Literal["L1"]], beta: float, best_advs: ep.Tensor, best_advs_norms: ep.Tensor, x_k: ep.Tensor, x: ep.Tensor, found_advs: ep.Tensor, ) -> Tuple[ep.Tensor, ep.Tensor]: if decision_rule == "EN": norms = beta * flatten(x_k - x).abs().sum( axis=-1) + flatten(x_k - x).square().sum(axis=-1) else: # decision rule = L1 norms = flatten(x_k - x).abs().sum(axis=-1) new_best = ep.logical_and(norms < best_advs_norms, found_advs) new_best_kd = atleast_kd(new_best, best_advs.ndim) best_advs = ep.where(new_best_kd, x_k, best_advs) best_advs_norms = ep.where(new_best, norms, best_advs_norms) return best_advs, best_advs_norms
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) self.process_raw() assert self.inputs is not None assert self.outputs is not None x, restore_type = ep.astensor_(inputs) del inputs, kwargs criterion = get_criterion(criterion) result = x found = criterion(x, model(x)) dataset_size = len(self.inputs) batch_size = len(x) while not found.all(): indices = np.random.randint(0, dataset_size, size=(batch_size, )) xp = self.inputs[indices] yp = self.outputs[indices] is_adv = criterion(xp, yp) new_found = ep.logical_and(is_adv, found.logical_not()) result = ep.where(atleast_kd(new_found, result.ndim), xp, result) found = ep.logical_or(found, new_found) return restore_type(result)
def test_logical_and(t: Tensor) -> Tensor: return ep.logical_and(t < 3, t > 1)
def __call__( self, model: Model, inputs, labels, *, criterion=misclassification, channel_axis: Optional[int] = None, ): """ Parameters ---------- channel_axis The axis across which the noise should be the same (if across_channels is True). If None, will be automatically inferred from the model if possible. """ inputs, labels, restore = wrap(inputs, labels) is_adversarial = get_is_adversarial(criterion, inputs, labels, model) x0 = inputs N = len(x0) shape = list(x0.shape) if self.across_channels and x0.ndim > 2: if channel_axis is None and not hasattr(model, "data_format"): raise ValueError( "cannot infer the data_format from the model, please specify" " channel_axis when calling the attack") elif channel_axis is None: data_format = model.data_format # type: ignore if (data_format is None or data_format != "channels_first" and data_format != "channels_last"): raise ValueError( f"expected data_format to be 'channels_first' or 'channels_last'" ) channel_axis = 1 if data_format == "channels_first" else x0.ndim - 1 elif not 0 <= channel_axis < x0.ndim: raise ValueError( f"expected channel_axis to be in [0, {x0.ndim})") shape[channel_axis] = 1 min_, max_ = model.bounds() r = max_ - min_ result = x0 is_adv = is_adversarial(result) best_advs_norms = ep.where(is_adv, ep.zeros(x0, N), ep.full(x0, N, ep.inf)) min_probability = ep.zeros(x0, N) max_probability = ep.ones(x0, N) stepsizes = max_probability / self.steps p = stepsizes for step in range(self.steps): # add salt and pepper u = ep.uniform(x0, shape) p_ = atleast_kd(p, x0.ndim) salt = (u >= 1 - p_ / 2).astype(x0.dtype) * r pepper = -(u < p_ / 2).astype(x0.dtype) * r x = x0 + salt + pepper x = ep.clip(x, min_, max_) # check if we found new best adversarials norms = flatten(x).square().sum(axis=-1).sqrt() closer = norms < best_advs_norms is_adv = is_adversarial( x) # TODO: ignore those that are not closer anyway is_best_adv = ep.logical_and(is_adv, closer) # update results and search space result = ep.where(atleast_kd(is_best_adv, x.ndim), x, result) best_advs_norms = ep.where(is_best_adv, norms, best_advs_norms) min_probability = ep.where(is_best_adv, 0.5 * p, min_probability) # we set max_probability a bit higher than p because the relationship # between p and norms is not strictly monotonic max_probability = ep.where(is_best_adv, ep.minimum(p * 1.2, 1.0), max_probability) remaining = self.steps - step stepsizes = ep.where( is_best_adv, (max_probability - min_probability) / remaining, stepsizes) reset = p == max_probability p = ep.where(ep.logical_or(is_best_adv, reset), min_probability, p) p = ep.minimum(p + stepsizes, max_probability) return restore(result)
def __call__(self, perturbed: T, outputs: T) -> T: args, restore_type = ep.astensors_(perturbed, outputs) a = self.a(*args) b = self.b(*args) is_adv = ep.logical_and(a, b) return restore_type(is_adv)
def run( self, model: Model, inputs: T, criterion: Union[Misclassification, TargetedMisclassification, T], *, early_stop: Optional[float] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x, restore_type = ep.astensor_(inputs) criterion_ = get_criterion(criterion) del inputs, criterion, kwargs N = len(x) if isinstance(criterion_, Misclassification): targeted = False classes = criterion_.labels elif isinstance(criterion_, TargetedMisclassification): targeted = True classes = criterion_.target_classes else: raise ValueError("unsupported criterion") if classes.shape != (N, ): name = "target_classes" if targeted else "labels" raise ValueError( f"expected {name} to have shape ({N},), got {classes.shape}") stepsize = 1.0 min_, max_ = model.bounds def loss_fn(inputs: ep.Tensor, labels: ep.Tensor) -> Tuple[ep.Tensor, ep.Tensor]: logits = model(inputs) sign = -1.0 if targeted else 1.0 loss = sign * ep.crossentropy(logits, labels).sum() return loss, logits grad_and_logits = ep.value_and_grad_fn(x, loss_fn, has_aux=True) delta = ep.zeros_like(x) epsilon = self.init_epsilon * ep.ones(x, len(x)) worst_norm = ep.norms.l2(flatten(ep.maximum(x - min_, max_ - x)), -1) best_l2 = worst_norm best_delta = delta adv_found = ep.zeros(x, len(x)).bool() for i in range(self.steps): # perform cosine annealing of LR starting from 1.0 to 0.01 stepsize = (0.01 + (stepsize - 0.01) * (1 + math.cos(math.pi * i / self.steps)) / 2) x_adv = x + delta _, logits, gradients = grad_and_logits(x_adv, classes) gradients = normalize_gradient_l2_norms(gradients) is_adversarial = criterion_(x_adv, logits) l2 = ep.norms.l2(flatten(delta), axis=-1) is_smaller = l2 <= best_l2 is_both = ep.logical_and(is_adversarial, is_smaller) adv_found = ep.logical_or(adv_found, is_adversarial) best_l2 = ep.where(is_both, l2, best_l2) best_delta = ep.where(atleast_kd(is_both, x.ndim), delta, best_delta) # do step delta = delta + stepsize * gradients epsilon = epsilon * ep.where(is_adversarial, 1.0 - self.gamma, 1.0 + self.gamma) epsilon = ep.minimum(epsilon, worst_norm) # project to epsilon ball delta *= atleast_kd(epsilon / ep.norms.l2(flatten(delta), -1), x.ndim) # clip to valid bounds delta = ep.clip(x + delta, *model.bounds) - x x_adv = x + best_delta return restore_type(x_adv)
def __call__( # noqa: F811 self, model: Model, inputs: T, criterion: Any, *, epsilons: Union[Sequence[Union[float, None]], float, None], **kwargs: Any, ) -> Union[Tuple[List[T], List[T], T], Tuple[T, T, T]]: x, restore_type = ep.astensor_(inputs) del inputs verify_input_bounds(x, model) criterion = get_criterion(criterion) was_iterable = True if not isinstance(epsilons, Iterable): epsilons = [epsilons] was_iterable = False N = len(x) K = len(epsilons) for i in range(self.times): # run the attack xps, xpcs, success = self.attack( model, x, criterion, epsilons=epsilons, **kwargs ) assert len(xps) == K assert len(xpcs) == K for xp in xps: assert xp.shape == x.shape for xpc in xpcs: assert xpc.shape == x.shape assert success.shape == (K, N) if i == 0: best_xps = xps best_xpcs = xpcs best_success = success continue # TODO: test if stacking the list to a single tensor and # getting rid of the loop is faster for k, epsilon in enumerate(epsilons): first = best_success[k].logical_not() assert first.shape == (N,) if epsilon is None: # if epsilon is None, we need the minimum # TODO: maybe cache some of these distances # and then remove the else part closer = self.distance(x, xps[k]) < self.distance(x, best_xps[k]) assert closer.shape == (N,) new_best = ep.logical_and(success[k], ep.logical_or(closer, first)) else: # for concrete epsilon, we just need a successful one new_best = ep.logical_and(success[k], first) new_best = atleast_kd(new_best, x.ndim) best_xps[k] = ep.where(new_best, xps[k], best_xps[k]) best_xpcs[k] = ep.where(new_best, xpcs[k], best_xpcs[k]) best_success = ep.logical_or(success, best_success) best_xps_ = [restore_type(xp) for xp in best_xps] best_xpcs_ = [restore_type(xpc) for xpc in best_xpcs] if was_iterable: return best_xps_, best_xpcs_, restore_type(best_success) else: assert len(best_xps_) == 1 assert len(best_xpcs_) == 1 return ( best_xps_[0], best_xpcs_[0], restore_type(best_success.squeeze(axis=0)), )
def run( self, model: Model, inputs: T, criterion: Misclassification, *, early_stop: Optional[float] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x0, restore_type = ep.astensor_(inputs) criterion_ = get_criterion(criterion) del inputs, criterion, kwargs is_adversarial = get_is_adversarial(criterion_, model) N = len(x0) shape = list(x0.shape) if self.across_channels and x0.ndim > 2: if self.channel_axis is None: channel_axis = get_channel_axis(model, x0.ndim) else: channel_axis = self.channel_axis % x0.ndim if channel_axis is not None: shape[channel_axis] = 1 min_, max_ = model.bounds r = max_ - min_ result = x0 is_adv = is_adversarial(result) best_advs_norms = ep.where(is_adv, ep.zeros(x0, N), ep.full(x0, N, ep.inf)) min_probability = ep.zeros(x0, N) max_probability = ep.ones(x0, N) stepsizes = max_probability / self.steps p = stepsizes for step in range(self.steps): # add salt and pepper u = ep.uniform(x0, tuple(shape)) p_ = atleast_kd(p, x0.ndim) salt = (u >= 1 - p_ / 2).astype(x0.dtype) * r pepper = -(u < p_ / 2).astype(x0.dtype) * r x = x0 + salt + pepper x = ep.clip(x, min_, max_) # check if we found new best adversarials norms = flatten(x).norms.l2(axis=-1) closer = norms < best_advs_norms is_adv = is_adversarial( x) # TODO: ignore those that are not closer anyway is_best_adv = ep.logical_and(is_adv, closer) # update results and search space result = ep.where(atleast_kd(is_best_adv, x.ndim), x, result) best_advs_norms = ep.where(is_best_adv, norms, best_advs_norms) min_probability = ep.where(is_best_adv, 0.5 * p, min_probability) # we set max_probability a bit higher than p because the relationship # between p and norms is not strictly monotonic max_probability = ep.where(is_best_adv, ep.minimum(p * 1.2, 1.0), max_probability) remaining = self.steps - step stepsizes = ep.where( is_best_adv, (max_probability - min_probability) / remaining, stepsizes) reset = p == max_probability p = ep.where(ep.logical_or(is_best_adv, reset), min_probability, p) p = ep.minimum(p + stepsizes, max_probability) return restore_type(result)
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, starting_points: Optional[T] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) if starting_points is None: raise ValueError("BinarizationRefinementAttack requires starting_points") (o, x), restore_type = ep.astensors_(inputs, starting_points) del inputs, starting_points, kwargs criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) if self.threshold is None: min_, max_ = model.bounds threshold = (min_ + max_) / 2.0 else: threshold = self.threshold assert o.dtype == x.dtype nptype = o.reshape(-1)[0].numpy().dtype.type if nptype not in [np.float16, np.float32, np.float64]: raise ValueError( # pragma: no cover f"expected dtype to be float16, float32 or float64, found '{nptype}'" ) threshold = nptype(threshold) offset = nptype(1.0) if self.included_in == "lower": lower_ = threshold upper_ = np.nextafter(threshold, threshold + offset) elif self.included_in == "upper": lower_ = np.nextafter(threshold, threshold - offset) upper_ = threshold else: raise ValueError( f"expected included_in to be 'lower' or 'upper', found '{self.included_in}'" ) assert lower_ < upper_ p = ep.full_like(o, ep.nan) lower = ep.ones_like(o) * lower_ upper = ep.ones_like(o) * upper_ indices = ep.logical_and(o <= lower, x <= lower) p = ep.where(indices, o, p) indices = ep.logical_and(o <= lower, x >= upper) p = ep.where(indices, upper, p) indices = ep.logical_and(o >= upper, x <= lower) p = ep.where(indices, lower, p) indices = ep.logical_and(o >= upper, x >= upper) p = ep.where(indices, o, p) assert not ep.any(ep.isnan(p)) is_adv1 = is_adversarial(x) is_adv2 = is_adversarial(p) if (is_adv1 != is_adv2).any(): raise ValueError( "The specified threshold does not match what is done by the model." ) return restore_type(p)
def run( self, model: Model, inputs: T, criterion: Union[Misclassification, TargetedMisclassification, T], *, starting_points: Optional[ep.Tensor] = None, early_stop: Optional[float] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) criterion_ = get_criterion(criterion) if isinstance(criterion_, Misclassification): targeted = False classes = criterion_.labels elif isinstance(criterion_, TargetedMisclassification): targeted = True classes = criterion_.target_classes else: raise ValueError("unsupported criterion") def loss_fn( inputs: ep.Tensor, labels: ep.Tensor ) -> Tuple[ep.Tensor, Tuple[ep.Tensor, ep.Tensor]]: logits = model(inputs) if targeted: c_minimize = best_other_classes(logits, labels) c_maximize = labels # target_classes else: c_minimize = labels # labels c_maximize = best_other_classes(logits, labels) loss = logits[rows, c_minimize] - logits[rows, c_maximize] return -loss.sum(), (logits, loss) x, restore_type = ep.astensor_(inputs) del inputs, criterion, kwargs N = len(x) # start from initialization points/attack if starting_points is not None: x1 = starting_points else: if self.init_attack is not None: x1 = self.init_attack.run(model, x, criterion_) else: x1 = None # if initial points or initialization attacks are provided, # search for the boundary if x1 is not None: is_adv = get_is_adversarial(criterion_, model) assert is_adv(x1).all() lower_bound = ep.zeros(x, shape=(N, )) upper_bound = ep.ones(x, shape=(N, )) for _ in range(self.binary_search_steps): epsilons = (lower_bound + upper_bound) / 2 mid_points = self.mid_points(x, x1, epsilons, model.bounds) is_advs = is_adv(mid_points) lower_bound = ep.where(is_advs, lower_bound, epsilons) upper_bound = ep.where(is_advs, epsilons, upper_bound) starting_points = self.mid_points(x, x1, upper_bound, model.bounds) delta = starting_points - x else: # start from x0 delta = ep.zeros_like(x) if classes.shape != (N, ): name = "target_classes" if targeted else "labels" raise ValueError( f"expected {name} to have shape ({N},), got {classes.shape}") min_, max_ = model.bounds rows = range(N) grad_and_logits = ep.value_and_grad_fn(x, loss_fn, has_aux=True) if self.p != 0: epsilon = ep.inf * ep.ones(x, len(x)) else: epsilon = ep.ones(x, len(x)) if x1 is None \ else ep.norms.l0(flatten(delta), axis=-1) if self.p != 0: worst_norm = ep.norms.lp(flatten(ep.maximum(x - min_, max_ - x)), p=self.p, axis=-1) else: worst_norm = flatten(ep.ones_like(x)).bool().sum(axis=1).float32() best_lp = worst_norm best_delta = delta adv_found = ep.zeros(x, len(x)).bool() for i in range(self.steps): # perform cosine annealing of learning rates stepsize = (self.min_stepsize + (self.max_stepsize - self.min_stepsize) * (1 + math.cos(math.pi * i / self.steps)) / 2) gamma = (0.001 + (self.gamma - 0.001) * (1 + math.cos(math.pi * (i / self.steps))) / 2) x_adv = x + delta loss, (logits, loss_batch), gradients = grad_and_logits(x_adv, classes) is_adversarial = criterion_(x_adv, logits) lp = ep.norms.lp(flatten(delta), p=self.p, axis=-1) is_smaller = lp <= best_lp is_both = ep.logical_and(is_adversarial, is_smaller) adv_found = ep.logical_or(adv_found, is_adversarial) best_lp = ep.where(is_both, lp, best_lp) best_delta = ep.where(atleast_kd(is_both, x.ndim), delta, best_delta) # update epsilon if self.p != 0: distance_to_boundary = abs(loss_batch) / ep.norms.lp( flatten(gradients), p=self.dual, axis=-1) epsilon = ep.where( is_adversarial, ep.minimum( epsilon * (1 - gamma), ep.norms.lp(flatten(best_delta), p=self.p, axis=-1)), ep.where( adv_found, epsilon * (1 + gamma), ep.norms.lp(flatten(delta), p=self.p, axis=-1) + distance_to_boundary)) else: epsilon = ep.where( is_adversarial, ep.minimum( ep.minimum(epsilon - 1, (epsilon * (1 - gamma)).astype(int).astype( epsilon.dtype)), ep.norms.lp(flatten(best_delta), p=self.p, axis=-1)), ep.maximum(epsilon + 1, (epsilon * (1 + gamma)).astype(int).astype( epsilon.dtype))) epsilon = ep.maximum(0, epsilon).astype(epsilon.dtype) # clip epsilon epsilon = ep.minimum(epsilon, worst_norm) # computes normalized gradient update grad_ = self.normalize(gradients, x=x, bounds=model.bounds) * stepsize # do step delta = delta + grad_ # project according to the given norm delta = self.project(x=x + delta, x0=x, epsilon=epsilon) - x # clip to valid bounds delta = ep.clip(x + delta, *model.bounds) - x x_adv = x + best_delta return restore_type(x_adv)
def __call__( self, inputs, labels, *, adversarials, criterion, threshold=None, included_in="upper", ): """For models that preprocess their inputs by binarizing the inputs, this attack can improve adversarials found by other attacks. It does this by utilizing information about the binarization and mapping values to the corresponding value in the clean input or to the right side of the threshold. Parameters ---------- threshold : float The treshold used by the models binarization. If none, defaults to (model.bounds()[1] - model.bounds()[0]) / 2. included_in : str Whether the threshold value itself belongs to the lower or upper interval. """ originals = ep.astensor(inputs) labels = ep.astensor(labels) def is_adversarial(p: ep.Tensor) -> ep.Tensor: """For each input in x, returns true if it is an adversarial for the given model and criterion""" logits = ep.astensor(self.model.forward(p.tensor)) return criterion(originals, labels, p, logits) o = ep.astensor(inputs) x = ep.astensor(adversarials) min_, max_ = self.model.bounds() if threshold is None: threshold = (min_ + max_) / 2.0 assert o.dtype == x.dtype dtype = o.dtype if dtype == o.backend.float16: nptype = np.float16 elif dtype == o.backend.float32: nptype = np.float32 elif dtype == o.backend.float64: nptype = np.float64 else: raise ValueError( "expected dtype to be float16, float32 or float64, found '{dtype}'" ) threshold = nptype(threshold) offset = nptype(1.0) if included_in == "lower": lower = threshold upper = np.nextafter(threshold, threshold + offset) elif included_in == "upper": lower = np.nextafter(threshold, threshold - offset) upper = threshold else: raise ValueError( "expected included_in to be 'lower' or 'upper', found '{included_in}'" ) assert lower < upper p = ep.full_like(o, ep.nan) lower = ep.ones_like(o) * lower upper = ep.ones_like(o) * upper indices = ep.logical_and(o <= lower, x <= lower) p = ep.where(indices, o, p) indices = ep.logical_and(o <= lower, x >= upper) p = ep.where(indices, upper, p) indices = ep.logical_and(o >= upper, x <= lower) p = ep.where(indices, lower, p) indices = ep.logical_and(o >= upper, x >= upper) p = ep.where(indices, o, p) assert not ep.any(ep.isnan(p)) is_adv1 = is_adversarial(x) is_adv2 = is_adversarial(p) assert (is_adv1 == is_adv2).all( ), "The specified threshold does not match what is done by the model." return p.tensor
def _binary_search_on_alpha( self, function_evolution: Callable[[ep.Tensor], ep.Tensor], lower: ep.Tensor) -> ep.Tensor: # Upper --> not adversarial / Lower --> adversarial v_type = function_evolution(lower) def get_alpha(theta: ep.Tensor) -> ep.Tensor: return 1 - ep.astensor(self._cos(theta.raw * np.pi / 180)) check_opposite = lower > 0 # if param < 0: abs(param) doesn't work # Get the upper range upper = ep.where( abs(lower) != self.theta_max, lower + ep.sign(lower) * self.theta_max / self.T, ep.zeros_like(lower) ) mask_upper = (upper == 0) while mask_upper.any(): # Find the correct lower/upper range # if True in mask_upper, the range haven't been found new_upper = lower + ep.sign(lower) * self.theta_max / self.T potential_x = function_evolution(new_upper) x = ep.where( atleast_kd(mask_upper, potential_x.ndim), potential_x, ep.zeros_like(potential_x) ) is_advs = self._is_adversarial(x) lower = ep.where(ep.logical_and(mask_upper, is_advs), new_upper, lower) upper = ep.where(ep.logical_and(mask_upper, is_advs.logical_not()), new_upper, upper) mask_upper = mask_upper * is_advs step = 0 over_gamma = abs(get_alpha(upper) - get_alpha(lower)) > self._BS_gamma while step < self._BS_max_iteration and over_gamma.any(): mid_bound = (upper + lower) / 2 mid = ep.where( atleast_kd(ep.logical_and(mid_bound != 0, over_gamma), v_type.ndim), function_evolution(mid_bound), ep.zeros_like(v_type) ) is_adv = self._is_adversarial(mid) mid_opp = ep.where( atleast_kd(ep.logical_and(ep.astensor(check_opposite), over_gamma), mid.ndim), function_evolution(-mid_bound), ep.zeros_like(mid) ) is_adv_opp = self._is_adversarial(mid_opp) lower = ep.where(over_gamma * is_adv, mid_bound, lower) lower = ep.where(over_gamma * is_adv.logical_not() * check_opposite * is_adv_opp, -mid_bound, lower) upper = ep.where(over_gamma * is_adv.logical_not() * check_opposite * is_adv_opp, - upper, upper) upper = ep.where(over_gamma * (abs(lower) != abs(mid_bound)), mid_bound, upper) check_opposite = over_gamma * check_opposite * is_adv_opp * (lower > 0) over_gamma = abs(get_alpha(upper) - get_alpha(lower)) > self._BS_gamma step += 1 return ep.astensor(lower)
def __call__(self, inputs, labels, *, criterion, steps=1000): originals = ep.astensor(inputs) labels = ep.astensor(labels) def is_adversarial(p: ep.Tensor) -> ep.Tensor: """For each input in x, returns true if it is an adversarial for the given model and criterion""" logits = ep.astensor(self.model.forward(p.tensor)) return criterion(originals, labels, p, logits) x0 = ep.astensor(inputs) N = len(x0) shape = list(x0.shape) if self.channel_axis is not None: shape[self.channel_axis] = 1 min_, max_ = self.model.bounds() r = max_ - min_ result = x0 is_adv = is_adversarial(result) best_advs_norms = ep.where(is_adv, ep.zeros(x0, N), ep.full(x0, N, ep.inf)) min_probability = ep.zeros(x0, N) max_probability = ep.ones(x0, N) stepsizes = max_probability / steps p = stepsizes for step in range(steps): # add salt and pepper u = ep.uniform(x0, shape) p_ = atleast_kd(p, x0.ndim) salt = (u >= 1 - p_ / 2).astype(x0.dtype) * r pepper = -(u < p_ / 2).astype(x0.dtype) * r x = x0 + salt + pepper x = ep.clip(x, min_, max_) # check if we found new best adversarials norms = flatten(x).square().sum(axis=-1).sqrt() closer = norms < best_advs_norms is_adv = is_adversarial(x) # TODO: ignore those that are not closer anyway is_best_adv = ep.logical_and(is_adv, closer) # update results and search space result = ep.where(atleast_kd(is_best_adv, x.ndim), x, result) best_advs_norms = ep.where(is_best_adv, norms, best_advs_norms) min_probability = ep.where(is_best_adv, 0.5 * p, min_probability) # we set max_probability a bit higher than p because the relationship # between p and norms is not strictly monotonic max_probability = ep.where( is_best_adv, ep.minimum(p * 1.2, 1.0), max_probability ) remaining = steps - step stepsizes = ep.where( is_best_adv, (max_probability - min_probability) / remaining, stepsizes ) reset = p == max_probability p = ep.where(ep.logical_or(is_best_adv, reset), min_probability, p) p = ep.minimum(p + stepsizes, max_probability) return result.tensor
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, starting_points: Optional[T] = None, epsilons: float, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) originals, restore_type = ep.astensor_(inputs) del inputs, kwargs if self.eps_early_stop and len(epsilons)!=1: print('epsilon-based early stopping only possible for one epsilon value') assert not(self.eps_early_stop and len(epsilons)!=1) verify_input_bounds(originals, model) criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) if starting_points is None: init_attack: MinimizationAttack if self.init_attack is None: init_attack = LinearSearchBlendedUniformNoiseAttack(steps=50) logging.info( f"Neither starting_points nor init_attack given. Falling" f" back to {init_attack!r} for initialization." ) else: init_attack = self.init_attack # TODO: use call and support all types of attacks (once early_stop is # possible in __call__) best_advs = init_attack.run( model, originals, criterion, early_stop=early_stop ) else: best_advs = ep.astensor(starting_points) is_adv = is_adversarial(best_advs) if not is_adv.all(): failed = is_adv.logical_not().float32().sum() if starting_points is None: raise ValueError( f"init_attack failed for {failed} of {len(is_adv)} inputs" ) else: raise ValueError( f"{failed} of {len(is_adv)} starting_points are not adversarial" ) del starting_points tb = TensorBoard(logdir=self.tensorboard) N = len(originals) epsilon = ep.astensor(epsilons[0] * ep.ones(originals,(N,))) ndim = originals.ndim spherical_steps = ep.ones(originals, N) * self.spherical_step source_steps = ep.ones(originals, N) * self.source_step tb.scalar("batchsize", N, 0) # create two queues for each sample to track success rates # (used to update the hyper parameters) stats_spherical_adversarial = ArrayQueue(maxlen=100, N=N) stats_step_adversarial = ArrayQueue(maxlen=30, N=N) bounds = model.bounds for step in range(1, self.steps + 1): converged = source_steps < self.source_step_convergance if converged.all(): break # pragma: no cover converged = atleast_kd(converged, ndim) # TODO: performance: ignore those that have converged # (we could select the non-converged ones, but we currently # cannot easily invert this in the end using EagerPy) unnormalized_source_directions = originals - best_advs source_norms = ep.norms.l2(flatten(unnormalized_source_directions), axis=-1) source_directions = unnormalized_source_directions / atleast_kd( source_norms, ndim ) # only check spherical candidates every k steps check_spherical_and_update_stats = step % self.update_stats_every_k == 0 candidates, spherical_candidates = draw_proposals( bounds, originals, best_advs, unnormalized_source_directions, source_directions, source_norms, spherical_steps, source_steps, ) candidates.dtype == originals.dtype spherical_candidates.dtype == spherical_candidates.dtype is_adv = is_adversarial(candidates) spherical_is_adv: Optional[ep.Tensor] if check_spherical_and_update_stats: spherical_is_adv = is_adversarial(spherical_candidates) stats_spherical_adversarial.append(spherical_is_adv) # TODO: algorithm: the original implementation ignores those samples # for which spherical is not adversarial and continues with the # next iteration -> we estimate different probabilities (conditional vs. unconditional) # TODO: thoughts: should we always track this because we compute it anyway stats_step_adversarial.append(is_adv) else: spherical_is_adv = None # in theory, we are closer per construction # but limited numerical precision might break this distances = ep.norms.l2(flatten(originals - candidates), axis=-1) closer = distances < source_norms is_best_adv = ep.logical_and(is_adv, closer) is_best_adv = atleast_kd(is_best_adv, ndim) cond = converged.logical_not().logical_and(is_best_adv) best_advs = ep.where(cond, candidates, best_advs) tb.probability("converged", converged, step) tb.scalar("updated_stats", check_spherical_and_update_stats, step) tb.histogram("norms", source_norms, step) tb.probability("is_adv", is_adv, step) if spherical_is_adv is not None: tb.probability("spherical_is_adv", spherical_is_adv, step) tb.histogram("candidates/distances", distances, step) tb.probability("candidates/closer", closer, step) tb.probability("candidates/is_best_adv", is_best_adv, step) tb.probability("new_best_adv_including_converged", is_best_adv, step) tb.probability("new_best_adv", cond, step) if check_spherical_and_update_stats: full = stats_spherical_adversarial.isfull() tb.probability("spherical_stats/full", full, step) if full.any(): probs = stats_spherical_adversarial.mean() cond1 = ep.logical_and(probs > 0.5, full) spherical_steps = ep.where( cond1, spherical_steps * self.step_adaptation, spherical_steps ) source_steps = ep.where( cond1, source_steps * self.step_adaptation, source_steps ) cond2 = ep.logical_and(probs < 0.2, full) spherical_steps = ep.where( cond2, spherical_steps / self.step_adaptation, spherical_steps ) source_steps = ep.where( cond2, source_steps / self.step_adaptation, source_steps ) stats_spherical_adversarial.clear(ep.logical_or(cond1, cond2)) tb.conditional_mean( "spherical_stats/isfull/success_rate/mean", probs, full, step ) tb.probability_ratio( "spherical_stats/isfull/too_linear", cond1, full, step ) tb.probability_ratio( "spherical_stats/isfull/too_nonlinear", cond2, full, step ) full = stats_step_adversarial.isfull() tb.probability("step_stats/full", full, step) if full.any(): probs = stats_step_adversarial.mean() # TODO: algorithm: changed the two values because we are currently tracking p(source_step_sucess) # instead of p(source_step_success | spherical_step_sucess) that was tracked before cond1 = ep.logical_and(probs > 0.25, full) source_steps = ep.where( cond1, source_steps * self.step_adaptation, source_steps ) cond2 = ep.logical_and(probs < 0.1, full) source_steps = ep.where( cond2, source_steps / self.step_adaptation, source_steps ) stats_step_adversarial.clear(ep.logical_or(cond1, cond2)) tb.conditional_mean( "step_stats/isfull/success_rate/mean", probs, full, step ) tb.probability_ratio( "step_stats/isfull/success_rate_too_high", cond1, full, step ) tb.probability_ratio( "step_stats/isfull/success_rate_too_low", cond2, full, step ) tb.histogram("spherical_step", spherical_steps, step) tb.histogram("source_step", source_steps, step) best_advs_norms = flatten(originals - best_advs).norms.l2(axis=-1) if self.eps_early_stop and (ep.maximum(best_advs_norms,epsilon) == epsilon).all(): print('early stopped because epsilon condition satisfied') break tb.close() return restore_type(best_advs)
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x, restore_type = ep.astensor_(inputs) del inputs, kwargs criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) if x.ndim != 4: raise NotImplementedError( "only implemented for inputs with two spatial dimensions (and one channel and one batch dimension)" ) if self.channel_axis is None: channel_axis = get_channel_axis(model, x.ndim) else: channel_axis = self.channel_axis % x.ndim if channel_axis is None: raise ValueError( "cannot infer the data_format from the model, please specify" " channel_axis when initializing the attack" ) max_sigma: float if self.max_sigma is None: if channel_axis == 1: h, w = x.shape[2:4] elif channel_axis == 3: h, w = x.shape[1:3] else: raise ValueError( "expected 'channel_axis' to be 1 or 3, got {channel_axis}" ) max_sigma = max(h, w) else: max_sigma = self.max_sigma min_, max_ = model.bounds x0 = x x0_ = x0.numpy() result = x0 found = is_adversarial(x0) epsilon = 0.0 stepsize = 1.0 / self.steps for _ in range(self.steps): # TODO: reduce the batch size to the ones that haven't been sucessful epsilon += stepsize sigmas = [epsilon * max_sigma] * x0.ndim sigmas[0] = 0 sigmas[channel_axis] = 0 # TODO: once we can implement gaussian_filter in eagerpy, avoid converting from numpy x_ = gaussian_filter(x0_, sigmas) x_ = np.clip(x_, min_, max_) x = ep.from_numpy(x0, x_) is_adv = is_adversarial(x) new_adv = ep.logical_and(is_adv, found.logical_not()) result = ep.where(atleast_kd(new_adv, x.ndim), x, result) found = ep.logical_or(new_adv, found) if found.all(): break return restore_type(result)
def test_logical_and_scalar(t: Tensor) -> Tensor: return ep.logical_and(True, t < 3)
def run( self, model: Model, inputs: T, criterion: Union[Misclassification, TargetedMisclassification, T], *, early_stop: Optional[float] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x, restore_type = ep.astensor_(inputs) criterion_ = get_criterion(criterion) del inputs, criterion, kwargs N = len(x) if isinstance(criterion_, Misclassification): targeted = False classes = criterion_.labels change_classes_logits = self.confidence elif isinstance(criterion_, TargetedMisclassification): targeted = True classes = criterion_.target_classes change_classes_logits = -self.confidence else: raise ValueError("unsupported criterion") def is_adversarial(perturbed: ep.Tensor, logits: ep.Tensor) -> ep.Tensor: if change_classes_logits != 0: logits += ep.onehot_like(logits, classes, value=change_classes_logits) return criterion_(perturbed, logits) if classes.shape != (N, ): name = "target_classes" if targeted else "labels" raise ValueError( f"expected {name} to have shape ({N},), got {classes.shape}") bounds = model.bounds to_attack_space = partial(_to_attack_space, bounds=bounds) to_model_space = partial(_to_model_space, bounds=bounds) x_attack = to_attack_space(x) reconstsructed_x = to_model_space(x_attack) rows = range(N) def loss_fun( delta: ep.Tensor, consts: ep.Tensor ) -> Tuple[ep.Tensor, Tuple[ep.Tensor, ep.Tensor]]: assert delta.shape == x_attack.shape assert consts.shape == (N, ) x = to_model_space(x_attack + delta) logits = model(x) if targeted: c_minimize = best_other_classes(logits, classes) c_maximize = classes # target_classes else: c_minimize = classes # labels c_maximize = best_other_classes(logits, classes) is_adv_loss = logits[rows, c_minimize] - logits[rows, c_maximize] assert is_adv_loss.shape == (N, ) is_adv_loss = is_adv_loss + self.confidence is_adv_loss = ep.maximum(0, is_adv_loss) is_adv_loss = is_adv_loss * consts squared_norms = flatten(x - reconstsructed_x).square().sum(axis=-1) loss = is_adv_loss.sum() + squared_norms.sum() return loss, (x, logits) loss_aux_and_grad = ep.value_and_grad_fn(x, loss_fun, has_aux=True) consts = self.initial_const * np.ones((N, )) lower_bounds = np.zeros((N, )) upper_bounds = np.inf * np.ones((N, )) best_advs = ep.zeros_like(x) best_advs_norms = ep.full(x, (N, ), ep.inf) # the binary search searches for the smallest consts that produce adversarials for binary_search_step in range(self.binary_search_steps): if (binary_search_step == self.binary_search_steps - 1 and self.binary_search_steps >= 10): # in the last binary search step, repeat the search once consts = np.minimum(upper_bounds, 1e10) # create a new optimizer find the delta that minimizes the loss delta = ep.zeros_like(x_attack) optimizer = AdamOptimizer(delta) # tracks whether adv with the current consts was found found_advs = np.full((N, ), fill_value=False) loss_at_previous_check = np.inf consts_ = ep.from_numpy(x, consts.astype(np.float32)) for step in range(self.steps): loss, (perturbed, logits), gradient = loss_aux_and_grad(delta, consts_) delta += optimizer(gradient, self.stepsize) if self.abort_early and step % (np.ceil(self.steps / 10)) == 0: # after each tenth of the overall steps, check progress if not (loss <= 0.9999 * loss_at_previous_check): break # stop Adam if there has been no progress loss_at_previous_check = loss found_advs_iter = is_adversarial(perturbed, logits) found_advs = np.logical_or(found_advs, found_advs_iter.numpy()) norms = flatten(perturbed - x).norms.l2(axis=-1) closer = norms < best_advs_norms new_best = ep.logical_and(closer, found_advs_iter) new_best_ = atleast_kd(new_best, best_advs.ndim) best_advs = ep.where(new_best_, perturbed, best_advs) best_advs_norms = ep.where(new_best, norms, best_advs_norms) upper_bounds = np.where(found_advs, consts, upper_bounds) lower_bounds = np.where(found_advs, lower_bounds, consts) consts_exponential_search = consts * 10 consts_binary_search = (lower_bounds + upper_bounds) / 2 consts = np.where(np.isinf(upper_bounds), consts_exponential_search, consts_binary_search) return restore_type(best_advs)
def test_logical_and_manual(t: Tensor) -> None: assert (ep.logical_and(t < 3, ep.ones_like(t).bool()) == (t < 3)).all()
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, starting_points: Optional[T] = None, **kwargs: Any, ) -> T: """For models that preprocess their inputs by binarizing the inputs, this attack can improve adversarials found by other attacks. It does this by utilizing information about the binarization and mapping values to the corresponding value in the clean input or to the right side of the threshold. Parameters ---------- threshold : float The treshold used by the models binarization. If none, defaults to (model.bounds()[1] - model.bounds()[0]) / 2. included_in : str Whether the threshold value itself belongs to the lower or upper interval. """ raise_if_kwargs(kwargs) if starting_points is None: raise ValueError( "BinarizationRefinementAttack requires starting_points") (o, x), restore_type = ep.astensors_(inputs, starting_points) del inputs, starting_points, kwargs criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) if self.threshold is None: min_, max_ = model.bounds threshold = (min_ + max_) / 2.0 else: threshold = self.threshold assert o.dtype == x.dtype nptype = o.reshape(-1)[0].numpy().dtype.type if nptype not in [np.float16, np.float32, np.float64]: raise ValueError( # pragma: no cover f"expected dtype to be float16, float32 or float64, found '{nptype}'" ) threshold = nptype(threshold) offset = nptype(1.0) if self.included_in == "lower": lower_ = threshold upper_ = np.nextafter(threshold, threshold + offset) elif self.included_in == "upper": lower_ = np.nextafter(threshold, threshold - offset) upper_ = threshold else: raise ValueError( f"expected included_in to be 'lower' or 'upper', found '{self.included_in}'" ) assert lower_ < upper_ p = ep.full_like(o, ep.nan) lower = ep.ones_like(o) * lower_ upper = ep.ones_like(o) * upper_ indices = ep.logical_and(o <= lower, x <= lower) p = ep.where(indices, o, p) indices = ep.logical_and(o <= lower, x >= upper) p = ep.where(indices, upper, p) indices = ep.logical_and(o >= upper, x <= lower) p = ep.where(indices, lower, p) indices = ep.logical_and(o >= upper, x >= upper) p = ep.where(indices, o, p) assert not ep.any(ep.isnan(p)) is_adv1 = is_adversarial(x) is_adv2 = is_adversarial(p) if (is_adv1 != is_adv2).any(): raise ValueError( "The specified threshold does not match what is done by the model." ) return restore_type(p)
def __call__( self, inputs, labels, *, starting_points=None, init_attack=None, criterion: Callable = misclassification, steps=25000, spherical_step=1e-2, source_step=1e-2, source_step_convergance=1e-7, step_adaptation=1.5, tensorboard=False, update_stats_every_k=10, ): """Boundary Attack Differences to the original reference implementation: * We do not perform internal operations with float64 * The samples within a batch can currently influence each other a bit * We don't perform the additional convergence confirmation * The success rate tracking changed a bit * Some other changes due to batching and merged loops Parameters ---------- criterion : Callable A callable that returns true if the given logits of perturbed inputs should be considered adversarial w.r.t. to the given labels and unperturbed inputs. tensorboard : str The log directory for TensorBoard summaries. If False, TensorBoard summaries will be disabled (default). If None, the logdir will be runs/CURRENT_DATETIME_HOSTNAME. """ tb = TensorBoard(logdir=tensorboard) originals = ep.astensor(inputs) labels = ep.astensor(labels) def is_adversarial(p: ep.Tensor) -> ep.Tensor: """For each input in x, returns true if it is an adversarial for the given model and criterion""" logits = self.model.forward(p) return criterion(originals, labels, p, logits) if starting_points is None: if init_attack is None: init_attack = LinearSearchBlendedUniformNoiseAttack logging.info( f"Neither starting_points nor init_attack given. Falling" f" back to {init_attack.__name__} for initialization.") starting_points = init_attack(self.model)(inputs, labels) best_advs = ep.astensor(starting_points) assert is_adversarial(best_advs).all() N = len(originals) ndim = originals.ndim spherical_steps = ep.ones(originals, N) * spherical_step source_steps = ep.ones(originals, N) * source_step tb.scalar("batchsize", N, 0) # create two queues for each sample to track success rates # (used to update the hyper parameters) stats_spherical_adversarial = ArrayQueue(maxlen=100, N=N) stats_step_adversarial = ArrayQueue(maxlen=30, N=N) bounds = self.model.bounds() for step in range(1, steps + 1): converged = source_steps < source_step_convergance if converged.all(): break converged = atleast_kd(converged, ndim) # TODO: performance: ignore those that have converged # (we could select the non-converged ones, but we currently # cannot easily invert this in the end using EagerPy) unnormalized_source_directions = originals - best_advs source_norms = l2norms(unnormalized_source_directions) source_directions = unnormalized_source_directions / atleast_kd( source_norms, ndim) # only check spherical candidates every k steps check_spherical_and_update_stats = step % update_stats_every_k == 0 candidates, spherical_candidates = draw_proposals( bounds, originals, best_advs, unnormalized_source_directions, source_directions, source_norms, spherical_steps, source_steps, ) candidates.dtype == originals.dtype spherical_candidates.dtype == spherical_candidates.dtype is_adv = is_adversarial(candidates) if check_spherical_and_update_stats: spherical_is_adv = is_adversarial(spherical_candidates) stats_spherical_adversarial.append(spherical_is_adv) # TODO: algorithm: the original implementation ignores those samples # for which spherical is not adversarial and continues with the # next iteration -> we estimate different probabilities (conditional vs. unconditional) # TODO: thoughts: should we always track this because we compute it anyway stats_step_adversarial.append(is_adv) else: spherical_is_adv = None # in theory, we are closer per construction # but limited numerical precision might break this distances = l2norms(originals - candidates) closer = distances < source_norms is_best_adv = ep.logical_and(is_adv, closer) is_best_adv = atleast_kd(is_best_adv, ndim) cond = converged.logical_not().logical_and(is_best_adv) best_advs = ep.where(cond, candidates, best_advs) tb.probability("converged", converged, step) tb.scalar("updated_stats", check_spherical_and_update_stats, step) tb.histogram("norms", source_norms, step) tb.probability("is_adv", is_adv, step) if spherical_is_adv is not None: tb.probability("spherical_is_adv", spherical_is_adv, step) tb.histogram("candidates/distances", distances, step) tb.probability("candidates/closer", closer, step) tb.probability("candidates/is_best_adv", is_best_adv, step) tb.probability("new_best_adv_including_converged", is_best_adv, step) tb.probability("new_best_adv", cond, step) if check_spherical_and_update_stats: full = stats_spherical_adversarial.isfull() tb.probability("spherical_stats/full", full, step) if full.any(): probs = stats_spherical_adversarial.mean() cond1 = ep.logical_and(probs > 0.5, full) spherical_steps = ep.where( cond1, spherical_steps * step_adaptation, spherical_steps) source_steps = ep.where(cond1, source_steps * step_adaptation, source_steps) cond2 = ep.logical_and(probs < 0.2, full) spherical_steps = ep.where( cond2, spherical_steps / step_adaptation, spherical_steps) source_steps = ep.where(cond2, source_steps / step_adaptation, source_steps) stats_spherical_adversarial.clear( ep.logical_or(cond1, cond2)) tb.conditional_mean( "spherical_stats/isfull/success_rate/mean", probs, full, step) tb.probability_ratio("spherical_stats/isfull/too_linear", cond1, full, step) tb.probability_ratio( "spherical_stats/isfull/too_nonlinear", cond2, full, step) full = stats_step_adversarial.isfull() tb.probability("step_stats/full", full, step) if full.any(): probs = stats_step_adversarial.mean() # TODO: algorithm: changed the two values because we are currently tracking p(source_step_sucess) # instead of p(source_step_success | spherical_step_sucess) that was tracked before cond1 = ep.logical_and(probs > 0.25, full) source_steps = ep.where(cond1, source_steps * step_adaptation, source_steps) cond2 = ep.logical_and(probs < 0.1, full) source_steps = ep.where(cond2, source_steps / step_adaptation, source_steps) stats_step_adversarial.clear(ep.logical_or(cond1, cond2)) tb.conditional_mean("step_stats/isfull/success_rate/mean", probs, full, step) tb.probability_ratio( "step_stats/isfull/success_rate_too_high", cond1, full, step) tb.probability_ratio( "step_stats/isfull/success_rate_too_low", cond2, full, step) tb.histogram("spherical_step", spherical_steps, step) tb.histogram("source_step", source_steps, step) tb.close() return best_advs.tensor
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, starting_points: Optional[T] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) originals, restore_type = ep.astensor_(inputs) del inputs, kwargs criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) if starting_points is None: init_attack: MinimizationAttack if self.init_attack is None: init_attack = LinearSearchBlendedUniformNoiseAttack(steps=50) logging.info( f"Neither starting_points nor init_attack given. Falling" f" back to {init_attack!r} for initialization.") else: init_attack = self.init_attack # TODO: use call and support all types of attacks (once early_stop is # possible in __call__) best_advs = init_attack.run(model, originals, criterion, early_stop=early_stop) else: best_advs = ep.astensor(starting_points) is_adv = is_adversarial(best_advs) if not is_adv.all(): failed = is_adv.logical_not().float32().sum() if starting_points is None: raise ValueError( f"init_attack failed for {failed} of {len(is_adv)} inputs") else: raise ValueError( f"{failed} of {len(is_adv)} starting_points are not adversarial" ) del starting_points tb = TensorBoard(logdir=self.tensorboard) N = len(originals) ndim = originals.ndim spherical_steps = ep.ones(originals, N) * self.spherical_step source_steps = ep.ones(originals, N) * self.source_step tb.scalar("batchsize", N, 0) # create two queues for each sample to track success rates # (used to update the hyper parameters) stats_spherical_adversarial = ArrayQueue(maxlen=100, N=N) stats_step_adversarial = ArrayQueue(maxlen=30, N=N) bounds = model.bounds self.class_1 = [] self.class_2 = [] self.surrogate_model = None device = model.device train_step = 500 for step in tqdm(range(1, self.steps + 1)): converged = source_steps < self.source_step_convergance if converged.all(): break # pragma: no cover converged = atleast_kd(converged, ndim) # TODO: performance: ignore those that have converged # (we could select the non-converged ones, but we currently # cannot easily invert this in the end using EagerPy) unnormalized_source_directions = originals - best_advs source_norms = ep.norms.l2(flatten(unnormalized_source_directions), axis=-1) source_directions = unnormalized_source_directions / atleast_kd( source_norms, ndim) # only check spherical candidates every k steps check_spherical_and_update_stats = step % self.update_stats_every_k == 0 candidates, spherical_candidates = draw_proposals( bounds, originals, best_advs, unnormalized_source_directions, source_directions, source_norms, spherical_steps, source_steps, self.surrogate_model) candidates.dtype == originals.dtype spherical_candidates.dtype == spherical_candidates.dtype is_adv = is_adversarial(candidates) is_adv_spherical_candidates = is_adversarial(spherical_candidates) if is_adv.item(): self.class_1.append(candidates) if not is_adv_spherical_candidates.item(): self.class_2.append(spherical_candidates) if (step % train_step == 0) and (step > 0): start_time = time() class_1 = self.class_1 class_2 = self.class_2 class_1 = np.array([image.numpy()[0] for image in class_1]) class_2 = np.array([image.numpy()[0] for image in class_2]) class_2 = class_2[:len(class_1)] data = np.concatenate([class_1, class_2]) labels = np.append(np.ones(len(class_1)), np.zeros(len(class_2))) X = torch.tensor(data).to(device) y = torch.tensor(labels, dtype=torch.long).to(device) if self.surrogate_model is None: model_sur = torchvision.models.resnet18(pretrained=True) #model.features[0] = torch.nn.Conv2d(3, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2)) model_sur.fc = torch.nn.Linear(in_features=512, out_features=2, bias=True) model_sur = model_sur.to(device) else: model_sur = model_surrogate X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42) optimizer = torch.optim.Adam(model_sur.parameters(), lr=3e-4) loss = torch.nn.CrossEntropyLoss() model_surrogate, accuracy_history_test, accuracy_history_train = train( model_sur, optimizer, loss, X_train, y_train, X_test, y_test) model_surrogate = model_surrogate.eval() self.surrogate_model = fb.PyTorchModel(model_surrogate, bounds=(0, 1), device=device) end_time = time() #print('Time for train: ', np.round(end_time - start_time, 2)) #print('\n') spherical_is_adv: Optional[ep.Tensor] if check_spherical_and_update_stats: spherical_is_adv = is_adversarial(spherical_candidates) stats_spherical_adversarial.append(spherical_is_adv) # TODO: algorithm: the original implementation ignores those samples # for which spherical is not adversarial and continues with the # next iteration -> we estimate different probabilities (conditional vs. unconditional) # TODO: thoughts: should we always track this because we compute it anyway stats_step_adversarial.append(is_adv) else: spherical_is_adv = None # in theory, we are closer per construction # but limited numerical precision might break this distances = ep.norms.l2(flatten(originals - candidates), axis=-1) closer = distances < source_norms is_best_adv = ep.logical_and(is_adv, closer) is_best_adv = atleast_kd(is_best_adv, ndim) cond = converged.logical_not().logical_and(is_best_adv) best_advs = ep.where(cond, candidates, best_advs) tb.probability("converged", converged, step) tb.scalar("updated_stats", check_spherical_and_update_stats, step) tb.histogram("norms", source_norms, step) tb.probability("is_adv", is_adv, step) if spherical_is_adv is not None: tb.probability("spherical_is_adv", spherical_is_adv, step) tb.histogram("candidates/distances", distances, step) tb.probability("candidates/closer", closer, step) tb.probability("candidates/is_best_adv", is_best_adv, step) tb.probability("new_best_adv_including_converged", is_best_adv, step) tb.probability("new_best_adv", cond, step) if check_spherical_and_update_stats: full = stats_spherical_adversarial.isfull() tb.probability("spherical_stats/full", full, step) if full.any(): probs = stats_spherical_adversarial.mean() cond1 = ep.logical_and(probs > 0.5, full) spherical_steps = ep.where( cond1, spherical_steps * self.step_adaptation, spherical_steps) source_steps = ep.where( cond1, source_steps * self.step_adaptation, source_steps) cond2 = ep.logical_and(probs < 0.2, full) spherical_steps = ep.where( cond2, spherical_steps / self.step_adaptation, spherical_steps) source_steps = ep.where( cond2, source_steps / self.step_adaptation, source_steps) stats_spherical_adversarial.clear( ep.logical_or(cond1, cond2)) tb.conditional_mean( "spherical_stats/isfull/success_rate/mean", probs, full, step) tb.probability_ratio("spherical_stats/isfull/too_linear", cond1, full, step) tb.probability_ratio( "spherical_stats/isfull/too_nonlinear", cond2, full, step) full = stats_step_adversarial.isfull() tb.probability("step_stats/full", full, step) if full.any(): probs = stats_step_adversarial.mean() # TODO: algorithm: changed the two values because we are currently tracking p(source_step_sucess) # instead of p(source_step_success | spherical_step_sucess) that was tracked before cond1 = ep.logical_and(probs > 0.25, full) source_steps = ep.where( cond1, source_steps * self.step_adaptation, source_steps) cond2 = ep.logical_and(probs < 0.1, full) source_steps = ep.where( cond2, source_steps / self.step_adaptation, source_steps) stats_step_adversarial.clear(ep.logical_or(cond1, cond2)) tb.conditional_mean("step_stats/isfull/success_rate/mean", probs, full, step) tb.probability_ratio( "step_stats/isfull/success_rate_too_high", cond1, full, step) tb.probability_ratio( "step_stats/isfull/success_rate_too_low", cond2, full, step) tb.histogram("spherical_step", spherical_steps, step) tb.histogram("source_step", source_steps, step) tb.close() return restore_type(best_advs)