def l2_clipping_aware_rescaling(x, delta, eps: float, a: float = 0.0, b: float = 1.0): # type: ignore """Calculates eta such that norm(clip(x + eta * delta, a, b) - x) == eps. Assumes x and delta have a batch dimension and eps, a, b, and p are scalars. If the equation cannot be solved because eps is too large, the left hand side is maximized. Args: x: A batch of inputs (PyTorch Tensor, TensorFlow Eager Tensor, NumPy Array, JAX Array, or EagerPy Tensor). delta: A batch of perturbation directions (same shape and type as x). eps: The target norm (non-negative float). a: The lower bound of the data domain (float). b: The upper bound of the data domain (float). Returns: eta: A batch of scales with the same number of dimensions as x but all axis == 1 except for the batch dimension. """ (x, delta), restore_fn = ep.astensors_(x, delta) N = x.shape[0] assert delta.shape[0] == N rows = ep.arange(x, N) delta2 = delta.square().reshape((N, -1)) space = ep.where(delta >= 0, b - x, x - a).reshape((N, -1)) f2 = space.square() / ep.maximum(delta2, 1e-20) ks = ep.argsort(f2, axis=-1) f2_sorted = f2[rows[:, ep.newaxis], ks] m = ep.cumsum(delta2[rows[:, ep.newaxis], ks.flip(axis=1)], axis=-1).flip(axis=1) dx = f2_sorted[:, 1:] - f2_sorted[:, :-1] dx = ep.concatenate((f2_sorted[:, :1], dx), axis=-1) dy = m * dx y = ep.cumsum(dy, axis=-1) c = y >= eps**2 # work-around to get first nonzero element in each row f = ep.arange(x, c.shape[-1], 0, -1) j = ep.argmax(c.astype(f.dtype) * f, axis=-1) eta2 = f2_sorted[rows, j] - (y[rows, j] - eps**2) / m[rows, j] # it can happen that for certain rows even the largest j is not large enough # (i.e. c[:, -1] is False), then we will just use it (without any correction) as it's # the best we can do (this should also be the only cases where m[j] can be # 0 and they are thus not a problem) eta2 = ep.where(c[:, -1], eta2, f2_sorted[:, -1]) eta = ep.sqrt(eta2) eta = eta.reshape((-1, ) + (1, ) * (x.ndim - 1)) # xp = ep.clip(x + eta * delta, a, b) # l2 = (xp - x).reshape((N, -1)).square().sum(axis=-1).sqrt() return restore_fn(eta)
def __call__(self, model: Model, inputs: T, criterion: Union[Misclassification, T]) -> T: x, restore_type = ep.astensor_(inputs) criterion_ = get_criterion(criterion) del inputs, criterion N = len(x) if isinstance(criterion_, Misclassification): classes = criterion_.labels else: raise ValueError("unsupported criterion") if classes.shape != (N, ): raise ValueError( f"expected labels to have shape ({N},), got {classes.shape}") bounds = model.bounds def loss_fun(delta: ep.Tensor, logits: ep.Tensor) -> ep.Tensor: assert x.shape[0] == logits.shape[0] assert delta.shape == x.shape x_hat = x + delta logits_hat = model(x_hat) loss = ep.kl_div_with_logits(logits, logits_hat).sum() return loss value_and_grad = ep.value_and_grad_fn(x, loss_fun, has_aux=False) clean_logits = model(x) # start with random vector as search vector d = ep.normal(x, shape=x.shape, mean=0, stddev=1) for it in range(self.iterations): # normalize proposal to be unit vector d = d * self.xi / atleast_kd(ep.norms.l2(flatten(d), axis=-1), x.ndim) # use gradient of KL divergence as new search vector _, grad = value_and_grad(d, clean_logits) d = grad # rescale search vector d = (bounds[1] - bounds[0]) * d if ep.any(ep.norms.l2(flatten(d), axis=-1) < 1e-64): raise RuntimeError( "Gradient vanished; this can happen if xi is too small.") final_delta = (self.epsilon / ep.sqrt( (d**2).sum(keepdims=True, axis=(1, 2, 3))) * d) x_adv = ep.clip(x + final_delta, *bounds) return restore_type(x_adv)
def __call__(self, gradient: ep.Tensor) -> ep.Tensor: self.t += 1 self.m = self.beta1 * self.m + (1 - self.beta1) * gradient self.v = self.beta2 * self.v + (1 - self.beta2) * gradient**2 bias_correction_1 = 1 - self.beta1**self.t bias_correction_2 = 1 - self.beta2**self.t m_hat = self.m / bias_correction_1 v_hat = self.v / bias_correction_2 return self.stepsize * m_hat / (ep.sqrt(v_hat) + self.epsilon)
def __call__(self, gradient, learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8): self.t += 1 self.m = beta1 * self.m + (1 - beta1) * gradient self.v = beta2 * self.v + (1 - beta2) * gradient**2 bias_correction_1 = 1 - beta1**self.t bias_correction_2 = 1 - beta2**self.t m_hat = self.m / bias_correction_1 v_hat = self.v / bias_correction_2 return -learning_rate * m_hat / (ep.sqrt(v_hat) + epsilon)
def __call__( self, gradient: ep.Tensor, stepsize: float, beta1: float = 0.9, beta2: float = 0.999, epsilon: float = 1e-8, ) -> ep.Tensor: self.t += 1 self.m = beta1 * self.m + (1 - beta1) * gradient self.v = beta2 * self.v + (1 - beta2) * gradient**2 bias_correction_1 = 1 - beta1**self.t bias_correction_2 = 1 - beta2**self.t m_hat = self.m / bias_correction_1 v_hat = self.v / bias_correction_2 return -stepsize * m_hat / (ep.sqrt(v_hat) + epsilon)
def test_sqrt(t: Tensor) -> Tensor: return ep.sqrt(t)