def l2_clipping_aware_rescaling(x,
                                delta,
                                eps: float,
                                a: float = 0.0,
                                b: float = 1.0):  # type: ignore
    """Calculates eta such that norm(clip(x + eta * delta, a, b) - x) == eps.

    Assumes x and delta have a batch dimension and eps, a, b, and p are
    scalars. If the equation cannot be solved because eps is too large, the
    left hand side is maximized.

    Args:
        x: A batch of inputs (PyTorch Tensor, TensorFlow Eager Tensor, NumPy
            Array, JAX Array, or EagerPy Tensor).
        delta: A batch of perturbation directions (same shape and type as x).
        eps: The target norm (non-negative float).
        a: The lower bound of the data domain (float).
        b: The upper bound of the data domain (float).

    Returns:
        eta: A batch of scales with the same number of dimensions as x but all
            axis == 1 except for the batch dimension.
    """
    (x, delta), restore_fn = ep.astensors_(x, delta)
    N = x.shape[0]
    assert delta.shape[0] == N
    rows = ep.arange(x, N)

    delta2 = delta.square().reshape((N, -1))
    space = ep.where(delta >= 0, b - x, x - a).reshape((N, -1))
    f2 = space.square() / ep.maximum(delta2, 1e-20)
    ks = ep.argsort(f2, axis=-1)
    f2_sorted = f2[rows[:, ep.newaxis], ks]
    m = ep.cumsum(delta2[rows[:, ep.newaxis],
                         ks.flip(axis=1)], axis=-1).flip(axis=1)
    dx = f2_sorted[:, 1:] - f2_sorted[:, :-1]
    dx = ep.concatenate((f2_sorted[:, :1], dx), axis=-1)
    dy = m * dx
    y = ep.cumsum(dy, axis=-1)
    c = y >= eps**2

    # work-around to get first nonzero element in each row
    f = ep.arange(x, c.shape[-1], 0, -1)
    j = ep.argmax(c.astype(f.dtype) * f, axis=-1)

    eta2 = f2_sorted[rows, j] - (y[rows, j] - eps**2) / m[rows, j]
    # it can happen that for certain rows even the largest j is not large enough
    # (i.e. c[:, -1] is False), then we will just use it (without any correction) as it's
    # the best we can do (this should also be the only cases where m[j] can be
    # 0 and they are thus not a problem)
    eta2 = ep.where(c[:, -1], eta2, f2_sorted[:, -1])
    eta = ep.sqrt(eta2)
    eta = eta.reshape((-1, ) + (1, ) * (x.ndim - 1))

    # xp = ep.clip(x + eta * delta, a, b)
    # l2 = (xp - x).reshape((N, -1)).square().sum(axis=-1).sqrt()
    return restore_fn(eta)
Example #2
0
    def __call__(self, model: Model, inputs: T,
                 criterion: Union[Misclassification, T]) -> T:
        x, restore_type = ep.astensor_(inputs)
        criterion_ = get_criterion(criterion)
        del inputs, criterion

        N = len(x)

        if isinstance(criterion_, Misclassification):
            classes = criterion_.labels
        else:
            raise ValueError("unsupported criterion")

        if classes.shape != (N, ):
            raise ValueError(
                f"expected labels to have shape ({N},), got {classes.shape}")

        bounds = model.bounds

        def loss_fun(delta: ep.Tensor, logits: ep.Tensor) -> ep.Tensor:
            assert x.shape[0] == logits.shape[0]
            assert delta.shape == x.shape

            x_hat = x + delta
            logits_hat = model(x_hat)
            loss = ep.kl_div_with_logits(logits, logits_hat).sum()

            return loss

        value_and_grad = ep.value_and_grad_fn(x, loss_fun, has_aux=False)

        clean_logits = model(x)

        # start with random vector as search vector
        d = ep.normal(x, shape=x.shape, mean=0, stddev=1)
        for it in range(self.iterations):
            # normalize proposal to be unit vector
            d = d * self.xi / atleast_kd(ep.norms.l2(flatten(d), axis=-1),
                                         x.ndim)

            # use gradient of KL divergence as new search vector
            _, grad = value_and_grad(d, clean_logits)
            d = grad

            # rescale search vector
            d = (bounds[1] - bounds[0]) * d

            if ep.any(ep.norms.l2(flatten(d), axis=-1) < 1e-64):
                raise RuntimeError(
                    "Gradient vanished; this can happen if xi is too small.")

        final_delta = (self.epsilon / ep.sqrt(
            (d**2).sum(keepdims=True, axis=(1, 2, 3))) * d)
        x_adv = ep.clip(x + final_delta, *bounds)

        return restore_type(x_adv)
Example #3
0
    def __call__(self, gradient: ep.Tensor) -> ep.Tensor:
        self.t += 1

        self.m = self.beta1 * self.m + (1 - self.beta1) * gradient
        self.v = self.beta2 * self.v + (1 - self.beta2) * gradient**2

        bias_correction_1 = 1 - self.beta1**self.t
        bias_correction_2 = 1 - self.beta2**self.t

        m_hat = self.m / bias_correction_1
        v_hat = self.v / bias_correction_2

        return self.stepsize * m_hat / (ep.sqrt(v_hat) + self.epsilon)
Example #4
0
    def __call__(self,
                 gradient,
                 learning_rate,
                 beta1=0.9,
                 beta2=0.999,
                 epsilon=1e-8):
        self.t += 1

        self.m = beta1 * self.m + (1 - beta1) * gradient
        self.v = beta2 * self.v + (1 - beta2) * gradient**2

        bias_correction_1 = 1 - beta1**self.t
        bias_correction_2 = 1 - beta2**self.t

        m_hat = self.m / bias_correction_1
        v_hat = self.v / bias_correction_2

        return -learning_rate * m_hat / (ep.sqrt(v_hat) + epsilon)
    def __call__(
        self,
        gradient: ep.Tensor,
        stepsize: float,
        beta1: float = 0.9,
        beta2: float = 0.999,
        epsilon: float = 1e-8,
    ) -> ep.Tensor:
        self.t += 1

        self.m = beta1 * self.m + (1 - beta1) * gradient
        self.v = beta2 * self.v + (1 - beta2) * gradient**2

        bias_correction_1 = 1 - beta1**self.t
        bias_correction_2 = 1 - beta2**self.t

        m_hat = self.m / bias_correction_1
        v_hat = self.v / bias_correction_2

        return -stepsize * m_hat / (ep.sqrt(v_hat) + epsilon)
Example #6
0
def test_sqrt(t: Tensor) -> Tensor:
    return ep.sqrt(t)