Example #1
0
def macer_train(sigma, lbd, gauss_num, beta, gamma, num_classes, model,
                trainloader, optimizer, device):
    m = Normal(torch.tensor([0.0]).to(device), torch.tensor([1.0]).to(device))

    cl_total = 0.0
    rl_total = 0.0
    input_total = 0

    for _, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        input_size = len(inputs)
        input_total += input_size

        new_shape = [input_size * gauss_num]
        new_shape.extend(inputs[0].shape)
        inputs = inputs.repeat((1, gauss_num, 1, 1)).view(new_shape)
        noise = torch.randn_like(inputs, device=device) * sigma
        noisy_inputs = inputs + noise

        outputs = model(noisy_inputs)
        outputs = outputs.reshape((input_size, gauss_num, num_classes))

        # Classification loss
        outputs_softmax = F.softmax(outputs, dim=2).mean(1)
        outputs_logsoftmax = torch.log(outputs_softmax + 1e-10)  # avoid nan
        classification_loss = F.nll_loss(outputs_logsoftmax,
                                         targets,
                                         reduction='sum')
        cl_total += classification_loss.item()

        # Robustness loss
        beta_outputs = outputs * beta  # only apply beta to the robustness loss
        beta_outputs_softmax = F.softmax(beta_outputs, dim=2).mean(1)
        top2 = torch.topk(beta_outputs_softmax, 2)
        top2_score = top2[0]
        top2_idx = top2[1]
        indices_correct = (top2_idx[:, 0] == targets)  # G_theta

        out0, out1 = top2_score[indices_correct,
                                0], top2_score[indices_correct, 1]
        robustness_loss = m.icdf(out1) - m.icdf(out0)
        indices = ~torch.isnan(robustness_loss) & ~torch.isinf(
            robustness_loss) & (torch.abs(robustness_loss) <= gamma)  # hinge
        out0, out1 = out0[indices], out1[indices]
        robustness_loss = m.icdf(out1) - m.icdf(out0) + gamma
        robustness_loss = robustness_loss.sum() * sigma / 2
        rl_total += robustness_loss.item()

        # Final objective function
        loss = classification_loss + lbd * robustness_loss
        loss /= input_size
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    cl_total /= input_total
    rl_total /= input_total
    print('Classification Loss: {}  Robustness Loss: {}'.format(
        cl_total, rl_total))
Example #2
0
class GaussianModel(nn.Module):
    r"""
    Model to learn a univariate Gaussian distribution.

    Arguments
    ----------
    mu: Mean of the Gaussian distribution
    sigma: Standard deviation of the Gaussian distribution
    device: The torch.device to use, typically cpu or gpu id
    """

    def __init__(self, mu, sigma, device=None):
        super().__init__()
        if device is not None:
            self.device = device
            mu = mu.to(device)
            sigma = sigma.to(device)
        self.mu = mu
        self.sigma = sigma
        self.distr = Normal(self.mu, self.sigma)

    def to_device(self, device):
        """
        Moves members to a specified torch.device.
        """
        self.device = device

    def forward(self, x):
        """
        Takes input x as new distribution parameters.
        """
        # If mini-batching
        if len(x.shape) > 1:
            self.mu_batch = x[:, 0]
            self.sigma_batch = F.softplus(x[:, 1])

        # If not mini-batching
        else:
            self.mu = x[0]
            self.distr = Normal(self.mu, self.sigma)

        return self.distr

    def log_prob(self, x):
        x = x.view(x.shape.numel())
        if x.shape[0] == 1:
            return self.distr.log_prob(x[0]).view(1)

        log_like_arr = torch.ones_like(x)
        for i in range(len(x)):
            self.mu = self.mu_batch[i]
            self.distr = Normal(self.mu, self.sigma)
            lpxx = self.distr.log_prob(x[i]).view(1)
            log_like_arr[i] = lpxx

        lpx = log_like_arr
        return lpx

    def icdf(self, value):
        return self.distr.icdf(value)
Example #3
0
def OptimzeSigma(model, batch, alpha, sig_0, K, n):
    device = 'cuda:0'
    batch_size = batch.shape[0]

    sig = Variable(sig_0, requires_grad=True).view(batch_size, 1, 1, 1)
    m = Normal(
        torch.zeros(batch_size).to(device),
        torch.ones(batch_size).to(device))

    for param in model.parameters():
        param.requires_grad_(False)

    #Reshaping so for n > 1
    new_shape = [batch_size * n]
    new_shape.extend(batch[0].shape)
    new_batch = batch.repeat((1, n, 1, 1)).view(new_shape)

    for _ in range(K):
        sigma_repeated = sig.repeat((1, n, 1, 1)).view(-1, 1, 1, 1)
        eps = torch.randn_like(
            new_batch) * sigma_repeated  #Reparamitrization trick
        out = model(new_batch + eps).reshape(batch_size, n, -1).mean(
            1)  #This is \psi in the algorithm

        vals, _ = torch.topk(out, 2)
        vals.transpose_(0, 1)
        gap = m.icdf(vals[0].clamp_(0.02, 0.98)) - m.icdf(vals[1].clamp_(
            0.02, 0.98))
        radius = sig.reshape(-1) / 2 * gap  # The radius formula
        grad = torch.autograd.grad(radius.sum(), sig)

        sig.data += alpha * grad[0]  # Gradient Ascent step

    #For training purposes after getting the sigma
    for param in model.parameters():
        param.requires_grad_(True)

    return sig.reshape(-1)
class L2Certificate(Certificate):
    norm = "l2"

    def __init__(self, batch_size: int, device: str = "cuda:0"):
        self.m = Normal(
            torch.zeros(batch_size).to(device),
            torch.ones(batch_size).to(device))
        self.device = device

    def compute_proxy_gap(self, logits: torch.Tensor) -> torch.Tensor:
        return self.m.icdf(logits[:, 0].clamp_(0.001, 0.999)) - \
            self.m.icdf(logits[:, 1].clamp_(0.001, 0.999))

    def sample_noise(self, batch: torch.Tensor,
                     repeated_theta: torch.Tensor) -> torch.Tensor:
        return torch.randn_like(batch, device=self.device) * repeated_theta

    def compute_gap(self, pABar: float) -> float:
        return norm.ppf(pABar)

    def compute_radius_estimate(self, logits: torch.Tensor,
                                theta: torch.Tensor) -> torch.Tensor:
        return theta / 2 * self.compute_proxy_gap(logits)
Example #5
0
    def pdf_param(self, x):
        # self._check_dimension(x)
        '''

        :param x: data numpy  n*d
        :param R: flattened correlation value, not include the redundancy and diagonal value shape: (d*(d-1))/2
        :return:
        '''

        # print('R:',R)
        # print('x:',x)
        norm = Normal(torch.tensor([0.0]), torch.tensor([1.0]))
        u = norm.icdf(x)

        # print('shape u:', u.shape)

        cov = self.get_R().cuda()

        if self.dim == 2:
            RDet = cov[0, 0] * cov[1, 1] - cov[0, 1]**2
            RInv = 1. / RDet * torch.from_numpy(
                np.asarray([[cov[1, 1], -cov[0, 1]], [-cov[0, 1], cov[0, 0]]]))
        else:
            RDet = torch.det(cov)
            RInv = torch.inverse(cov)
        u = u.unsqueeze(0).cuda()

        # print('u shape', u.shape)  #d
        I = torch.eye(self.dim).cuda()
        # print('u cuda:', u.is_cuda)
        # print('cov cuda:', cov.is_cuda)
        # print('I cuda:', I.is_cuda)
        res = RDet**(-0.5) * torch.exp(
            -0.5 * torch.mm(torch.mm(u, (RInv - I)), u.permute(1, 0))).cuda()
        # print('res:', res)
        if res.data == 0.0:
            print('RDet:', RDet)
            print('RInv shape', RInv.shape)
        if math.isnan(res.data):
            print('self.diagonal:', self.diagonal_val)
            print('self.non_diagonal:', self.off_diagonal_val)
            print('RDet:', RDet)

            print('RInv:', RInv)
            print('cov:', cov)
            print('u:', u)
            return

        return res
Example #6
0
def test():
    relative_error = 0
    for i in range(100):
        x = -1 + i * (10 - (-1)) / 100
        my_erfcx = erfcx(torch.FloatTensor([x]))
        relative_error = relative_error + np.abs(
            my_erfcx.item() - special.erfcx(x)) / special.erfcx(x)

    average_error = relative_error / 100
    print(average_error)
    normal = Normal(loc=torch.Tensor([0.0]), scale=torch.Tensor([1.0]))

    # cdf from 0 to x
    print(normal.cdf(1.6449))
    print(normal.icdf(torch.Tensor([0.95])))
Example #7
0
    def certify(self, x: torch.tensor, n0: int, n: int, alpha: float,
                batch_size: int) -> (int, float):
        """ Monte Carlo algorithm for certifying that g's prediction around x is constant within some L2 radius.
        With probability at least 1 - alpha, the class returned by this method will equal g(x), and g's prediction will
        robust within a L2 ball of radius R around x.
        :param x: the input [channel x height x width]
        :param n0: the number of Monte Carlo samples to use for selection
        :param n: the number of Monte Carlo samples to use for estimation
        :param alpha: the failure probability
        :param batch_size: batch size to use when evaluating the base classifier
        :return: (predicted class, certified radius)
                 in the case of abstention, the class will be ABSTAIN and the radius 0.
        """
        device = x.device
        self.cvae.eval()
        self.base_classifier.eval()
        # draw samples of f(x+ epsilon)
        counts_selection = self._sample_noise(x, n0, batch_size)
        # use these samples to take a guess at the top class
        # cAHat = counts_selection.argmax().item()
        cAHat = counts_selection.max(0)[1]
        # draw more samples of f(x + epsilon)
        counts_estimation = self._sample_noise(x, n, batch_size)
        # use these samples to estimate a lower bound on pA
        # nA = counts_estimation[cAHat].item()
        nA = counts_estimation.gather(0, cAHat.unsqueeze(0)).squeeze(0)

        # now all on CPU
        pABar = self._lower_confidence_bound(nA, n, alpha)

        std_normal = Normal(0, 1)
        radius = self.sigma * std_normal.icdf(pABar)

        if cAHat.ndim == 0:
            if pABar < 0.5:
                return torch.Tensor([
                    Smooth.ABSTAIN
                ]).long().to(device), torch.Tensor([0]).to(device)
            else:
                # radius = self.sigma * norm.ppf(pABar)
                return cAHat.to(device), radius.to(device)

        else:
            I = pABar < 0.5
            radius[I] = Smooth.ABSTAIN
            cAHat[I] = 0.0

            return cAHat.to(device), radius.to(device)
def morph():
    model = load_model()
    from torch.distributions.normal import Normal
    normal = Normal(0., 1.)

    images = []
    z = torch.randn(NUM_HIDDEN)
    for x in range(10+1):
        for y in range(10+1):
            x_coord = min(max(x / 10., .01), .99)
            y_coord = min(max(y / 10., .01), .99)

            z[0:2] = normal.icdf(torch.tensor([x_coord, y_coord]))
            recon = model.decode(z)
            images.append(recon)

    images_joined = torch.cat(images).view(-1, 1, IMG_WIDTH, IMG_HEIGHT)
    save_image(images_joined.cpu(),
               'data/reconstruction/morph.png', nrow=11)
Example #9
0
class NormalInvCDF(Transform):
    domain = constraints.real
    codomain = constraints.interval(0, 1)
    bijective = True
    
    def __init__(self):
        super(NormalInvCDF, self).__init__()
        self.normal_dist = Normal(0., 1.)

    def __eq__(self, other):
        return isinstance(other, NormalInvCDF)

    def _call(self, x):
        return self.normal_dist.cdf(x)

    def _inverse(self, y):
        return self.normal_dist.icdf(y)

    def log_abs_det_jacobian(self, x, y):
        # return self.normal_dist.log_prob(x)
        return torch.log(y)
Example #10
0
class labels_transformer():
    def __init__(self):
        self.labels_mean = torch.FloatTensor([
            0.4761464174454829, 0.5202864583333333, 0.5481813186813186,
            0.5227313915857604, 0.5037803738317757, 0.5662814814814815
        ])
        self.labels_std = torch.FloatTensor([
            0.15228452985134602, 0.15353347248058757, 0.13637365282783034,
            0.15520650375390665, 0.15013557786759546, 0.14697755975897248
        ])
        self.dist = Normal(0, 1)

    def transform_labels(self, true_labels):
        pseudo_labels = (true_labels - self.labels_mean) / self.labels_std
        pseudo_labels = self.dist.cdf(pseudo_labels)
        return pseudo_labels

    def inverse_transform_labels(self, pseudo_labels):
        true_labels = self.dist.icdf(pseudo_labels)
        true_labels = true_labels * self.labels_std + self.labels_mean
        return true_labels
Example #11
0
    def fit_transform(self, x):
        all_latent = []
        # 1. PCA transform
        pca = PCA(random_state=0)
        pca.fit(x.detach().numpy())
        # assert np.isclose(np.abs(np.linalg.det(pca.components_)), 1), 'Should be close to one'
        Q_pca = torch.from_numpy(pca.components_)
        x = torch.mm(x, Q_pca.T)

        # 2. Independent normal cdf transform
        scale, loc = torch.std_mean(x, dim=0)
        ind_normal = Normal(loc, torch.sqrt(scale * scale + self.lam_variance))
        x = ind_normal.cdf(x)
        x = torch.clamp(x, 1e-10, 1 - 1e-10)

        # 3. Independent histogram transform
        if True:
            histograms = [
                TorchUnitHistogram(n_bins=self.n_bins,
                                   alpha=self.alpha).fit(x_col)
                for x_col in x.detach().T
            ]
            x = torch.cat(tuple(
                hist.cdf(x_col).reshape(-1, 1)
                for x_col, hist in zip(x.T, histograms)),
                          dim=1)
            # all_latent.append(x.detach().numpy())
            self.histograms_ = histograms

        # 4. Independent inverse standard normal transform
        if True:
            standard_normal = Normal(loc=torch.zeros_like(loc),
                                     scale=torch.ones_like(scale))
            x = standard_normal.icdf(x)
            self.standard_normal_ = standard_normal

        self.Q_pca_ = Q_pca
        self.ind_normal_ = ind_normal
        self._latent = x  # Just for debugging purposes
        return x
Example #12
0
class Wang_distortion():
    """Sample quantile levels for the Wang risk measure.
    Wang 2000

    Parameters
    ----------
    eta: float. Default: -0.75
        for eta < 0 prduces risk-averse.
    """
    def __init__(self, eta=-0.75):
        self.eta = eta
        self.normal = Normal(loc=torch.Tensor([0]), scale=torch.Tensor([1]))

    def sample(self, num_samples):
        """
        Parameters
        ----------
        num_samples: tuple. (num_samples,)

        """
        taus_uniform = uniform.Uniform(0., 1.).sample(num_samples)
        wang_tau = self.normal.cdf(value=self.normal.icdf(value=taus_uniform) +
                                   self.eta)
        return wang_tau
Example #13
0
class MQF2Distribution(torch.distributions.Distribution):
    r"""
    Distribution class for the model MQF2 proposed in the paper
    ``Multivariate Quantile Function Forecaster``
    by Kan, Aubet, Januschowski, Park, Benidis, Ruthotto, Gasthaus

    Parameters
    ----------
    picnn
        A SequentialNet instance of a
        partially input convex neural network (picnn)
    hidden_state
        hidden_state obtained by unrolling the RNN encoder
        shape = (batch_size, context_length, hidden_size) in training
        shape = (batch_size, hidden_size) in inference
    prediction_length
        Length of the prediction horizon
    is_energy_score
        If True, use energy score as objective function
        otherwise use maximum likelihood as
        objective function (normalizing flows)
    es_num_samples
        Number of samples drawn to approximate the energy score
    beta
        Hyperparameter of the energy score (power of the two terms)
    threshold_input
        Clamping threshold of the (scaled) input when maximum
        likelihood is used as objective function
        this is used to make the forecaster more robust
        to outliers in training samples
    validate_args
        Sets whether validation is enabled or disabled
        For more details, refer to the descriptions in
        torch.distributions.distribution.Distribution
    """
    def __init__(
        self,
        picnn: torch.nn.Module,
        hidden_state: torch.Tensor,
        prediction_length: int,
        is_energy_score: bool = True,
        es_num_samples: int = 50,
        beta: float = 1.0,
        threshold_input: float = 100.0,
        validate_args: bool = False,
    ) -> None:

        self.picnn = picnn
        self.hidden_state = hidden_state
        self.prediction_length = prediction_length
        self.is_energy_score = is_energy_score
        self.es_num_samples = es_num_samples
        self.beta = beta
        self.threshold_input = threshold_input

        super().__init__(batch_shape=self.batch_shape,
                         validate_args=validate_args)

        self.context_length = (self.hidden_state.shape[-2]
                               if len(self.hidden_state.shape) > 2 else 1)
        self.numel_batch = MQF2Distribution.get_numel(self.batch_shape)

        # mean zero and std one
        mu = torch.tensor(0,
                          dtype=hidden_state.dtype,
                          device=hidden_state.device)
        sigma = torch.ones_like(mu)
        self.standard_normal = Normal(mu, sigma)

    def stack_sliding_view(self, z: torch.Tensor) -> torch.Tensor:
        """
        Auxiliary function for loss computation.

        Unfolds the observations by sliding a window of size prediction_length
        over the observations z
        Then, reshapes the observations into a 2-dimensional tensor for
        further computation

        Parameters
        ----------
        z
            A batch of time series with shape
            (batch_size, context_length + prediction_length - 1)

        Returns
        -------
        Tensor
            Unfolded time series with shape
            (batch_size * context_length, prediction_length)
        """

        z = z.unfold(dimension=-1, size=self.prediction_length, step=1)
        z = z.reshape(-1, z.shape[-1])

        return z

    def loss(self, z: torch.Tensor) -> torch.Tensor:
        if self.is_energy_score:
            return self.energy_score(z)
        else:
            return -self.log_prob(z)

    def log_prob(self, z: torch.Tensor) -> torch.Tensor:
        """
        Computes the log likelihood  log(g(z)) + logdet(dg(z)/dz), where g is
        the gradient of the picnn.

        Parameters
        ----------
        z
            A batch of time series with shape
            (batch_size, context_length + prediciton_length - 1)

        Returns
        -------
        loss
            Tesnor of shape (batch_size * context_length,)
        """

        z = torch.clamp(z, min=-self.threshold_input, max=self.threshold_input)
        z = self.stack_sliding_view(z)

        loss = self.picnn.logp(
            z, self.hidden_state.reshape(-1, self.hidden_state.shape[-1]))

        return loss

    def energy_score(self, z: torch.Tensor) -> torch.Tensor:
        """
        Computes the (approximated) energy score sum_i ES(g,z_i), where
        ES(g,z_i) =

        -1/(2*es_num_samples^2) * sum_{w,w'} ||w-w'||_2^beta
        + 1/es_num_samples * sum_{w''} ||w''-z_i||_2^beta,
        w's are samples drawn from the
        quantile function g(., h_i) (gradient of picnn),
        h_i is the hidden state associated with z_i,
        and es_num_samples is the number of samples drawn
        for each of w, w', w'' in energy score approximation

        Parameters
        ----------
        z
            A batch of time series with shape
            (batch_size, context_length + prediction_length - 1)

        Returns
        -------
        loss
            Tensor of shape (batch_size * context_length,)
        """

        es_num_samples = self.es_num_samples
        beta = self.beta

        z = self.stack_sliding_view(z)
        reshaped_hidden_state = self.hidden_state.reshape(
            -1, self.hidden_state.shape[-1])

        loss = self.picnn.energy_score(z,
                                       reshaped_hidden_state,
                                       es_num_samples=es_num_samples,
                                       beta=beta)

        return loss

    def rsample(self, sample_shape: torch.Size = torch.Size()) -> torch.Tensor:
        """
        Generates the sample paths.

        Parameters
        ----------
        sample_shape
            Shape of the samples

        Returns
        -------
        sample_paths
            Tesnor of shape (batch_size, *sample_shape, prediction_length)
        """

        numel_batch = self.numel_batch
        prediction_length = self.prediction_length

        num_samples_per_batch = MQF2Distribution.get_numel(sample_shape)
        num_samples = num_samples_per_batch * numel_batch

        hidden_state_repeat = self.hidden_state.repeat_interleave(
            repeats=num_samples_per_batch, dim=0)

        alpha = torch.rand(
            (num_samples, prediction_length),
            dtype=self.hidden_state.dtype,
            device=self.hidden_state.device,
            layout=self.hidden_state.layout,
        )

        return self.quantile(
            alpha,
            hidden_state_repeat).reshape((numel_batch, ) + sample_shape +
                                         (prediction_length, ))

    def quantile(self,
                 alpha: torch.Tensor,
                 hidden_state: Optional[torch.Tensor] = None) -> torch.Tensor:
        """
        Generates the predicted paths associated with the quantile levels
        alpha.

        Parameters
        ----------
        alpha
            quantile levels,
            shape = (batch_shape, prediction_length)
        hidden_state
            hidden_state, shape = (batch_shape, hidden_size)

        Returns
        -------
        results
            predicted paths of shape = (batch_shape, prediction_length)
        """

        if hidden_state is None:
            hidden_state = self.hidden_state

        normal_quantile = self.standard_normal.icdf(alpha)

        # In the energy score approach, we directly draw samples from picnn
        # In the MLE (Normalizing flows) approach, we need to invert the picnn
        # (go backward through the flow) to draw samples
        if self.is_energy_score:
            result = self.picnn(normal_quantile, context=hidden_state)
        else:
            result = self.picnn.reverse(normal_quantile, context=hidden_state)

        return result

    @staticmethod
    def get_numel(tensor_shape: torch.Size) -> int:
        # Auxiliary function
        # compute number of elements specified in a torch.Size()
        return torch.prod(torch.tensor(tensor_shape)).item()

    @property
    def batch_shape(self) -> torch.Size:
        # last dimension is the hidden state size
        return self.hidden_state.shape[:-1]

    @property
    def event_shape(self) -> Tuple:
        return ()

    @property
    def event_dim(self) -> int:
        return 0
Example #14
0
def squeeze_values(mu, x):
    norm1 = Normal(mu, 1)
    norm2 = Normal(-mu, 1)
    return norm2.cdf(norm1.icdf(x.clamp(0,1)))
Example #15
0
def phi_inv(x):
    normal = Normal(loc=torch.cuda.FloatTensor([0.0]),
                    scale=torch.cuda.FloatTensor([1.0]))
    return normal.icdf(x)
Example #16
0
def inverse_transform_labels(pseudo_labels):
    dist = Normal(0, 1)
    true_labels = dist.icdf(pseudo_labels)
    true_labels = true_labels * labels_std + labels_mean
    return true_labels
Example #17
0
    def forward(self, query, key, value):

        batch_size = query.shape[0]
        maxlen = query.shape[1]

        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        # Q = [batch size, query len, hid dim]
        # K = [batch size, key len, hid dim]
        # V = [batch size, value len, hid dim]

        Q = Q.view(batch_size, maxlen, self.n_heads,
                   self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, maxlen, self.n_heads,
                   self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, maxlen, self.n_heads,
                   self.head_dim).permute(0, 2, 1, 3)
        # Q = [batch size, n heads, query len, head dim]
        # K = [batch size, n heads, key len, head dim]
        # V = [batch size, n heads, value len, head dim]

        KLD = torch.tensor(0.0)
        if self.args.att_type == 'dot':
            energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        elif self.args.att_type == 'ikandirect':
            w1_proj = self.attsharedw.w1
            w2_proj = self.attsharedw.w2
            scores, norm = ika_ns(Q, K, self.args, self.scale, w1_proj,
                                  w2_proj, 2 * np.pi, self.training)
            energy = torch.log(scores + (1e-5)) + norm
        elif self.args.att_type == 'mikan':
            ''' copula augmented estimation '''
            mu, logvar, L = self.attsharedw.copulanet(Q, K)
            mu = mu.squeeze(-1)
            logvar = logvar.squeeze(-1)
            var = torch.exp(logvar)

            dim_batch_size, num_head, num_head = L.size()
            dim = int(dim_batch_size / batch_size)

            pos_eps = torch.randn([dim, num_head,
                                   self.args.M // 2]).cuda()  # [64,8,128(M/2)]
            X_pos = torch.einsum('ijk,ijl->ijl', L, pos_eps)  # [64,8,128(M/2)]
            X_pos = torch.clamp(X_pos, min=-2.0, max=2.0)
            U_pos = self.standard_normal_dist.cdf(
                X_pos)  # [64,num_head,128(M/2)]

            neg_eps = torch.randn([dim, num_head,
                                   self.args.M // 2]).cuda()  # [64,8,128(M/2)]
            X_neg = torch.einsum('ijk,ijl->ijl', L, neg_eps)  # [64,8,128(M/2)]
            X_neg = torch.clamp(X_neg, min=-2.0, max=2.0)
            U_neg = self.standard_normal_dist.cdf(
                X_neg)  # [64,num_head,128(M/2)]

            marginal_pos = Normal(
                mu.unsqueeze(-1),
                var.unsqueeze(-1))  # mu : [64,num_head] / var : [64,num_head]
            marginal_neg = Normal(
                -1 * mu.unsqueeze(-1),
                var.unsqueeze(-1))  # mu : [64,num_head] / var : [64,num_head]
            Y_pos = marginal_pos.icdf(U_pos)  # [32,4,64]
            Y_neg = marginal_neg.icdf(U_neg)
            U = torch.cat([U_pos, U_neg])
            ent_copula = -1 * torch.sum(torch.mul(U, torch.log(U + (1e-5))))
            ''' kernel and norm calculation '''
            z = torch.cat([Y_pos, Y_neg], -1)  # torch.Size([1, 64, 4, 256])
            w1_proj = self.attsharedw.wnet1(z)
            w2_proj = self.attsharedw.wnet2(z)
            scores, norm = ika_ns(Q, K, self.args, self.scale, w1_proj,
                                  w2_proj, 2 * np.pi, self.training)
            energy = torch.log(scores + (1e-5)) + norm
            # energy = [batch size, n heads, query len, key len]

            q_dist = tdist.Normal(mu, logvar.exp())
            KLD = torch.distributions.kl_divergence(q_dist, self.p_dist)
            KLD = self.args.kl_lambda * torch.sum(
                KLD) + self.args.copula_lambda * ent_copula

        attention = torch.softmax(energy, dim=-1)
        # attention = [batch size, n heads, query len, key len]

        x = torch.matmul(self.dropout(attention), V)
        # x = [batch size, n heads, query len, head dim]

        x = x.permute(0, 2, 1, 3).contiguous()
        # x = [batch size, query len, n heads, head dim]

        x = x.view(batch_size, -1, self.args.KEY_DIM)
        # x = [batch size, query len, hid dim]

        x = self.fc_o(x)
        # x = [batch size, query len, hid dim]

        return x, attention, KLD
Example #18
0
def macer_train(method,
                sigma,
                lbd,
                gauss_num,
                beta,
                gamma,
                num_classes,
                model,
                trainloader,
                optimizer,
                device,
                label_smooth='True'):
    m = Normal(torch.tensor([0.0]).to(device), torch.tensor([1.0]).to(device))

    cl_total = 0.0
    rl_total = 0.0
    data_size = 0
    correct = 0

    if method == 'macer':
        for batch_idx, (inputs, targets) in enumerate(trainloader):
            inputs, targets = inputs.to(device), targets.to(device)

            batch_size = len(inputs)
            data_size += targets.size(0)

            new_shape = [batch_size * gauss_num]
            new_shape.extend(inputs[0].shape)
            inputs = inputs.repeat((1, gauss_num, 1, 1)).view(new_shape)
            noise = torch.randn_like(inputs, device=device) * sigma

            noisy_inputs = inputs + noise

            outputs = model(noisy_inputs)

            # noise = noise.reshape([batch_size, gauss_num] + list(inputs[0].size()))
            outputs = outputs.reshape((batch_size, gauss_num, num_classes))

            # Classification loss
            if label_smooth == 'True':
                labels = label_smoothing(inputs, targets, noise, gauss_num,
                                         num_classes, device)
                criterion = nn.KLDivLoss(size_average=False)
                outputs_logsoftmax = F.log_softmax(outputs, dim=2).mean(
                    1)  # log_softmax
                smoothing_label = labels.mean(1)
                classification_loss = criterion.forward(
                    outputs_logsoftmax, smoothing_label)

            else:
                outputs_softmax = F.softmax(outputs, dim=2).mean(1)
                outputs_logsoftmax = torch.log(outputs_softmax +
                                               1e-10)  # avoid nan
                classification_loss = F.nll_loss(outputs_logsoftmax,
                                                 targets,
                                                 reduction='sum')

            cl_total += classification_loss.item()

            # Robustness loss
            beta_outputs = outputs * beta  # only apply beta to the robustness loss
            beta_outputs_softmax = F.softmax(beta_outputs, dim=2).mean(1)
            _, predicted = beta_outputs_softmax.max(1)
            correct += predicted.eq(targets).sum().item()

            top2 = torch.topk(beta_outputs_softmax, 2)
            top2_score = top2[0]
            top2_idx = top2[1]
            indices_correct = (top2_idx[:, 0] == targets)  # G_theta

            #cut off large pA and pB to avoid nan
            out0_correct, out1_correct = top2_score[
                indices_correct, 0], top2_score[indices_correct, 1]
            out0_correct, out1_correct = torch.clamp(out0_correct, 0,
                                                     0.9999999), torch.clamp(
                                                         out1_correct, 1e-7, 1)

            #phi^{-1}(pA) - phi^{-1}(pB)
            robustness_loss_correct = m.icdf(out0_correct) - m.icdf(
                out1_correct)

            #hinge factor, only calculate data with small robustness
            indice_1 = robustness_loss_correct <= gamma
            # indice_2 = ~(robustness_loss_correct <= gamma)

            radius_loss = (robustness_loss_correct[indice_1] * sigma).sum() / 2

            #maxmizing gradient norm for robust data
            # gradient_loss = 0
            # if len(noise[indices_correct][indice_2]) > 0:
            #     sub_noise = noise[indices_correct][indice_2]
            #     sub_outputs = F.softmax(outputs, dim=2)[indices_correct][indice_2]
            #
            #     sub_noise = sub_noise.view(sub_noise.size()[0] * gauss_num, -1)
            #     sub_outputs = sub_outputs.view(sub_outputs.size()[0] * gauss_num, -1)
            #
            #     for i in range(num_classes):
            #         gradient_loss_tmp = sub_outputs[:, i] * sub_noise[:, i] / (gauss_num * sigma ** 2)
            #         gradient_loss_tmp = (gradient_loss_tmp ** 2).sum()
            #         gradient_loss += gradient_loss_tmp

            robustness_loss = radius_loss  #+ gradient_loss
            rl_total += lbd * robustness_loss.item()

            # Final objective function
            loss = classification_loss - lbd * robustness_loss
            loss /= batch_size
            loss.backward()

            optimizer.step()
            optimizer.zero_grad()

        cl_total /= data_size
        rl_total /= data_size
        acc = 100 * correct / data_size

        return cl_total, rl_total, acc

    else:
        for batch_idx, (inputs, targets) in enumerate(trainloader):
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = model.forward(inputs)
            loss = nn.CrossEntropyLoss()(outputs, targets)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            cl_total += loss.item() * len(inputs)
            _, predicted = outputs.max(1)
            data_size += targets.size(0)
            correct += predicted.eq(targets).sum().item()

        cl_total /= data_size
        acc = 100 * correct / data_size

        return cl_total, rl_total, acc