def macer_train(sigma, lbd, gauss_num, beta, gamma, num_classes, model, trainloader, optimizer, device): m = Normal(torch.tensor([0.0]).to(device), torch.tensor([1.0]).to(device)) cl_total = 0.0 rl_total = 0.0 input_total = 0 for _, (inputs, targets) in enumerate(trainloader): inputs, targets = inputs.to(device), targets.to(device) input_size = len(inputs) input_total += input_size new_shape = [input_size * gauss_num] new_shape.extend(inputs[0].shape) inputs = inputs.repeat((1, gauss_num, 1, 1)).view(new_shape) noise = torch.randn_like(inputs, device=device) * sigma noisy_inputs = inputs + noise outputs = model(noisy_inputs) outputs = outputs.reshape((input_size, gauss_num, num_classes)) # Classification loss outputs_softmax = F.softmax(outputs, dim=2).mean(1) outputs_logsoftmax = torch.log(outputs_softmax + 1e-10) # avoid nan classification_loss = F.nll_loss(outputs_logsoftmax, targets, reduction='sum') cl_total += classification_loss.item() # Robustness loss beta_outputs = outputs * beta # only apply beta to the robustness loss beta_outputs_softmax = F.softmax(beta_outputs, dim=2).mean(1) top2 = torch.topk(beta_outputs_softmax, 2) top2_score = top2[0] top2_idx = top2[1] indices_correct = (top2_idx[:, 0] == targets) # G_theta out0, out1 = top2_score[indices_correct, 0], top2_score[indices_correct, 1] robustness_loss = m.icdf(out1) - m.icdf(out0) indices = ~torch.isnan(robustness_loss) & ~torch.isinf( robustness_loss) & (torch.abs(robustness_loss) <= gamma) # hinge out0, out1 = out0[indices], out1[indices] robustness_loss = m.icdf(out1) - m.icdf(out0) + gamma robustness_loss = robustness_loss.sum() * sigma / 2 rl_total += robustness_loss.item() # Final objective function loss = classification_loss + lbd * robustness_loss loss /= input_size optimizer.zero_grad() loss.backward() optimizer.step() cl_total /= input_total rl_total /= input_total print('Classification Loss: {} Robustness Loss: {}'.format( cl_total, rl_total))
class GaussianModel(nn.Module): r""" Model to learn a univariate Gaussian distribution. Arguments ---------- mu: Mean of the Gaussian distribution sigma: Standard deviation of the Gaussian distribution device: The torch.device to use, typically cpu or gpu id """ def __init__(self, mu, sigma, device=None): super().__init__() if device is not None: self.device = device mu = mu.to(device) sigma = sigma.to(device) self.mu = mu self.sigma = sigma self.distr = Normal(self.mu, self.sigma) def to_device(self, device): """ Moves members to a specified torch.device. """ self.device = device def forward(self, x): """ Takes input x as new distribution parameters. """ # If mini-batching if len(x.shape) > 1: self.mu_batch = x[:, 0] self.sigma_batch = F.softplus(x[:, 1]) # If not mini-batching else: self.mu = x[0] self.distr = Normal(self.mu, self.sigma) return self.distr def log_prob(self, x): x = x.view(x.shape.numel()) if x.shape[0] == 1: return self.distr.log_prob(x[0]).view(1) log_like_arr = torch.ones_like(x) for i in range(len(x)): self.mu = self.mu_batch[i] self.distr = Normal(self.mu, self.sigma) lpxx = self.distr.log_prob(x[i]).view(1) log_like_arr[i] = lpxx lpx = log_like_arr return lpx def icdf(self, value): return self.distr.icdf(value)
def OptimzeSigma(model, batch, alpha, sig_0, K, n): device = 'cuda:0' batch_size = batch.shape[0] sig = Variable(sig_0, requires_grad=True).view(batch_size, 1, 1, 1) m = Normal( torch.zeros(batch_size).to(device), torch.ones(batch_size).to(device)) for param in model.parameters(): param.requires_grad_(False) #Reshaping so for n > 1 new_shape = [batch_size * n] new_shape.extend(batch[0].shape) new_batch = batch.repeat((1, n, 1, 1)).view(new_shape) for _ in range(K): sigma_repeated = sig.repeat((1, n, 1, 1)).view(-1, 1, 1, 1) eps = torch.randn_like( new_batch) * sigma_repeated #Reparamitrization trick out = model(new_batch + eps).reshape(batch_size, n, -1).mean( 1) #This is \psi in the algorithm vals, _ = torch.topk(out, 2) vals.transpose_(0, 1) gap = m.icdf(vals[0].clamp_(0.02, 0.98)) - m.icdf(vals[1].clamp_( 0.02, 0.98)) radius = sig.reshape(-1) / 2 * gap # The radius formula grad = torch.autograd.grad(radius.sum(), sig) sig.data += alpha * grad[0] # Gradient Ascent step #For training purposes after getting the sigma for param in model.parameters(): param.requires_grad_(True) return sig.reshape(-1)
class L2Certificate(Certificate): norm = "l2" def __init__(self, batch_size: int, device: str = "cuda:0"): self.m = Normal( torch.zeros(batch_size).to(device), torch.ones(batch_size).to(device)) self.device = device def compute_proxy_gap(self, logits: torch.Tensor) -> torch.Tensor: return self.m.icdf(logits[:, 0].clamp_(0.001, 0.999)) - \ self.m.icdf(logits[:, 1].clamp_(0.001, 0.999)) def sample_noise(self, batch: torch.Tensor, repeated_theta: torch.Tensor) -> torch.Tensor: return torch.randn_like(batch, device=self.device) * repeated_theta def compute_gap(self, pABar: float) -> float: return norm.ppf(pABar) def compute_radius_estimate(self, logits: torch.Tensor, theta: torch.Tensor) -> torch.Tensor: return theta / 2 * self.compute_proxy_gap(logits)
def pdf_param(self, x): # self._check_dimension(x) ''' :param x: data numpy n*d :param R: flattened correlation value, not include the redundancy and diagonal value shape: (d*(d-1))/2 :return: ''' # print('R:',R) # print('x:',x) norm = Normal(torch.tensor([0.0]), torch.tensor([1.0])) u = norm.icdf(x) # print('shape u:', u.shape) cov = self.get_R().cuda() if self.dim == 2: RDet = cov[0, 0] * cov[1, 1] - cov[0, 1]**2 RInv = 1. / RDet * torch.from_numpy( np.asarray([[cov[1, 1], -cov[0, 1]], [-cov[0, 1], cov[0, 0]]])) else: RDet = torch.det(cov) RInv = torch.inverse(cov) u = u.unsqueeze(0).cuda() # print('u shape', u.shape) #d I = torch.eye(self.dim).cuda() # print('u cuda:', u.is_cuda) # print('cov cuda:', cov.is_cuda) # print('I cuda:', I.is_cuda) res = RDet**(-0.5) * torch.exp( -0.5 * torch.mm(torch.mm(u, (RInv - I)), u.permute(1, 0))).cuda() # print('res:', res) if res.data == 0.0: print('RDet:', RDet) print('RInv shape', RInv.shape) if math.isnan(res.data): print('self.diagonal:', self.diagonal_val) print('self.non_diagonal:', self.off_diagonal_val) print('RDet:', RDet) print('RInv:', RInv) print('cov:', cov) print('u:', u) return return res
def test(): relative_error = 0 for i in range(100): x = -1 + i * (10 - (-1)) / 100 my_erfcx = erfcx(torch.FloatTensor([x])) relative_error = relative_error + np.abs( my_erfcx.item() - special.erfcx(x)) / special.erfcx(x) average_error = relative_error / 100 print(average_error) normal = Normal(loc=torch.Tensor([0.0]), scale=torch.Tensor([1.0])) # cdf from 0 to x print(normal.cdf(1.6449)) print(normal.icdf(torch.Tensor([0.95])))
def certify(self, x: torch.tensor, n0: int, n: int, alpha: float, batch_size: int) -> (int, float): """ Monte Carlo algorithm for certifying that g's prediction around x is constant within some L2 radius. With probability at least 1 - alpha, the class returned by this method will equal g(x), and g's prediction will robust within a L2 ball of radius R around x. :param x: the input [channel x height x width] :param n0: the number of Monte Carlo samples to use for selection :param n: the number of Monte Carlo samples to use for estimation :param alpha: the failure probability :param batch_size: batch size to use when evaluating the base classifier :return: (predicted class, certified radius) in the case of abstention, the class will be ABSTAIN and the radius 0. """ device = x.device self.cvae.eval() self.base_classifier.eval() # draw samples of f(x+ epsilon) counts_selection = self._sample_noise(x, n0, batch_size) # use these samples to take a guess at the top class # cAHat = counts_selection.argmax().item() cAHat = counts_selection.max(0)[1] # draw more samples of f(x + epsilon) counts_estimation = self._sample_noise(x, n, batch_size) # use these samples to estimate a lower bound on pA # nA = counts_estimation[cAHat].item() nA = counts_estimation.gather(0, cAHat.unsqueeze(0)).squeeze(0) # now all on CPU pABar = self._lower_confidence_bound(nA, n, alpha) std_normal = Normal(0, 1) radius = self.sigma * std_normal.icdf(pABar) if cAHat.ndim == 0: if pABar < 0.5: return torch.Tensor([ Smooth.ABSTAIN ]).long().to(device), torch.Tensor([0]).to(device) else: # radius = self.sigma * norm.ppf(pABar) return cAHat.to(device), radius.to(device) else: I = pABar < 0.5 radius[I] = Smooth.ABSTAIN cAHat[I] = 0.0 return cAHat.to(device), radius.to(device)
def morph(): model = load_model() from torch.distributions.normal import Normal normal = Normal(0., 1.) images = [] z = torch.randn(NUM_HIDDEN) for x in range(10+1): for y in range(10+1): x_coord = min(max(x / 10., .01), .99) y_coord = min(max(y / 10., .01), .99) z[0:2] = normal.icdf(torch.tensor([x_coord, y_coord])) recon = model.decode(z) images.append(recon) images_joined = torch.cat(images).view(-1, 1, IMG_WIDTH, IMG_HEIGHT) save_image(images_joined.cpu(), 'data/reconstruction/morph.png', nrow=11)
class NormalInvCDF(Transform): domain = constraints.real codomain = constraints.interval(0, 1) bijective = True def __init__(self): super(NormalInvCDF, self).__init__() self.normal_dist = Normal(0., 1.) def __eq__(self, other): return isinstance(other, NormalInvCDF) def _call(self, x): return self.normal_dist.cdf(x) def _inverse(self, y): return self.normal_dist.icdf(y) def log_abs_det_jacobian(self, x, y): # return self.normal_dist.log_prob(x) return torch.log(y)
class labels_transformer(): def __init__(self): self.labels_mean = torch.FloatTensor([ 0.4761464174454829, 0.5202864583333333, 0.5481813186813186, 0.5227313915857604, 0.5037803738317757, 0.5662814814814815 ]) self.labels_std = torch.FloatTensor([ 0.15228452985134602, 0.15353347248058757, 0.13637365282783034, 0.15520650375390665, 0.15013557786759546, 0.14697755975897248 ]) self.dist = Normal(0, 1) def transform_labels(self, true_labels): pseudo_labels = (true_labels - self.labels_mean) / self.labels_std pseudo_labels = self.dist.cdf(pseudo_labels) return pseudo_labels def inverse_transform_labels(self, pseudo_labels): true_labels = self.dist.icdf(pseudo_labels) true_labels = true_labels * self.labels_std + self.labels_mean return true_labels
def fit_transform(self, x): all_latent = [] # 1. PCA transform pca = PCA(random_state=0) pca.fit(x.detach().numpy()) # assert np.isclose(np.abs(np.linalg.det(pca.components_)), 1), 'Should be close to one' Q_pca = torch.from_numpy(pca.components_) x = torch.mm(x, Q_pca.T) # 2. Independent normal cdf transform scale, loc = torch.std_mean(x, dim=0) ind_normal = Normal(loc, torch.sqrt(scale * scale + self.lam_variance)) x = ind_normal.cdf(x) x = torch.clamp(x, 1e-10, 1 - 1e-10) # 3. Independent histogram transform if True: histograms = [ TorchUnitHistogram(n_bins=self.n_bins, alpha=self.alpha).fit(x_col) for x_col in x.detach().T ] x = torch.cat(tuple( hist.cdf(x_col).reshape(-1, 1) for x_col, hist in zip(x.T, histograms)), dim=1) # all_latent.append(x.detach().numpy()) self.histograms_ = histograms # 4. Independent inverse standard normal transform if True: standard_normal = Normal(loc=torch.zeros_like(loc), scale=torch.ones_like(scale)) x = standard_normal.icdf(x) self.standard_normal_ = standard_normal self.Q_pca_ = Q_pca self.ind_normal_ = ind_normal self._latent = x # Just for debugging purposes return x
class Wang_distortion(): """Sample quantile levels for the Wang risk measure. Wang 2000 Parameters ---------- eta: float. Default: -0.75 for eta < 0 prduces risk-averse. """ def __init__(self, eta=-0.75): self.eta = eta self.normal = Normal(loc=torch.Tensor([0]), scale=torch.Tensor([1])) def sample(self, num_samples): """ Parameters ---------- num_samples: tuple. (num_samples,) """ taus_uniform = uniform.Uniform(0., 1.).sample(num_samples) wang_tau = self.normal.cdf(value=self.normal.icdf(value=taus_uniform) + self.eta) return wang_tau
class MQF2Distribution(torch.distributions.Distribution): r""" Distribution class for the model MQF2 proposed in the paper ``Multivariate Quantile Function Forecaster`` by Kan, Aubet, Januschowski, Park, Benidis, Ruthotto, Gasthaus Parameters ---------- picnn A SequentialNet instance of a partially input convex neural network (picnn) hidden_state hidden_state obtained by unrolling the RNN encoder shape = (batch_size, context_length, hidden_size) in training shape = (batch_size, hidden_size) in inference prediction_length Length of the prediction horizon is_energy_score If True, use energy score as objective function otherwise use maximum likelihood as objective function (normalizing flows) es_num_samples Number of samples drawn to approximate the energy score beta Hyperparameter of the energy score (power of the two terms) threshold_input Clamping threshold of the (scaled) input when maximum likelihood is used as objective function this is used to make the forecaster more robust to outliers in training samples validate_args Sets whether validation is enabled or disabled For more details, refer to the descriptions in torch.distributions.distribution.Distribution """ def __init__( self, picnn: torch.nn.Module, hidden_state: torch.Tensor, prediction_length: int, is_energy_score: bool = True, es_num_samples: int = 50, beta: float = 1.0, threshold_input: float = 100.0, validate_args: bool = False, ) -> None: self.picnn = picnn self.hidden_state = hidden_state self.prediction_length = prediction_length self.is_energy_score = is_energy_score self.es_num_samples = es_num_samples self.beta = beta self.threshold_input = threshold_input super().__init__(batch_shape=self.batch_shape, validate_args=validate_args) self.context_length = (self.hidden_state.shape[-2] if len(self.hidden_state.shape) > 2 else 1) self.numel_batch = MQF2Distribution.get_numel(self.batch_shape) # mean zero and std one mu = torch.tensor(0, dtype=hidden_state.dtype, device=hidden_state.device) sigma = torch.ones_like(mu) self.standard_normal = Normal(mu, sigma) def stack_sliding_view(self, z: torch.Tensor) -> torch.Tensor: """ Auxiliary function for loss computation. Unfolds the observations by sliding a window of size prediction_length over the observations z Then, reshapes the observations into a 2-dimensional tensor for further computation Parameters ---------- z A batch of time series with shape (batch_size, context_length + prediction_length - 1) Returns ------- Tensor Unfolded time series with shape (batch_size * context_length, prediction_length) """ z = z.unfold(dimension=-1, size=self.prediction_length, step=1) z = z.reshape(-1, z.shape[-1]) return z def loss(self, z: torch.Tensor) -> torch.Tensor: if self.is_energy_score: return self.energy_score(z) else: return -self.log_prob(z) def log_prob(self, z: torch.Tensor) -> torch.Tensor: """ Computes the log likelihood log(g(z)) + logdet(dg(z)/dz), where g is the gradient of the picnn. Parameters ---------- z A batch of time series with shape (batch_size, context_length + prediciton_length - 1) Returns ------- loss Tesnor of shape (batch_size * context_length,) """ z = torch.clamp(z, min=-self.threshold_input, max=self.threshold_input) z = self.stack_sliding_view(z) loss = self.picnn.logp( z, self.hidden_state.reshape(-1, self.hidden_state.shape[-1])) return loss def energy_score(self, z: torch.Tensor) -> torch.Tensor: """ Computes the (approximated) energy score sum_i ES(g,z_i), where ES(g,z_i) = -1/(2*es_num_samples^2) * sum_{w,w'} ||w-w'||_2^beta + 1/es_num_samples * sum_{w''} ||w''-z_i||_2^beta, w's are samples drawn from the quantile function g(., h_i) (gradient of picnn), h_i is the hidden state associated with z_i, and es_num_samples is the number of samples drawn for each of w, w', w'' in energy score approximation Parameters ---------- z A batch of time series with shape (batch_size, context_length + prediction_length - 1) Returns ------- loss Tensor of shape (batch_size * context_length,) """ es_num_samples = self.es_num_samples beta = self.beta z = self.stack_sliding_view(z) reshaped_hidden_state = self.hidden_state.reshape( -1, self.hidden_state.shape[-1]) loss = self.picnn.energy_score(z, reshaped_hidden_state, es_num_samples=es_num_samples, beta=beta) return loss def rsample(self, sample_shape: torch.Size = torch.Size()) -> torch.Tensor: """ Generates the sample paths. Parameters ---------- sample_shape Shape of the samples Returns ------- sample_paths Tesnor of shape (batch_size, *sample_shape, prediction_length) """ numel_batch = self.numel_batch prediction_length = self.prediction_length num_samples_per_batch = MQF2Distribution.get_numel(sample_shape) num_samples = num_samples_per_batch * numel_batch hidden_state_repeat = self.hidden_state.repeat_interleave( repeats=num_samples_per_batch, dim=0) alpha = torch.rand( (num_samples, prediction_length), dtype=self.hidden_state.dtype, device=self.hidden_state.device, layout=self.hidden_state.layout, ) return self.quantile( alpha, hidden_state_repeat).reshape((numel_batch, ) + sample_shape + (prediction_length, )) def quantile(self, alpha: torch.Tensor, hidden_state: Optional[torch.Tensor] = None) -> torch.Tensor: """ Generates the predicted paths associated with the quantile levels alpha. Parameters ---------- alpha quantile levels, shape = (batch_shape, prediction_length) hidden_state hidden_state, shape = (batch_shape, hidden_size) Returns ------- results predicted paths of shape = (batch_shape, prediction_length) """ if hidden_state is None: hidden_state = self.hidden_state normal_quantile = self.standard_normal.icdf(alpha) # In the energy score approach, we directly draw samples from picnn # In the MLE (Normalizing flows) approach, we need to invert the picnn # (go backward through the flow) to draw samples if self.is_energy_score: result = self.picnn(normal_quantile, context=hidden_state) else: result = self.picnn.reverse(normal_quantile, context=hidden_state) return result @staticmethod def get_numel(tensor_shape: torch.Size) -> int: # Auxiliary function # compute number of elements specified in a torch.Size() return torch.prod(torch.tensor(tensor_shape)).item() @property def batch_shape(self) -> torch.Size: # last dimension is the hidden state size return self.hidden_state.shape[:-1] @property def event_shape(self) -> Tuple: return () @property def event_dim(self) -> int: return 0
def squeeze_values(mu, x): norm1 = Normal(mu, 1) norm2 = Normal(-mu, 1) return norm2.cdf(norm1.icdf(x.clamp(0,1)))
def phi_inv(x): normal = Normal(loc=torch.cuda.FloatTensor([0.0]), scale=torch.cuda.FloatTensor([1.0])) return normal.icdf(x)
def inverse_transform_labels(pseudo_labels): dist = Normal(0, 1) true_labels = dist.icdf(pseudo_labels) true_labels = true_labels * labels_std + labels_mean return true_labels
def forward(self, query, key, value): batch_size = query.shape[0] maxlen = query.shape[1] Q = self.fc_q(query) K = self.fc_k(key) V = self.fc_v(value) # Q = [batch size, query len, hid dim] # K = [batch size, key len, hid dim] # V = [batch size, value len, hid dim] Q = Q.view(batch_size, maxlen, self.n_heads, self.head_dim).permute(0, 2, 1, 3) K = K.view(batch_size, maxlen, self.n_heads, self.head_dim).permute(0, 2, 1, 3) V = V.view(batch_size, maxlen, self.n_heads, self.head_dim).permute(0, 2, 1, 3) # Q = [batch size, n heads, query len, head dim] # K = [batch size, n heads, key len, head dim] # V = [batch size, n heads, value len, head dim] KLD = torch.tensor(0.0) if self.args.att_type == 'dot': energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale elif self.args.att_type == 'ikandirect': w1_proj = self.attsharedw.w1 w2_proj = self.attsharedw.w2 scores, norm = ika_ns(Q, K, self.args, self.scale, w1_proj, w2_proj, 2 * np.pi, self.training) energy = torch.log(scores + (1e-5)) + norm elif self.args.att_type == 'mikan': ''' copula augmented estimation ''' mu, logvar, L = self.attsharedw.copulanet(Q, K) mu = mu.squeeze(-1) logvar = logvar.squeeze(-1) var = torch.exp(logvar) dim_batch_size, num_head, num_head = L.size() dim = int(dim_batch_size / batch_size) pos_eps = torch.randn([dim, num_head, self.args.M // 2]).cuda() # [64,8,128(M/2)] X_pos = torch.einsum('ijk,ijl->ijl', L, pos_eps) # [64,8,128(M/2)] X_pos = torch.clamp(X_pos, min=-2.0, max=2.0) U_pos = self.standard_normal_dist.cdf( X_pos) # [64,num_head,128(M/2)] neg_eps = torch.randn([dim, num_head, self.args.M // 2]).cuda() # [64,8,128(M/2)] X_neg = torch.einsum('ijk,ijl->ijl', L, neg_eps) # [64,8,128(M/2)] X_neg = torch.clamp(X_neg, min=-2.0, max=2.0) U_neg = self.standard_normal_dist.cdf( X_neg) # [64,num_head,128(M/2)] marginal_pos = Normal( mu.unsqueeze(-1), var.unsqueeze(-1)) # mu : [64,num_head] / var : [64,num_head] marginal_neg = Normal( -1 * mu.unsqueeze(-1), var.unsqueeze(-1)) # mu : [64,num_head] / var : [64,num_head] Y_pos = marginal_pos.icdf(U_pos) # [32,4,64] Y_neg = marginal_neg.icdf(U_neg) U = torch.cat([U_pos, U_neg]) ent_copula = -1 * torch.sum(torch.mul(U, torch.log(U + (1e-5)))) ''' kernel and norm calculation ''' z = torch.cat([Y_pos, Y_neg], -1) # torch.Size([1, 64, 4, 256]) w1_proj = self.attsharedw.wnet1(z) w2_proj = self.attsharedw.wnet2(z) scores, norm = ika_ns(Q, K, self.args, self.scale, w1_proj, w2_proj, 2 * np.pi, self.training) energy = torch.log(scores + (1e-5)) + norm # energy = [batch size, n heads, query len, key len] q_dist = tdist.Normal(mu, logvar.exp()) KLD = torch.distributions.kl_divergence(q_dist, self.p_dist) KLD = self.args.kl_lambda * torch.sum( KLD) + self.args.copula_lambda * ent_copula attention = torch.softmax(energy, dim=-1) # attention = [batch size, n heads, query len, key len] x = torch.matmul(self.dropout(attention), V) # x = [batch size, n heads, query len, head dim] x = x.permute(0, 2, 1, 3).contiguous() # x = [batch size, query len, n heads, head dim] x = x.view(batch_size, -1, self.args.KEY_DIM) # x = [batch size, query len, hid dim] x = self.fc_o(x) # x = [batch size, query len, hid dim] return x, attention, KLD
def macer_train(method, sigma, lbd, gauss_num, beta, gamma, num_classes, model, trainloader, optimizer, device, label_smooth='True'): m = Normal(torch.tensor([0.0]).to(device), torch.tensor([1.0]).to(device)) cl_total = 0.0 rl_total = 0.0 data_size = 0 correct = 0 if method == 'macer': for batch_idx, (inputs, targets) in enumerate(trainloader): inputs, targets = inputs.to(device), targets.to(device) batch_size = len(inputs) data_size += targets.size(0) new_shape = [batch_size * gauss_num] new_shape.extend(inputs[0].shape) inputs = inputs.repeat((1, gauss_num, 1, 1)).view(new_shape) noise = torch.randn_like(inputs, device=device) * sigma noisy_inputs = inputs + noise outputs = model(noisy_inputs) # noise = noise.reshape([batch_size, gauss_num] + list(inputs[0].size())) outputs = outputs.reshape((batch_size, gauss_num, num_classes)) # Classification loss if label_smooth == 'True': labels = label_smoothing(inputs, targets, noise, gauss_num, num_classes, device) criterion = nn.KLDivLoss(size_average=False) outputs_logsoftmax = F.log_softmax(outputs, dim=2).mean( 1) # log_softmax smoothing_label = labels.mean(1) classification_loss = criterion.forward( outputs_logsoftmax, smoothing_label) else: outputs_softmax = F.softmax(outputs, dim=2).mean(1) outputs_logsoftmax = torch.log(outputs_softmax + 1e-10) # avoid nan classification_loss = F.nll_loss(outputs_logsoftmax, targets, reduction='sum') cl_total += classification_loss.item() # Robustness loss beta_outputs = outputs * beta # only apply beta to the robustness loss beta_outputs_softmax = F.softmax(beta_outputs, dim=2).mean(1) _, predicted = beta_outputs_softmax.max(1) correct += predicted.eq(targets).sum().item() top2 = torch.topk(beta_outputs_softmax, 2) top2_score = top2[0] top2_idx = top2[1] indices_correct = (top2_idx[:, 0] == targets) # G_theta #cut off large pA and pB to avoid nan out0_correct, out1_correct = top2_score[ indices_correct, 0], top2_score[indices_correct, 1] out0_correct, out1_correct = torch.clamp(out0_correct, 0, 0.9999999), torch.clamp( out1_correct, 1e-7, 1) #phi^{-1}(pA) - phi^{-1}(pB) robustness_loss_correct = m.icdf(out0_correct) - m.icdf( out1_correct) #hinge factor, only calculate data with small robustness indice_1 = robustness_loss_correct <= gamma # indice_2 = ~(robustness_loss_correct <= gamma) radius_loss = (robustness_loss_correct[indice_1] * sigma).sum() / 2 #maxmizing gradient norm for robust data # gradient_loss = 0 # if len(noise[indices_correct][indice_2]) > 0: # sub_noise = noise[indices_correct][indice_2] # sub_outputs = F.softmax(outputs, dim=2)[indices_correct][indice_2] # # sub_noise = sub_noise.view(sub_noise.size()[0] * gauss_num, -1) # sub_outputs = sub_outputs.view(sub_outputs.size()[0] * gauss_num, -1) # # for i in range(num_classes): # gradient_loss_tmp = sub_outputs[:, i] * sub_noise[:, i] / (gauss_num * sigma ** 2) # gradient_loss_tmp = (gradient_loss_tmp ** 2).sum() # gradient_loss += gradient_loss_tmp robustness_loss = radius_loss #+ gradient_loss rl_total += lbd * robustness_loss.item() # Final objective function loss = classification_loss - lbd * robustness_loss loss /= batch_size loss.backward() optimizer.step() optimizer.zero_grad() cl_total /= data_size rl_total /= data_size acc = 100 * correct / data_size return cl_total, rl_total, acc else: for batch_idx, (inputs, targets) in enumerate(trainloader): inputs, targets = inputs.to(device), targets.to(device) outputs = model.forward(inputs) loss = nn.CrossEntropyLoss()(outputs, targets) loss.backward() optimizer.step() optimizer.zero_grad() cl_total += loss.item() * len(inputs) _, predicted = outputs.max(1) data_size += targets.size(0) correct += predicted.eq(targets).sum().item() cl_total /= data_size acc = 100 * correct / data_size return cl_total, rl_total, acc