def compute_backdoor_loss(params, model, criterion, inputs_back, labels_back, grads=None): t = time.perf_counter() outputs = model(inputs_back) record_time(params, t, 'forward') if params.task == 'pipa': loss = criterion(outputs, labels_back) loss[labels_back == 0] *= 0.001 if labels_back.sum().item() == 0.0: loss[:] = 0.0 loss = loss.mean() else: loss = criterion(outputs, labels_back) if not params.dp: loss = loss.mean() if grads: grads = get_grads(params, model, loss) return loss, grads
def get_grads(params, model, loss): t = time.perf_counter() grads = list( torch.autograd.grad(loss.mean(), [x for x in model.parameters() if x.requires_grad], retain_graph=True)) record_time(params, t, 'backward') return grads
def compute_latent_cosine_similarity(params: Params, model: Model, fixed_model: Model, inputs, grads=None): if not fixed_model: return torch.tensor(0.0), None t = time.perf_counter() with torch.no_grad(): _, fixed_latent = fixed_model(inputs, latent=True) _, latent = model(inputs) record_time(params, t, 'forward') loss = -torch.cosine_similarity(latent, fixed_latent).mean() + 1 if grads: grads = get_grads(params, model, loss) return loss, grads
def compute_normal_loss(params, model, criterion, inputs, labels, grads): t = time.perf_counter() outputs = model(inputs) record_time(params, t, 'forward') loss = criterion(outputs, labels) if not params.dp: loss = loss.mean() if grads: t = time.perf_counter() grads = list( torch.autograd.grad( loss.mean(), [x for x in model.parameters() if x.requires_grad], retain_graph=True)) record_time(params, t, 'backward') return loss, grads
def get_latent_grads(params, model, inputs, labels): model.eval() model.zero_grad() t = time.perf_counter() pred, _ = model(inputs) record_time(params, t, 'forward') z = torch.zeros_like(pred) z[list(range(labels.shape[0])), labels] = 1 pred = pred * z t = time.perf_counter() pred.sum().backward(retain_graph=True) record_time(params, t, 'backward') gradients = model.get_gradient()[labels == params.backdoor_label] pooled_gradients = torch.mean(gradients, dim=[0, 2, 3]).detach() model.zero_grad() return pooled_gradients
def compute_spectral_evasion_loss(params: Params, model: Model, fixed_model: Model, inputs, grads=None): """ Evades spectral analysis defense. Aims to preserve the latent representation on non-backdoored inputs. Uses a checkpoint non-backdoored `fixed_model` to compare the outputs. Uses euclidean distance as penalty. :param params: training parameters :param model: current model :param fixed_model: saved non-backdoored model as a reference. :param inputs: training data inputs :param grads: compute gradients. :return: """ if not fixed_model: return torch.tensor(0.0), None t = time.perf_counter() with torch.no_grad(): _, fixed_latent = fixed_model(inputs, latent=True) _, latent = model(inputs) record_time(params, t, 'latent_fixed') if params.spectral_similarity == 'norm': loss = torch.norm(latent - fixed_latent, dim=1).mean() elif params.spectral_similarity == 'cosine': loss = -torch.cosine_similarity(latent, fixed_latent).mean() + 1 else: raise ValueError(f'Specify correct similarity metric for ' f'spectral evasion: [norm, cosine].') if grads: grads = get_grads(params, model, loss) return loss, grads
def compute_sentinet_evasion(params, model, inputs, inputs_back, labels_back, grads=None): """The GradCam design is taken from: https://medium.com/@stepanulyanin/implementing-grad-cam-in-pytorch-ea0937c31e82 :param params: :param model: :param inputs: :param inputs_back: :param labels_back: :param grads: :return: """ pooled = get_latent_grads(params, model, inputs, labels_back) t = time.perf_counter() features = model.features(inputs) record_time(params, t, 'forward') features = features * pooled.view(1, 512, 1, 1) pooled_back = get_latent_grads(params, model, inputs_back, labels_back) t = time.perf_counter() back_features = model.features(inputs_back) record_time(params, t, 'forward') back_features = back_features * pooled_back.view(1, 512, 1, 1) features = torch.mean(features, dim=[0, 1], keepdim=True) features = F.relu(features) / features.max() back_features = torch.mean(back_features, dim=[0, 1], keepdim=True) back_features = F.relu(back_features) / back_features.max() loss = F.relu(back_features - features).max() * 10 if grads: t = time.perf_counter() loss.backward(retain_graph=True) record_time(params, t, 'backward') grads = params.copy_grad(model) return loss, grads