def create_adv_input(self, x, y, model): # Prepare copied model model = copy.deepcopy(model) # Prepare input and corresponding label data = torch.from_numpy(np.expand_dims(x, axis=0).astype(np.float32)) target = torch.from_numpy(np.array([y]).astype(np.int64)) data.requires_grad = True from advertorch.attacks import LinfBasicIterativeAttack adversary = LinfBasicIterativeAttack(model.forward, eps=self.eps, nb_iter=self.nb_iter) perturbed_data = adversary.perturb(data, target) # Have to be different output = model.forward(perturbed_data) final_pred = output.max( 1, keepdim=True)[1] # get the index of the max log-probability if final_pred.item() == target.item(): return perturbed_data, 0 else: return perturbed_data, 1
def test_adver(net, tar_net, attack, target): net.eval() tar_net.eval() # BIM if attack == 'BIM': adversary = LinfBasicIterativeAttack( net, loss_fn=nn.CrossEntropyLoss(reduction="sum"), eps=0.25, nb_iter=120, eps_iter=0.02, clip_min=0.0, clip_max=1.0, targeted=opt.target) # PGD elif attack == 'PGD': if opt.target: adversary = PGDAttack(net, loss_fn=nn.CrossEntropyLoss(reduction="sum"), eps=0.25, nb_iter=11, eps_iter=0.03, clip_min=0.0, clip_max=1.0, targeted=opt.target) else: adversary = PGDAttack(net, loss_fn=nn.CrossEntropyLoss(reduction="sum"), eps=0.25, nb_iter=6, eps_iter=0.03, clip_min=0.0, clip_max=1.0, targeted=opt.target) # FGSM elif attack == 'FGSM': adversary = GradientSignAttack( net, loss_fn=nn.CrossEntropyLoss(reduction="sum"), eps=0.26, targeted=opt.target) elif attack == 'CW': adversary = CarliniWagnerL2Attack( net, num_classes=10, learning_rate=0.45, # loss_fn=nn.CrossEntropyLoss(reduction="sum"), binary_search_steps=10, max_iterations=12, targeted=opt.target) # ---------------------------------- # Obtain the accuracy of the model # ---------------------------------- with torch.no_grad(): correct_netD = 0.0 total = 0.0 net.eval() for data in testloader: inputs, labels = data inputs = inputs.cuda() labels = labels.cuda() outputs = net(inputs) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct_netD += (predicted == labels).sum() print('Accuracy of the network on netD: %.2f %%' % (100. * correct_netD.float() / total)) # ---------------------------------- # Obtain the attack success rate of the model # ---------------------------------- correct = 0.0 total = 0.0 tar_net.eval() total_L2_distance = 0.0 for data in testloader: inputs, labels = data inputs = inputs.to(device) labels = labels.to(device) outputs = tar_net(inputs) _, predicted = torch.max(outputs.data, 1) if target: # randomly choose the specific label of targeted attack labels = torch.randint(0, 9, (1, )).to(device) # test the images which are not classified as the specific label if predicted != labels: # print(total) adv_inputs_ori = adversary.perturb(inputs, labels) L2_distance = (torch.norm(adv_inputs_ori - inputs)).item() total_L2_distance += L2_distance with torch.no_grad(): outputs = tar_net(adv_inputs_ori) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() else: # test the images which are classified correctly if predicted == labels: # print(total) adv_inputs_ori = adversary.perturb(inputs, labels) L2_distance = (torch.norm(adv_inputs_ori - inputs)).item() total_L2_distance += L2_distance with torch.no_grad(): outputs = tar_net(adv_inputs_ori) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() if target: print('Attack success rate: %.2f %%' % (100. * correct.float() / total)) else: print('Attack success rate: %.2f %%' % (100.0 - 100. * correct.float() / total)) print('l2 distance: %.4f ' % (total_L2_distance / total))
correct_netD += (predicted == labels).sum() print('Accuracy of the network on netD: %.2f %%' % (100. * correct_netD.float() / total)) ################################################ # estimate the attack success rate of initial D: ################################################ correct_ghost = 0.0 total = 0.0 netD.eval() for data in testloader: inputs, labels = data inputs = inputs.cuda() labels = labels.cuda() adv_inputs_ghost = adversary_ghost.perturb(inputs, labels) with torch.no_grad(): if opt.dataset == 'azure': predicted = cal_azure(clf, adv_inputs_ghost) else: outputs = original_net(adv_inputs_ghost) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct_ghost += (predicted == labels).sum() print('Attack success rate: %.2f %%' % (100 - 100. * correct_ghost.float() / total)) del inputs, labels, adv_inputs_ghost torch.cuda.empty_cache() gc.collect() batch_num = 1000