def create_adv_input(self, x, y, model): # Prepare copied model model = copy.deepcopy(model) # Prepare input and corresponding label data = torch.from_numpy(np.expand_dims(x, axis=0).astype(np.float32)) target = torch.from_numpy(np.array([y]).astype(np.int64)) data.requires_grad = True from advertorch.attacks import LinfBasicIterativeAttack adversary = LinfBasicIterativeAttack(model.forward, eps=self.eps, nb_iter=self.nb_iter) perturbed_data = adversary.perturb(data, target) # Have to be different output = model.forward(perturbed_data) final_pred = output.max( 1, keepdim=True)[1] # get the index of the max log-probability if final_pred.item() == target.item(): return perturbed_data, 0 else: return perturbed_data, 1
def make_adversary_dict(model, model_name, targetted=False): if (model_name == "capsnet"): model_for_adversary = Model_for_Adversary_Caps(model) else: model_for_adversary = Model_for_Adversary_CNN(model) linf_eps = 0.3 fgsm_step = 0.05 bim_pgd_step = 0.01 adversary_dict = {} adversary_dict['Clean'] = CleanAttack(clip_min=-0.4242, clip_max=2.8215) adversary_dict['PGD'] = LinfPGDAttack( model_for_adversary, loss_fn=nn.CrossEntropyLoss(reduction="sum"), eps=(linf_eps / 0.3081), nb_iter=100, eps_iter=(bim_pgd_step / 0.3081), rand_init=True, clip_min=-0.4242, clip_max=2.8215, targeted=targetted) adversary_dict['FGSM'] = GradientSignAttack( model_for_adversary, loss_fn=nn.CrossEntropyLoss(reduction="sum"), eps=(fgsm_step / 0.3081), clip_min=-0.4242, clip_max=2.8215, targeted=targetted) adversary_dict['BIM'] = LinfBasicIterativeAttack( model_for_adversary, loss_fn=nn.CrossEntropyLoss(reduction="sum"), eps=(linf_eps / 0.3081), nb_iter=100, eps_iter=(bim_pgd_step / 0.3081), clip_min=-0.4242, clip_max=2.8215, targeted=targetted) return adversary_dict
def test_adver(net, tar_net, attack, target): net.eval() tar_net.eval() # BIM if attack == 'BIM': adversary = LinfBasicIterativeAttack( net, loss_fn=nn.CrossEntropyLoss(reduction="sum"), eps=0.25, nb_iter=120, eps_iter=0.02, clip_min=0.0, clip_max=1.0, targeted=opt.target) # PGD elif attack == 'PGD': if opt.target: adversary = PGDAttack(net, loss_fn=nn.CrossEntropyLoss(reduction="sum"), eps=0.25, nb_iter=11, eps_iter=0.03, clip_min=0.0, clip_max=1.0, targeted=opt.target) else: adversary = PGDAttack(net, loss_fn=nn.CrossEntropyLoss(reduction="sum"), eps=0.25, nb_iter=6, eps_iter=0.03, clip_min=0.0, clip_max=1.0, targeted=opt.target) # FGSM elif attack == 'FGSM': adversary = GradientSignAttack( net, loss_fn=nn.CrossEntropyLoss(reduction="sum"), eps=0.26, targeted=opt.target) elif attack == 'CW': adversary = CarliniWagnerL2Attack( net, num_classes=10, learning_rate=0.45, # loss_fn=nn.CrossEntropyLoss(reduction="sum"), binary_search_steps=10, max_iterations=12, targeted=opt.target) # ---------------------------------- # Obtain the accuracy of the model # ---------------------------------- with torch.no_grad(): correct_netD = 0.0 total = 0.0 net.eval() for data in testloader: inputs, labels = data inputs = inputs.cuda() labels = labels.cuda() outputs = net(inputs) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct_netD += (predicted == labels).sum() print('Accuracy of the network on netD: %.2f %%' % (100. * correct_netD.float() / total)) # ---------------------------------- # Obtain the attack success rate of the model # ---------------------------------- correct = 0.0 total = 0.0 tar_net.eval() total_L2_distance = 0.0 for data in testloader: inputs, labels = data inputs = inputs.to(device) labels = labels.to(device) outputs = tar_net(inputs) _, predicted = torch.max(outputs.data, 1) if target: # randomly choose the specific label of targeted attack labels = torch.randint(0, 9, (1, )).to(device) # test the images which are not classified as the specific label if predicted != labels: # print(total) adv_inputs_ori = adversary.perturb(inputs, labels) L2_distance = (torch.norm(adv_inputs_ori - inputs)).item() total_L2_distance += L2_distance with torch.no_grad(): outputs = tar_net(adv_inputs_ori) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() else: # test the images which are classified correctly if predicted == labels: # print(total) adv_inputs_ori = adversary.perturb(inputs, labels) L2_distance = (torch.norm(adv_inputs_ori - inputs)).item() total_L2_distance += L2_distance with torch.no_grad(): outputs = tar_net(adv_inputs_ori) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum() if target: print('Attack success rate: %.2f %%' % (100. * correct.float() / total)) else: print('Attack success rate: %.2f %%' % (100.0 - 100. * correct.float() / total)) print('l2 distance: %.4f ' % (total_L2_distance / total))
transform=transforms.Compose([ # transforms.Pad(2, padding_mode="symmetric"), transforms.ToTensor(), # transforms.RandomCrop(32, 4), # normalize, ])) netD = Net_l().cuda() netD = nn.DataParallel(netD) clf = joblib.load('pretrained/sklearn_mnist_model.pkl') adversary_ghost = LinfBasicIterativeAttack( netD, loss_fn=nn.CrossEntropyLoss(reduction="sum"), eps=0.25, nb_iter=100, eps_iter=0.01, clip_min=0.0, clip_max=1.0, targeted=False) nc = 1 elif opt.dataset == 'mnist': testset = torchvision.datasets.MNIST( root='dataset/', train=False, download=True, transform=transforms.Compose([ # transforms.Pad(2, padding_mode="symmetric"), transforms.ToTensor(), # transforms.RandomCrop(32, 4),
model = SmallCNN().cuda() model.load_state_dict(torch.load('./models/trades.pt')) model.eval() sub = SmallCNN().cuda() sub.load_state_dict(torch.load('./substitute_models/mnist_trades.pt')) sub.eval() adversaries = [ GradientSignAttack(model, nn.CrossEntropyLoss(size_average=False), eps=0.3), GradientSignAttack(sub, nn.CrossEntropyLoss(size_average=False), eps=0.3), LinfBasicIterativeAttack(model, nn.CrossEntropyLoss(size_average=False), eps=0.3, nb_iter=40, eps_iter=0.01), LinfBasicIterativeAttack(sub, nn.CrossEntropyLoss(size_average=False), eps=0.3, nb_iter=40, eps_iter=0.01) ] _, _, test_loader = get_mnist_data_loaders() for adversary in adversaries: correct_adv = 0 for i, (x_batch, y_batch) in enumerate(test_loader): x_batch, y_batch = x_batch.cuda(), y_batch.cuda() adv_x_batch = adversary.perturb(x_batch, y_batch) logits = model(adv_x_batch)