Example #1
0
    def create_adv_input(self, x, y, model):
        # Prepare copied model
        model = copy.deepcopy(model)

        # Prepare input and corresponding label
        data = torch.from_numpy(np.expand_dims(x, axis=0).astype(np.float32))
        target = torch.from_numpy(np.array([y]).astype(np.int64))
        data.requires_grad = True

        from advertorch.attacks import LinfBasicIterativeAttack
        adversary = LinfBasicIterativeAttack(model.forward,
                                             eps=self.eps,
                                             nb_iter=self.nb_iter)
        perturbed_data = adversary.perturb(data, target)

        # Have to be different
        output = model.forward(perturbed_data)
        final_pred = output.max(
            1, keepdim=True)[1]  # get the index of the max log-probability

        if final_pred.item() == target.item():
            return perturbed_data, 0
        else:
            return perturbed_data, 1
Example #2
0
def test_adver(net, tar_net, attack, target):
    net.eval()
    tar_net.eval()
    # BIM
    if attack == 'BIM':
        adversary = LinfBasicIterativeAttack(
            net,
            loss_fn=nn.CrossEntropyLoss(reduction="sum"),
            eps=0.25,
            nb_iter=120,
            eps_iter=0.02,
            clip_min=0.0,
            clip_max=1.0,
            targeted=opt.target)
    # PGD
    elif attack == 'PGD':
        if opt.target:
            adversary = PGDAttack(net,
                                  loss_fn=nn.CrossEntropyLoss(reduction="sum"),
                                  eps=0.25,
                                  nb_iter=11,
                                  eps_iter=0.03,
                                  clip_min=0.0,
                                  clip_max=1.0,
                                  targeted=opt.target)
        else:
            adversary = PGDAttack(net,
                                  loss_fn=nn.CrossEntropyLoss(reduction="sum"),
                                  eps=0.25,
                                  nb_iter=6,
                                  eps_iter=0.03,
                                  clip_min=0.0,
                                  clip_max=1.0,
                                  targeted=opt.target)
    # FGSM
    elif attack == 'FGSM':
        adversary = GradientSignAttack(
            net,
            loss_fn=nn.CrossEntropyLoss(reduction="sum"),
            eps=0.26,
            targeted=opt.target)
    elif attack == 'CW':
        adversary = CarliniWagnerL2Attack(
            net,
            num_classes=10,
            learning_rate=0.45,
            # loss_fn=nn.CrossEntropyLoss(reduction="sum"),
            binary_search_steps=10,
            max_iterations=12,
            targeted=opt.target)

    # ----------------------------------
    # Obtain the accuracy of the model
    # ----------------------------------

    with torch.no_grad():
        correct_netD = 0.0
        total = 0.0
        net.eval()
        for data in testloader:
            inputs, labels = data
            inputs = inputs.cuda()
            labels = labels.cuda()
            outputs = net(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct_netD += (predicted == labels).sum()
        print('Accuracy of the network on netD: %.2f %%' %
              (100. * correct_netD.float() / total))

    # ----------------------------------
    # Obtain the attack success rate of the model
    # ----------------------------------

    correct = 0.0
    total = 0.0
    tar_net.eval()
    total_L2_distance = 0.0
    for data in testloader:
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = tar_net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        if target:
            # randomly choose the specific label of targeted attack
            labels = torch.randint(0, 9, (1, )).to(device)
            # test the images which are not classified as the specific label
            if predicted != labels:
                # print(total)
                adv_inputs_ori = adversary.perturb(inputs, labels)
                L2_distance = (torch.norm(adv_inputs_ori - inputs)).item()
                total_L2_distance += L2_distance
                with torch.no_grad():
                    outputs = tar_net(adv_inputs_ori)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum()
        else:
            # test the images which are classified correctly
            if predicted == labels:
                # print(total)
                adv_inputs_ori = adversary.perturb(inputs, labels)
                L2_distance = (torch.norm(adv_inputs_ori - inputs)).item()
                total_L2_distance += L2_distance
                with torch.no_grad():
                    outputs = tar_net(adv_inputs_ori)
                    _, predicted = torch.max(outputs.data, 1)

                    total += labels.size(0)
                    correct += (predicted == labels).sum()

    if target:
        print('Attack success rate: %.2f %%' %
              (100. * correct.float() / total))
    else:
        print('Attack success rate: %.2f %%' %
              (100.0 - 100. * correct.float() / total))
    print('l2 distance:  %.4f ' % (total_L2_distance / total))
Example #3
0
        correct_netD += (predicted == labels).sum()
    print('Accuracy of the network on netD: %.2f %%' %
          (100. * correct_netD.float() / total))

################################################
# estimate the attack success rate of initial D:
################################################
correct_ghost = 0.0
total = 0.0
netD.eval()
for data in testloader:
    inputs, labels = data
    inputs = inputs.cuda()
    labels = labels.cuda()

    adv_inputs_ghost = adversary_ghost.perturb(inputs, labels)
    with torch.no_grad():
        if opt.dataset == 'azure':
            predicted = cal_azure(clf, adv_inputs_ghost)
        else:
            outputs = original_net(adv_inputs_ghost)
            _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct_ghost += (predicted == labels).sum()
print('Attack success rate: %.2f %%' %
      (100 - 100. * correct_ghost.float() / total))
del inputs, labels, adv_inputs_ghost
torch.cuda.empty_cache()
gc.collect()

batch_num = 1000