예제 #1
0
    def create_adv_input(self, x, y, model):
        # Prepare copied model
        model = copy.deepcopy(model)

        # Prepare input and corresponding label
        data = torch.from_numpy(np.expand_dims(x, axis=0).astype(np.float32))
        target = torch.from_numpy(np.array([y]).astype(np.int64))
        data.requires_grad = True

        from advertorch.attacks import LinfBasicIterativeAttack
        adversary = LinfBasicIterativeAttack(model.forward,
                                             eps=self.eps,
                                             nb_iter=self.nb_iter)
        perturbed_data = adversary.perturb(data, target)

        # Have to be different
        output = model.forward(perturbed_data)
        final_pred = output.max(
            1, keepdim=True)[1]  # get the index of the max log-probability

        if final_pred.item() == target.item():
            return perturbed_data, 0
        else:
            return perturbed_data, 1
예제 #2
0
def make_adversary_dict(model, model_name, targetted=False):
    if (model_name == "capsnet"):
        model_for_adversary = Model_for_Adversary_Caps(model)
    else:
        model_for_adversary = Model_for_Adversary_CNN(model)

    linf_eps = 0.3
    fgsm_step = 0.05
    bim_pgd_step = 0.01

    adversary_dict = {}
    adversary_dict['Clean'] = CleanAttack(clip_min=-0.4242, clip_max=2.8215)
    adversary_dict['PGD'] = LinfPGDAttack(
        model_for_adversary,
        loss_fn=nn.CrossEntropyLoss(reduction="sum"),
        eps=(linf_eps / 0.3081),
        nb_iter=100,
        eps_iter=(bim_pgd_step / 0.3081),
        rand_init=True,
        clip_min=-0.4242,
        clip_max=2.8215,
        targeted=targetted)

    adversary_dict['FGSM'] = GradientSignAttack(
        model_for_adversary,
        loss_fn=nn.CrossEntropyLoss(reduction="sum"),
        eps=(fgsm_step / 0.3081),
        clip_min=-0.4242,
        clip_max=2.8215,
        targeted=targetted)
    adversary_dict['BIM'] = LinfBasicIterativeAttack(
        model_for_adversary,
        loss_fn=nn.CrossEntropyLoss(reduction="sum"),
        eps=(linf_eps / 0.3081),
        nb_iter=100,
        eps_iter=(bim_pgd_step / 0.3081),
        clip_min=-0.4242,
        clip_max=2.8215,
        targeted=targetted)

    return adversary_dict
예제 #3
0
def test_adver(net, tar_net, attack, target):
    net.eval()
    tar_net.eval()
    # BIM
    if attack == 'BIM':
        adversary = LinfBasicIterativeAttack(
            net,
            loss_fn=nn.CrossEntropyLoss(reduction="sum"),
            eps=0.25,
            nb_iter=120,
            eps_iter=0.02,
            clip_min=0.0,
            clip_max=1.0,
            targeted=opt.target)
    # PGD
    elif attack == 'PGD':
        if opt.target:
            adversary = PGDAttack(net,
                                  loss_fn=nn.CrossEntropyLoss(reduction="sum"),
                                  eps=0.25,
                                  nb_iter=11,
                                  eps_iter=0.03,
                                  clip_min=0.0,
                                  clip_max=1.0,
                                  targeted=opt.target)
        else:
            adversary = PGDAttack(net,
                                  loss_fn=nn.CrossEntropyLoss(reduction="sum"),
                                  eps=0.25,
                                  nb_iter=6,
                                  eps_iter=0.03,
                                  clip_min=0.0,
                                  clip_max=1.0,
                                  targeted=opt.target)
    # FGSM
    elif attack == 'FGSM':
        adversary = GradientSignAttack(
            net,
            loss_fn=nn.CrossEntropyLoss(reduction="sum"),
            eps=0.26,
            targeted=opt.target)
    elif attack == 'CW':
        adversary = CarliniWagnerL2Attack(
            net,
            num_classes=10,
            learning_rate=0.45,
            # loss_fn=nn.CrossEntropyLoss(reduction="sum"),
            binary_search_steps=10,
            max_iterations=12,
            targeted=opt.target)

    # ----------------------------------
    # Obtain the accuracy of the model
    # ----------------------------------

    with torch.no_grad():
        correct_netD = 0.0
        total = 0.0
        net.eval()
        for data in testloader:
            inputs, labels = data
            inputs = inputs.cuda()
            labels = labels.cuda()
            outputs = net(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct_netD += (predicted == labels).sum()
        print('Accuracy of the network on netD: %.2f %%' %
              (100. * correct_netD.float() / total))

    # ----------------------------------
    # Obtain the attack success rate of the model
    # ----------------------------------

    correct = 0.0
    total = 0.0
    tar_net.eval()
    total_L2_distance = 0.0
    for data in testloader:
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = tar_net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        if target:
            # randomly choose the specific label of targeted attack
            labels = torch.randint(0, 9, (1, )).to(device)
            # test the images which are not classified as the specific label
            if predicted != labels:
                # print(total)
                adv_inputs_ori = adversary.perturb(inputs, labels)
                L2_distance = (torch.norm(adv_inputs_ori - inputs)).item()
                total_L2_distance += L2_distance
                with torch.no_grad():
                    outputs = tar_net(adv_inputs_ori)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum()
        else:
            # test the images which are classified correctly
            if predicted == labels:
                # print(total)
                adv_inputs_ori = adversary.perturb(inputs, labels)
                L2_distance = (torch.norm(adv_inputs_ori - inputs)).item()
                total_L2_distance += L2_distance
                with torch.no_grad():
                    outputs = tar_net(adv_inputs_ori)
                    _, predicted = torch.max(outputs.data, 1)

                    total += labels.size(0)
                    correct += (predicted == labels).sum()

    if target:
        print('Attack success rate: %.2f %%' %
              (100. * correct.float() / total))
    else:
        print('Attack success rate: %.2f %%' %
              (100.0 - 100. * correct.float() / total))
    print('l2 distance:  %.4f ' % (total_L2_distance / total))
예제 #4
0
파일: dast.py 프로젝트: zhoumingyi/DaST
        transform=transforms.Compose([
            # transforms.Pad(2, padding_mode="symmetric"),
            transforms.ToTensor(),
            # transforms.RandomCrop(32, 4),
            # normalize,
        ]))
    netD = Net_l().cuda()
    netD = nn.DataParallel(netD)

    clf = joblib.load('pretrained/sklearn_mnist_model.pkl')

    adversary_ghost = LinfBasicIterativeAttack(
        netD,
        loss_fn=nn.CrossEntropyLoss(reduction="sum"),
        eps=0.25,
        nb_iter=100,
        eps_iter=0.01,
        clip_min=0.0,
        clip_max=1.0,
        targeted=False)
    nc = 1

elif opt.dataset == 'mnist':
    testset = torchvision.datasets.MNIST(
        root='dataset/',
        train=False,
        download=True,
        transform=transforms.Compose([
            # transforms.Pad(2, padding_mode="symmetric"),
            transforms.ToTensor(),
            # transforms.RandomCrop(32, 4),
예제 #5
0
model = SmallCNN().cuda()
model.load_state_dict(torch.load('./models/trades.pt'))
model.eval()

sub = SmallCNN().cuda()
sub.load_state_dict(torch.load('./substitute_models/mnist_trades.pt'))
sub.eval()

adversaries = [
    GradientSignAttack(model, nn.CrossEntropyLoss(size_average=False),
                       eps=0.3),
    GradientSignAttack(sub, nn.CrossEntropyLoss(size_average=False), eps=0.3),
    LinfBasicIterativeAttack(model,
                             nn.CrossEntropyLoss(size_average=False),
                             eps=0.3,
                             nb_iter=40,
                             eps_iter=0.01),
    LinfBasicIterativeAttack(sub,
                             nn.CrossEntropyLoss(size_average=False),
                             eps=0.3,
                             nb_iter=40,
                             eps_iter=0.01)
]
_, _, test_loader = get_mnist_data_loaders()
for adversary in adversaries:
    correct_adv = 0
    for i, (x_batch, y_batch) in enumerate(test_loader):
        x_batch, y_batch = x_batch.cuda(), y_batch.cuda()
        adv_x_batch = adversary.perturb(x_batch, y_batch)
        logits = model(adv_x_batch)