def cw_l2_attack(model, hps):

    print('============== CW_l2 Summary ===============')
    confidence = hps.cw_confidence
    adversary = CarliniWagnerL2Attack(model,
                                      num_classes=10,
                                      confidence=confidence,
                                      clip_min=0.,
                                      clip_max=1.,
                                      max_iterations=1000)
    print('confidence = {}'.format(confidence))
    attack_run_rejection_policy(model, adversary, hps)

    print('============== CW_l2 Summary ===============')
Example #2
0
    def create_adv_input(self, x, y, model):
        # Prepare copied model
        model = copy.deepcopy(model)

        # Prepare input and corresponding label
        data = torch.from_numpy(np.expand_dims(x, axis=0).astype(np.float32))
        target = torch.from_numpy(np.array([y]).astype(np.int64))
        data.requires_grad = True

        from advertorch.attacks import CarliniWagnerL2Attack
        adversary = CarliniWagnerL2Attack(model.forward,
                                          self.num_classes,
                                          max_iterations=self.max_iterations)
        perturbed_data = adversary.perturb(data, target)

        # Have to be different
        output = model.forward(perturbed_data)
        final_pred = output.max(
            1, keepdim=True)[1]  # get the index of the max log-probability

        if final_pred.item() == target.item():
            return perturbed_data, 0
        else:
            return perturbed_data, 1
Example #3
0
def test_adver(net, tar_net, attack, target):
    net.eval()
    tar_net.eval()
    # BIM
    if attack == 'BIM':
        adversary = LinfBasicIterativeAttack(
            net,
            loss_fn=nn.CrossEntropyLoss(reduction="sum"),
            eps=0.25,
            nb_iter=120,
            eps_iter=0.02,
            clip_min=0.0,
            clip_max=1.0,
            targeted=opt.target)
    # PGD
    elif attack == 'PGD':
        if opt.target:
            adversary = PGDAttack(net,
                                  loss_fn=nn.CrossEntropyLoss(reduction="sum"),
                                  eps=0.25,
                                  nb_iter=11,
                                  eps_iter=0.03,
                                  clip_min=0.0,
                                  clip_max=1.0,
                                  targeted=opt.target)
        else:
            adversary = PGDAttack(net,
                                  loss_fn=nn.CrossEntropyLoss(reduction="sum"),
                                  eps=0.25,
                                  nb_iter=6,
                                  eps_iter=0.03,
                                  clip_min=0.0,
                                  clip_max=1.0,
                                  targeted=opt.target)
    # FGSM
    elif attack == 'FGSM':
        adversary = GradientSignAttack(
            net,
            loss_fn=nn.CrossEntropyLoss(reduction="sum"),
            eps=0.26,
            targeted=opt.target)
    elif attack == 'CW':
        adversary = CarliniWagnerL2Attack(
            net,
            num_classes=10,
            learning_rate=0.45,
            # loss_fn=nn.CrossEntropyLoss(reduction="sum"),
            binary_search_steps=10,
            max_iterations=12,
            targeted=opt.target)

    # ----------------------------------
    # Obtain the accuracy of the model
    # ----------------------------------

    with torch.no_grad():
        correct_netD = 0.0
        total = 0.0
        net.eval()
        for data in testloader:
            inputs, labels = data
            inputs = inputs.cuda()
            labels = labels.cuda()
            outputs = net(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct_netD += (predicted == labels).sum()
        print('Accuracy of the network on netD: %.2f %%' %
              (100. * correct_netD.float() / total))

    # ----------------------------------
    # Obtain the attack success rate of the model
    # ----------------------------------

    correct = 0.0
    total = 0.0
    tar_net.eval()
    total_L2_distance = 0.0
    for data in testloader:
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = tar_net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        if target:
            # randomly choose the specific label of targeted attack
            labels = torch.randint(0, 9, (1, )).to(device)
            # test the images which are not classified as the specific label
            if predicted != labels:
                # print(total)
                adv_inputs_ori = adversary.perturb(inputs, labels)
                L2_distance = (torch.norm(adv_inputs_ori - inputs)).item()
                total_L2_distance += L2_distance
                with torch.no_grad():
                    outputs = tar_net(adv_inputs_ori)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum()
        else:
            # test the images which are classified correctly
            if predicted == labels:
                # print(total)
                adv_inputs_ori = adversary.perturb(inputs, labels)
                L2_distance = (torch.norm(adv_inputs_ori - inputs)).item()
                total_L2_distance += L2_distance
                with torch.no_grad():
                    outputs = tar_net(adv_inputs_ori)
                    _, predicted = torch.max(outputs.data, 1)

                    total += labels.size(0)
                    correct += (predicted == labels).sum()

    if target:
        print('Attack success rate: %.2f %%' %
              (100. * correct.float() / total))
    else:
        print('Attack success rate: %.2f %%' %
              (100.0 - 100. * correct.float() / total))
    print('l2 distance:  %.4f ' % (total_L2_distance / total))
                                         num_workers=0)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
           'ship', 'truck')

# In[4]:

####################################
# Construct an adversary instance
####################################
adversary_CW = CarliniWagnerL2Attack(model,
                                     num_classes=len(classes),
                                     confidence=0,
                                     targeted=False,
                                     learning_rate=0.01,
                                     binary_search_steps=9,
                                     max_iterations=10000,
                                     abort_early=True,
                                     initial_const=0.001,
                                     clip_min=0.0,
                                     clip_max=1.0,
                                     loss_fn=None)

adversary_Jacobian = JacobianSaliencyMapAttack(model,
                                               num_classes=len(classes),
                                               clip_min=0.0,
                                               clip_max=1.0,
                                               loss_fn=None,
                                               theta=1.0,
                                               gamma=1.0,
                                               comply_cleverhans=False)
Example #5
0
def adv_train_loop(model,
                   params,
                   ds,
                   min_y,
                   base_data,
                   model_id,
                   attack_type,
                   device,
                   batch_size,
                   max_epochs=5):
    print('training adversarial:', attack_type)
    ds_train, ds_valid = ds
    min_y_train, min_y_val = min_y
    original_model = copy.deepcopy(
        model)  # used to generate adv images for the trained model
    original_model.eval()
    model = copy.deepcopy(
        model)  # making a copy so that original model is not changed
    model = model.to(device)
    model_id = f'{model_id}_{attack_type}'

    with create_summary_writer(model,
                               ds_train,
                               base_data,
                               model_id,
                               device=device) as writer:
        lr = params['lr']
        mom = params['momentum']
        wd = params['l2_wd']
        optimizer = torch.optim.SGD(model.parameters(),
                                    lr=lr,
                                    momentum=mom,
                                    weight_decay=wd)
        sched = ReduceLROnPlateau(optimizer, factor=0.5, patience=5)
        funcs = {'accuracy': Accuracy(), 'loss': Loss(F.cross_entropy)}
        loss = funcs['loss']._loss_fn

        acc_metric = Accuracy(device=device)
        loss_metric = Loss(F.cross_entropy, device=device)

        acc_val_metric = Accuracy(device=device)
        loss_val_metric = Loss(F.cross_entropy, device=device)

        classifier = PyTorchClassifier(
            model=original_model,
            clip_values=(0, 1),
            loss=nn.CrossEntropyLoss(),
            optimizer=optimizer,
            input_shape=(3, 64, 64),
            nb_classes=200,
        )

        attack = None

        #         if attack_type == "fgsm":
        #             attack = FastGradientMethod(estimator=classifier, eps=0.2)
        #         elif attack_type == "bim":
        #             attack = BasicIterativeMethod(estimator=classifier, eps=0.2)
        #         elif attack_type == "carlini":
        #             attack = CarliniLInfMethod(classifier=classifier)
        #         elif attack_type == "deepfool":
        #             attack = DeepFool(classifier=classifier)
        if attack_type == "fgsm":
            attack = GradientSignAttack(model, loss_fn=loss, eps=0.2)
        elif attack_type == "ffa":
            attack = FastFeatureAttack(model, loss_fn=loss, eps=0.3)
        elif attack_type == "carlini":
            attack = CarliniWagnerL2Attack(model, 200, max_iterations=1000)
        elif attack_type == "lbfgs":
            attack = DeepFool(classifier=classifier)

        def train_step(engine, batch):
            model.train()
            x, y = batch
            x = x.to(device)
            y = y.to(device) - min_y_train
            with ctx_noparamgrad_and_eval(model):
                x_adv = attack.perturb(x, y)
            optimizer.zero_grad()
            x = torch.cat((x, x_adv))
            y = torch.cat((y, y))
            ans = model.forward(x)
            l = loss(ans, y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            #             return ans, y
            return l.item()

        trainer = Engine(train_step)

        #         acc_metric.attach(trainer, "accuracy")
        #         loss_metric.attach(trainer, 'loss')

        def train_eval_step(engine, batch):
            model.eval()
            x, y = batch
            x = x.to(device)
            y = y.to(device) - min_y_train
            x_adv = attack.perturb(x, y)
            x = torch.cat((x, x_adv))
            y = torch.cat((y, y))
            with torch.no_grad():
                ans = model.forward(x)
            return ans, y

        train_evaluator = Engine(train_eval_step)
        acc_metric.attach(train_evaluator, "accuracy")
        loss_metric.attach(train_evaluator, 'loss')

        def validation_step(engine, batch):
            model.eval()
            x, y = batch
            x = x.to(device)
            y = y.to(device) - min_y_val
            x_adv = attack.perturb(x, y)
            x = torch.cat((x, x_adv))
            y = torch.cat((y, y))
            with torch.no_grad():
                ans = model.forward(x)
            return ans, y

        valid_evaluator = Engine(validation_step)
        acc_val_metric.attach(valid_evaluator, "accuracy")
        loss_val_metric.attach(valid_evaluator, 'loss')

        @trainer.on(
            Events.ITERATION_COMPLETED(every=200 * 5000 // batch_size // 10))
        def log_validation_results(engine):
            valid_evaluator.run(ds_valid)
            metrics = valid_evaluator.state.metrics
            valid_avg_accuracy = metrics['accuracy']
            avg_nll = metrics['loss']
            print(
                "Validation Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
                .format(engine.state.epoch, valid_avg_accuracy, avg_nll))
            writer.add_scalar("validation/avg_loss", avg_nll,
                              engine.state.epoch)
            writer.add_scalar("validation/avg_accuracy", valid_avg_accuracy,
                              engine.state.epoch)
            writer.add_scalar("validation/avg_error", 1. - valid_avg_accuracy,
                              engine.state.epoch)

        @trainer.on(Events.EPOCH_COMPLETED)
        def lr_scheduler(engine):
            metrics = valid_evaluator.state.metrics
            avg_nll = metrics['accuracy']
            sched.step(avg_nll)

        @trainer.on(Events.ITERATION_COMPLETED(every=50))
        def log_training_loss(engine):
            batch = engine.state.batch
            ds = DataLoader(TensorDataset(*batch), batch_size=batch_size)
            train_evaluator.run(ds)
            metrics = train_evaluator.state.metrics
            # metrics = engine.state.metrics
            accuracy = metrics['accuracy']
            nll = metrics['loss']
            iter = (engine.state.iteration - 1) % len(ds_train) + 1
            if (iter % 50) == 0:
                print("Epoch[{}] Iter[{}/{}] Accuracy: {:.2f} Loss: {:.2f}".
                      format(engine.state.epoch, iter, len(ds_train), accuracy,
                             nll))
            writer.add_scalar("batchtraining/detloss", nll, engine.state.epoch)
            writer.add_scalar("batchtraining/accuracy", accuracy,
                              engine.state.iteration)
            writer.add_scalar("batchtraining/error", 1. - accuracy,
                              engine.state.iteration)
            writer.add_scalar("batchtraining/loss", engine.state.output,
                              engine.state.iteration)

        @trainer.on(Events.EPOCH_COMPLETED)
        def log_lr(engine):
            writer.add_scalar("lr", optimizer.param_groups[0]['lr'],
                              engine.state.epoch)

#         @trainer.on(Events.EPOCH_COMPLETED)
#         def log_training_results(engine):
#             train_evaluator.run(ds_train)
#             metrics = train_evaluator.state.metrics
#             # metrics = engine.state.metrics
#             avg_accuracy = metrics['accuracy']
#             avg_nll = metrics['loss']
#             print("Training Results - Epoch: {}  Avg accuracy: {:.2f} Avg loss: {:.2f}"
#                   .format(engine.state.epoch, avg_accuracy, avg_nll))
#             writer.add_scalar("training/avg_loss", avg_nll, engine.state.epoch)
#             writer.add_scalar("training/avg_accuracy",
#                               avg_accuracy, engine.state.epoch)
#             writer.add_scalar("training/avg_error", 1. -
#                               avg_accuracy, engine.state.epoch)

        @trainer.on(
            Events.ITERATION_COMPLETED(every=200 * 5000 // batch_size // 10))
        def validation_value(engine):
            metrics = valid_evaluator.state.metrics
            valid_avg_accuracy = metrics['accuracy']
            return valid_avg_accuracy

        to_save = {'model': model}
        handler = Checkpoint(
            to_save,
            DiskSaver(os.path.join(base_data, model_id), create_dir=True),
            score_function=validation_value,
            score_name="val_acc",
            global_step_transform=global_step_from_engine(trainer),
            n_saved=None)

        # kick everything off
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED(every=200 * 5000 // batch_size // 10),
            handler)
        trainer.run(ds_train, max_epochs=max_epochs)
# load classifier
predict = get_classifier(cfg, cfg.classifier)
state_dict = torch.load(cfg.classifier.ckpt)
predict.load_state_dict(state_dict)
for p in predict.parameters():
    p.requires_grad_(False)
predict = torch.nn.Sequential(proj_fn, transform.classifier_preprocess_layer,
                              predict).cuda()
predict.eval()

# create attacker
attacker = CarliniWagnerL2Attack(predict=predict,
                                 num_classes=cfg.dataset.num_classes,
                                 learning_rate=0.2,
                                 initial_const=10,
                                 binary_search_steps=4,
                                 max_iterations=100,
                                 abort_early=True)

total = 0
correct_adv = 0
for i, (images, labels) in enumerate(progress_bar):
    if i < start_ind or i >= end_ind:
        continue

    images, labels = images.cuda(), labels.cuda()
    result_path = os.path.join(result_dir, 'batch_{:04d}.pt'.format(i))
    if os.path.isfile(result_path):
        result_dict = torch.load(result_path)
        images_adv = result_dict['input'].cuda()
Example #7
0
                          ord=norm,
                          rand_init=True)
elif args.attack == 'MIFGSM':
    adversary = MomentumIterativeAttack(
        lambda x: wrapper(normalize(x), pcl=pcl),
        eps=epsilon,
        eps_iter=epsilon / 10,
        ord=norm,
        nb_iter=10)
elif args.attack == 'FGSM':
    adversary = GradientSignAttack(lambda x: wrapper(x, pcl=pcl), eps=epsilon)
    # adversary = PGDAttack(lambda x: wrapper(x, pcl=pcl), eps=epsilon, eps_iter=epsilon, nb_iter=1, ord=norm, rand_init=False)
elif args.attack == 'CW':
    adversary = CarliniWagnerL2Attack(lambda x: wrapper(x, pcl=pcl),
                                      10,
                                      binary_search_steps=2,
                                      max_iterations=500,
                                      initial_const=1e-1)
elif args.attack == 'DDN':
    adversary = DDN(steps=100, device=device)
    ddn = True
else:
    adversary = None

criterion = torch.nn.CrossEntropyLoss()
net.eval()

test_acc_adv, test_loss_adv, dist_l2, dist_linf = adv_test(
    lambda x: wrapper(x, pcl=pcl),
    test_loader,
    criterion,
def CW(model,X,y,num_class=10,num_iter=10):
    adversary = CarliniWagnerL2Attack(model, loss_fn=nn.CrossEntropyLoss(reduction="sum"),num_classes=num_class,confidence=0, targeted=False, learning_rate=0.01, binary_search_steps=5, max_iterations=20, abort_early=True, initial_const=0.001, clip_min=0.0, clip_max=1.0)
    adv_untargeted = adversary.perturb(X, y)-X
    return adv_untargeted
Example #9
0
def model_test(model, data_loader, output_file_path, attack='mia', eps=8/255, nb_iter=3):
    model.eval()
    
    test_loss, adv_loss, correct, correct_adv, nb_data, adv_l2dist, adv_linfdist = \
    0, 0, 0, 0, 0, 0.0, 0.0

    start_time = time.time()
    for i, (data, target) in enumerate(data_loader):
        print('i:', i)

        indx_target = target.clone()
        data_length = data.shape[0]
        nb_data += data_length
        
        data, target = data.cuda(), target.cuda()

        with torch.no_grad():
            output = model(data)
        
        # print('data max:', torch.max(data))
        # print('data min:', torch.min(data))
        if attack == 'cw':
            if i >= 5:
                break
            adversary = CarliniWagnerL2Attack(predict=model, num_classes=10, targeted=True, 
                clip_min=min_v, clip_max=max_v, max_iterations=50)
        elif attack == 'mia':
            adversary = MomentumIterativeAttack(predict=model, targeted=True, eps=eps, nb_iter=40, eps_iter=0.01*(max_v-min_v), 
                clip_min=min_v, clip_max=max_v )
        elif attack == 'pgd':
            adversary = LinfPGDAttack(predict=model, targeted=True, eps=eps, nb_iter=nb_iter, eps_iter=eps*1.25/nb_iter,
                clip_min=min_v, clip_max=max_v )
        else:
            raise 'unimplemented error'
        pred = model(data) # torch.Size([128, 10])
        print('pred:', type(pred), pred.shape)
        print('target:', type(target), target.shape, target[0:20])
        # pred_argmax = torch.argmax(pred, dim=1)
        # print('pred_argmax:', type(pred_argmax), pred_argmax.shape, pred_argmax[0:10])
        # for i in range(list(pred.shape)[0]):
        #     pred[i,pred_argmax[i]] = -1
        for i in range(list(pred.shape)[0]):
            pred[i,target[i]] = -1
        # target_adv = torch.argmax(pred, dim=1)
        target_adv = (target + 5) % 10
        print('target_adv:', type(target_adv), target_adv.shape, target_adv[0:20])
        data_adv = adversary.perturb(data, target_adv)

        print('data_adv max:', torch.max(data_adv))
        print('data_adv min:', torch.min(data_adv))
        print('linf:', torch.max(torch.abs(data_adv-data)) )

        adv_l2dist += torch.norm((data-data_adv).view(data.size(0), -1), p=2, dim=-1).sum().item()
        adv_linfdist += torch.max((data-data_adv).view(data.size(0), -1).abs(), dim=-1)[0].sum().item()

        with torch.no_grad():
            output_adv = model(data_adv)

        pred_adv = output_adv.data.max(1)[1]
        correct_adv += pred_adv.cpu().eq(indx_target).sum()
        
        pred = output.data.max(1)[1]  # get the index of the max log-probability
        correct += pred.cpu().eq(indx_target).sum()
        
        time_consume = time.time() - start_time
        print('time_consume:', time_consume)

        acc = float(100. * correct) / nb_data
        print('\tTest set: Accuracy: {}/{}({:.2f}%)'.format(
            correct, nb_data, acc))

        acc_adv = float(100. * correct_adv) / nb_data
        print('\tAdv set: Accuracy : {}/{}({:.2f}%)'.format(
            correct_adv, nb_data, acc_adv
        ))

    adv_l2dist /= nb_data
    adv_linfdist /= nb_data
    print('\tAdv dist: L2: {:.8f} , Linf: {:.8f}'.format(adv_l2dist, adv_linfdist))

    with open(output_file_path, "a+") as output_file:
        output_file.write(args.model_name + '\n')
        info_string = 'attack: %s:\n acc: %.2f, acc_adv: %.2f, adv_l2dist: %.2f, adv_linfdist: %.2f, time_consume: %.2f' % (
            attack, acc, acc_adv, adv_l2dist, adv_linfdist, time_consume) 
        output_file.write(info_string)

    return acc, acc_adv