def compare_attacks(key, item): AdvertorchAttack = key fmodel = foolbox.models.PyTorchModel(model, bounds=(0, 1), num_classes=NUM_CLASS) fb_adversary = item["fb_class"](fmodel) fb_kwargs = merge2dicts(item["kwargs"], item["fb_kwargs"]) at_kwargs = merge2dicts(item["kwargs"], item["at_kwargs"]) thresholds = item["thresholds"] at_adversary = AdvertorchAttack(model, **at_kwargs) x_at = at_adversary.perturb(img_batch, label_batch) y_logits = model(img_batch) y_at_logits = model(x_at) y_pred = predict_from_logits(y_logits) y_at_pred = predict_from_logits(y_at_logits) fb_successed_once = False for i, (x_i, y_i) in enumerate(zip(img_batch, label_batch)): # rule out when classification is wrong or attack is # unsuccessful (we test if foolbox attacks fails here) if y_i != y_pred[i:i + 1][0]: continue if y_i == y_at_pred[i:i + 1][0]: continue np.random.seed(233333) with warnings.catch_warnings(): warnings.simplefilter("ignore") x_fb = fb_adversary(x_i.cpu().numpy(), label=int(y_i), **fb_kwargs) if x_fb is not None: compare_at_fb(x_at[i].cpu().numpy(), x_fb, **thresholds) fb_successed_once = True if not fb_successed_once: raise RuntimeError( "Foolbox never succeed, change your testing parameters!!!")
def multiple_mini_batch_attack(adversary, loader, device="cuda", norm=None): lst_label = [] lst_pred = [] lst_advpred = [] lst_dist = [] if norm == "inf": def dist_func(x, y): return (x - y).view(x.size(0), -1).max(dim=1)[0] elif norm == 1 or norm == 2: from advertorch.utils import _get_norm_batch def dist_func(x, y): return _get_norm_batch(x - y, norm) else: assert norm is None for data, label in loader: data, label = data.to(device), label.to(device) adv = adversary.perturb(data, label) advpred = predict_from_logits(adversary.predict(adv)) pred = predict_from_logits(adversary.predict(data)) lst_label.append(label) lst_pred.append(pred) lst_advpred.append(advpred) if norm is not None: lst_dist.append(dist_func(data, adv)) return torch.cat(lst_label), torch.cat(lst_pred), torch.cat(lst_advpred), \ torch.cat(lst_dist) if norm is not None else None
def get_metric_eval(self): utr_score = [] tr_score = [] for i in range(1): ##TODO: Customise input parameters to methods like LinfPGDAttack adversary = LinfPGDAttack( self.phi, loss_fn=nn.CrossEntropyLoss(reduction="sum"), eps=0.10, nb_iter=40, eps_iter=0.01, rand_init=True, clip_min=0.0, clip_max=1.0, targeted=False) adv_untargeted = adversary.perturb(x_e, y_e) target = torch.ones_like(y_e) * 3 adversary.targeted = True adv_targeted = adversary.perturb(x_e, target) pred_cln = predict_from_logits(self.phi(x_e)) pred_untargeted_adv = predict_from_logits(self.phi(adv_untargeted)) pred_targeted_adv = predict_from_logits(self.phi(adv_targeted)) utr_score.append(torch.sum(pred_cln != pred_untargeted_adv)) tr_score.append(torch.sum(pred_cln != pred_targeted_adv)) batch_size = 5 plt.figure(figsize=(10, 8)) for ii in range(batch_size): plt.subplot(3, batch_size, ii + 1) _imshow(x_e[ii]) plt.title("clean \n pred: {}".format(pred_cln[ii])) plt.subplot(3, batch_size, ii + 1 + batch_size) _imshow(adv_untargeted[ii]) plt.title("untargeted \n adv \n pred: {}".format( pred_untargeted_adv[ii])) plt.subplot(3, batch_size, ii + 1 + batch_size * 2) _imshow(adv_targeted[ii]) plt.title("targeted to 3 \n adv \n pred: {}".format( pred_targeted_adv[ii])) plt.tight_layout() plt.savefig(self.save_path + '.png') utr_score = np.array(utr_score) tr_score = np.array(tr_score) print('MisClassifcation on Untargetted Attack ', np.mean(utr_score), np.std(utr_score)) print('MisClassifcation on Targeted Atttack', np.mean(tr_score), np.std(tr_score)) self.metric_score['Untargetted Method'] = np.mean(utr_score) self.metric_score['Targetted Method'] = np.mean(tr_score) return
def multiple_mini_batch_attack(adversary, loader, device="cuda", save_adv=False, norm=None, num_batch=None): lst_label = [] lst_pred = [] lst_advpred = [] lst_dist = [] _norm_convert_dict = {"Linf": "inf", "L2": 2, "L1": 1} if norm in _norm_convert_dict: norm = _norm_convert_dict[norm] if norm == "inf": def dist_func(x, y): return (x - y).view(x.size(0), -1).max(dim=1)[0] elif norm == 1 or norm == 2: from advertorch.utils import _get_norm_batch def dist_func(x, y): return _get_norm_batch(x - y, norm) else: assert norm is None idx_batch = 0 for data, label in loader: data, label = data.to(device), label.to(device) adv = adversary.perturb(data, label) advpred = predict_from_logits(adversary.predict(adv)) pred = predict_from_logits(adversary.predict(data)) lst_label.append(label) lst_pred.append(pred) lst_advpred.append(advpred) if norm is not None: lst_dist.append(dist_func(data, adv)) idx_batch += 1 if idx_batch == num_batch: break return ( torch.cat(lst_label), torch.cat(lst_pred), torch.cat(lst_advpred), torch.cat(lst_dist) if norm is not None else None, )
def attack_one_model(model, nb_iter, eps_iter): # generate attack samples batch_size = 100 # dataset root = '../data' if not os.path.exists(root): os.mkdir(root) test_set = dset.MNIST(root=root, train=False, transform=transforms.ToTensor(), download=True) test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=batch_size, shuffle=True) for cln_data, true_labels in test_loader: break cln_data, true_labels = cln_data.to(device), true_labels.to(device) if model_2 is not None: adv_targeted_results, adv_target_labels, adv_untargeted = generate_attack_samples( model_2, cln_data, true_labels, nb_iter, eps_iter) else: adv_targeted_results, adv_target_labels, adv_untargeted = generate_attack_samples( model, cln_data, true_labels, nb_iter, eps_iter) defense_cln_acc = 0.0 defense_acc = 0.0 defense_rate = 0.0 attack_rate = 0.0 pred_cln = predict_from_logits(model(cln_data)) for targeted_label in range(len(adv_targeted_results)): # make sure label index equals to adv target label assert targeted_label == adv_target_labels[targeted_label][0] for pred_label, adv_result, true_label in zip(pred_cln, adv_targeted_results[targeted_label], true_labels): if true_label == targeted_label: continue pred_targeted_adv = predict_from_logits(model(adv_result.unsqueeze(0))) if pred_label == true_label: defense_cln_acc += 1 if pred_targeted_adv == true_label: defense_acc += 1 if pred_label == pred_targeted_adv: defense_rate += 1 if pred_targeted_adv == targeted_label: attack_rate += 1 defense_cln_acc /= 900 defense_acc /= 900 defense_rate /= 900 attack_rate /= 900 return defense_cln_acc, defense_acc, defense_rate, attack_rate
def multiple_mini_batch_attack(adversary, loader, device="cuda"): lst_label = [] lst_pred = [] lst_advpred = [] for data, label in loader: data, label = data.to(device), label.to(device) adv = adversary.perturb(data, label) advpred = predict_from_logits(adversary.predict(adv)) pred = predict_from_logits(adversary.predict(data)) lst_label.append(label) lst_pred.append(pred) lst_advpred.append(advpred) return torch.cat(lst_label), torch.cat(lst_pred), torch.cat(lst_advpred)
def predict_then_update_loss_acc_meter(self, meter, data, target): with torch.no_grad(), ctx_eval(self.model): output = self.model(data) acc = get_accuracy(predict_from_logits(output), target) loss = self.loss_fn(output, target).item() update_loss_acc_meter(meter, loss, acc, len(data)) return loss, acc
def attack_whole_dataset(adversary, loader, device="cuda"): lst_adv = [] lst_label = [] lst_pred = [] lst_advpred = [] for data, label in loader: data, label = data.to(device), label.to(device) pred = predict_from_logits(adversary.predict(data)) adv = adversary.perturb(data, label) advpred = predict_from_logits(adversary.predict(adv)) lst_label.append(label) lst_pred.append(pred) lst_advpred.append(advpred) lst_adv.append(adv) return torch.cat(lst_adv), torch.cat(lst_label), torch.cat(lst_pred), \ torch.cat(lst_advpred)
def train_one_epoch(self): _bgn_epoch = time.time() if self.verbose: print("Training epoch {}".format(self.epochs)) self.model.train() self.model.to(self.device) self.reset_epoch_meters() self.reset_disp_meters() _train_time = 0. for batch_idx, (data, idx) in enumerate(self.loader): data, idx = data.to(self.device), idx.to(self.device) target = self.loader.targets[idx] _bgn_train = time.time() clnoutput, clnloss, eps = self.train_one_batch(data, idx, target) _train_time = _train_time + (time.time() - _bgn_train) clnacc = get_accuracy(predict_from_logits(clnoutput), target) update_loss_acc_meter(self.cln_meter, clnloss.item(), clnacc, len(data)) update_eps_meter(self.eps_meter, eps.mean().item(), len(data)) if self.disp_interval is not None and \ batch_idx % self.disp_interval == 0: self.print_disp_meters(batch_idx) self.reset_disp_meters() if self.steps == self.max_steps: self.stop_training() break self.print_disp_meters(batch_idx) self.disp_eps_hist() self.epochs += 1 self._adjust_lr_by_epochs() print("total epoch time", time.time() - _bgn_epoch) print("training total time", _train_time)
idx += 1 avg_distortion_rate /= 4 # 4 x 250 = 1000 adv_targeted_results = [torch.cat(result) for result in adv_targeted_results] adv_target_labels = [torch.cat(label) for label in adv_target_labels] defense_cln_acc = 0.0 defense_acc = 0.0 defense_rate = 0.0 attack_rate = 0.0 cln_data = torch.cat(cln_data, 0) # 100, 1, 28, 28 true_labels = torch.cat(true_labels, 0) # 100 pred_cln = predict_from_logits(model(cln_data.to(device))) with torch.no_grad(): loader = tqdm(range(len(adv_targeted_results)), total=len(range(len(adv_targeted_results)))) for targeted_label in loader: # make sure label index equals to adv target label assert targeted_label == adv_target_labels[targeted_label][0] for pred_label, adv_result, true_label in zip( pred_cln, adv_targeted_results[targeted_label], true_labels): if true_label == targeted_label: continue pred_targeted_adv = predict_from_logits( model(adv_result.unsqueeze(0))) if pred_label == true_label: defense_cln_acc += 1
adversary = LinfPGDAttack( model, loss_fn=nn.CrossEntropyLoss(reduction="sum"), eps=0.15, nb_iter=40, eps_iter=0.01, rand_init=True, clip_min=0.0, clip_max=1.0, targeted=False) ''' # generate untargeted adversarial samples adv_untargeted = adversary.perturb(cln_data, true_label) # generate targeted adversarial samples target = torch.ones_like(true_label) * 3 adversary.targeted = True adv_targeted = adversary.perturb(cln_data, target) # Test the model on these samples pred_cln = predict_from_logits(model(cln_data)) pred_untargeted_adv = predict_from_logits(model(adv_untargeted)) pred_targeted_adv = predict_from_logits(model(adv_targeted)) # Show the results # Model performacne on clean, untargeted and targeted images # ---------------------------------------------------------- plt.figure(figsize=(10, 8)) for ii in range(batch_size): plt.subplot(3, batch_size, ii + 1) _imshow(cln_data[ii]) plt.title("clean \n pred: {}".format(emotion_lst[pred_cln[ii]])) plt.subplot(3, batch_size, ii + 1 + batch_size) _imshow(adv_untargeted[ii]) plt.title("untargeted \n adv \n pred: {}".format( emotion_lst[pred_untargeted_adv[ii]]))
def w_dist(x, y): return (wass_model(x) - wass_model(y)).detach().numpy().mean() loader = get_mnist_test_loader(batch_size=args.batch_size, shuffle=True) clns = [] advs = [] trues = [] pred_clns = [] pred_advs = [] iter_counts = [] for batch_idx, (cln_data, true_label) in enumerate(loader): cln_data, true_label = cln_data.to(device), true_label.to(device) adv_untargeted, iter_count = adversary.perturb(cln_data, true_label) iter_counts.append(iter_count) pred_cln = predict_from_logits(model(cln_data)) pred_untargeted_adv = predict_from_logits(model(adv_untargeted)) clns = clns + list(cln_data.numpy()) advs = advs + list(adv_untargeted.numpy()) trues = trues + list(true_label.numpy()) pred_clns = pred_clns + list(pred_cln.numpy()) pred_advs = pred_advs + list(pred_untargeted_adv.numpy()) if (batch_idx == args.iters): break np.save('adv_data/clns.npy', clns) np.save('adv_data/advs.npy', advs) np.save('adv_data/true_labels.npy', trues) np.save('adv_data/pred_clns.npy', pred_clns) np.save('adv_data/pred_advs.npy', pred_advs)
def get_metric_eval(self): utr_score=[] tr_score=[] for i in range(1): ##TODO: Customise input parameters to methods like LinfPGDAttack adversary = LinfPGDAttack( self.phi, loss_fn=nn.CrossEntropyLoss(reduction="sum"), eps=self.args.adv_eps, nb_iter=70, eps_iter=0.01, rand_init=True, clip_min=(0.0-0.1307)/0.3081, clip_max=(1.0-0.1307)/0.3081, targeted=False) pred_cln=[] pred_untargeted_adv=[] pred_targeted_adv=[] temp_counter=0 for batch_idx, (x_e, y_e ,d_e, idx_e) in enumerate(self.test_dataset): x_e= x_e.to(self.cuda) print(torch.min(x_e), torch.max(x_e)) y_e= torch.argmax(y_e, dim=1).to(self.cuda) adversary.targeted = False adv_untargeted = adversary.perturb(x_e, y_e) target = torch.ones_like(y_e)*3 adversary.targeted = True adv_targeted = adversary.perturb(x_e, target) print(torch.min(adv_untargeted), torch.max(adv_untargeted)) pred_cln.append( predict_from_logits(self.phi(x_e)) ) pred_untargeted_adv.append( predict_from_logits(self.phi(adv_untargeted)) ) pred_targeted_adv.append( predict_from_logits(self.phi(adv_targeted)) ) temp_counter+=1 if temp_counter ==5: break pred_cln= torch.cat(pred_cln) pred_untargeted_adv= torch.cat(pred_untargeted_adv) pred_targeted_adv= torch.cat(pred_targeted_adv) utr_score.append( torch.sum( pred_cln != pred_untargeted_adv).detach().cpu().numpy() / pred_cln.shape[0] ) tr_score.append( torch.sum(pred_cln!= pred_targeted_adv).detach().cpu().numpy() / pred_cln.shape[0] ) # batch_size=5 # plt.figure(figsize=(10, 8)) # for ii in range(batch_size): # plt.subplot(3, batch_size, ii + 1) # _imshow(x_e[ii]) # plt.title("clean \n pred: {}".format(pred_cln[ii])) # plt.subplot(3, batch_size, ii + 1 + batch_size) # _imshow(adv_untargeted[ii]) # plt.title("untargeted \n adv \n pred: {}".format( # pred_untargeted_adv[ii])) # plt.subplot(3, batch_size, ii + 1 + batch_size * 2) # _imshow(adv_targeted[ii]) # plt.title("targeted to 3 \n adv \n pred: {}".format( # pred_targeted_adv[ii])) # plt.tight_layout() # plt.savefig( self.save_path + '.png' ) utr_score= np.array(utr_score) tr_score= np.array(tr_score) print('MisClassifcation on Untargetted Attack ', np.mean(utr_score), np.std(utr_score), self.args.adv_eps ) print('MisClassifcation on Targeted Atttack', np.mean(tr_score), np.std(tr_score), self.args.adv_eps ) self.metric_score['Untargetted Method']= np.mean( utr_score ) self.metric_score['Targetted Method']= np.mean( tr_score ) return