def test_attack_convergence(bn_model, bn_criterion, bn_images, bn_labels): attack = BoundaryAttack(bn_model, bn_criterion) advs = attack(bn_images, bn_labels, unpack=False, verbose=True) for adv in advs: assert adv.perturbed is not None assert adv.distance.value < np.inf attack2 = BoundaryAttack(bn_model, bn_criterion) bn_images2 = np.array([adv.perturbed for adv in advs]) advs2 = attack2(bn_images2, bn_labels, unpack=False, iterations=5000) for adv in advs2: # should converge assert adv.perturbed is not None assert adv.distance.value < np.inf
def test_attack_convergence(bn_adversarial): adv = bn_adversarial attack1 = DeepFoolAttack() attack1(adv) attack2 = BoundaryAttack() attack2(adv, iterations=5000, verbose=True) # should converge assert adv.image is not None assert adv.distance.value < np.inf
def boundary_attack(model, img, target): img_01 = (img / 255).astype(np.float32) atk = BoundaryAttack(model, TargetClass(target)) label = 1-target adv = atk(img_01, label, iterations=1000, verbose=False, log_every_n_steps=100) if adv is not None: adv = np.clip(adv * 255, 0, 255) return adv
def test_attack_continue(bn_adversarial): adv = bn_adversarial attack1 = BlendedUniformNoiseAttack() attack1(adv) d1 = adv.distance.value attack2 = BoundaryAttack() attack2(adv, iterations=200, verbose=True) assert adv.image is not None assert adv.distance.value < np.inf assert adv.distance.value < d1
def test_attack(bn_model, bn_criterion, bn_images, bn_labels): attack = BoundaryAttack(bn_model, bn_criterion) advs = attack(bn_images, bn_labels, unpack=False, iterations=200, verbose=True) for adv in advs: assert adv.perturbed is not None assert adv.distance.value < np.inf
def test_attack_parameters3(bn_adversarial): adv = bn_adversarial attack = BoundaryAttack() o = adv.original_image starting_point = np.random.uniform(0, 1, size=o.shape).astype(o.dtype) attack(adv, iterations=200, starting_point=starting_point, log_every_n_steps=2, tune_batch_size=30, threaded_rnd=False, threaded_gen=False, verbose=True) assert adv.image is not None assert adv.distance.value < np.inf
def test_attack_parameters(bn_adversarial): adv = bn_adversarial attack = BoundaryAttack() o = adv.unperturbed np.random.seed(2) starting_point = np.random.uniform(0, 1, size=o.shape).astype(o.dtype) attack(adv, iterations=200, starting_point=starting_point, log_every_n_steps=2, tune_batch_size=False, threaded_rnd=False, threaded_gen=False, alternative_generator=True, verbose=True) assert adv.perturbed is not None assert adv.distance.value < np.inf
def test_attack_parameters3(bn_model, bn_criterion, bn_images, bn_labels): attack = BoundaryAttack(bn_model, bn_criterion) np.random.seed(2) starting_point = np.random.uniform(0, 1, size=bn_images[0].shape).astype( bn_images.dtype) advs = attack( bn_images, bn_labels, unpack=False, iterations=200, starting_point=starting_point, log_every_n_steps=2, tune_batch_size=30, threaded_rnd=False, threaded_gen=False, verbose=True, ) for adv in advs: assert adv.perturbed is not None assert adv.distance.value < np.inf
elif params.get('metric') == 'mahalanobis': psd_matrix = np.loadtxt(params.get('psd_matrix_path'), ) psd_matrix = torch.tensor(psd_matrix, dtype=torch.float, device=params.get('device')) else: raise Exception('unsupported metric') knn_module = MahalanobisKnnModule(X_train, y_train, params.getint('k'), psd_matrix) knn_module.to(params.get('device')) knn_module.eval() fmodel = PyTorchModel(knn_module, bounds=(0, 1), device=params.get('device')) attack = BoundaryAttack() n_eval = params.getint('n_eval') perturbations_list = [] for i, (X_eval, y_eval) in enumerate( zip( torch.split(X_test[:n_eval], params.getint('attack_batch_size')), torch.split(y_test[:n_eval], params.getint('attack_batch_size')), )): print(i) _, advs, successful = attack( fmodel, X_eval, y_eval, epsilons=None,
fmodel = KerasModel(kmodel, bounds=(-1, 1)) # label of the target class preds = kmodel.predict(dog_x) dog_label = np.argmax(preds) # label of the original class preds = kmodel.predict(cat_x) cat_label = np.argmax(preds) criterion_1 = TopKMisclassification(k=5) criterion_2 = TargetClass(dog_label) criterion_3 = TargetClassProbability(dog_label, p=0.5) criterion = criterion_1 & criterion_2 & criterion_3 attack = BoundaryAttack(model=fmodel, criterion=criterion) iteration_size = 1000 global_iterations = 0 # Run boundary attack to generate an adversarial example adversarial = attack(cat_img, label=cat_label, unpack=False, iterations=iteration_size, starting_point=dog_img, log_every_n_steps=10, verbose=True) global_iterations += iteration_size np.save('adversarial_image_{0}'.format(global_iterations), adversarial.image)
def main(): X_train = np.load("./Data/sGrid/X_train.npy") X_test = np.load("./Data/sGrid/X_test.npy") X_vaild = np.load("./Data/sGrid/X_vaild.npy") Y_train = np.load("./Data/sGrid/Y_train.npy") Y_test = np.load("./Data/sGrid/Y_test.npy") Y_vaild = np.load("./Data/sGrid/Y_vaild.npy") torch.manual_seed(1) embedding = nn.Embedding(128, 5, max_norm=1) Y_train = torch.from_numpy(Y_train) Y_test = torch.from_numpy(Y_test) Y_vaild = torch.from_numpy(Y_vaild) input = Variable(torch.from_numpy(X_train * 128).long()) X_train_embed = embedding(input) X_train_embed = X_train_embed.detach() input = Variable(torch.from_numpy(X_test * 128).long()) X_test_embed = embedding(input) X_test_embed = X_test_embed.detach() input = Variable(torch.from_numpy(X_vaild * 128).long()) X_vaild_embed = embedding(input) X_vaild_embed = X_vaild_embed.detach() dic = {} count = 0 for i in range(X_train.shape[0]): for j in range(400): if chr(int(X_train[i, j] * 128)) not in dic.keys(): dic[chr(int(X_train[i, j] * 128))] = X_train_embed[i, j] symbol_dict = dic args = Args() net = CNN_Text_dropout(args).cuda() print(net) pretrained_dict = torch.load( 'Parameters/cnn_text_kernel3.5.7.9_128_embed_dropout.pkl').state_dict( ) model_dict = net.state_dict() pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict } # 更新现有的model_dict model_dict.update(pretrained_dict) # 加载我们真正需要的state_dict net.load_state_dict(model_dict) batch_size = 500 Train_data = Data.TensorDataset(X_train_embed, Y_train) Test_data = Data.TensorDataset(X_test_embed, Y_test) train_data = Data.DataLoader(dataset=Train_data, batch_size=batch_size, shuffle=False) test_data = Data.DataLoader(dataset=Test_data, batch_size=1, shuffle=False) optimizer = optim.Adam(net.parameters(), lr=0.0001, weight_decay=1e-9) loss_function = nn.CrossEntropyLoss() attack_log_list = None attack_log_string_list = [] net.eval() # This is the begin of the attack # model and boudary attack model = PyTorchModel( net, (-1, 1), 2, ) attack = BoundaryAttack(model) # find the nearest attack sample as the starting point X_test_string = find_string_from_tensor(X_test) dict_attack_string_tensor = {} for i in range(len(X_train)): x, label = X_train[i], int(Y_train[i].numpy()[0]) # the prediction of an attack sample should be an attack if label == 1 and np.argmax(model.predictions( X_train_embed[i].numpy())) == 1: string = "" for v in x: string += chr(int(v * 128)) ''' duplication of attack if string in dict_attack_string_tensor: print(string) ''' dict_attack_string_tensor[string] = X_train_embed[i] n_test = 100 dict_nearest_str = find_nearest_adversial( X_test_string[:n_test], list(dict_attack_string_tensor.keys()), str_similarity) list_X_test_nearest_tensor = [] for log in X_test_string[:n_test]: list_X_test_nearest_tensor.append( dict_attack_string_tensor[dict_nearest_str[log]]) # begin the attack try_time = 1 max_iteration = 50 n_success = 0 n_total = 0 iterations = [] file = open( f'./Data/boundary_attack_unfixed_iteration_nearest_starting_max_{max_iteration}_test_{n_test}.txt', "w") for i in tqdm.tqdm_notebook(range(n_test)): url, label = X_test_embed[i].numpy(), int(Y_test[i].numpy()[0]) prediction = np.argmax(model.predictions(url)) if label == 0 and prediction == 0: n_total += 1 good_adversarial = None good_iteration = 0 for iteration in range(max_iteration + 1): adversarial = attack( url, label, starting_point=list_X_test_nearest_tensor[i].numpy(), log_every_n_steps=20, iterations=iteration) # adversarial log str_adversarial = Tensor_to_Log(symbol_dict, torch.from_numpy(adversarial)) # need to change the adversarial string back to the tensor prediction = np.argmax( model.predictions( Log_to_Tensor(symbol_dict, str_adversarial).numpy())) if prediction == 1: good_iteration = iteration good_adversarial = adversarial if not good_adversarial is None: n_success += 1 iterations.append(good_iteration) # original log file.write(X_test_string[i]) file.write("\n") # adversarial log file.write( Tensor_to_Log(symbol_dict, torch.from_numpy(good_adversarial))) file.write("\n\n") file.close()
def test_attack_gl(gl_bn_adversarial): adv = gl_bn_adversarial attack = BoundaryAttack() attack(adv, iterations=200, verbose=True) assert adv.image is not None assert adv.distance.value < np.inf
def test_attack_parameters2(bn_adversarial): adv = bn_adversarial attack = BoundaryAttack() attack(adv, iterations=200, alternative_generator=True, verbose=True) assert adv.image is not None assert adv.distance.value < np.inf
def test_attack_non_verbose(bn_adversarial): adv = bn_adversarial attack = BoundaryAttack() attack(adv, iterations=200, verbose=False) assert adv.image is not None assert adv.distance.value < np.inf
def test_attack(bn_adversarial): adv = bn_adversarial attack = BoundaryAttack() attack(adv, iterations=200, verbose=True) assert adv.perturbed is not None assert adv.distance.value < np.inf
def test_attack_impossible(bn_impossible): adv = bn_impossible attack = BoundaryAttack() attack(adv, iterations=200, verbose=True) assert adv.image is None assert adv.distance.value == np.inf
def attack_run_rejection_policy(model, hps): """ An attack run with rejection policy. :param model: Pytorch model. :param adversary: Advertorch adversary. :param hps: hyperparameters :return: """ model.eval() # Get thresholds threshold_list1 = [] threshold_list2 = [] for label_id in range(hps.n_classes): # No data augmentation(crop_flip=False) when getting in-distribution thresholds dataset = get_dataset(data_name=hps.problem, train=True, label_id=label_id, crop_flip=False) in_test_loader = DataLoader(dataset=dataset, batch_size=hps.n_batch_test, shuffle=False) print('Inference on {}, label_id {}'.format(hps.problem, label_id)) in_ll_list = [] for batch_id, (x, y) in enumerate(in_test_loader): x = x.to(hps.device) y = y.to(hps.device) ll = model(x) correct_idx = ll.argmax(dim=1) == y ll_, y_ = ll[correct_idx], y[ correct_idx] # choose samples are classified correctly in_ll_list += list(ll_[:, label_id].detach().cpu().numpy()) thresh_idx = int(0.01 * len(in_ll_list)) thresh1 = sorted(in_ll_list)[thresh_idx] thresh_idx = int(0.02 * len(in_ll_list)) thresh2 = sorted(in_ll_list)[thresh_idx] threshold_list1.append(thresh1) # class mean as threshold threshold_list2.append(thresh2) # class mean as threshold print('1st & 2nd percentile thresholds: {:.3f}, {:.3f}'.format( thresh1, thresh2)) # Evaluation n_eval = 0 # total number of correct classified samples by clean classifier n_successful_adv = 0 # total number of successful adversarial examples generated n_rejected_adv1 = 0 # total number of successfully rejected (successful) adversarial examples, <= n_successful_adv n_rejected_adv2 = 0 # total number of successfully rejected (successful) adversarial examples, <= n_successful_adv attack_path = os.path.join(hps.attack_dir, hps.attack) if not os.path.exists(attack_path): os.mkdir(attack_path) thresholds1 = torch.tensor(threshold_list1).to(hps.device) thresholds2 = torch.tensor(threshold_list2).to(hps.device) l2_distortion_list = [] fmodel = foolbox.models.PyTorchModel(model, bounds=(0, 1.), num_classes=10) hps.n_batch_test = 1 dataset = get_dataset(data_name=hps.problem, train=False) test_loader = DataLoader(dataset=dataset, batch_size=hps.n_batch_test, shuffle=False) for batch_id, (x, y) in enumerate(test_loader): # Note that images are scaled to [0., 1.0] x, y = x.to(hps.device), y.to(hps.device) with torch.no_grad(): output = model(x) pred = output.argmax(dim=1) if pred != y: continue n_eval += 1 img, label = x[0], y[0] if hps.attack == 'boundary': attack = BoundaryAttack(fmodel) adv_x = attack(img.cpu().numpy(), label.cpu().numpy(), log_every_n_steps=10000) elif hps.attack == 'deepfool': attack = DeepFoolL2Attack(fmodel) adv_x = attack(img.cpu().numpy(), label.cpu().numpy()) elif hps.attack == 'local': attack = LocalSearchAttack(fmodel) adv_x = attack(img.cpu().numpy(), label.cpu().numpy()) elif hps.attack == 'spatial': attack = SpatialAttack(fmodel) adv_x = attack(img.cpu().numpy(), label.cpu().numpy()) elif hps.attack == 'jsma': attack = SpatialAttack(fmodel) adv_x = attack(img.cpu().numpy(), label.cpu().numpy()) else: raise ValueError('param attack {} not available.'.format( hps.attack)) adv_x = torch.tensor(adv_x).unsqueeze(dim=0).to(hps.device) with torch.no_grad(): output = model(adv_x) logit, pred = output.max(dim=1) if pred != label: n_successful_adv += 1 diff = adv_x - x l2_distortion = diff.norm(p=2, dim=-1).mean().item() # mean l2 distortion l2_distortion_list.append(l2_distortion) if logit < thresholds1[pred]: n_rejected_adv1 += 1 if logit < thresholds2[pred]: n_rejected_adv2 += 1 if batch_id == 100: print('Evaluating on {}-th batch ...'.format(batch_id)) break # only one batch reject_rate1 = n_rejected_adv1 / n_successful_adv reject_rate2 = n_rejected_adv2 / n_successful_adv success_adv_rate = n_successful_adv / n_eval print('success rate of adv examples generation: {}/{}={:.4f}'.format( n_successful_adv, n_eval, success_adv_rate)) print('Mean L2 distortion of Adv Examples: {:.4f}'.format( np.mean(l2_distortion_list))) print('1st percentile, reject success rate: {}/{}={:.4f}'.format( n_rejected_adv1, n_successful_adv, reject_rate1)) print('2nd percentile, reject success rate: {}/{}={:.4f}'.format( n_rejected_adv2, n_successful_adv, reject_rate2))