def get_cw(dataset='mnist'): eps, normalizer = _get_settings(dataset) delta_threat = ap.ThreatModel( ap.DeltaAddition, ap.PerturbationParameters(lp_style='inf', lp_bound=eps, manual_gpu=True)) return aa.CarliniWagner(classifier_net=None, normalizer=normalizer, threat_model=delta_threat, distance_fxn=lf.SoftLInfRegularization, carlini_loss=lf.CWLossF6, manual_gpu=True)
def main(config): model = Classifier(200, classifier_name='resnet18', dataset="tinyimagenet", pretrained=False) # format matching data_classifier_state = torch.load(os.path.join(config.path, 'Classifier.pth'), map_location=None) if 'state_dict' in data_classifier_state: data_classifier_state = data_classifier_state['state_dict'] bad_classifier_state = {} for k, v in data_classifier_state.items(): if k.startswith('1.'): bad_classifier_state[k[2:]] = v else: bad_classifier_state[k] = v starts_with_module = False for key in bad_classifier_state.keys(): if key.startswith('module.'): starts_with_module = True break if starts_with_module: correct_classifier_state = { k[7:]: v for k, v in bad_classifier_state.items() } else: correct_classifier_state = bad_classifier_state starts_with_feature_extractor = False for k in correct_classifier_state.keys(): if k.startswith('feature_extractor.'): starts_with_feature_extractor = True break if not starts_with_feature_extractor: correct_classifier_state = { 'feature_extractor.' + k: v for k, v in correct_classifier_state.items() } # fit into our model model.load_state_dict(correct_classifier_state) normalizer = utils.IdentityNormalize() # Put this into the AdversarialEvaluation object adv_eval_object = adveval.AdversarialEvaluation(model, normalizer) surrogate = model normalizer_surr = normalizer # First let's build the attack parameters for each. # we'll reuse the loss function: attack_loss = plf.VanillaXentropy(surrogate, normalizer_surr) linf_8_threat = ap.ThreatModel(ap.DeltaAddition, { 'lp_style': 'inf', 'lp_bound': 8.0 / 255.0 }) #------ FGSM Block fgsm_attack = aa.FGSM(surrogate, normalizer_surr, linf_8_threat, attack_loss) fgsm_attack_kwargs = {'step_size': 8.0 / 255.0, 'verbose': False} fgsm_attack_params = advtrain.AdversarialAttackParameters( fgsm_attack, attack_specific_params={'attack_kwargs': fgsm_attack_kwargs}) # ------ pgd10 Block pgd10_attack = aa.PGD(surrogate, normalizer_surr, linf_8_threat, attack_loss) pgd10_attack_kwargs = { 'step_size': 8.0 / 255.0 / 4.0, 'num_iterations': 10, 'keep_best': True, 'random_init': True, 'verbose': False } pgd10_attack_params = advtrain.AdversarialAttackParameters( pgd10_attack, attack_specific_params={'attack_kwargs': pgd10_attack_kwargs}) # ------ pgd100 Block pgd100_attack = aa.PGD(surrogate, normalizer_surr, linf_8_threat, attack_loss) pgd100_attack_kwargs = { 'step_size': 8.0 / 255.0 / 12.0, 'num_iterations': 100, 'keep_best': True, 'random_init': True, 'verbose': False } pgd100_attack_params = advtrain.AdversarialAttackParameters( pgd100_attack, attack_specific_params={'attack_kwargs': pgd100_attack_kwargs}) # ------ CarliniWagner100 Block cwloss6 = lf.CWLossF6 distance_fxn = lf.SoftLInfRegularization cw100_attack = aa.CarliniWagner(surrogate, normalizer_surr, linf_8_threat, distance_fxn, cwloss6) cw100_attack_kwargs = {'num_optim_steps': 100, 'verbose': False} cw100_attack_params = advtrain.AdversarialAttackParameters( cw100_attack, attack_specific_params={'attack_kwargs': cw100_attack_kwargs}) # ------ CarliniWagner1000 Block cwloss6 = lf.CWLossF6 distance_fxn = lf.SoftLInfRegularization cw1000_attack = aa.CarliniWagner(surrogate, normalizer_surr, linf_8_threat, distance_fxn, cwloss6) cw1000_attack_kwargs = {'num_optim_steps': 1000, 'verbose': False} cw1000_attack_params = advtrain.AdversarialAttackParameters( cw1000_attack, attack_specific_params={'attack_kwargs': cw1000_attack_kwargs}) to_eval_dict = { 'top1': 'top1', 'avg_loss_value': 'avg_loss_value', 'avg_successful_ssim': 'avg_successful_ssim' } fgsm_eval = adveval.EvaluationResult(fgsm_attack_params, to_eval=to_eval_dict) pgd10_eval = adveval.EvaluationResult(pgd10_attack_params, to_eval=to_eval_dict) pgd100_eval = adveval.EvaluationResult(pgd100_attack_params, to_eval=to_eval_dict) cw100_eval = adveval.EvaluationResult(cw100_attack_params, to_eval=to_eval_dict) cw1000_eval = adveval.EvaluationResult(cw1000_attack_params, to_eval=to_eval_dict) attack_ensemble = { 'fgsm': fgsm_eval, 'pgd10': pgd10_eval, 'pgd100': pgd100_eval, 'cw100': cw100_eval, 'cw1000': cw1000_eval } ensemble_out = adv_eval_object.evaluate_ensemble(test_dataloader, attack_ensemble, verbose=True, num_minibatches=None) sort_order = { 'ground': 1, 'fgsm': 2, 'pgd10': 3, 'pgd100': 4, 'cw100': 5, 'cw1000': 6 } # sort_order = {'ground': 1, 'pgd10': 2, 'pgd100': 3} def pretty_printer(fd, eval_ensemble, result_type): print('~' * 10, result_type, '~' * 10) fd.write('~' * 10 + result_type + '~' * 10 + "\n") for key in sorted(list(eval_ensemble.keys()), key=lambda k: sort_order[k]): eval_result = eval_ensemble[key] pad = 6 - len(key) if result_type not in eval_result.results: continue avg_result = eval_result.results[result_type].avg print(key, pad * ' ', ': ', avg_result) fd.write(key + pad * ' ' + ': ' + str(avg_result) + "\n") with open(os.path.join(config.path, 'base_eval_result.txt'), "w") as fd: fd.write('Result for {}'.format(config.path) + "\n") fd.write("\n") pretty_printer(fd, ensemble_out, 'top1') # We can examine the loss (noting that we seek to 'maximize' loss in the adversarial example domain) pretty_printer(fd, ensemble_out, 'avg_loss_value') # This is actually 1-SSIM, which can serve as a makeshift 'similarity index', # which essentially gives a meterstick for how similar the perturbed images are to the originals pretty_printer(fd, ensemble_out, 'avg_successful_ssim')
def main(config): defence_method = config.defence flavor = config.architecture blackbox = config.blackbox flavor_blackbox = config.flavor_blackbox epoch = config.epoch # assert defence_method in ['PLAIN','FGSM', 'PGD', 'CW'],"INVALID ATTACK: %s" % defence_method assert flavor in ['20', '56', 'wide'], "INVALID ARCHITECTURE: %s" % flavor # Load the trained model and normalizer if flavor in ['20', '56']: model, normalizer = cifar_loader.load_pretrained_cifar_resnet( flavor=int(flavor), return_normalizer=True) elif flavor == 'wide': model, normalizer = cifar_loader.load_pretrained_cifar_wide_resnet( return_normalizer=True) if defence_method in ['FGSM', 'PGD', 'CW', 'PGD40', 'PGD100']: model = checkpoints.load_state_dict(defence_method + 'ResNet' + flavor, 'resnet' + flavor, epoch, model) elif defence_method != 'PLAIN': bad_state_dict = torch.load('./pretrained_models/' + defence_method + '.pth') correct_state_dict = { re.sub(r'^.*feature_extractor\.', '', k): v for k, v in bad_state_dict.items() } model.load_state_dict(correct_state_dict) # Load the evaluation dataset cifar_valset = cifar_loader.load_cifar_data('val', no_transform=True, shuffle=False, batch_size=100) # Put this into the AdversarialEvaluation object adv_eval_object = adveval.AdversarialEvaluation(model, normalizer) # Use blackbox attack or not if blackbox: surrogate, normalizer_surr = cifar_loader.load_pretrained_cifar_resnet( flavor=int(flavor_blackbox), return_normalizer=True) surrogate.cuda() else: surrogate = model normalizer_surr = normalizer # First let's build the attack parameters for each. # we'll reuse the loss function: attack_loss = plf.VanillaXentropy(surrogate, normalizer_surr) linf_8_threat = ap.ThreatModel(ap.DeltaAddition, { 'lp_style': 'inf', 'lp_bound': 8.0 / 255.0 }) #------ FGSM Block fgsm_attack = aa.FGSM(surrogate, normalizer_surr, linf_8_threat, attack_loss) fgsm_attack_kwargs = {'step_size': 8.0 / 255.0, 'verbose': False} fgsm_attack_params = advtrain.AdversarialAttackParameters( fgsm_attack, attack_specific_params={'attack_kwargs': fgsm_attack_kwargs}) # ------ pgd10 Block pgd10_attack = aa.PGD(surrogate, normalizer_surr, linf_8_threat, attack_loss) pgd10_attack_kwargs = { 'step_size': 8.0 / 255.0 / 4.0, 'num_iterations': 10, 'keep_best': True, 'verbose': False } pgd10_attack_params = advtrain.AdversarialAttackParameters( pgd10_attack, attack_specific_params={'attack_kwargs': pgd10_attack_kwargs}) # ------ pgd100 Block pgd100_attack = aa.PGD(surrogate, normalizer_surr, linf_8_threat, attack_loss) pgd100_attack_kwargs = { 'step_size': 8.0 / 255.0 / 12.0, 'num_iterations': 100, 'keep_best': True, 'verbose': False } pgd100_attack_params = advtrain.AdversarialAttackParameters( pgd100_attack, attack_specific_params={'attack_kwargs': pgd100_attack_kwargs}) # ------ CarliniWagner100 Block cwloss6 = lf.CWLossF6 distance_fxn = lf.SoftLInfRegularization cw100_attack = aa.CarliniWagner(surrogate, normalizer_surr, linf_8_threat, distance_fxn, cwloss6) cw100_attack_kwargs = {'num_optim_steps': 100, 'verbose': False} cw100_attack_params = advtrain.AdversarialAttackParameters( cw100_attack, attack_specific_params={'attack_kwargs': cw100_attack_kwargs}) # ------ CarliniWagner1000 Block cwloss6 = lf.CWLossF6 distance_fxn = lf.SoftLInfRegularization cw1000_attack = aa.CarliniWagner(surrogate, normalizer_surr, linf_8_threat, distance_fxn, cwloss6) cw1000_attack_kwargs = {'num_optim_steps': 1000, 'verbose': False} cw1000_attack_params = advtrain.AdversarialAttackParameters( cw1000_attack, attack_specific_params={'attack_kwargs': cw1000_attack_kwargs}) ''' Next we'll build the EvaluationResult objects that wrap these. And let's say we'll evaluate the: - top1 accuracy - average loss - average SSIM distance of successful perturbations [don't worry too much about this] The 'to_eval' dict as passed in the constructor has structure {key : <shorthand fxn>} where key is just a human-readable handle for what's being evaluated and shorthand_fxn is either a string for prebuilt evaluators, or you can pass in a general function to evaluate ''' to_eval_dict = { 'top1': 'top1', 'avg_loss_value': 'avg_loss_value', 'avg_successful_ssim': 'avg_successful_ssim' } fgsm_eval = adveval.EvaluationResult(fgsm_attack_params, to_eval=to_eval_dict) pgd10_eval = adveval.EvaluationResult(pgd10_attack_params, to_eval=to_eval_dict) pgd100_eval = adveval.EvaluationResult(pgd100_attack_params, to_eval=to_eval_dict) cw100_eval = adveval.EvaluationResult(cw100_attack_params, to_eval=to_eval_dict) cw1000_eval = adveval.EvaluationResult(cw1000_attack_params, to_eval=to_eval_dict) attack_ensemble = { 'fgsm': fgsm_eval, 'pgd10': pgd10_eval, 'pgd100': pgd100_eval, 'cw100': cw100_eval, 'cw1000': cw1000_eval } if blackbox: attack_ensemble = { 'fgsm': fgsm_eval, 'pgd10': pgd10_eval, 'pgd100': pgd100_eval } ensemble_out = adv_eval_object.evaluate_ensemble(cifar_valset, attack_ensemble, verbose=True, num_minibatches=None) filename = "result.txt" if blackbox: filename = "result_blackbox.txt" # Now let's build a little helper to print things out cleanly: sort_order = { 'ground': 1, 'fgsm': 2, 'pgd10': 3, 'pgd100': 4, 'cw100': 5, 'cw1000': 6 } if blackbox: sort_order = {'ground': 1, 'fgsm': 2, 'pgd10': 3, 'pgd100': 4} def pretty_printer(eval_ensemble, result_type): f = open(filename, "a") print('~' * 10, result_type, '~' * 10) f.write('~' * 10 + result_type + '~' * 10 + "\n") for key in sorted(list(eval_ensemble.keys()), key=lambda k: sort_order[k]): eval_result = eval_ensemble[key] pad = 6 - len(key) if result_type not in eval_result.results: continue avg_result = eval_result.results[result_type].avg print(key, pad * ' ', ': ', avg_result) f.write(key + pad * ' ' + ': ' + str(avg_result) + "\n") f.close() '''And then we can print out and look at the results: This prints the accuracy. Ground is the unperturbed accuracy. If everything is done right, we should see that PGD with an l_inf bound of 4 is a stronger attack against undefended networks than FGSM with an l_inf bound of 8 ''' f = open(filename, "a") f.write('Result for ' + defence_method + 'ResNet{}'.format(flavor) + "\n") if blackbox: f.write('Blackbox' + flavor_blackbox + "\n") f.close() pretty_printer(ensemble_out, 'top1') # We can examine the loss (noting that we seek to 'maximize' loss in the adversarial example domain) pretty_printer(ensemble_out, 'avg_loss_value') # This is actually 1-SSIM, which can serve as a makeshift 'similarity index', # which essentially gives a meterstick for how similar the perturbed images are to the originals pretty_printer(ensemble_out, 'avg_successful_ssim') f = open(filename, "a") f.write("\n") f.close()
def main_attack_script(attack_examples=None, show_images=False): # Which attacks to do... attack_examples = attack_examples or [ 'FGSM', 'BIM', 'PGD', 'CW2', 'CWLInf' ] ######################################################################## # SHARED BLOCK # ######################################################################## # Initialize CIFAR classifier classifier_net = cifar_loader.load_pretrained_cifar_resnet(flavor=32) classifier_net.eval() # Collect one minibatch worth of data/targets val_loader = cifar_loader.load_cifar_data('val', normalize=False, batch_size=16) ex_minibatch, ex_targets = next(iter(val_loader)) # Differentiable normalizer needed for classification cifar_normer = utils.DifferentiableNormalize(mean=config.CIFAR10_MEANS, std=config.CIFAR10_STDS) ######################################################################### # FGSM ATTACK BLOCK # ######################################################################### if 'FGSM' in attack_examples: # Example FGSM attack on a single minibatch # steps: # 0) initialize hyperparams # 1) setup loss object # 2) build attack object # 3) setup examples to attack # 4) perform attack # 5) evaluate attack (accuracy + display a few images ) FGSM_L_INF = 8.0 / 255.0 delta_threat = ap.ThreatModel(ap.DeltaAddition, { 'lp_style': 'inf', 'lp_bound': 8.0 / 255 }) fgsm_xentropy_loss = plf.VanillaXentropy(classifier_net, normalizer=cifar_normer) fgsm_attack_obj = aa.FGSM(classifier_net, cifar_normer, delta_threat, fgsm_xentropy_loss) fgsm_original_images = ex_minibatch fgsm_original_labels = ex_targets fgsm_adv_images = fgsm_attack_obj.attack( fgsm_original_images, fgsm_original_labels, FGSM_L_INF).adversarial_tensors() fgsm_accuracy = fgsm_attack_obj.eval(fgsm_original_images, fgsm_adv_images, fgsm_original_labels) print("FGSM ATTACK ACCURACY: ") print("\t Original %% correct: %s" % fgsm_accuracy[0]) print("\t Adversarial %% correct: %s" % fgsm_accuracy[1]) if show_images: img_utils.display_adversarial_2row(classifier_net, cifar_normer, fgsm_original_images, fgsm_adv_images, 4) ########################################################################## # BIM ATTACK BLOCK # ########################################################################## if 'BIM' in attack_examples: # Example BIM attack on a single minibatch # steps: # 0) initialize hyperparams # 1) setup loss object # 2) build attack object # 3) setup examples to attack # 4) perform attack # 5) evaluate attack BIM_L_INF = 8.0 / 255.0 BIM_STEP_SIZE = 1.0 / 255.0 BIM_NUM_ITER = 16 bim_xentropy_loss = plf.VanillaXentropy(classifier_net, normalizer=cifar_normer) bim_attack_obj = aa.BIM(classifier_net, cifar_normer, bim_xentropy_loss) bim_original_images = ex_minibatch bim_original_labels = ex_targets bim_adv_images = bim_attack_obj.attack(bim_original_images, bim_original_labels, l_inf_bound=BIM_L_INF, step_size=BIM_STEP_SIZE, num_iterations=BIM_NUM_ITER) bim_accuracy = bim_attack_obj.eval(bim_original_images, bim_adv_images, bim_original_labels) print("BIM ATTACK ACCURACY: ") print("\t Original %% correct: %s" % bim_accuracy[0]) print("\t Adversarial %% correct: %s" % bim_accuracy[1]) if show_images: img_utils.display_adversarial_2row(classifier_net, cifar_normer, bim_original_images, bim_adv_images, 4) ########################################################################## # PGD ATTACK BLOCK # ########################################################################## if 'PGD' in attack_examples: # Example BIM attack on a single minibatch # steps: # 0) initialize hyperparams # 1) setup loss object # 2) build attack object # 3) setup examples to attack # 4) perform attack # 5) evaluate attack PGD_L_INF = 8.0 / 255.0 PGD_STEP_SIZE = 1.0 / 255.0 PGD_NUM_ITER = 16 pgd_xentropy_loss = plf.VanillaXentropy(classifier_net, normalizer=cifar_normer) delta_threat = ap.ThreatModel(ap.DeltaAddition, { 'lp_style': 'inf', 'lp_bound': 8.0 / 255 }) pgd_attack_obj = aa.PGD(classifier_net, cifar_normer, delta_threat, pgd_xentropy_loss) pgd_original_images = ex_minibatch pgd_original_labels = ex_targets pgd_adv_images = pgd_attack_obj.attack( pgd_original_images, pgd_original_labels, step_size=PGD_STEP_SIZE, num_iterations=PGD_NUM_ITER).adversarial_tensors() pgd_accuracy = pgd_attack_obj.eval(pgd_original_images, pgd_adv_images, pgd_original_labels) print("PGD ATTACK ACCURACY: ") print("\t Original %% correct: %s" % pgd_accuracy[0]) print("\t Adversarial %% correct: %s" % pgd_accuracy[1]) if show_images: img_utils.display_adversarial_2row(classifier_net, cifar_normer, pgd_original_images, pgd_adv_images, 4) ########################################################################## # CW L2 ATTACK # ########################################################################## if 'CWL2' in attack_examples: # Example Carlini Wagner L2 attack on a single minibatch # steps: # 0) initialize hyperparams # 1) setup loss object # 2) build attack object # 3) setup examples to attack # 4) perform attack # 5) evaluate attack CW_INITIAL_SCALE_CONSTANT = 0.1 CW_NUM_BIN_SEARCH_STEPS = 5 CW_NUM_OPTIM_STEPS = 1000 CW_DISTANCE_METRIC = 'l2' CW_CONFIDENCE = 0.0 cw_f6loss = lf.CWLossF6 delta_threat = ap.ThreatModel(ap.DeltaAddition, { 'lp_style': 2, 'lp_bound': 3072.0 }) cwl2_obj = aa.CarliniWagner(classifier_net, cifar_normer, delta_threat, lf.L2Regularization, cw_f6loss) cwl2_original_images = ex_minibatch cwl2_original_labels = ex_targets cwl2_output = cwl2_obj.attack( ex_minibatch, ex_targets, num_bin_search_steps=CW_NUM_BIN_SEARCH_STEPS, num_optim_steps=CW_NUM_OPTIM_STEPS, verbose=True) print(cwl2_output['best_dist']) cwl2_adv_images = cwl2_output['best_adv_images'] cwl2_accuracy = cwl2_obj.eval(cwl2_original_images, cwl2_adv_images, cwl2_original_labels) print("CWL2 ATTACK ACCURACY: ") print("\t Original %% correct: %s" % cwl2_accuracy[0]) print("\t Adversarial %% correct: %s" % cwl2_accuracy[1]) if show_images: img_utils.display_adversarial_2row(classifier_net, cifar_normer, cwl2_original_images, cwl2_adv_images, 4)