def main_defense_script(): ######################################################################## # SHARED BLOCK # ######################################################################## # Initialize CIFAR classifier classifier_net = cifar_loader.load_pretrained_cifar_resnet(flavor=32, use_gpu=False) classifier_net.eval() # Differentiable normalizer needed for classification cifar_normer = utils.DifferentiableNormalize(mean=config.CIFAR10_MEANS, std=config.CIFAR10_STDS) ###################################################################### # SIMPLE FGSM TRAINING EXAMPLE # ###################################################################### if True: # Steps # 0) initialize hyperparams for attack/training # 1) setup attack loss object # 2) build attack and parameters for attack # 3) build training object, training loss, data loader # 4) train # 0 FGSM_L_INF = 8.0 / 255.0 FGSM_TRAINING_ATTACK_PROPORTION = 0.5 FGSM_TRAINING_EPOCHS = 10 # 1 fgsm_attack_loss = plf.VanillaXentropy(classifier_net, cifar_normer) # 2 fgsm_xentropy_attack_obj = aa.FGSM(classifier_net, cifar_normer, fgsm_attack_loss) fgsm_xentropy_attack_params = advtrain.AdversarialAttackParameters( fgsm_xentropy_attack_obj, FGSM_TRAINING_ATTACK_PROPORTION, {'attack_kwargs': { 'l_inf_bound': FGSM_L_INF }}) # 3 half_fgsm_cifar = advtrain.AdversarialTraining(classifier_net, cifar_normer, 'half_fgsm_cifar', 'cifar_resnet32') train_loss = nn.CrossEntropyLoss() train_loader = cifar_loader.load_cifar_data('train', normalize=False) # 4 half_fgsm_cifar.train(train_loader, FGSM_TRAINING_EPOCHS, train_loss, attack_parameters=fgsm_xentropy_attack_params, verbosity='snoop')
def main_evaluation_script(): """ Here's a little script to show how to evaluate a trained model against varying attacks (on the fly, without saving adv examples) """ # Steps # 0) Initialize a classifier/normalizer/evaluation loader # 1) Build some attack objects to try # 2) Run the evaluation and print results # 0 classifier_net = cifar_loader.load_pretrained_cifar_resnet(flavor=32, use_gpu=False) cifar_normer = utils.DifferentiableNormalize(mean=config.CIFAR10_MEANS, std=config.CIFAR10_STDS) val_loader = cifar_loader.load_cifar_data('val', normalize=False) # 1 L_INF_BOUND = 8.0 / 255.0 # --- FGSM attack fgsm_xentropy_loss = plf.VanillaXentropy(classifier_net, normalizer=cifar_normer) fgsm_attack_obj = aa.FGSM(classifier_net, cifar_normer, fgsm_xentropy_loss) fgsm_spec_params = {'attack_kwargs': {'l_inf_bound': L_INF_BOUND}} fgsm_attack_params = advtrain.AdversarialAttackParameters( fgsm_attack_obj, 0.5, fgsm_spec_params) # --- BIM attack BIM_L_INF = 8.0 / 255.0 BIM_STEP_SIZE = 1.0 / 255.0 BIM_NUM_ITER = 16 bim_xentropy_loss = plf.VanillaXentropy(classifier_net, normalizer=cifar_normer) bim_attack_obj = aa.BIM(classifier_net, cifar_normer, bim_xentropy_loss) bim_spec_params = { 'attack_kwargs': { 'l_inf_bound': L_INF_BOUND, 'step_size': BIM_STEP_SIZE, 'num_iterations': BIM_NUM_ITER } } bim_attack_params = advtrain.AdversarialAttackParameters( bim_attack_obj, 0.5, bim_spec_params) attack_ensemble = {'fgsm': fgsm_attack_params, 'bim': bim_attack_params} # 2 eval_obj = advtrain.AdversarialEvaluation(classifier_net, cifar_normer) eval_out = eval_obj.evaluate(val_loader, attack_ensemble, num_minibatches=5)
def get_fgsm(dataset='mnist'): eps, normalizer = _get_settings(dataset) delta_threat = ap.ThreatModel( ap.DeltaAddition, ap.PerturbationParameters(lp_style='inf', lp_bound=eps, manual_gpu=True)) return aa.FGSM(classifier_net=None, normalizer=normalizer, threat_model=delta_threat, loss_fxn=None, manual_gpu=True)
def build_fgsm_attack(classifier_net, normalizer): delta_threat = ap.ThreatModel( ap.DeltaAddition, ap.PerturbationParameters(lp_style='inf', lp_bound=L_INF_BOUND)) attack_loss = plf.VanillaXentropy(classifier_net, normalizer) fgsm_attack = aa.FGSM(classifier_net, cifar_normer, delta_threat, attack_loss) attack_kwargs = {'verbose': GLOBAL_ATK_KWARGS['verbose']} params = advtrain.AdversarialAttackParameters( fgsm_attack, 1.0, attack_specific_params={'attack_kwargs': attack_kwargs}) return params
def build_delta_fgsm(model, normalizer, linf_bound=L_INF_BOUND, verbose=False, adv_loss='xentropy', output='attack', manual_gpu=None): # Build threat delta_threat = ap.ThreatModel(ap.DeltaAddition, ap.PerturbationParameters(lp_style='inf', lp_bound=linf_bound, manual_gpu=manual_gpu)) # Build loss assert adv_loss in ['xentropy', 'cw'] if adv_loss == 'xentropy': attack_loss = plf.VanillaXentropy(model, normalizer) else: cw_loss = lf.CWLossF6(model, normalizer) attack_loss = lf.RegularizedLoss({'adv': cw_loss}, {'adv': 1.0}) # Build attack fgsm_attack = aa.FGSM(model, normalizer, delta_threat, attack_loss, manual_gpu=manual_gpu) # Return based on output arg assert output in ['attack', 'params', 'eval'] if output == 'attack': return fgsm_attack attack_kwargs ={'verbose': verbose} params = advtrain.AdversarialAttackParameters(fgsm_attack, 1.0, attack_specific_params={'attack_kwargs': attack_kwargs}) if output == 'params': return params to_eval= {'top1': 'top1', 'lpips': 'avg_successful_lpips'} eval_result = adveval.EvaluationResult(params, to_eval=to_eval, manual_gpu=manual_gpu) return eval_result
def main(config): model = Classifier(200, classifier_name='resnet18', dataset="tinyimagenet", pretrained=False) # format matching data_classifier_state = torch.load(os.path.join(config.path, 'Classifier.pth'), map_location=None) if 'state_dict' in data_classifier_state: data_classifier_state = data_classifier_state['state_dict'] bad_classifier_state = {} for k, v in data_classifier_state.items(): if k.startswith('1.'): bad_classifier_state[k[2:]] = v else: bad_classifier_state[k] = v starts_with_module = False for key in bad_classifier_state.keys(): if key.startswith('module.'): starts_with_module = True break if starts_with_module: correct_classifier_state = { k[7:]: v for k, v in bad_classifier_state.items() } else: correct_classifier_state = bad_classifier_state starts_with_feature_extractor = False for k in correct_classifier_state.keys(): if k.startswith('feature_extractor.'): starts_with_feature_extractor = True break if not starts_with_feature_extractor: correct_classifier_state = { 'feature_extractor.' + k: v for k, v in correct_classifier_state.items() } # fit into our model model.load_state_dict(correct_classifier_state) normalizer = utils.IdentityNormalize() # Put this into the AdversarialEvaluation object adv_eval_object = adveval.AdversarialEvaluation(model, normalizer) surrogate = model normalizer_surr = normalizer # First let's build the attack parameters for each. # we'll reuse the loss function: attack_loss = plf.VanillaXentropy(surrogate, normalizer_surr) linf_8_threat = ap.ThreatModel(ap.DeltaAddition, { 'lp_style': 'inf', 'lp_bound': 8.0 / 255.0 }) #------ FGSM Block fgsm_attack = aa.FGSM(surrogate, normalizer_surr, linf_8_threat, attack_loss) fgsm_attack_kwargs = {'step_size': 8.0 / 255.0, 'verbose': False} fgsm_attack_params = advtrain.AdversarialAttackParameters( fgsm_attack, attack_specific_params={'attack_kwargs': fgsm_attack_kwargs}) # ------ pgd10 Block pgd10_attack = aa.PGD(surrogate, normalizer_surr, linf_8_threat, attack_loss) pgd10_attack_kwargs = { 'step_size': 8.0 / 255.0 / 4.0, 'num_iterations': 10, 'keep_best': True, 'random_init': True, 'verbose': False } pgd10_attack_params = advtrain.AdversarialAttackParameters( pgd10_attack, attack_specific_params={'attack_kwargs': pgd10_attack_kwargs}) # ------ pgd100 Block pgd100_attack = aa.PGD(surrogate, normalizer_surr, linf_8_threat, attack_loss) pgd100_attack_kwargs = { 'step_size': 8.0 / 255.0 / 12.0, 'num_iterations': 100, 'keep_best': True, 'random_init': True, 'verbose': False } pgd100_attack_params = advtrain.AdversarialAttackParameters( pgd100_attack, attack_specific_params={'attack_kwargs': pgd100_attack_kwargs}) # ------ CarliniWagner100 Block cwloss6 = lf.CWLossF6 distance_fxn = lf.SoftLInfRegularization cw100_attack = aa.CarliniWagner(surrogate, normalizer_surr, linf_8_threat, distance_fxn, cwloss6) cw100_attack_kwargs = {'num_optim_steps': 100, 'verbose': False} cw100_attack_params = advtrain.AdversarialAttackParameters( cw100_attack, attack_specific_params={'attack_kwargs': cw100_attack_kwargs}) # ------ CarliniWagner1000 Block cwloss6 = lf.CWLossF6 distance_fxn = lf.SoftLInfRegularization cw1000_attack = aa.CarliniWagner(surrogate, normalizer_surr, linf_8_threat, distance_fxn, cwloss6) cw1000_attack_kwargs = {'num_optim_steps': 1000, 'verbose': False} cw1000_attack_params = advtrain.AdversarialAttackParameters( cw1000_attack, attack_specific_params={'attack_kwargs': cw1000_attack_kwargs}) to_eval_dict = { 'top1': 'top1', 'avg_loss_value': 'avg_loss_value', 'avg_successful_ssim': 'avg_successful_ssim' } fgsm_eval = adveval.EvaluationResult(fgsm_attack_params, to_eval=to_eval_dict) pgd10_eval = adveval.EvaluationResult(pgd10_attack_params, to_eval=to_eval_dict) pgd100_eval = adveval.EvaluationResult(pgd100_attack_params, to_eval=to_eval_dict) cw100_eval = adveval.EvaluationResult(cw100_attack_params, to_eval=to_eval_dict) cw1000_eval = adveval.EvaluationResult(cw1000_attack_params, to_eval=to_eval_dict) attack_ensemble = { 'fgsm': fgsm_eval, 'pgd10': pgd10_eval, 'pgd100': pgd100_eval, 'cw100': cw100_eval, 'cw1000': cw1000_eval } ensemble_out = adv_eval_object.evaluate_ensemble(test_dataloader, attack_ensemble, verbose=True, num_minibatches=None) sort_order = { 'ground': 1, 'fgsm': 2, 'pgd10': 3, 'pgd100': 4, 'cw100': 5, 'cw1000': 6 } # sort_order = {'ground': 1, 'pgd10': 2, 'pgd100': 3} def pretty_printer(fd, eval_ensemble, result_type): print('~' * 10, result_type, '~' * 10) fd.write('~' * 10 + result_type + '~' * 10 + "\n") for key in sorted(list(eval_ensemble.keys()), key=lambda k: sort_order[k]): eval_result = eval_ensemble[key] pad = 6 - len(key) if result_type not in eval_result.results: continue avg_result = eval_result.results[result_type].avg print(key, pad * ' ', ': ', avg_result) fd.write(key + pad * ' ' + ': ' + str(avg_result) + "\n") with open(os.path.join(config.path, 'base_eval_result.txt'), "w") as fd: fd.write('Result for {}'.format(config.path) + "\n") fd.write("\n") pretty_printer(fd, ensemble_out, 'top1') # We can examine the loss (noting that we seek to 'maximize' loss in the adversarial example domain) pretty_printer(fd, ensemble_out, 'avg_loss_value') # This is actually 1-SSIM, which can serve as a makeshift 'similarity index', # which essentially gives a meterstick for how similar the perturbed images are to the originals pretty_printer(fd, ensemble_out, 'avg_successful_ssim')
def main(config): defence_method = config.defence flavor = config.architecture blackbox = config.blackbox flavor_blackbox = config.flavor_blackbox epoch = config.epoch # assert defence_method in ['PLAIN','FGSM', 'PGD', 'CW'],"INVALID ATTACK: %s" % defence_method assert flavor in ['20', '56', 'wide'], "INVALID ARCHITECTURE: %s" % flavor # Load the trained model and normalizer if flavor in ['20', '56']: model, normalizer = cifar_loader.load_pretrained_cifar_resnet( flavor=int(flavor), return_normalizer=True) elif flavor == 'wide': model, normalizer = cifar_loader.load_pretrained_cifar_wide_resnet( return_normalizer=True) if defence_method in ['FGSM', 'PGD', 'CW', 'PGD40', 'PGD100']: model = checkpoints.load_state_dict(defence_method + 'ResNet' + flavor, 'resnet' + flavor, epoch, model) elif defence_method != 'PLAIN': bad_state_dict = torch.load('./pretrained_models/' + defence_method + '.pth') correct_state_dict = { re.sub(r'^.*feature_extractor\.', '', k): v for k, v in bad_state_dict.items() } model.load_state_dict(correct_state_dict) # Load the evaluation dataset cifar_valset = cifar_loader.load_cifar_data('val', no_transform=True, shuffle=False, batch_size=100) # Put this into the AdversarialEvaluation object adv_eval_object = adveval.AdversarialEvaluation(model, normalizer) # Use blackbox attack or not if blackbox: surrogate, normalizer_surr = cifar_loader.load_pretrained_cifar_resnet( flavor=int(flavor_blackbox), return_normalizer=True) surrogate.cuda() else: surrogate = model normalizer_surr = normalizer # First let's build the attack parameters for each. # we'll reuse the loss function: attack_loss = plf.VanillaXentropy(surrogate, normalizer_surr) linf_8_threat = ap.ThreatModel(ap.DeltaAddition, { 'lp_style': 'inf', 'lp_bound': 8.0 / 255.0 }) #------ FGSM Block fgsm_attack = aa.FGSM(surrogate, normalizer_surr, linf_8_threat, attack_loss) fgsm_attack_kwargs = {'step_size': 8.0 / 255.0, 'verbose': False} fgsm_attack_params = advtrain.AdversarialAttackParameters( fgsm_attack, attack_specific_params={'attack_kwargs': fgsm_attack_kwargs}) # ------ pgd10 Block pgd10_attack = aa.PGD(surrogate, normalizer_surr, linf_8_threat, attack_loss) pgd10_attack_kwargs = { 'step_size': 8.0 / 255.0 / 4.0, 'num_iterations': 10, 'keep_best': True, 'verbose': False } pgd10_attack_params = advtrain.AdversarialAttackParameters( pgd10_attack, attack_specific_params={'attack_kwargs': pgd10_attack_kwargs}) # ------ pgd100 Block pgd100_attack = aa.PGD(surrogate, normalizer_surr, linf_8_threat, attack_loss) pgd100_attack_kwargs = { 'step_size': 8.0 / 255.0 / 12.0, 'num_iterations': 100, 'keep_best': True, 'verbose': False } pgd100_attack_params = advtrain.AdversarialAttackParameters( pgd100_attack, attack_specific_params={'attack_kwargs': pgd100_attack_kwargs}) # ------ CarliniWagner100 Block cwloss6 = lf.CWLossF6 distance_fxn = lf.SoftLInfRegularization cw100_attack = aa.CarliniWagner(surrogate, normalizer_surr, linf_8_threat, distance_fxn, cwloss6) cw100_attack_kwargs = {'num_optim_steps': 100, 'verbose': False} cw100_attack_params = advtrain.AdversarialAttackParameters( cw100_attack, attack_specific_params={'attack_kwargs': cw100_attack_kwargs}) # ------ CarliniWagner1000 Block cwloss6 = lf.CWLossF6 distance_fxn = lf.SoftLInfRegularization cw1000_attack = aa.CarliniWagner(surrogate, normalizer_surr, linf_8_threat, distance_fxn, cwloss6) cw1000_attack_kwargs = {'num_optim_steps': 1000, 'verbose': False} cw1000_attack_params = advtrain.AdversarialAttackParameters( cw1000_attack, attack_specific_params={'attack_kwargs': cw1000_attack_kwargs}) ''' Next we'll build the EvaluationResult objects that wrap these. And let's say we'll evaluate the: - top1 accuracy - average loss - average SSIM distance of successful perturbations [don't worry too much about this] The 'to_eval' dict as passed in the constructor has structure {key : <shorthand fxn>} where key is just a human-readable handle for what's being evaluated and shorthand_fxn is either a string for prebuilt evaluators, or you can pass in a general function to evaluate ''' to_eval_dict = { 'top1': 'top1', 'avg_loss_value': 'avg_loss_value', 'avg_successful_ssim': 'avg_successful_ssim' } fgsm_eval = adveval.EvaluationResult(fgsm_attack_params, to_eval=to_eval_dict) pgd10_eval = adveval.EvaluationResult(pgd10_attack_params, to_eval=to_eval_dict) pgd100_eval = adveval.EvaluationResult(pgd100_attack_params, to_eval=to_eval_dict) cw100_eval = adveval.EvaluationResult(cw100_attack_params, to_eval=to_eval_dict) cw1000_eval = adveval.EvaluationResult(cw1000_attack_params, to_eval=to_eval_dict) attack_ensemble = { 'fgsm': fgsm_eval, 'pgd10': pgd10_eval, 'pgd100': pgd100_eval, 'cw100': cw100_eval, 'cw1000': cw1000_eval } if blackbox: attack_ensemble = { 'fgsm': fgsm_eval, 'pgd10': pgd10_eval, 'pgd100': pgd100_eval } ensemble_out = adv_eval_object.evaluate_ensemble(cifar_valset, attack_ensemble, verbose=True, num_minibatches=None) filename = "result.txt" if blackbox: filename = "result_blackbox.txt" # Now let's build a little helper to print things out cleanly: sort_order = { 'ground': 1, 'fgsm': 2, 'pgd10': 3, 'pgd100': 4, 'cw100': 5, 'cw1000': 6 } if blackbox: sort_order = {'ground': 1, 'fgsm': 2, 'pgd10': 3, 'pgd100': 4} def pretty_printer(eval_ensemble, result_type): f = open(filename, "a") print('~' * 10, result_type, '~' * 10) f.write('~' * 10 + result_type + '~' * 10 + "\n") for key in sorted(list(eval_ensemble.keys()), key=lambda k: sort_order[k]): eval_result = eval_ensemble[key] pad = 6 - len(key) if result_type not in eval_result.results: continue avg_result = eval_result.results[result_type].avg print(key, pad * ' ', ': ', avg_result) f.write(key + pad * ' ' + ': ' + str(avg_result) + "\n") f.close() '''And then we can print out and look at the results: This prints the accuracy. Ground is the unperturbed accuracy. If everything is done right, we should see that PGD with an l_inf bound of 4 is a stronger attack against undefended networks than FGSM with an l_inf bound of 8 ''' f = open(filename, "a") f.write('Result for ' + defence_method + 'ResNet{}'.format(flavor) + "\n") if blackbox: f.write('Blackbox' + flavor_blackbox + "\n") f.close() pretty_printer(ensemble_out, 'top1') # We can examine the loss (noting that we seek to 'maximize' loss in the adversarial example domain) pretty_printer(ensemble_out, 'avg_loss_value') # This is actually 1-SSIM, which can serve as a makeshift 'similarity index', # which essentially gives a meterstick for how similar the perturbed images are to the originals pretty_printer(ensemble_out, 'avg_successful_ssim') f = open(filename, "a") f.write("\n") f.close()
def main_attack_script(attack_examples=None, show_images=False): # Which attacks to do... attack_examples = attack_examples or [ 'FGSM', 'BIM', 'PGD', 'CW2', 'CWLInf' ] ######################################################################## # SHARED BLOCK # ######################################################################## # Initialize CIFAR classifier classifier_net = cifar_loader.load_pretrained_cifar_resnet(flavor=32) classifier_net.eval() # Collect one minibatch worth of data/targets val_loader = cifar_loader.load_cifar_data('val', normalize=False, batch_size=16) ex_minibatch, ex_targets = next(iter(val_loader)) # Differentiable normalizer needed for classification cifar_normer = utils.DifferentiableNormalize(mean=config.CIFAR10_MEANS, std=config.CIFAR10_STDS) ######################################################################### # FGSM ATTACK BLOCK # ######################################################################### if 'FGSM' in attack_examples: # Example FGSM attack on a single minibatch # steps: # 0) initialize hyperparams # 1) setup loss object # 2) build attack object # 3) setup examples to attack # 4) perform attack # 5) evaluate attack (accuracy + display a few images ) FGSM_L_INF = 8.0 / 255.0 delta_threat = ap.ThreatModel(ap.DeltaAddition, { 'lp_style': 'inf', 'lp_bound': 8.0 / 255 }) fgsm_xentropy_loss = plf.VanillaXentropy(classifier_net, normalizer=cifar_normer) fgsm_attack_obj = aa.FGSM(classifier_net, cifar_normer, delta_threat, fgsm_xentropy_loss) fgsm_original_images = ex_minibatch fgsm_original_labels = ex_targets fgsm_adv_images = fgsm_attack_obj.attack( fgsm_original_images, fgsm_original_labels, FGSM_L_INF).adversarial_tensors() fgsm_accuracy = fgsm_attack_obj.eval(fgsm_original_images, fgsm_adv_images, fgsm_original_labels) print("FGSM ATTACK ACCURACY: ") print("\t Original %% correct: %s" % fgsm_accuracy[0]) print("\t Adversarial %% correct: %s" % fgsm_accuracy[1]) if show_images: img_utils.display_adversarial_2row(classifier_net, cifar_normer, fgsm_original_images, fgsm_adv_images, 4) ########################################################################## # BIM ATTACK BLOCK # ########################################################################## if 'BIM' in attack_examples: # Example BIM attack on a single minibatch # steps: # 0) initialize hyperparams # 1) setup loss object # 2) build attack object # 3) setup examples to attack # 4) perform attack # 5) evaluate attack BIM_L_INF = 8.0 / 255.0 BIM_STEP_SIZE = 1.0 / 255.0 BIM_NUM_ITER = 16 bim_xentropy_loss = plf.VanillaXentropy(classifier_net, normalizer=cifar_normer) bim_attack_obj = aa.BIM(classifier_net, cifar_normer, bim_xentropy_loss) bim_original_images = ex_minibatch bim_original_labels = ex_targets bim_adv_images = bim_attack_obj.attack(bim_original_images, bim_original_labels, l_inf_bound=BIM_L_INF, step_size=BIM_STEP_SIZE, num_iterations=BIM_NUM_ITER) bim_accuracy = bim_attack_obj.eval(bim_original_images, bim_adv_images, bim_original_labels) print("BIM ATTACK ACCURACY: ") print("\t Original %% correct: %s" % bim_accuracy[0]) print("\t Adversarial %% correct: %s" % bim_accuracy[1]) if show_images: img_utils.display_adversarial_2row(classifier_net, cifar_normer, bim_original_images, bim_adv_images, 4) ########################################################################## # PGD ATTACK BLOCK # ########################################################################## if 'PGD' in attack_examples: # Example BIM attack on a single minibatch # steps: # 0) initialize hyperparams # 1) setup loss object # 2) build attack object # 3) setup examples to attack # 4) perform attack # 5) evaluate attack PGD_L_INF = 8.0 / 255.0 PGD_STEP_SIZE = 1.0 / 255.0 PGD_NUM_ITER = 16 pgd_xentropy_loss = plf.VanillaXentropy(classifier_net, normalizer=cifar_normer) delta_threat = ap.ThreatModel(ap.DeltaAddition, { 'lp_style': 'inf', 'lp_bound': 8.0 / 255 }) pgd_attack_obj = aa.PGD(classifier_net, cifar_normer, delta_threat, pgd_xentropy_loss) pgd_original_images = ex_minibatch pgd_original_labels = ex_targets pgd_adv_images = pgd_attack_obj.attack( pgd_original_images, pgd_original_labels, step_size=PGD_STEP_SIZE, num_iterations=PGD_NUM_ITER).adversarial_tensors() pgd_accuracy = pgd_attack_obj.eval(pgd_original_images, pgd_adv_images, pgd_original_labels) print("PGD ATTACK ACCURACY: ") print("\t Original %% correct: %s" % pgd_accuracy[0]) print("\t Adversarial %% correct: %s" % pgd_accuracy[1]) if show_images: img_utils.display_adversarial_2row(classifier_net, cifar_normer, pgd_original_images, pgd_adv_images, 4) ########################################################################## # CW L2 ATTACK # ########################################################################## if 'CWL2' in attack_examples: # Example Carlini Wagner L2 attack on a single minibatch # steps: # 0) initialize hyperparams # 1) setup loss object # 2) build attack object # 3) setup examples to attack # 4) perform attack # 5) evaluate attack CW_INITIAL_SCALE_CONSTANT = 0.1 CW_NUM_BIN_SEARCH_STEPS = 5 CW_NUM_OPTIM_STEPS = 1000 CW_DISTANCE_METRIC = 'l2' CW_CONFIDENCE = 0.0 cw_f6loss = lf.CWLossF6 delta_threat = ap.ThreatModel(ap.DeltaAddition, { 'lp_style': 2, 'lp_bound': 3072.0 }) cwl2_obj = aa.CarliniWagner(classifier_net, cifar_normer, delta_threat, lf.L2Regularization, cw_f6loss) cwl2_original_images = ex_minibatch cwl2_original_labels = ex_targets cwl2_output = cwl2_obj.attack( ex_minibatch, ex_targets, num_bin_search_steps=CW_NUM_BIN_SEARCH_STEPS, num_optim_steps=CW_NUM_OPTIM_STEPS, verbose=True) print(cwl2_output['best_dist']) cwl2_adv_images = cwl2_output['best_adv_images'] cwl2_accuracy = cwl2_obj.eval(cwl2_original_images, cwl2_adv_images, cwl2_original_labels) print("CWL2 ATTACK ACCURACY: ") print("\t Original %% correct: %s" % cwl2_accuracy[0]) print("\t Adversarial %% correct: %s" % cwl2_accuracy[1]) if show_images: img_utils.display_adversarial_2row(classifier_net, cifar_normer, cwl2_original_images, cwl2_adv_images, 4)
def main_attack_script(attack_examples=None, show_images=False, use_gpu=False): # Which attacks to do... attack_examples = attack_examples or [ 'FGSM', 'BIM', 'PGD', 'CW2', 'CWLInf' ] ######################################################################## # SHARED BLOCK # ######################################################################## # Initialize CIFAR classifier classifier_net = cifar_loader.load_pretrained_cifar_resnet(flavor=32, use_gpu=use_gpu) classifier_net.eval() # Collect one minibatch worth of data/targets val_loader = cifar_loader.load_cifar_data('val', normalize=False, batch_size=16, use_gpu=use_gpu) ex_minibatch, ex_targets = next(iter(val_loader)) # Differentiable normalizer needed for classification cifar_normer = utils.DifferentiableNormalize(mean=config.CIFAR10_MEANS, std=config.CIFAR10_STDS) ######################################################################### # FGSM ATTACK BLOCK # ######################################################################### if 'FGSM' in attack_examples: # Example FGSM attack on a single minibatch # steps: # 0) initialize hyperparams # 1) setup loss object # 2) build attack object # 3) setup examples to attack # 4) perform attack # 5) evaluate attack (accuracy + display a few images ) FGSM_L_INF = 8.0 / 255.0 fgsm_xentropy_loss = plf.VanillaXentropy(classifier_net, normalizer=cifar_normer) fgsm_attack_obj = aa.FGSM(classifier_net, cifar_normer, fgsm_xentropy_loss) fgsm_original_images = ex_minibatch fgsm_original_labels = ex_targets fgsm_adv_images = fgsm_attack_obj.attack(fgsm_original_images, fgsm_original_labels, FGSM_L_INF) fgsm_accuracy = fgsm_attack_obj.eval(fgsm_original_images, fgsm_adv_images, fgsm_original_labels) print "FGSM ATTACK ACCURACY: " print "\t Original %% correct: %s" % fgsm_accuracy[0] print "\t Adversarial %% correct: %s" % fgsm_accuracy[1] if show_images: img_utils.display_adversarial_2row(classifier_net, cifar_normer, fgsm_original_images, fgsm_adv_images, 4) ########################################################################## # BIM ATTACK BLOCK # ########################################################################## if 'BIM' in attack_examples: # Example BIM attack on a single minibatch # steps: # 0) initialize hyperparams # 1) setup loss object # 2) build attack object # 3) setup examples to attack # 4) perform attack # 5) evaluate attack BIM_L_INF = 8.0 / 255.0 BIM_STEP_SIZE = 1.0 / 255.0 BIM_NUM_ITER = 16 bim_xentropy_loss = plf.VanillaXentropy(classifier_net, normalizer=cifar_normer) bim_attack_obj = aa.BIM(classifier_net, cifar_normer, bim_xentropy_loss) bim_original_images = ex_minibatch bim_original_labels = ex_targets bim_adv_images = bim_attack_obj.attack(bim_original_images, bim_original_labels, l_inf_bound=BIM_L_INF, step_size=BIM_STEP_SIZE, num_iterations=BIM_NUM_ITER) bim_accuracy = bim_attack_obj.eval(bim_original_images, bim_adv_images, bim_original_labels) print "BIM ATTACK ACCURACY: " print "\t Original %% correct: %s" % bim_accuracy[0] print "\t Adversarial %% correct: %s" % bim_accuracy[1] if show_images: img_utils.display_adversarial_2row(classifier_net, cifar_normer, bim_original_images, bim_adv_images, 4) ########################################################################## # PGD ATTACK BLOCK # ########################################################################## if 'PGD' in attack_examples: # Example BIM attack on a single minibatch # steps: # 0) initialize hyperparams # 1) setup loss object # 2) build attack object # 3) setup examples to attack # 4) perform attack # 5) evaluate attack PGD_L_INF = 8.0 / 255.0 PGD_STEP_SIZE = 1.0 / 255.0 PGD_NUM_ITER = 16 pgd_xentropy_loss = plf.VanillaXentropy(classifier_net, normalizer=cifar_normer) pgd_attack_obj = aa.LInfPGD(classifier_net, cifar_normer, pgd_xentropy_loss) pgd_original_images = ex_minibatch pgd_original_labels = ex_targets pgd_adv_images = pgd_attack_obj.attack(pgd_original_images, pgd_original_labels, l_inf_bound=PGD_L_INF, step_size=PGD_STEP_SIZE, num_iterations=PGD_NUM_ITER) pgd_accuracy = pgd_attack_obj.eval(pgd_original_images, pgd_adv_images, pgd_original_labels) print "PGD ATTACK ACCURACY: " print "\t Original %% correct: %s" % pgd_accuracy[0] print "\t Adversarial %% correct: %s" % pgd_accuracy[1] if show_images: img_utils.display_adversarial_2row(classifier_net, cifar_normer, pgd_original_images, pgd_adv_images, 4) ########################################################################## # CW L2 ATTACK # ########################################################################## if 'CWL2' in attack_examples: # Example Carlini Wagner L2 attack on a single minibatch # steps: # 0) initialize hyperparams # 1) setup loss object # 2) build attack object # 3) setup examples to attack # 4) perform attack # 5) evaluate attack CW_INITIAL_SCALE_CONSTANT = 0.1 CW_NUM_BIN_SEARCH_STEPS = 5 CW_NUM_OPTIM_STEPS = 1000 CW_DISTANCE_METRIC = 'l2' CW_CONFIDENCE = 0.0 cwl2_loss = plf.CWL2Loss(classifier_net, cifar_normer, kappa=0.0) cwl2_obj = aa.CW(classifier_net, cifar_normer, cwl2_loss, CW_INITIAL_SCALE_CONSTANT, num_bin_search_steps=CW_NUM_BIN_SEARCH_STEPS, num_optim_steps=CW_NUM_OPTIM_STEPS, distance_metric_type=CW_DISTANCE_METRIC, confidence=CW_CONFIDENCE) cwl2_original_images = ex_minibatch cwl2_original_labels = ex_targets cwl2_output = cwl2_obj.attack(ex_minibatch, ex_targets, verbose=True) print cwl2_output['best_dist'] cwl2_adv_images = cwl2_output['best_adv_images'] cwl2_accuracy = cwl2_obj.eval(cwl2_original_images, cwl2_adv_images, cwl2_original_labels) print "CWL2 ATTACK ACCURACY: " print "\t Original %% correct: %s" % cwl2_accuracy[0] print "\t Adversarial %% correct: %s" % cwl2_accuracy[1] if show_images: img_utils.display_adversarial_2row(classifier_net, cifar_normer, cwl2_original_images, cwl2_adv_images, 4) ########################################################################## # CW LINF ATTACK # ########################################################################## if 'CWLInf' in attack_examples: # Example Carlini Wagner L2 attack on a single minibatch # steps: # 0) initialize hyperparams # 1) setup loss object # 2) build attack object # 3) setup examples to attack # 4) perform attack # 5) evaluate attack CW_INITIAL_SCALE_CONSTANT = 0.1 CW_NUM_BIN_SEARCH_STEPS = 5 CW_NUM_OPTIM_STEPS = 1000 CW_DISTANCE_METRIC = 'linf' CW_CONFIDENCE = 0.0 cwlinf_loss = plf.CWLInfLoss(classifier_net, cifar_normer, kappa=0.0) cwlinf_obj = aa.CW(classifier_net, cifar_normer, cwlinf_loss, CW_INITIAL_SCALE_CONSTANT, num_bin_search_steps=CW_NUM_BIN_SEARCH_STEPS, num_optim_steps=CW_NUM_OPTIM_STEPS, distance_metric_type=CW_DISTANCE_METRIC, confidence=CW_CONFIDENCE) cwlinf_original_images = ex_minibatch cwlinf_original_labels = ex_targets cwlinf_output = cwlinf_obj.attack(ex_minibatch, ex_targets, verbose=True) print cwlinf_output['best_dist'] * 255.0 cwlinf_adv_images = cwlinf_output['best_adv_images'] cwlinf_accuracy = cwlinf_obj.eval(cwlinf_original_images, cwlinf_adv_images, cwlinf_original_labels) print "CWLinf ATTACK ACCURACY: " print "\t Original %% correct: %s" % cwlinf_accuracy[0] print "\t Adversarial %% correct: %s" % cwlinf_accuracy[1] if show_images: img_utils.display_adversarial_2row(classifier_net, cifar_normer, cwlinf_original_images, cwlinf_adv_images, 4) ########################################################################## # URM ATTACK # ########################################################################## if 'URM' in attack_examples: # Example Uniform Random Method # steps: # 0) initialize hyperparams # 1) setup loss object # 2) build attack object # 3) setup examples to attack # 4) perform attack # 5) evaluate attack URM_BOUND = 8.0 / 255.0 URM_TRIES = 100 urm_loss = lf.IncorrectIndicator(classifier_net, normalizer=cifar_normer) urm_attack = aa.URM(classifier_net, cifar_normer, urm_loss, use_gpu=use_gpu) urm_original_images = ex_minibatch urm_original_labels = ex_targets urm_output = urm_attack.attack(ex_minibatch, ex_targets, URM_BOUND, num_tries=URM_TRIES) import interact