def black_box_rejector(options): """ Uses a black box attack to evade the rejector defense. Adversarial samples are generated to fool the defended model, which only provides the labels when queried. Note: Models with rejectors also have a special label 'reject', which does not represent a valid misclassification (i.e. the attack does not considered being rejected a success). """ attack_p = options['attack_p'] attack_name = options['attack_name'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] foolbox_model = options['foolbox_model'] loader = options['loader'] rejector = options['rejector'] results_path = options['results_path'] # The defended_model returns [y1, y2 ... yN, -inf] if it believes # that the sample is valid, otherwise it returns [0, 0 ... 0, 1] # This means that if the top label is the last one, it was classified as adversarial. # On a genuine dataset, this should never happen (if the rejector is perfect). defended_model = rejectors.RejectorModel(foolbox_model, rejector) # detectors.Undetected() adds the condition that the top label must not be the last # Note: (foolbox.Criterion and foolbox.Criterion) should give a combined criterion, but # apparently it doesn't work. The documentation recommends using "&" criterion = foolbox.criteria.CombinedCriteria( foolbox.criteria.Misclassification(), rejectors.Unrejected()) # The attack will be against the defended model attack = parsing.parse_attack(attack_name, attack_p, criterion) samples_count, correct_count, successful_attack_count, distances, _, _ = tests.attack_test( defended_model, loader, attack, attack_p, cuda, attack_workers, name='Black-Box Rejector Attack') info = utils.attack_statistics_info(samples_count, correct_count, successful_attack_count, distances) header = ['Distances'] utils.save_results(results_path, table=[distances], command=command, info=info, header=header)
def substitute_preprocessor(options): """ Uses BPDA with a substitute model to evade the preprocessor defense. BPDA uses predictions from the defended model and gradients from the substitute model. """ attack_p = options['attack_p'] attack_name = options['attack_name'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] foolbox_model = options['foolbox_model'] loader = options['loader'] results_path = options['results_path'] preprocessor = options['preprocessor'] substitute_foolbox_model = options['substitute_foolbox_model'] defended_model = defenses.PreprocessorDefenseModel(foolbox_model, preprocessor) if substitute_foolbox_model.num_classes() != defended_model.num_classes(): raise click.BadArgumentUsage( 'The substitute model ({} classes) must have the same ' 'number of classes as the defended model ({} classes)'.format( substitute_foolbox_model.num_classes(), defended_model.num_classes())) composite_model = foolbox.models.CompositeModel(defended_model, substitute_foolbox_model) criterion = foolbox.criteria.Misclassification() # The attack will be against the defended model with estimated gradients attack = parsing.parse_attack(attack_name, attack_p, criterion) samples_count, correct_count, successful_attack_count, distances, _, _ = tests.attack_test( composite_model, loader, attack, attack_p, cuda, attack_workers, name='Substitute Preprocessor Attack') info = utils.attack_statistics_info(samples_count, correct_count, successful_attack_count, distances) header = ['Distances'] utils.save_results(results_path, table=[distances], command=command, info=info, header=header)
def substitute_model(options): """ Uses BPDA with a substitute model to attack the custom model. BPDA uses predictions from the defended model and gradients from the substitute model. Note: We could technically attack the custom model directly, since most models support gradient computation, but we are assuming that we do not have access to the gradients. """ attack_p = options['attack_p'] attack_name = options['attack_name'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] custom_foolbox_model = options['custom_foolbox_model'] loader = options['loader'] results_path = options['results_path'] substitute_foolbox_model = options['substitute_foolbox_model'] if substitute_foolbox_model.num_classes( ) != custom_foolbox_model.num_classes(): raise click.BadArgumentUsage( 'The substitute model ({} classes) must have the same ' 'number of classes as the custom model ({} classes)'.format( substitute_foolbox_model.num_classes(), custom_foolbox_model.num_classes())) composite_model = foolbox.models.CompositeModel(custom_foolbox_model, substitute_foolbox_model) criterion = foolbox.criteria.Misclassification() # The attack will be against the substitute model with estimated gradients attack = parsing.parse_attack(attack_name, attack_p, criterion) samples_count, correct_count, successful_attack_count, distances, _, _ = tests.attack_test( composite_model, loader, attack, attack_p, cuda, attack_workers, name='Substitute Model Attack') info = utils.attack_statistics_info(samples_count, correct_count, successful_attack_count, distances) header = ['Distances'] utils.save_results(results_path, table=[distances], command=command, info=info, header=header)
def adversarial_perturbation(options): attack_name = options['attack_name'] attack_p = options['attack_p'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] foolbox_model = options['foolbox_model'] loader = options['loader'] results_path = options['results_path'] criterion = foolbox.criteria.Misclassification() attack = parsing.parse_attack(attack_name, attack_p, criterion) distance_tool = parsing.parse_distance_tool('counter-attack', options, np.inf) samples_count, correct_count, successful_count, \ correct_estimate_count, boundary_distances,\ adversarial_distances = tests.adversarial_perturbation_test(foolbox_model, loader, attack, distance_tool, cuda, attack_workers) correct_estimate_rate = correct_estimate_count / successful_count effective_correct_estimate_rate = correct_estimate_count / correct_count info = [ ['Total Count', samples_count], ['Correctly Classified Count', correct_count], ['Successful Attack Count', successful_count], ['Correct Estimate Count', correct_estimate_count], [ 'Correct Estimate Rate (correct_estimate / successful_attack)', '{:2.2f}%'.format(correct_estimate_rate * 100.0) ], [ 'Effective Correct Estimate Rate (correct_estimate / correct_classification)', '{:2.2f}%'.format(effective_correct_estimate_rate * 100.0) ] ] header = ['Boundary Distances', 'Adversarial Distances'] utils.save_results(results_path, table=[boundary_distances, adversarial_distances], header=header, command=command, info=info)
def shallow_preprocessor(options): """ Simply evaluates the effectiveness of the preprocessor defense, without additional attack strategies. Adversarial samples are generated to fool the undefended model. """ attack_p = options['attack_p'] attack_name = options['attack_name'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] foolbox_model = options['foolbox_model'] loader = options['loader'] results_path = options['results_path'] preprocessor = options['preprocessor'] criterion = foolbox.criteria.Misclassification() # The attack will be against the undefended model attack = parsing.parse_attack(attack_name, attack_p, criterion) defended_model = defenses.PreprocessorDefenseModel(foolbox_model, preprocessor) samples_count, correct_count, successful_attack_count, distances = tests.shallow_defense_test( foolbox_model, loader, attack, attack_p, defended_model, cuda, attack_workers, name='Shallow Preprocessor Attack') info = utils.attack_statistics_info(samples_count, correct_count, successful_attack_count, distances) header = ['Distances'] utils.save_results(results_path, table=[distances], command=command, info=info, header=header)
def black_box_model(options): """ Uses a black box attack against the custom model. Adversarial samples are generated to fool the custom model, which only provides the labels when queried. Note: We could technically use the gradients, since most models support gradient computation, but we are assuming that we do not have access to them. """ attack_p = options['attack_p'] attack_name = options['attack_name'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] custom_foolbox_model = options['custom_foolbox_model'] loader = options['loader'] results_path = options['results_path'] criterion = foolbox.criteria.Misclassification() # The attack will be against the defended (custom) model attack = parsing.parse_attack(attack_name, attack_p, criterion) samples_count, correct_count, successful_attack_count, distances, _, _ = tests.attack_test( custom_foolbox_model, loader, attack, attack_p, cuda, attack_workers, name='Black-Box Model Attack') info = utils.attack_statistics_info(samples_count, correct_count, successful_attack_count, distances) header = ['Distances'] utils.save_results(results_path, table=[distances], command=command, info=info, header=header)
def black_box_preprocessor(options): """ Uses a black box attack to evade the preprocessor defense. Adversarial samples are generated to fool the defended model, which only provides the labels when queried. """ attack_p = options['attack_p'] attack_name = options['attack_name'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] foolbox_model = options['foolbox_model'] loader = options['loader'] results_path = options['results_path'] preprocessor = options['preprocessor'] defended_model = defenses.PreprocessorDefenseModel(foolbox_model, preprocessor) criterion = foolbox.criteria.Misclassification() # The attack will be against the defended model attack = parsing.parse_attack(attack_name, attack_p, criterion) samples_count, correct_count, successful_attack_count, distances, _, _ = tests.attack_test( defended_model, loader, attack, attack_p, cuda, attack_workers, name='Black-Box Preprocessor Attack') info = utils.attack_statistics_info(samples_count, correct_count, successful_attack_count, distances) header = ['Distances'] utils.save_results(results_path, table=[distances], command=command, info=info, header=header)
def _parse_counter_attack_options(options, counter_attack, counter_attack_workers, *args, **kwargs): defense_p = options['defense_p'] max_model_batch_size = options['max_model_batch_size'] if counter_attack in definitions.parallelizable_attacks: logger.debug('Counter attack supports parallelization.') else: logger.debug( 'Counter attack does not support parallelization.') if counter_attack_workers is not None and counter_attack_workers > 0: raise click.BadOptionUsage( '--counter-attack-workers', 'The chosen counter-attack \'{}\' does not support parallelization.' .format(counter_attack)) counter_attack_workers = 0 logger.info( 'Counter attack workers: {}.'.format(counter_attack_workers)) if max_model_batch_size > 0 and counter_attack_workers > max_model_batch_size: raise click.BadOptionUsage( '--counter-attack-workers', 'The number of counter attack workers must be at most the maximum model batch size. ' 'Either increase the maximum model batch size, decrease the number of ' 'counter attack workers, or disable model batch limiting.') counter_attack = parsing.parse_attack( counter_attack, defense_p, foolbox.criteria.Misclassification()) options = dict(options) options['counter_attack'] = counter_attack options['counter_attack_workers'] = counter_attack_workers return func(options, *args, **kwargs)
def check_parallelization(options): """ Compares parallelized attacks with standard ones. This is a sanity check to verify that attack parallelization does not seriously affect the results. """ attack_p = options['attack_p'] attack_name = options['attack_name'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] foolbox_model = options['foolbox_model'] results_path = options['results_path'] loader = options['loader'] criterion = foolbox.criteria.Misclassification() attack = parsing.parse_attack(attack_name, attack_p, criterion) samples_count, correct_count, standard_attack_count, parallel_attack_count, standard_distances, parallel_distances = tests.parallelization_test( foolbox_model, loader, attack, attack_p, cuda, attack_workers) standard_failure_count = correct_count - standard_attack_count parallel_failure_count = correct_count - parallel_attack_count standard_average_distance, standard_median_distance, standard_adjusted_median_distance = utils.distance_statistics( standard_distances, standard_failure_count) parallel_average_distance, parallel_median_distance, parallel_adjusted_median_distance = utils.distance_statistics( parallel_distances, parallel_failure_count) standard_success_rate = standard_attack_count / correct_count parallel_success_rate = parallel_attack_count / correct_count average_distance_difference = ( parallel_average_distance - standard_average_distance) / standard_average_distance median_distance_difference = ( parallel_median_distance - standard_median_distance) / standard_median_distance success_rate_difference = (parallel_success_rate - standard_success_rate) / standard_success_rate adjusted_median_distance_difference = ( parallel_adjusted_median_distance - standard_adjusted_median_distance) / standard_adjusted_median_distance info = [[ 'Average Distance Relative Difference', average_distance_difference ], ['Median Distance Relative Difference', median_distance_difference], ['Success Rate Difference', success_rate_difference], [ 'Adjusted Median Distance Difference', adjusted_median_distance_difference ], ['Samples Count', str(samples_count)], ['Correct Count', str(correct_count)], ['Standard Attack Count', str(standard_attack_count)], ['Parallel Attack Count', str(parallel_attack_count)]] header = ['Standard Distances', 'Parallel Distances'] utils.save_results(results_path, table=[standard_distances, parallel_distances], command=command, info=info, header=header)
def attack(options, adversarial_dataset_path, no_test_warning): """ Runs an attack against the model. \b Stores the following results: Success Rate: The success rate of the attack. Average Distance: The average L_p distance of the successful adversarial samples from their original samples. Median Distance: The median L_p distance of the successful adversarial samples from their original samples. Adjusted Median Distance: The median L_p distance of the adversarial samples from their original samples, treating failed attacks as samples with distance Infinity. """ attack_p = options['attack_p'] attack_name = options['attack_name'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] dataset_type = options['dataset_type'] foolbox_model = options['foolbox_model'] loader = options['loader'] results_path = options['results_path'] criterion = foolbox.criteria.Misclassification() attack = parsing.parse_attack(attack_name, attack_p, criterion) save_adversarials = adversarial_dataset_path is not None if dataset_type == 'test' and save_adversarials and not no_test_warning: logger.warning( 'Remember to use \'--dataset-type train\' if you plan to use the generated adversarials ' 'to train or calibrate an adversarial detector. You can disable this warning by passing ' '\'--no-test-warning\'.') samples_count, correct_count, successful_attack_count, distances, adversarials, adversarial_ground_truths = tests.attack_test( foolbox_model, loader, attack, attack_p, cuda, attack_workers, save_adversarials=save_adversarials) accuracy = correct_count / samples_count success_rate = successful_attack_count / correct_count failure_count = correct_count - successful_attack_count average_distance, median_distance, adjusted_median_distance = utils.distance_statistics( distances, failure_count) info = [['Base Accuracy', '{:2.2f}%'.format(accuracy * 100.0)], ['Success Rate', '{:2.2f}%'.format(success_rate * 100.0)], ['Average Distance', '{:2.2e}'.format(average_distance)], ['Median Distance', '{:2.2e}'.format(median_distance)], [ 'Adjusted Median Distance', '{:2.2e}'.format(adjusted_median_distance) ], ['Samples Count', str(samples_count)], ['Correct Count', str(correct_count)], ['Successful Attack Count', str(successful_attack_count)]] header = ['Distances'] utils.save_results(results_path, table=[distances], command=command, info=info, header=header) if save_adversarials: dataset = list(zip(adversarials, adversarial_ground_truths)), success_rate utils.save_zip(dataset, adversarial_dataset_path)