def black_box_rejector(options): """ Uses a black box attack to evade the rejector defense. Adversarial samples are generated to fool the defended model, which only provides the labels when queried. Note: Models with rejectors also have a special label 'reject', which does not represent a valid misclassification (i.e. the attack does not considered being rejected a success). """ attack_p = options['attack_p'] attack_name = options['attack_name'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] foolbox_model = options['foolbox_model'] loader = options['loader'] rejector = options['rejector'] results_path = options['results_path'] # The defended_model returns [y1, y2 ... yN, -inf] if it believes # that the sample is valid, otherwise it returns [0, 0 ... 0, 1] # This means that if the top label is the last one, it was classified as adversarial. # On a genuine dataset, this should never happen (if the rejector is perfect). defended_model = rejectors.RejectorModel(foolbox_model, rejector) # detectors.Undetected() adds the condition that the top label must not be the last # Note: (foolbox.Criterion and foolbox.Criterion) should give a combined criterion, but # apparently it doesn't work. The documentation recommends using "&" criterion = foolbox.criteria.CombinedCriteria( foolbox.criteria.Misclassification(), rejectors.Unrejected()) # The attack will be against the defended model attack = parsing.parse_attack(attack_name, attack_p, criterion) samples_count, correct_count, successful_attack_count, distances, _, _ = tests.attack_test( defended_model, loader, attack, attack_p, cuda, attack_workers, name='Black-Box Rejector Attack') info = utils.attack_statistics_info(samples_count, correct_count, successful_attack_count, distances) header = ['Distances'] utils.save_results(results_path, table=[distances], command=command, info=info, header=header)
def substitute_preprocessor(options): """ Uses BPDA with a substitute model to evade the preprocessor defense. BPDA uses predictions from the defended model and gradients from the substitute model. """ attack_p = options['attack_p'] attack_name = options['attack_name'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] foolbox_model = options['foolbox_model'] loader = options['loader'] results_path = options['results_path'] preprocessor = options['preprocessor'] substitute_foolbox_model = options['substitute_foolbox_model'] defended_model = defenses.PreprocessorDefenseModel(foolbox_model, preprocessor) if substitute_foolbox_model.num_classes() != defended_model.num_classes(): raise click.BadArgumentUsage( 'The substitute model ({} classes) must have the same ' 'number of classes as the defended model ({} classes)'.format( substitute_foolbox_model.num_classes(), defended_model.num_classes())) composite_model = foolbox.models.CompositeModel(defended_model, substitute_foolbox_model) criterion = foolbox.criteria.Misclassification() # The attack will be against the defended model with estimated gradients attack = parsing.parse_attack(attack_name, attack_p, criterion) samples_count, correct_count, successful_attack_count, distances, _, _ = tests.attack_test( composite_model, loader, attack, attack_p, cuda, attack_workers, name='Substitute Preprocessor Attack') info = utils.attack_statistics_info(samples_count, correct_count, successful_attack_count, distances) header = ['Distances'] utils.save_results(results_path, table=[distances], command=command, info=info, header=header)
def substitute_model(options): """ Uses BPDA with a substitute model to attack the custom model. BPDA uses predictions from the defended model and gradients from the substitute model. Note: We could technically attack the custom model directly, since most models support gradient computation, but we are assuming that we do not have access to the gradients. """ attack_p = options['attack_p'] attack_name = options['attack_name'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] custom_foolbox_model = options['custom_foolbox_model'] loader = options['loader'] results_path = options['results_path'] substitute_foolbox_model = options['substitute_foolbox_model'] if substitute_foolbox_model.num_classes( ) != custom_foolbox_model.num_classes(): raise click.BadArgumentUsage( 'The substitute model ({} classes) must have the same ' 'number of classes as the custom model ({} classes)'.format( substitute_foolbox_model.num_classes(), custom_foolbox_model.num_classes())) composite_model = foolbox.models.CompositeModel(custom_foolbox_model, substitute_foolbox_model) criterion = foolbox.criteria.Misclassification() # The attack will be against the substitute model with estimated gradients attack = parsing.parse_attack(attack_name, attack_p, criterion) samples_count, correct_count, successful_attack_count, distances, _, _ = tests.attack_test( composite_model, loader, attack, attack_p, cuda, attack_workers, name='Substitute Model Attack') info = utils.attack_statistics_info(samples_count, correct_count, successful_attack_count, distances) header = ['Distances'] utils.save_results(results_path, table=[distances], command=command, info=info, header=header)
def adversarial_perturbation(options): attack_name = options['attack_name'] attack_p = options['attack_p'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] foolbox_model = options['foolbox_model'] loader = options['loader'] results_path = options['results_path'] criterion = foolbox.criteria.Misclassification() attack = parsing.parse_attack(attack_name, attack_p, criterion) distance_tool = parsing.parse_distance_tool('counter-attack', options, np.inf) samples_count, correct_count, successful_count, \ correct_estimate_count, boundary_distances,\ adversarial_distances = tests.adversarial_perturbation_test(foolbox_model, loader, attack, distance_tool, cuda, attack_workers) correct_estimate_rate = correct_estimate_count / successful_count effective_correct_estimate_rate = correct_estimate_count / correct_count info = [ ['Total Count', samples_count], ['Correctly Classified Count', correct_count], ['Successful Attack Count', successful_count], ['Correct Estimate Count', correct_estimate_count], [ 'Correct Estimate Rate (correct_estimate / successful_attack)', '{:2.2f}%'.format(correct_estimate_rate * 100.0) ], [ 'Effective Correct Estimate Rate (correct_estimate / correct_classification)', '{:2.2f}%'.format(effective_correct_estimate_rate * 100.0) ] ] header = ['Boundary Distances', 'Adversarial Distances'] utils.save_results(results_path, table=[boundary_distances, adversarial_distances], header=header, command=command, info=info)
def shallow_preprocessor(options): """ Simply evaluates the effectiveness of the preprocessor defense, without additional attack strategies. Adversarial samples are generated to fool the undefended model. """ attack_p = options['attack_p'] attack_name = options['attack_name'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] foolbox_model = options['foolbox_model'] loader = options['loader'] results_path = options['results_path'] preprocessor = options['preprocessor'] criterion = foolbox.criteria.Misclassification() # The attack will be against the undefended model attack = parsing.parse_attack(attack_name, attack_p, criterion) defended_model = defenses.PreprocessorDefenseModel(foolbox_model, preprocessor) samples_count, correct_count, successful_attack_count, distances = tests.shallow_defense_test( foolbox_model, loader, attack, attack_p, defended_model, cuda, attack_workers, name='Shallow Preprocessor Attack') info = utils.attack_statistics_info(samples_count, correct_count, successful_attack_count, distances) header = ['Distances'] utils.save_results(results_path, table=[distances], command=command, info=info, header=header)
def radius(options, sampling_count): command = options['command'] foolbox_model = options['foolbox_model'] loader = options['loader'] results_path = options['results_path'] distance_tool = parsing.parse_distance_tool('counter-attack', options, np.inf) total_count, consistent_count, failures = tests.radius_test( foolbox_model, loader, distance_tool, sampling_count) consistency_rate = consistent_count / total_count info = [['Total Samples', total_count], ['Failures', failures], ['Consistent Sample', consistent_count], ['Consistency Rate', '{:.2f}%'.format(consistency_rate * 100.0)]] utils.save_results(results_path, command=command, info=info)
def black_box_model(options): """ Uses a black box attack against the custom model. Adversarial samples are generated to fool the custom model, which only provides the labels when queried. Note: We could technically use the gradients, since most models support gradient computation, but we are assuming that we do not have access to them. """ attack_p = options['attack_p'] attack_name = options['attack_name'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] custom_foolbox_model = options['custom_foolbox_model'] loader = options['loader'] results_path = options['results_path'] criterion = foolbox.criteria.Misclassification() # The attack will be against the defended (custom) model attack = parsing.parse_attack(attack_name, attack_p, criterion) samples_count, correct_count, successful_attack_count, distances, _, _ = tests.attack_test( custom_foolbox_model, loader, attack, attack_p, cuda, attack_workers, name='Black-Box Model Attack') info = utils.attack_statistics_info(samples_count, correct_count, successful_attack_count, distances) header = ['Distances'] utils.save_results(results_path, table=[distances], command=command, info=info, header=header)
def black_box_preprocessor(options): """ Uses a black box attack to evade the preprocessor defense. Adversarial samples are generated to fool the defended model, which only provides the labels when queried. """ attack_p = options['attack_p'] attack_name = options['attack_name'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] foolbox_model = options['foolbox_model'] loader = options['loader'] results_path = options['results_path'] preprocessor = options['preprocessor'] defended_model = defenses.PreprocessorDefenseModel(foolbox_model, preprocessor) criterion = foolbox.criteria.Misclassification() # The attack will be against the defended model attack = parsing.parse_attack(attack_name, attack_p, criterion) samples_count, correct_count, successful_attack_count, distances, _, _ = tests.attack_test( defended_model, loader, attack, attack_p, cuda, attack_workers, name='Black-Box Preprocessor Attack') info = utils.attack_statistics_info(samples_count, correct_count, successful_attack_count, distances) header = ['Distances'] utils.save_results(results_path, table=[distances], command=command, info=info, header=header)
def boundary_distance(options, max_radius, generation_workers): command = options['command'] cuda = options['cuda'] foolbox_model = options['foolbox_model'] loader = options['loader'] results_path = options['results_path'] distance_tool = parsing.parse_distance_tool('counter-attack', options, np.inf) samples_count, consistent_count, failure_count, inconsistent_differences = tests.boundary_distance_test( foolbox_model, loader, distance_tool, max_radius, cuda, generation_workers, name='Boundary Distance Consistency Test') failure_rate = failure_count / samples_count consistency_rate = consistent_count / samples_count effective_consistency_rate = consistent_count / (samples_count - failure_count) average_difference = np.average(inconsistent_differences) median_difference = np.median(inconsistent_differences) info = [['Total Samples', samples_count], ['Failed Measure Samples', failure_count], ['Consistent Sample', consistent_count], ['Failure Rate', '{:.2f}%'.format(failure_rate * 100.0)], ['Consistency Rate', '{:.2f}%'.format(consistency_rate * 100.0)], [ 'Effective Consistency Rate', '{:.2f}%'.format(effective_consistency_rate * 100.0) ], ['Average Difference', '{:2.2e}'.format(average_difference)], ['Median Difference', '{:2.2e}'.format(median_difference)]] header = ['Inconsistent Differences'] utils.save_results(results_path, table=[inconsistent_differences], header=header, command=command, info=info)
def accuracy(options, top_ks): """ Computes the accuracy of the model. \b Stores the following results: Top-K Accuracies: The accuracies, where k values are configurable with --top-ks. """ command = options['command'] foolbox_model = options['foolbox_model'] loader = options['loader'] results_path = options['results_path'] accuracies = tests.accuracy_test(foolbox_model, loader, top_ks) info = [[ 'Top-{} Accuracy:'.format(top_k), '{:2.2f}%'.format(accuracy * 100.0) ] for top_k, accuracy in zip(top_ks, accuracies)] utils.save_results(results_path, command=command, info=info)
def check_parallelization(options): """ Compares parallelized attacks with standard ones. This is a sanity check to verify that attack parallelization does not seriously affect the results. """ attack_p = options['attack_p'] attack_name = options['attack_name'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] foolbox_model = options['foolbox_model'] results_path = options['results_path'] loader = options['loader'] criterion = foolbox.criteria.Misclassification() attack = parsing.parse_attack(attack_name, attack_p, criterion) samples_count, correct_count, standard_attack_count, parallel_attack_count, standard_distances, parallel_distances = tests.parallelization_test( foolbox_model, loader, attack, attack_p, cuda, attack_workers) standard_failure_count = correct_count - standard_attack_count parallel_failure_count = correct_count - parallel_attack_count standard_average_distance, standard_median_distance, standard_adjusted_median_distance = utils.distance_statistics( standard_distances, standard_failure_count) parallel_average_distance, parallel_median_distance, parallel_adjusted_median_distance = utils.distance_statistics( parallel_distances, parallel_failure_count) standard_success_rate = standard_attack_count / correct_count parallel_success_rate = parallel_attack_count / correct_count average_distance_difference = ( parallel_average_distance - standard_average_distance) / standard_average_distance median_distance_difference = ( parallel_median_distance - standard_median_distance) / standard_median_distance success_rate_difference = (parallel_success_rate - standard_success_rate) / standard_success_rate adjusted_median_distance_difference = ( parallel_adjusted_median_distance - standard_adjusted_median_distance) / standard_adjusted_median_distance info = [[ 'Average Distance Relative Difference', average_distance_difference ], ['Median Distance Relative Difference', median_distance_difference], ['Success Rate Difference', success_rate_difference], [ 'Adjusted Median Distance Difference', adjusted_median_distance_difference ], ['Samples Count', str(samples_count)], ['Correct Count', str(correct_count)], ['Standard Attack Count', str(standard_attack_count)], ['Parallel Attack Count', str(parallel_attack_count)]] header = ['Standard Distances', 'Parallel Distances'] utils.save_results(results_path, table=[standard_distances, parallel_distances], command=command, info=info, header=header)
def detector_roc(options, score_dataset_path, no_test_warning): """ Uses a detector to identify adversarial samples and computes the ROC curve. \b Stores the following results: ROC Area Under Curve (ROC-AUC) Best Threshold: The threshold with the best Youden Index (TPR - FPR) Best Threshold True Positive Rate: The TPR at the best threshold Best Threshold False Positive Rate: The FPR at the best threshold Genuine Scores: All the scores computed for the genuine samples Adversarial Scores: All the scores computed for the adversarial samples The last three columns contain the data to build the ROC curve. These are: Thresholds True Positive Rates False Positive Rates Each threshold has a corresponding TPR and FPR. """ adversarial_loader = options['adversarial_loader'] command = options['command'] dataset_type = options['dataset_type'] detector = options['detector'] failure_value = options['failure_value'] foolbox_model = options['foolbox_model'] genuine_loader = options['loader'] results_path = options['results_path'] save_scores = score_dataset_path is not None if dataset_type == 'test' and not no_test_warning: logger.warning('Remember to use \'--dataset-type train\' if you plan to use the results ' 'to pick a threshold for other tests. You can disable this warning by passing ' '\'--no-test-warning\'.') genuine_scores, adversarial_scores, genuine_samples, adversarial_samples = tests.roc_curve_test( foolbox_model, genuine_loader, adversarial_loader, detector, save_scores) false_positive_rates, true_positive_rates, thresholds = utils.roc_curve( genuine_scores, adversarial_scores) best_threshold, best_tpr, best_fpr = utils.get_best_threshold( true_positive_rates, false_positive_rates, thresholds) area_under_curve = sklearn.metrics.auc( false_positive_rates, true_positive_rates) info = [['ROC AUC', '{:2.2f}%'.format(area_under_curve * 100.0)], ['Best Threshold', '{:2.2e}'.format(best_threshold)], ['Best Threshold True Positive Rate', '{:2.2f}%'.format(best_tpr * 100.0)], ['Best Threshold False Positive Rate', '{:2.2f}%'.format(best_fpr * 100.0)]] header = ['Genuine Scores', 'Adversarial Scores', 'Thresholds', 'True Positive Rates', 'False Positive Rates'] true_positive_rates = ['{:2.2f}%'.format( true_positive_rate * 100.0) for true_positive_rate in true_positive_rates] false_positive_rates = ['{:2.2f}%'.format( false_positive_rate * 100.0) for false_positive_rate in false_positive_rates] columns = [genuine_scores, adversarial_scores, thresholds, true_positive_rates, false_positive_rates] utils.save_results(results_path, table=columns, command=command, info=info, header=header) if save_scores: # Remove failures genuine_not_failed = np.not_equal(genuine_scores, failure_value) genuine_samples = genuine_samples[genuine_not_failed] genuine_scores = genuine_scores[genuine_not_failed] adversarial_not_failed = np.not_equal( adversarial_scores, failure_value) adversarial_samples = adversarial_samples[adversarial_not_failed] adversarial_scores = adversarial_scores[adversarial_not_failed] genuine_list = zip(genuine_samples, genuine_scores) adversarial_list = zip(adversarial_samples, adversarial_scores) dataset = (genuine_list, adversarial_list) utils.save_zip(dataset, score_dataset_path)
def attack(options, adversarial_dataset_path, no_test_warning): """ Runs an attack against the model. \b Stores the following results: Success Rate: The success rate of the attack. Average Distance: The average L_p distance of the successful adversarial samples from their original samples. Median Distance: The median L_p distance of the successful adversarial samples from their original samples. Adjusted Median Distance: The median L_p distance of the adversarial samples from their original samples, treating failed attacks as samples with distance Infinity. """ attack_p = options['attack_p'] attack_name = options['attack_name'] attack_workers = options['attack_workers'] command = options['command'] cuda = options['cuda'] dataset_type = options['dataset_type'] foolbox_model = options['foolbox_model'] loader = options['loader'] results_path = options['results_path'] criterion = foolbox.criteria.Misclassification() attack = parsing.parse_attack(attack_name, attack_p, criterion) save_adversarials = adversarial_dataset_path is not None if dataset_type == 'test' and save_adversarials and not no_test_warning: logger.warning( 'Remember to use \'--dataset-type train\' if you plan to use the generated adversarials ' 'to train or calibrate an adversarial detector. You can disable this warning by passing ' '\'--no-test-warning\'.') samples_count, correct_count, successful_attack_count, distances, adversarials, adversarial_ground_truths = tests.attack_test( foolbox_model, loader, attack, attack_p, cuda, attack_workers, save_adversarials=save_adversarials) accuracy = correct_count / samples_count success_rate = successful_attack_count / correct_count failure_count = correct_count - successful_attack_count average_distance, median_distance, adjusted_median_distance = utils.distance_statistics( distances, failure_count) info = [['Base Accuracy', '{:2.2f}%'.format(accuracy * 100.0)], ['Success Rate', '{:2.2f}%'.format(success_rate * 100.0)], ['Average Distance', '{:2.2e}'.format(average_distance)], ['Median Distance', '{:2.2e}'.format(median_distance)], [ 'Adjusted Median Distance', '{:2.2e}'.format(adjusted_median_distance) ], ['Samples Count', str(samples_count)], ['Correct Count', str(correct_count)], ['Successful Attack Count', str(successful_attack_count)]] header = ['Distances'] utils.save_results(results_path, table=[distances], command=command, info=info, header=header) if save_adversarials: dataset = list(zip(adversarials, adversarial_ground_truths)), success_rate utils.save_zip(dataset, adversarial_dataset_path)