def test_digit(self): train_x, train_y, test_x, test_y = datasets.load("digit") self.assertEqual(train_x.shape[0], 1350) self.assertEqual(train_y.shape[0], 1350) self.assertEqual(test_x.shape[0], 150) self.assertEqual(test_y.shape[0], 150) for X in train_x, test_x: for x in X: self.assertTrue(1148 <= len(x) <= 18262)
def evaluate_classifier(config_path: str) -> None: """ Evaluate a config file for classiifcation robustness against attack. """ with open(config_path, "r") as fp: config = json.load(fp) model_config = config["model"] classifier_module = import_module(model_config["module"]) classifier_fn = getattr(classifier_module, model_config["name"]) classifier = classifier_fn(model_config["model_kwargs"], model_config["wrapper_kwargs"]) preprocessing_fn = getattr(classifier_module, "preprocessing_fn") logger.info(f"Loading dataset {config['dataset']['name']}...") clean_x, adv_x, labels = datasets.load(config["dataset"]["name"], preprocessing_fn=preprocessing_fn) # Evaluate the ART classifier on benign test examples logger.info("Predicting on clean dataset...") predictions = classifier.predict(clean_x) benign_accuracy = np.sum( np.argmax(predictions, axis=1) == labels) / len(labels) logger.info("Accuracy on benign test examples: {}%".format( benign_accuracy * 100)) # Evaluate the ART classifier on adversarial examples from transfer attack logger.info("Predicting on adversarial dataset...") predictions = classifier.predict(adv_x) adversarial_accuracy = np.sum( np.argmax(predictions, axis=1) == labels) / len(labels) logger.info("Accuracy on adversarial test examples: {}%".format( adversarial_accuracy * 100)) logger.info("Saving json output...") filepath = os.path.join(paths.OUTPUTS, "evaluation-results.json") with open(filepath, "w") as f: output_dict = { "config": config, "results": { "baseline_accuracy": str(benign_accuracy), "adversarial_accuracy": str(adversarial_accuracy), }, } json.dump(output_dict, f, sort_keys=True, indent=4) logger.info(f"Evaluation Results written to {filepath}")
def test_keras_imagenet(self): classifier_module = import_module( "armory.baseline_models.keras.keras_resnet50") classifier_fn = getattr(classifier_module, "get_art_model") classifier = classifier_fn(model_kwargs={}, wrapper_kwargs={}) preprocessing_fn = getattr(classifier_module, "preprocessing_fn") clean_x, adv_x, labels = datasets.load("imagenet_adversarial", preprocessing_fn) predictions = classifier.predict(clean_x) accuracy = np.sum( np.argmax(predictions, axis=1) == labels) / len(labels) self.assertGreater(accuracy, 0.65) predictions = classifier.predict(adv_x) accuracy = np.sum( np.argmax(predictions, axis=1) == labels) / len(labels) print(accuracy) self.assertLess(accuracy, 0.02)
def test_keras_cifar10(self): batch_size = 64 epochs = 2 classifier_module = import_module( "armory.baseline_models.keras.keras_cifar") classifier_fn = getattr(classifier_module, "get_art_model") classifier = classifier_fn(model_kwargs={}, wrapper_kwargs={}) preprocessing_fn = getattr(classifier_module, "preprocessing_fn") train_x, train_y, test_x, test_y = datasets.load( "cifar10", preprocessing_fn=preprocessing_fn) classifier.fit(train_x, train_y, batch_size=batch_size, nb_epochs=epochs) predictions = classifier.predict(test_x) accuracy = np.sum( np.argmax(predictions, axis=1) == test_y) / len(test_y) self.assertGreater(accuracy, 0.4)
def evaluate_classifier(config_path: str) -> None: """ Evaluate a config file for classifcation robustness against attack. """ # Generate adversarial test examples with open(config_path, "r") as fp: config = json.load(fp) model_config = config["model"] classifier_module = import_module(model_config["module"]) classifier_fn = getattr(classifier_module, model_config["name"]) preprocessing_fn = getattr(classifier_module, "preprocessing_fn") logger.info(f"Loading dataset {config['dataset']['name']}...") x_clean_train, y_clean_train, x_clean_test, y_clean_test = datasets.load( config["dataset"]["name"], preprocessing_fn=preprocessing_fn ) batch_size = config["adhoc"]["batch_size"] epochs = config["adhoc"]["epochs"] n_trials = config["adhoc"]["n_trials"] poison_frac_min = config["adhoc"]["poison_frac_min"] poison_frac_max = config["adhoc"]["poison_frac_max"] poison_frac_steps = config["adhoc"]["poison_frac_steps"] source_class = config["adhoc"]["source_class"] target_class = config["adhoc"]["target_class"] fraction_poisons = np.linspace(poison_frac_min, poison_frac_max, poison_frac_steps) # Test clean model accuracy to provide a benchmark to poisoned model accuracy raw_metrics = {} raw_metrics["undefended_backdoor_success_rate"] = init_metrics( fraction_poisons, n_trials ) raw_metrics["non_backdoored_accuracy"] = init_metrics(fraction_poisons, n_trials) raw_metrics["clean_model_accuracy"] = [None for _ in range(n_trials)] raw_metrics["defended_backdoor_success_rate"] = init_metrics( fraction_poisons, n_trials ) raw_metrics["delta_accuracy"] = init_metrics(fraction_poisons, n_trials) for trial in range(n_trials): classifier = classifier_fn( model_config["model_kwargs"], model_config["wrapper_kwargs"] ) logger.info( f"Fitting clean unpoisoned model of {model_config['module']}.{model_config['name']}..." ) classifier.fit( x_clean_train, y_clean_train, batch_size=batch_size, nb_epochs=epochs ) raw_metrics["clean_model_accuracy"][trial] = eval_targeted_fit( classifier, x_clean_test, y_clean_test ) for frac_poison in fraction_poisons: # Need to retrain from scratch for each frac_poison value classifier = classifier_fn( model_config["model_kwargs"], model_config["wrapper_kwargs"] ) classifier_defended = classifier_fn( model_config["model_kwargs"], model_config["wrapper_kwargs"] ) attack_config = config["attack"] attack_module = import_module(attack_config["module"]) attack_fn = getattr(attack_module, attack_config["name"]) attack = attack_fn( classifier=classifier, x_train=x_clean_train, y_train=y_clean_train, pct_poison=frac_poison, source_class=source_class, target_class=target_class, ) is_poison, x_poison, y_poison = attack.generate( x_clean_train, y_clean_train ) logger.info(f"Fitting poisoned model with poison fraction {frac_poison}...") classifier.fit(x_poison, y_poison, batch_size=batch_size, nb_epochs=epochs) x_test_targeted = x_clean_test[y_clean_test == source_class] x_poison_test = attack.generate_target_test(x_test_targeted) # Show targeted accuracy for poisoned classes is as expected raw_metrics["undefended_backdoor_success_rate"][frac_poison][ trial ] = eval_targeted_fit(classifier, x_poison_test, target_class) raw_metrics["non_backdoored_accuracy"][frac_poison][ trial ] = eval_targeted_fit(classifier, x_clean_test, y_clean_test) defense_config = config["defense"] defense_module = import_module(defense_config["module"]) defense_fn = getattr(defense_module, defense_config["name"]) defense = defense_fn( classifier, x_poison, y_poison, batch_size=batch_size, ub_pct_poison=frac_poison, **defense_config["kwargs"], ) conf_matrix_json = defense.evaluate_defence(np.logical_not(is_poison)) logger.info( f"Poison detection confusion matrix from defense {config['defense']['name']} " f"with poison fraction {frac_poison}:" ) logger.info(conf_matrix_json) _, indices_to_keep = defense.detect_poison() logger.info( f"Fitting poisoned model with poisons filtered by defense {config['defense']['name']} " f"with poison fraction {frac_poison}..." ) classifier_defended.fit( x_poison[indices_to_keep == 1], y_poison[indices_to_keep == 1], batch_size=batch_size, nb_epochs=epochs, ) defended_backdoor_success_rate = eval_targeted_fit( classifier_defended, x_poison_test, target_class ) raw_metrics["defended_backdoor_success_rate"][frac_poison][ trial ] = defended_backdoor_success_rate logger.info( f"Trial {trial+1} defended backdoor success rate {defended_backdoor_success_rate} " f"with poisoning proportion of {frac_poison}" ) defended_clean_accuracy = eval_targeted_fit( classifier_defended, x_clean_test, y_clean_test ) delta_accuracy = ( raw_metrics["non_backdoored_accuracy"][frac_poison][trial] - defended_clean_accuracy ) raw_metrics["delta_accuracy"][frac_poison][trial] = delta_accuracy logger.info( f"Trial {trial+1} delta accuracy of {delta_accuracy} " f"with poisoning proportion of {frac_poison}" ) logger.info(f"Trial {trial+1}/{n_trials} completed.") summarized_metrics = summarize_metrics(raw_metrics) logger.info("Saving json output...") filepath = os.path.join( paths.OUTPUTS, f"backdoor_performance_{int(time.time())}.json" ) with open(filepath, "w") as f: output_dict = {"config": config, "results": summarized_metrics} json.dump(output_dict, f, sort_keys=True, indent=4) shutil.copyfile(filepath, os.path.join(paths.OUTPUTS, "latest.json")) classification_poisoning(filepath) classification_poisoning(os.path.join(paths.OUTPUTS, "latest.json"))
def evaluate_classifier(config_path: str) -> None: """ Evaluate a config file for classification robustness against attack. """ with open(config_path) as fp: config = json.load(fp) model_config = config["model"] classifier_module = import_module(model_config["module"]) classifier_fn = getattr(classifier_module, model_config["name"]) classifier = classifier_fn( model_config["model_kwargs"], model_config["wrapper_kwargs"] ) batch_size = config["adhoc"]["batch_size"] preprocessing_fn = getattr(classifier_module, "preprocessing_fn") # Defense defense_config = config["defense"] defense_module = import_module(defense_config["module"]) defense_fn = getattr(defense_module, defense_config["name"]) transformer = defense_fn(**defense_config.get("kwargs", {})) if not isinstance(transformer, defences_ext.Transformer): raise ValueError( f'{defense_config["module"]}.{defense_config["name"]} is not an instance of ' f"{defences_ext.Transformer}" ) defended_classifier = transformer.transform(classifier) # retrofitted to work with existing code logger.info(f"Loading dataset {config['dataset']['name']}...") clean_x, adv_x, labels = datasets.load( config["dataset"]["name"], preprocessing_fn=preprocessing_fn ) logger.debug(f"Original model:\n{classifier}") logger.info("Predicting on clean dataset...") clean_y_pred = classifier.predict(clean_x, batch_size=batch_size) clean_accuracy = np.sum(np.argmax(clean_y_pred, axis=1) == labels) / len(labels) logger.info(f"Accuracy on benign test examples: {clean_accuracy * 100}%") # Evaluate the ART classifier on adversarial examples from transfer attack logger.info("Predicting on adversarial dataset...") adv_y_pred = classifier.predict(adv_x, batch_size=batch_size) adv_accuracy = np.sum(np.argmax(adv_y_pred, axis=1) == labels) / len(labels) logger.info(f"Accuracy on adversarial test examples: {adv_accuracy * 100}%") # Ee-evaluate on defended classifier logger.debug(f"Defended classifier:\n{defended_classifier}") logger.info( f'Classifier defended by {defense_config["module"]}.{defense_config["name"]} transform' ) logger.info("Predicting on clean dataset...") def_clean_y_pred = defended_classifier.predict(clean_x, batch_size=batch_size) def_clean_accuracy = np.sum(np.argmax(def_clean_y_pred, axis=1) == labels) / len( labels ) logger.info(f"Accuracy on benign test examples: {def_clean_accuracy * 100}%") # Evaluate the ART classifier on adversarial examples from transfer attack logger.info("Predicting on adversarial dataset...") def_adv_y_pred = defended_classifier.predict(adv_x, batch_size=batch_size) def_adv_accuracy = np.sum(np.argmax(def_adv_y_pred, axis=1) == labels) / len(labels) logger.info(f"Accuracy on adversarial test examples: {def_adv_accuracy * 100}%")
def evaluate_classifier(config_path: str) -> None: """ Evaluate a config file for classiifcation robustness against attack. """ with open(config_path, "r") as fp: config = json.load(fp) model_config = config["model"] classifier_module = import_module(model_config["module"]) classifier_fn = getattr(classifier_module, model_config["name"]) classifier = classifier_fn(model_config["model_kwargs"], model_config["wrapper_kwargs"]) preprocessing_fn = getattr(classifier_module, "preprocessing_fn") logger.info(f"Loading dataset {config['dataset']['name']}...") x_train, y_train, x_test, y_test = datasets.load( config["dataset"]["name"], preprocessing_fn=preprocessing_fn) logger.info( f"Fitting clean unpoisoned model of {model_config['module']}.{model_config['name']}..." ) classifier.fit( x_train, y_train, batch_size=config["adhoc"]["batch_size"], nb_epochs=config["adhoc"]["epochs"], ) # Generate adversarial test examples attack_config = config["attack"] attack_module = import_module(attack_config["module"]) attack_fn = getattr(attack_module, attack_config["name"]) attack = attack_fn(classifier=classifier, **attack_config["kwargs"]) norm = attack_config["budget"]["norm"][0] if norm == "L2": lp_norm = 2 elif norm == "Linf": lp_norm = np.inf else: raise ValueError( f"Adversarial budget must have a norm of L2 or Linf. Found {norm} in config" ) y_target = (y_test + 1) % config["adhoc"]["num_classes"] np.random.seed(config["adhoc"]["seed"]) indices = np.random.choice(x_test.shape[0], config["adhoc"]["num_attacked_pts"]) x_test_sample = x_test[indices] y_test_sample = y_test[indices] y_target_sample = y_target[indices] logger.info("Generating adversarial examples...") x_test_adv = attack.generate(x=x_test_sample, y=y_target_sample) diff = (x_test_adv - x_test_sample).reshape(x_test_adv.shape[0], -1) epsilons = np.linalg.norm(diff, ord=lp_norm, axis=1) y_clean_pred = np.argmax(classifier.predict(x_test_sample), axis=1) y_adv_pred = np.argmax(classifier.predict(x_test_adv), axis=1) # Evaluate the ART classifier on adversarial test examples and clean test examples successful_attack_indices = (y_clean_pred != y_target_sample) & ( y_adv_pred == y_target_sample) benign_misclassification_rate = np.sum( y_clean_pred == y_target_sample) / float(y_clean_pred.shape[0]) logger.info( f"Benign misclassification as targeted examples: {benign_misclassification_rate * 100}%" ) targeted_attack_success_rate = np.sum(successful_attack_indices) / float( y_clean_pred.shape[0]) clean_accuracy = np.sum(y_clean_pred == y_test_sample) / float( y_clean_pred.shape[0]) logger.info(f"Accuracy on benign test examples: {clean_accuracy * 100}%") epsilons = epsilons.astype(object) epsilons[np.logical_not(successful_attack_indices)] = None unique_epsilons, targeted_attack_success = roc_targeted_epsilon(epsilons) results = {} results[norm] = { "epsilons": list(unique_epsilons), "metric": "Targeted attack success rate", "values": list(targeted_attack_success), } logger.info( f"Finished attacking on norm {norm}. Attack success: {targeted_attack_success_rate * 100}%" ) logger.info("Saving json output...") filepath = os.path.join( paths.OUTPUTS, f"carlini_wagner_attack_{norm}_targeted_output.json") with open(filepath, "w") as f: output_dict = { "config": config, "results": results, } json.dump(output_dict, f, sort_keys=True, indent=4) logger.info("Plotting results...") plot.classification(filepath)
def test_imagenet_adv(self): clean_x, adv_x, labels = datasets.load("imagenet_adversarial") self.assertEqual(clean_x.shape[0], 1000) self.assertEqual(adv_x.shape[0], 1000) self.assertEqual(labels.shape[0], 1000)
def test_cifar10(self): train_x, train_y, test_x, test_y = datasets.load("cifar10") self.assertEqual(train_x.shape[0], 50000) self.assertEqual(train_y.shape[0], 50000) self.assertEqual(test_x.shape[0], 10000) self.assertEqual(test_y.shape[0], 10000)
def test_mnist(self): train_x, train_y, test_x, test_y = datasets.load("mnist") self.assertEqual(train_x.shape[0], 60000) self.assertEqual(train_y.shape[0], 60000) self.assertEqual(test_x.shape[0], 10000) self.assertEqual(test_y.shape[0], 10000)
def evaluate_classifier(config_path: str) -> None: """ Evaluate a config file for classification robustness against attack. """ with open(config_path) as fp: config = json.load(fp) model_config = config["model"] classifier_module = import_module(model_config["module"]) classifier_fn = getattr(classifier_module, model_config["name"]) classifier = classifier_fn(model_config["model_kwargs"], model_config["wrapper_kwargs"]) preprocessing_fn = getattr(classifier_module, "preprocessing_fn") logger.info(f"Loading dataset {config['dataset']['name']}...") x_train, y_train, x_test, y_test = datasets.load( config["dataset"]["name"], preprocessing_fn=preprocessing_fn) logger.info( f"Fitting clean unpoisoned model of {model_config['module']}.{model_config['name']}..." ) classifier.fit( x_train, y_train, batch_size=config["adhoc"]["batch_size"], nb_epochs=config["adhoc"]["epochs"], ) # Speeds up testing... subsample = 100 x_test = x_test[::subsample] y_test = y_test[::subsample] # Evaluate the ART classifier on benign test examples y_pred = classifier.predict(x_test) benign_accuracy = np.sum(np.argmax(y_pred, axis=1) == y_test) / len(y_test) logger.info("Accuracy on benign test examples: {}%".format( benign_accuracy * 100)) attack_config = config["attack"] attack_module = import_module(attack_config["module"]) attack_fn = getattr(attack_module, attack_config["name"]) budget = attack_config["budget"] norms = budget["norm"] results = {} # Assume min_value = 0 max_value = 1.0 input_dim = np.product(x_test.shape[1:]) norm_map = { # from norm name to (fgm_input, max_epsilon) "L0": (0, input_dim), "L1": (1, input_dim * max_value), "L2": (2, np.sqrt(input_dim) * max_value), "Linf": (np.inf, max_value), } for norm in norms: lp_norm, max_epsilon = norm_map[norm] # Currently looking at untargeted attacks, # where adversary accuracy ~ 1 - benign accuracy (except incorrect benign) attack = attack_fn( classifier=classifier, norm=lp_norm, eps=max_epsilon, **attack_config["kwargs"], ) logger.info(f"Generating adversarial examples for norm {norm}...") x_test_adv = attack.generate(x=x_test) # Map into the original input space (bound and quantize) and back to float # NOTE: this step makes many of the attacks fail x_test_adv = project_to_mnist_input(x_test_adv, preprocessing_fn) diff = (x_test_adv - x_test).reshape(x_test.shape[0], -1) epsilons = np.linalg.norm(diff, ord=lp_norm, axis=1) if np.isnan(epsilons).any(): raise ValueError(f"Epsilons have nan values in norm {norm}") min_epsilon = 0 if (epsilons < min_epsilon).any() or (epsilons > max_epsilon).any(): raise ValueError( f"Epsilons have values outside bounds in norm {norm}") y_pred_adv = classifier.predict(x_test_adv) # Ignore benign misclassifications - no perturbation needed epsilons[np.argmax(y_pred, axis=1) != y_test] = min_epsilon # When all attacks fail, set perturbation to None epsilons = epsilons.astype(object) epsilons[(np.argmax(y_pred_adv, axis=1) == y_test) & (np.argmax(y_pred, axis=1) == y_test)] = None adv_acc = np.sum(np.argmax(y_pred_adv, axis=1) != y_test) / len(y_test) # generate curve unique_epsilons, accuracy = roc_epsilon(epsilons, min_epsilon=min_epsilon, max_epsilon=max_epsilon) results[norm] = { "epsilons": list(unique_epsilons), "metric": "Categorical Accuracy", "values": list(accuracy), } # Evaluate the ART classifier on adversarial test examples logger.info( f"Finished attacking on norm {norm}. Attack success: {adv_acc * 100}%" ) logger.info("Saving json output...") filepath = os.path.join(paths.OUTPUTS, f"classifier_extended_{int(time.time())}.json") with open(filepath, "w") as f: output_dict = { "config": config, "results": results, } json.dump(output_dict, f, sort_keys=True, indent=4) shutil.copyfile(filepath, os.path.join(paths.OUTPUTS, "latest.json")) logger.info(f"Now plotting results...") plot.classification(filepath) plot.classification(os.path.join(paths.OUTPUTS, "latest.json"))
def evaluate_classifier(config_path: str) -> None: """ Evaluate a config file for classiifcation robustness against attack. """ with open(config_path, "r") as fp: config = json.load(fp) model_config = config["model"] classifier_module = import_module(model_config["module"]) classifier_fn = getattr(classifier_module, model_config["name"]) classifier = classifier_fn(model_config["model_kwargs"], model_config["wrapper_kwargs"]) preprocessing_fn = getattr(classifier_module, "preprocessing_fn") logger.info(f"Loading dataset {config['dataset']['name']}...") train_x, train_y, test_x, test_y = datasets.load( config["dataset"]["name"], preprocessing_fn=preprocessing_fn) logger.info( f"Fitting clean unpoisoned model of {model_config['module']}.{model_config['name']}..." ) classifier.fit( train_x, train_y, batch_size=config["adhoc"]["batch_size"], nb_epochs=config["adhoc"]["epochs"], ) # Evaluate the ART classifier on benign test examples logger.info("Running inference on benign examples...") predictions = classifier.predict(test_x) benign_accuracy = np.sum( np.argmax(predictions, axis=1) == test_y) / len(test_y) logger.info("Accuracy on benign test examples: {}%".format( benign_accuracy * 100)) # Generate adversarial test examples attack_config = config["attack"] attack_module = import_module(attack_config["module"]) attack_fn = getattr(attack_module, attack_config["name"]) logger.info("Generating adversarial examples...") attack = attack_fn(classifier=classifier, **attack_config["kwargs"]) test_x_adv = attack.generate(x=test_x) # Evaluate the ART classifier on adversarial test examples logger.info("Running inference on adversarial examples...") predictions = classifier.predict(test_x_adv) adversarial_accuracy = np.sum( np.argmax(predictions, axis=1) == test_y) / len(test_y) logger.info("Accuracy on adversarial test examples: {}%".format( adversarial_accuracy * 100)) logger.info("Saving json output...") filepath = os.path.join(paths.OUTPUTS, "evaluation-results.json") with open(filepath, "w") as f: output_dict = { "config": config, "results": { "baseline_accuracy": str(benign_accuracy), "adversarial_accuracy": str(adversarial_accuracy), }, } json.dump(output_dict, f, sort_keys=True, indent=4) logger.info(f"Evaluation Results written to {filepath}")