Beispiel #1
0
 def test_digit(self):
     train_x, train_y, test_x, test_y = datasets.load("digit")
     self.assertEqual(train_x.shape[0], 1350)
     self.assertEqual(train_y.shape[0], 1350)
     self.assertEqual(test_x.shape[0], 150)
     self.assertEqual(test_y.shape[0], 150)
     for X in train_x, test_x:
         for x in X:
             self.assertTrue(1148 <= len(x) <= 18262)
Beispiel #2
0
def evaluate_classifier(config_path: str) -> None:
    """
    Evaluate a config file for classiifcation robustness against attack.
    """
    with open(config_path, "r") as fp:
        config = json.load(fp)

    model_config = config["model"]
    classifier_module = import_module(model_config["module"])
    classifier_fn = getattr(classifier_module, model_config["name"])
    classifier = classifier_fn(model_config["model_kwargs"],
                               model_config["wrapper_kwargs"])

    preprocessing_fn = getattr(classifier_module, "preprocessing_fn")

    logger.info(f"Loading dataset {config['dataset']['name']}...")
    clean_x, adv_x, labels = datasets.load(config["dataset"]["name"],
                                           preprocessing_fn=preprocessing_fn)

    # Evaluate the ART classifier on benign test examples
    logger.info("Predicting on clean dataset...")
    predictions = classifier.predict(clean_x)
    benign_accuracy = np.sum(
        np.argmax(predictions, axis=1) == labels) / len(labels)
    logger.info("Accuracy on benign test examples: {}%".format(
        benign_accuracy * 100))

    # Evaluate the ART classifier on adversarial examples from transfer attack
    logger.info("Predicting on adversarial dataset...")
    predictions = classifier.predict(adv_x)
    adversarial_accuracy = np.sum(
        np.argmax(predictions, axis=1) == labels) / len(labels)
    logger.info("Accuracy on adversarial test examples: {}%".format(
        adversarial_accuracy * 100))

    logger.info("Saving json output...")
    filepath = os.path.join(paths.OUTPUTS, "evaluation-results.json")
    with open(filepath, "w") as f:
        output_dict = {
            "config": config,
            "results": {
                "baseline_accuracy": str(benign_accuracy),
                "adversarial_accuracy": str(adversarial_accuracy),
            },
        }
        json.dump(output_dict, f, sort_keys=True, indent=4)
    logger.info(f"Evaluation Results written to {filepath}")
Beispiel #3
0
    def test_keras_imagenet(self):
        classifier_module = import_module(
            "armory.baseline_models.keras.keras_resnet50")
        classifier_fn = getattr(classifier_module, "get_art_model")
        classifier = classifier_fn(model_kwargs={}, wrapper_kwargs={})
        preprocessing_fn = getattr(classifier_module, "preprocessing_fn")

        clean_x, adv_x, labels = datasets.load("imagenet_adversarial",
                                               preprocessing_fn)

        predictions = classifier.predict(clean_x)
        accuracy = np.sum(
            np.argmax(predictions, axis=1) == labels) / len(labels)
        self.assertGreater(accuracy, 0.65)

        predictions = classifier.predict(adv_x)
        accuracy = np.sum(
            np.argmax(predictions, axis=1) == labels) / len(labels)
        print(accuracy)
        self.assertLess(accuracy, 0.02)
Beispiel #4
0
    def test_keras_cifar10(self):
        batch_size = 64
        epochs = 2

        classifier_module = import_module(
            "armory.baseline_models.keras.keras_cifar")
        classifier_fn = getattr(classifier_module, "get_art_model")
        classifier = classifier_fn(model_kwargs={}, wrapper_kwargs={})
        preprocessing_fn = getattr(classifier_module, "preprocessing_fn")

        train_x, train_y, test_x, test_y = datasets.load(
            "cifar10", preprocessing_fn=preprocessing_fn)

        classifier.fit(train_x,
                       train_y,
                       batch_size=batch_size,
                       nb_epochs=epochs)

        predictions = classifier.predict(test_x)
        accuracy = np.sum(
            np.argmax(predictions, axis=1) == test_y) / len(test_y)
        self.assertGreater(accuracy, 0.4)
def evaluate_classifier(config_path: str) -> None:
    """
    Evaluate a config file for classifcation robustness against attack.
    """
    # Generate adversarial test examples
    with open(config_path, "r") as fp:
        config = json.load(fp)

    model_config = config["model"]
    classifier_module = import_module(model_config["module"])
    classifier_fn = getattr(classifier_module, model_config["name"])
    preprocessing_fn = getattr(classifier_module, "preprocessing_fn")

    logger.info(f"Loading dataset {config['dataset']['name']}...")
    x_clean_train, y_clean_train, x_clean_test, y_clean_test = datasets.load(
        config["dataset"]["name"], preprocessing_fn=preprocessing_fn
    )

    batch_size = config["adhoc"]["batch_size"]
    epochs = config["adhoc"]["epochs"]
    n_trials = config["adhoc"]["n_trials"]
    poison_frac_min = config["adhoc"]["poison_frac_min"]
    poison_frac_max = config["adhoc"]["poison_frac_max"]
    poison_frac_steps = config["adhoc"]["poison_frac_steps"]
    source_class = config["adhoc"]["source_class"]
    target_class = config["adhoc"]["target_class"]

    fraction_poisons = np.linspace(poison_frac_min, poison_frac_max, poison_frac_steps)

    # Test clean model accuracy to provide a benchmark to poisoned model accuracy
    raw_metrics = {}
    raw_metrics["undefended_backdoor_success_rate"] = init_metrics(
        fraction_poisons, n_trials
    )
    raw_metrics["non_backdoored_accuracy"] = init_metrics(fraction_poisons, n_trials)
    raw_metrics["clean_model_accuracy"] = [None for _ in range(n_trials)]
    raw_metrics["defended_backdoor_success_rate"] = init_metrics(
        fraction_poisons, n_trials
    )
    raw_metrics["delta_accuracy"] = init_metrics(fraction_poisons, n_trials)

    for trial in range(n_trials):
        classifier = classifier_fn(
            model_config["model_kwargs"], model_config["wrapper_kwargs"]
        )
        logger.info(
            f"Fitting clean unpoisoned model of {model_config['module']}.{model_config['name']}..."
        )
        classifier.fit(
            x_clean_train, y_clean_train, batch_size=batch_size, nb_epochs=epochs
        )
        raw_metrics["clean_model_accuracy"][trial] = eval_targeted_fit(
            classifier, x_clean_test, y_clean_test
        )

        for frac_poison in fraction_poisons:
            # Need to retrain from scratch for each frac_poison value
            classifier = classifier_fn(
                model_config["model_kwargs"], model_config["wrapper_kwargs"]
            )
            classifier_defended = classifier_fn(
                model_config["model_kwargs"], model_config["wrapper_kwargs"]
            )

            attack_config = config["attack"]
            attack_module = import_module(attack_config["module"])
            attack_fn = getattr(attack_module, attack_config["name"])

            attack = attack_fn(
                classifier=classifier,
                x_train=x_clean_train,
                y_train=y_clean_train,
                pct_poison=frac_poison,
                source_class=source_class,
                target_class=target_class,
            )

            is_poison, x_poison, y_poison = attack.generate(
                x_clean_train, y_clean_train
            )
            logger.info(f"Fitting poisoned model with poison fraction {frac_poison}...")
            classifier.fit(x_poison, y_poison, batch_size=batch_size, nb_epochs=epochs)

            x_test_targeted = x_clean_test[y_clean_test == source_class]
            x_poison_test = attack.generate_target_test(x_test_targeted)

            # Show targeted accuracy for poisoned classes is as expected
            raw_metrics["undefended_backdoor_success_rate"][frac_poison][
                trial
            ] = eval_targeted_fit(classifier, x_poison_test, target_class)

            raw_metrics["non_backdoored_accuracy"][frac_poison][
                trial
            ] = eval_targeted_fit(classifier, x_clean_test, y_clean_test)

            defense_config = config["defense"]
            defense_module = import_module(defense_config["module"])
            defense_fn = getattr(defense_module, defense_config["name"])

            defense = defense_fn(
                classifier,
                x_poison,
                y_poison,
                batch_size=batch_size,
                ub_pct_poison=frac_poison,
                **defense_config["kwargs"],
            )
            conf_matrix_json = defense.evaluate_defence(np.logical_not(is_poison))
            logger.info(
                f"Poison detection confusion matrix from defense {config['defense']['name']} "
                f"with poison fraction {frac_poison}:"
            )
            logger.info(conf_matrix_json)
            _, indices_to_keep = defense.detect_poison()

            logger.info(
                f"Fitting poisoned model with poisons filtered by defense {config['defense']['name']} "
                f"with poison fraction {frac_poison}..."
            )
            classifier_defended.fit(
                x_poison[indices_to_keep == 1],
                y_poison[indices_to_keep == 1],
                batch_size=batch_size,
                nb_epochs=epochs,
            )

            defended_backdoor_success_rate = eval_targeted_fit(
                classifier_defended, x_poison_test, target_class
            )
            raw_metrics["defended_backdoor_success_rate"][frac_poison][
                trial
            ] = defended_backdoor_success_rate
            logger.info(
                f"Trial {trial+1} defended backdoor success rate {defended_backdoor_success_rate} "
                f"with poisoning proportion of {frac_poison}"
            )

            defended_clean_accuracy = eval_targeted_fit(
                classifier_defended, x_clean_test, y_clean_test
            )

            delta_accuracy = (
                raw_metrics["non_backdoored_accuracy"][frac_poison][trial]
                - defended_clean_accuracy
            )
            raw_metrics["delta_accuracy"][frac_poison][trial] = delta_accuracy

            logger.info(
                f"Trial {trial+1} delta accuracy of {delta_accuracy} "
                f"with poisoning proportion of {frac_poison}"
            )
        logger.info(f"Trial {trial+1}/{n_trials} completed.")

    summarized_metrics = summarize_metrics(raw_metrics)
    logger.info("Saving json output...")
    filepath = os.path.join(
        paths.OUTPUTS, f"backdoor_performance_{int(time.time())}.json"
    )
    with open(filepath, "w") as f:
        output_dict = {"config": config, "results": summarized_metrics}
        json.dump(output_dict, f, sort_keys=True, indent=4)
    shutil.copyfile(filepath, os.path.join(paths.OUTPUTS, "latest.json"))
    classification_poisoning(filepath)
    classification_poisoning(os.path.join(paths.OUTPUTS, "latest.json"))
Beispiel #6
0
def evaluate_classifier(config_path: str) -> None:
    """
    Evaluate a config file for classification robustness against attack.
    """
    with open(config_path) as fp:
        config = json.load(fp)

    model_config = config["model"]
    classifier_module = import_module(model_config["module"])
    classifier_fn = getattr(classifier_module, model_config["name"])
    classifier = classifier_fn(
        model_config["model_kwargs"], model_config["wrapper_kwargs"]
    )

    batch_size = config["adhoc"]["batch_size"]

    preprocessing_fn = getattr(classifier_module, "preprocessing_fn")

    # Defense
    defense_config = config["defense"]
    defense_module = import_module(defense_config["module"])
    defense_fn = getattr(defense_module, defense_config["name"])
    transformer = defense_fn(**defense_config.get("kwargs", {}))
    if not isinstance(transformer, defences_ext.Transformer):
        raise ValueError(
            f'{defense_config["module"]}.{defense_config["name"]} is not an instance of '
            f"{defences_ext.Transformer}"
        )
    defended_classifier = transformer.transform(classifier)

    # retrofitted to work with existing code
    logger.info(f"Loading dataset {config['dataset']['name']}...")
    clean_x, adv_x, labels = datasets.load(
        config["dataset"]["name"], preprocessing_fn=preprocessing_fn
    )

    logger.debug(f"Original model:\n{classifier}")
    logger.info("Predicting on clean dataset...")
    clean_y_pred = classifier.predict(clean_x, batch_size=batch_size)
    clean_accuracy = np.sum(np.argmax(clean_y_pred, axis=1) == labels) / len(labels)
    logger.info(f"Accuracy on benign test examples: {clean_accuracy * 100}%")

    # Evaluate the ART classifier on adversarial examples from transfer attack
    logger.info("Predicting on adversarial dataset...")
    adv_y_pred = classifier.predict(adv_x, batch_size=batch_size)
    adv_accuracy = np.sum(np.argmax(adv_y_pred, axis=1) == labels) / len(labels)
    logger.info(f"Accuracy on adversarial test examples: {adv_accuracy * 100}%")

    # Ee-evaluate on defended classifier
    logger.debug(f"Defended classifier:\n{defended_classifier}")
    logger.info(
        f'Classifier defended by {defense_config["module"]}.{defense_config["name"]} transform'
    )
    logger.info("Predicting on clean dataset...")
    def_clean_y_pred = defended_classifier.predict(clean_x, batch_size=batch_size)
    def_clean_accuracy = np.sum(np.argmax(def_clean_y_pred, axis=1) == labels) / len(
        labels
    )
    logger.info(f"Accuracy on benign test examples: {def_clean_accuracy * 100}%")

    # Evaluate the ART classifier on adversarial examples from transfer attack
    logger.info("Predicting on adversarial dataset...")
    def_adv_y_pred = defended_classifier.predict(adv_x, batch_size=batch_size)
    def_adv_accuracy = np.sum(np.argmax(def_adv_y_pred, axis=1) == labels) / len(labels)
    logger.info(f"Accuracy on adversarial test examples: {def_adv_accuracy * 100}%")
def evaluate_classifier(config_path: str) -> None:
    """
    Evaluate a config file for classiifcation robustness against attack.
    """
    with open(config_path, "r") as fp:
        config = json.load(fp)

    model_config = config["model"]
    classifier_module = import_module(model_config["module"])
    classifier_fn = getattr(classifier_module, model_config["name"])
    classifier = classifier_fn(model_config["model_kwargs"],
                               model_config["wrapper_kwargs"])

    preprocessing_fn = getattr(classifier_module, "preprocessing_fn")

    logger.info(f"Loading dataset {config['dataset']['name']}...")
    x_train, y_train, x_test, y_test = datasets.load(
        config["dataset"]["name"], preprocessing_fn=preprocessing_fn)

    logger.info(
        f"Fitting clean unpoisoned model of {model_config['module']}.{model_config['name']}..."
    )
    classifier.fit(
        x_train,
        y_train,
        batch_size=config["adhoc"]["batch_size"],
        nb_epochs=config["adhoc"]["epochs"],
    )

    # Generate adversarial test examples
    attack_config = config["attack"]
    attack_module = import_module(attack_config["module"])
    attack_fn = getattr(attack_module, attack_config["name"])

    attack = attack_fn(classifier=classifier, **attack_config["kwargs"])
    norm = attack_config["budget"]["norm"][0]
    if norm == "L2":
        lp_norm = 2
    elif norm == "Linf":
        lp_norm = np.inf
    else:
        raise ValueError(
            f"Adversarial budget must have a norm of L2 or Linf. Found {norm} in config"
        )

    y_target = (y_test + 1) % config["adhoc"]["num_classes"]

    np.random.seed(config["adhoc"]["seed"])
    indices = np.random.choice(x_test.shape[0],
                               config["adhoc"]["num_attacked_pts"])

    x_test_sample = x_test[indices]
    y_test_sample = y_test[indices]
    y_target_sample = y_target[indices]

    logger.info("Generating adversarial examples...")
    x_test_adv = attack.generate(x=x_test_sample, y=y_target_sample)

    diff = (x_test_adv - x_test_sample).reshape(x_test_adv.shape[0], -1)
    epsilons = np.linalg.norm(diff, ord=lp_norm, axis=1)

    y_clean_pred = np.argmax(classifier.predict(x_test_sample), axis=1)
    y_adv_pred = np.argmax(classifier.predict(x_test_adv), axis=1)

    # Evaluate the ART classifier on adversarial test examples and clean test examples
    successful_attack_indices = (y_clean_pred != y_target_sample) & (
        y_adv_pred == y_target_sample)

    benign_misclassification_rate = np.sum(
        y_clean_pred == y_target_sample) / float(y_clean_pred.shape[0])

    logger.info(
        f"Benign misclassification as targeted examples: {benign_misclassification_rate * 100}%"
    )

    targeted_attack_success_rate = np.sum(successful_attack_indices) / float(
        y_clean_pred.shape[0])
    clean_accuracy = np.sum(y_clean_pred == y_test_sample) / float(
        y_clean_pred.shape[0])

    logger.info(f"Accuracy on benign test examples: {clean_accuracy * 100}%")

    epsilons = epsilons.astype(object)
    epsilons[np.logical_not(successful_attack_indices)] = None

    unique_epsilons, targeted_attack_success = roc_targeted_epsilon(epsilons)
    results = {}

    results[norm] = {
        "epsilons": list(unique_epsilons),
        "metric": "Targeted attack success rate",
        "values": list(targeted_attack_success),
    }

    logger.info(
        f"Finished attacking on norm {norm}. Attack success: {targeted_attack_success_rate * 100}%"
    )

    logger.info("Saving json output...")
    filepath = os.path.join(
        paths.OUTPUTS, f"carlini_wagner_attack_{norm}_targeted_output.json")
    with open(filepath, "w") as f:
        output_dict = {
            "config": config,
            "results": results,
        }
        json.dump(output_dict, f, sort_keys=True, indent=4)
    logger.info("Plotting results...")
    plot.classification(filepath)
Beispiel #8
0
 def test_imagenet_adv(self):
     clean_x, adv_x, labels = datasets.load("imagenet_adversarial")
     self.assertEqual(clean_x.shape[0], 1000)
     self.assertEqual(adv_x.shape[0], 1000)
     self.assertEqual(labels.shape[0], 1000)
Beispiel #9
0
 def test_cifar10(self):
     train_x, train_y, test_x, test_y = datasets.load("cifar10")
     self.assertEqual(train_x.shape[0], 50000)
     self.assertEqual(train_y.shape[0], 50000)
     self.assertEqual(test_x.shape[0], 10000)
     self.assertEqual(test_y.shape[0], 10000)
Beispiel #10
0
 def test_mnist(self):
     train_x, train_y, test_x, test_y = datasets.load("mnist")
     self.assertEqual(train_x.shape[0], 60000)
     self.assertEqual(train_y.shape[0], 60000)
     self.assertEqual(test_x.shape[0], 10000)
     self.assertEqual(test_y.shape[0], 10000)
def evaluate_classifier(config_path: str) -> None:
    """
    Evaluate a config file for classification robustness against attack.
    """
    with open(config_path) as fp:
        config = json.load(fp)

    model_config = config["model"]
    classifier_module = import_module(model_config["module"])
    classifier_fn = getattr(classifier_module, model_config["name"])
    classifier = classifier_fn(model_config["model_kwargs"],
                               model_config["wrapper_kwargs"])

    preprocessing_fn = getattr(classifier_module, "preprocessing_fn")

    logger.info(f"Loading dataset {config['dataset']['name']}...")
    x_train, y_train, x_test, y_test = datasets.load(
        config["dataset"]["name"], preprocessing_fn=preprocessing_fn)

    logger.info(
        f"Fitting clean unpoisoned model of {model_config['module']}.{model_config['name']}..."
    )
    classifier.fit(
        x_train,
        y_train,
        batch_size=config["adhoc"]["batch_size"],
        nb_epochs=config["adhoc"]["epochs"],
    )

    # Speeds up testing...
    subsample = 100
    x_test = x_test[::subsample]
    y_test = y_test[::subsample]

    # Evaluate the ART classifier on benign test examples
    y_pred = classifier.predict(x_test)
    benign_accuracy = np.sum(np.argmax(y_pred, axis=1) == y_test) / len(y_test)
    logger.info("Accuracy on benign test examples: {}%".format(
        benign_accuracy * 100))

    attack_config = config["attack"]
    attack_module = import_module(attack_config["module"])
    attack_fn = getattr(attack_module, attack_config["name"])
    budget = attack_config["budget"]
    norms = budget["norm"]

    results = {}
    # Assume min_value = 0
    max_value = 1.0
    input_dim = np.product(x_test.shape[1:])
    norm_map = {  # from norm name to (fgm_input, max_epsilon)
        "L0": (0, input_dim),
        "L1": (1, input_dim * max_value),
        "L2": (2, np.sqrt(input_dim) * max_value),
        "Linf": (np.inf, max_value),
    }
    for norm in norms:
        lp_norm, max_epsilon = norm_map[norm]

        # Currently looking at untargeted attacks,
        # where adversary accuracy ~ 1 - benign accuracy (except incorrect benign)
        attack = attack_fn(
            classifier=classifier,
            norm=lp_norm,
            eps=max_epsilon,
            **attack_config["kwargs"],
        )
        logger.info(f"Generating adversarial examples for norm {norm}...")
        x_test_adv = attack.generate(x=x_test)

        # Map into the original input space (bound and quantize) and back to float
        # NOTE: this step makes many of the attacks fail
        x_test_adv = project_to_mnist_input(x_test_adv, preprocessing_fn)

        diff = (x_test_adv - x_test).reshape(x_test.shape[0], -1)
        epsilons = np.linalg.norm(diff, ord=lp_norm, axis=1)
        if np.isnan(epsilons).any():
            raise ValueError(f"Epsilons have nan values in norm {norm}")
        min_epsilon = 0
        if (epsilons < min_epsilon).any() or (epsilons > max_epsilon).any():
            raise ValueError(
                f"Epsilons have values outside bounds in norm {norm}")

        y_pred_adv = classifier.predict(x_test_adv)

        # Ignore benign misclassifications - no perturbation needed
        epsilons[np.argmax(y_pred, axis=1) != y_test] = min_epsilon

        # When all attacks fail, set perturbation to None
        epsilons = epsilons.astype(object)
        epsilons[(np.argmax(y_pred_adv, axis=1) == y_test)
                 & (np.argmax(y_pred, axis=1) == y_test)] = None

        adv_acc = np.sum(np.argmax(y_pred_adv, axis=1) != y_test) / len(y_test)

        # generate curve
        unique_epsilons, accuracy = roc_epsilon(epsilons,
                                                min_epsilon=min_epsilon,
                                                max_epsilon=max_epsilon)

        results[norm] = {
            "epsilons": list(unique_epsilons),
            "metric": "Categorical Accuracy",
            "values": list(accuracy),
        }
        # Evaluate the ART classifier on adversarial test examples
        logger.info(
            f"Finished attacking on norm {norm}. Attack success: {adv_acc * 100}%"
        )

    logger.info("Saving json output...")
    filepath = os.path.join(paths.OUTPUTS,
                            f"classifier_extended_{int(time.time())}.json")
    with open(filepath, "w") as f:
        output_dict = {
            "config": config,
            "results": results,
        }
        json.dump(output_dict, f, sort_keys=True, indent=4)
    shutil.copyfile(filepath, os.path.join(paths.OUTPUTS, "latest.json"))

    logger.info(f"Now plotting results...")
    plot.classification(filepath)
    plot.classification(os.path.join(paths.OUTPUTS, "latest.json"))
Beispiel #12
0
def evaluate_classifier(config_path: str) -> None:
    """
    Evaluate a config file for classiifcation robustness against attack.
    """
    with open(config_path, "r") as fp:
        config = json.load(fp)

    model_config = config["model"]
    classifier_module = import_module(model_config["module"])
    classifier_fn = getattr(classifier_module, model_config["name"])
    classifier = classifier_fn(model_config["model_kwargs"],
                               model_config["wrapper_kwargs"])

    preprocessing_fn = getattr(classifier_module, "preprocessing_fn")

    logger.info(f"Loading dataset {config['dataset']['name']}...")
    train_x, train_y, test_x, test_y = datasets.load(
        config["dataset"]["name"], preprocessing_fn=preprocessing_fn)

    logger.info(
        f"Fitting clean unpoisoned model of {model_config['module']}.{model_config['name']}..."
    )
    classifier.fit(
        train_x,
        train_y,
        batch_size=config["adhoc"]["batch_size"],
        nb_epochs=config["adhoc"]["epochs"],
    )

    # Evaluate the ART classifier on benign test examples
    logger.info("Running inference on benign examples...")
    predictions = classifier.predict(test_x)
    benign_accuracy = np.sum(
        np.argmax(predictions, axis=1) == test_y) / len(test_y)
    logger.info("Accuracy on benign test examples: {}%".format(
        benign_accuracy * 100))

    # Generate adversarial test examples
    attack_config = config["attack"]
    attack_module = import_module(attack_config["module"])
    attack_fn = getattr(attack_module, attack_config["name"])

    logger.info("Generating adversarial examples...")
    attack = attack_fn(classifier=classifier, **attack_config["kwargs"])
    test_x_adv = attack.generate(x=test_x)

    # Evaluate the ART classifier on adversarial test examples
    logger.info("Running inference on adversarial examples...")
    predictions = classifier.predict(test_x_adv)
    adversarial_accuracy = np.sum(
        np.argmax(predictions, axis=1) == test_y) / len(test_y)
    logger.info("Accuracy on adversarial test examples: {}%".format(
        adversarial_accuracy * 100))

    logger.info("Saving json output...")
    filepath = os.path.join(paths.OUTPUTS, "evaluation-results.json")
    with open(filepath, "w") as f:
        output_dict = {
            "config": config,
            "results": {
                "baseline_accuracy": str(benign_accuracy),
                "adversarial_accuracy": str(adversarial_accuracy),
            },
        }
        json.dump(output_dict, f, sort_keys=True, indent=4)
    logger.info(f"Evaluation Results written to {filepath}")