def test_8_keras_mnist(self): x_test_original = self.x_test_mnist.copy() # Keras classifier classifier = get_image_classifier_kr(from_logits=True) scores = classifier._model.evaluate(self.x_train_mnist, self.y_train_mnist) logger.info("[Keras, MNIST] Accuracy on training set: %.2f%%", (scores[1] * 100)) scores = classifier._model.evaluate(self.x_test_mnist, self.y_test_mnist) logger.info("[Keras, MNIST] Accuracy on test set: %.2f%%", (scores[1] * 100)) attack = DeepFool(classifier, max_iter=5, batch_size=11) x_train_adv = attack.generate(self.x_train_mnist) x_test_adv = attack.generate(self.x_test_mnist) self.assertFalse((self.x_train_mnist == x_train_adv).all()) self.assertFalse((self.x_test_mnist == x_test_adv).all()) train_y_pred = get_labels_np_array(classifier.predict(x_train_adv)) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((self.y_train_mnist == train_y_pred).all()) self.assertFalse((self.y_test_mnist == test_y_pred).all()) sum_0 = np.sum(np.argmax(train_y_pred, axis=1) == np.argmax(self.y_train_mnist, axis=1)) accuracy_0 = sum_0 / self.y_train_mnist.shape[0] logger.info("Accuracy on adversarial train examples: %.2f%%", (accuracy_0 * 100)) sum_1 = np.sum(np.argmax(test_y_pred, axis=1) == np.argmax(self.y_test_mnist, axis=1)) accuracy_1 = sum_1 / self.y_test_mnist.shape[0] logger.info("Accuracy on adversarial test examples: %.2f%%", (accuracy_1 * 100)) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float(np.max(np.abs(x_test_original - self.x_test_mnist))), 0.0, delta=0.00001)
def test_5_pytorch_mnist(self): x_train = np.reshape(self.x_train_mnist, (self.x_train_mnist.shape[0], 1, 28, 28)).astype( np.float32) x_test = np.reshape(self.x_test_mnist, (self.x_test_mnist.shape[0], 1, 28, 28)).astype( np.float32) x_test_original = x_test.copy() # Create basic PyTorch model classifier = get_image_classifier_pt(from_logits=True) scores = get_labels_np_array(classifier.predict(x_train)) sum6 = np.sum( np.argmax(scores, axis=1) == np.argmax(self.y_train_mnist, axis=1)) accuracy = sum6 / self.y_train_mnist.shape[0] logger.info("[PyTorch, MNIST] Accuracy on training set: %.2f%%", (accuracy * 100)) scores = get_labels_np_array(classifier.predict(x_test)) sum7 = np.sum( np.argmax(scores, axis=1) == np.argmax(self.y_test_mnist, axis=1)) accuracy = sum7 / self.y_test_mnist.shape[0] logger.info("[PyTorch, MNIST] Accuracy on test set: %.2f%%", (accuracy * 100)) attack = DeepFool(classifier, max_iter=5, batch_size=11, verbose=False) x_train_adv = attack.generate(x_train) x_test_adv = attack.generate(x_test) self.assertFalse((x_train == x_train_adv).all()) self.assertFalse((x_test == x_test_adv).all()) train_y_pred = get_labels_np_array(classifier.predict(x_train_adv)) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((self.y_train_mnist == train_y_pred).all()) self.assertFalse((self.y_test_mnist == test_y_pred).all()) sum8 = np.sum( np.argmax(train_y_pred, axis=1) == np.argmax(self.y_train_mnist, axis=1)) accuracy = sum8 / self.y_train_mnist.shape[0] logger.info("Accuracy on adversarial train examples: %.2f%%", (accuracy * 100)) sum9 = np.sum( np.argmax(test_y_pred, axis=1) == np.argmax(self.y_test_mnist, axis=1)) accuracy = sum9 / self.y_test_mnist.shape[0] logger.info("Accuracy on adversarial test examples: %.2f%%", (accuracy * 100)) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))), 0.0, delta=0.00001)
def _df(model, data, labels, attack_args): max_iter = attack_args.get('max_iter', 100) eps = attack_args.get('eps', 0.01) nb_grads = attack_args.get('nb_grads', 10) attacker = DeepFool(classifier=model, max_iter=max_iter, epsilon=eps, nb_grads=nb_grads) return attacker.generate(data, labels)
def test_9_keras_mnist_partial_grads(self): classifier = get_image_classifier_kr(from_logits=True) attack = DeepFool(classifier, max_iter=2, nb_grads=3) x_test_adv = attack.generate(self.x_test_mnist) self.assertFalse((self.x_test_mnist == x_test_adv).all()) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((self.y_test_mnist == test_y_pred).all()) sum10 = np.sum(np.argmax(test_y_pred, axis=1) == np.argmax(self.y_test_mnist, axis=1)) accuracy = sum10 / self.y_test_mnist.shape[0] logger.info("Accuracy on adversarial test examples: %.2f%%", (accuracy * 100))
def test_3_tensorflow_mnist(self): x_test_original = self.x_test_mnist.copy() # Create basic CNN on MNIST using TensorFlow classifier, sess = get_image_classifier_tf(from_logits=True) scores = get_labels_np_array(classifier.predict(self.x_train_mnist)) sum2 = np.sum( np.argmax(scores, axis=1) == np.argmax(self.y_train_mnist, axis=1)) accuracy = sum2 / self.y_train_mnist.shape[0] logger.info("[TF, MNIST] Accuracy on training set: %.2f%%", (accuracy * 100)) scores = get_labels_np_array(classifier.predict(self.x_test_mnist)) sum3 = np.sum( np.argmax(scores, axis=1) == np.argmax(self.y_test_mnist, axis=1)) accuracy = sum3 / self.y_test_mnist.shape[0] logger.info("[TF, MNIST] Accuracy on test set: %.2f%%", (accuracy * 100)) attack = DeepFool(classifier, max_iter=5, batch_size=11, verbose=False) x_train_adv = attack.generate(self.x_train_mnist) x_test_adv = attack.generate(self.x_test_mnist) self.assertFalse((self.x_train_mnist == x_train_adv).all()) self.assertFalse((self.x_test_mnist == x_test_adv).all()) train_y_pred = get_labels_np_array(classifier.predict(x_train_adv)) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((self.y_train_mnist == train_y_pred).all()) self.assertFalse((self.y_test_mnist == test_y_pred).all()) sum4 = np.sum( np.argmax(train_y_pred, axis=1) == np.argmax(self.y_train_mnist, axis=1)) accuracy = sum4 / self.y_train_mnist.shape[0] logger.info("Accuracy on adversarial train examples: %.2f%%", (accuracy * 100)) sum5 = np.sum( np.argmax(test_y_pred, axis=1) == np.argmax(self.y_test_mnist, axis=1)) accuracy = sum5 / self.y_test_mnist.shape[0] logger.info("Accuracy on adversarial test examples: %.2f%%", (accuracy * 100)) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float( np.max(np.abs(x_test_original - self.x_test_mnist))), 0.0, delta=0.00001)
def test_4_pytorch_iris(self): classifier = get_tabular_classifier_pt() attack = DeepFool(classifier, max_iter=5, batch_size=128) x_test_adv = attack.generate(self.x_test_iris) self.assertFalse((self.x_test_iris == x_test_adv).all()) self.assertLessEqual(np.amax(x_test_adv), 1.0) self.assertGreaterEqual(np.amin(x_test_adv), 0.0) predictions_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(self.y_test_iris, axis=1) == predictions_adv).all()) accuracy = np.sum(predictions_adv == np.argmax(self.y_test_iris, axis=1)) / self.y_test_iris.shape[0] logger.info("Accuracy on Iris with DeepFool adversarial examples: %.2f%%", (accuracy * 100))
def test_7_keras_iris_unbounded(self): classifier = get_tabular_classifier_kr() # Recreate a classifier without clip values classifier = KerasClassifier(model=classifier._model, use_logits=False, channels_first=True) attack = DeepFool(classifier, max_iter=5, batch_size=128) x_test_adv = attack.generate(self.x_test_iris) self.assertFalse((self.x_test_iris == x_test_adv).all()) predictions_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(self.y_test_iris, axis=1) == predictions_adv).all()) accuracy = np.sum(predictions_adv == np.argmax(self.y_test_iris, axis=1)) / self.y_test_iris.shape[0] logger.info("Accuracy on Iris with DeepFool adversarial examples: %.2f%%", (accuracy * 100))
def test_two_attacks(self): (x_train, y_train), (x_test, y_test) = self.mnist x_test_original = x_test.copy() attack1 = FastGradientMethod(estimator=self.classifier, batch_size=16) attack2 = DeepFool(classifier=self.classifier, max_iter=5, batch_size=16) x_test_adv = attack1.generate(x_test) predictions = np.argmax(self.classifier.predict(x_test_adv), axis=1) accuracy = np.sum(predictions == np.argmax(y_test, axis=1)) / NB_TEST adv_trainer = AdversarialTrainer(self.classifier, attacks=[attack1, attack2]) adv_trainer.fit(x_train, y_train, nb_epochs=2, batch_size=16) predictions_new = np.argmax(adv_trainer.predict(x_test_adv), axis=1) accuracy_new = np.sum( predictions_new == np.argmax(y_test, axis=1)) / NB_TEST self.assertEqual(accuracy_new, 0.36) self.assertEqual(accuracy, 0.13) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))), 0.0, delta=0.00001)
def get_adversarial_examples(X, Y, model, nb_classes, attack=None): assert model is not None assert attack is not None art_classifier = SklearnClassifier(model=model, clip_values=(0, nb_classes)) attacker = None if attack == ATTACK.PGD: attacker = ProjectedGradientDescent(classifier=art_classifier, norm=np.inf, eps=0.2, eps_step=0.1, max_iter=3, targeted=False, num_random_init=0, batch_size=128) elif attack == ATTACK.DEEPFOOL: attacker = DeepFool(classifier=art_classifier, max_iter=5, epsilon=1e-6, nb_grads=3, batch_size=1) elif attack == ATTACK.FGSM: attacker = FastGradientMethod(classifier=art_classifier, norm=np.inf, eps=0.3, targeted=False, batch_size=128) elif attack == ATTACK.BIM: attacker = BasicIterativeMethod(classifier=art_classifier, eps=0.3, eps_step=0.1, targeted=False, batch_size=128) elif attack == ATTACK.JSMA: attacker = SaliencyMapMethod(classifier=art_classifier, theta=0.3, gamma=0.5, batch_size=128) elif attack == ATTACK.CW_L2: attacker = CarliniL2Method(classifier=art_classifier, learning_rate=0.1) elif attack == ATTACK.CW_Linf: attacker = CarliniLInfMethod(classifier=art_classifier, learning_rate=0.01) else: raise NotImplementedError(attack, 'is not implemented.') print( 'Generating [{}] adversarial examples, it will take a while...'.format( attack)) X_adv = attacker.generate(X, y=Y) del attacker return X_adv
def test_two_attacks_with_generator(self): (x_train, y_train), (x_test, y_test) = self.mnist x_train_original = x_train.copy() x_test_original = x_test.copy() class MyDataGenerator(DataGenerator): def __init__(self, x, y, size, batch_size): super().__init__(size=size, batch_size=batch_size) self.x = x self.y = y self._size = size self._batch_size = batch_size def get_batch(self): ids = np.random.choice(self.size, size=min(self.size, self.batch_size), replace=False) return self.x[ids], self.y[ids] generator = MyDataGenerator(x_train, y_train, size=x_train.shape[0], batch_size=16) attack1 = FastGradientMethod(estimator=self.classifier, batch_size=16) attack2 = DeepFool(classifier=self.classifier, max_iter=5, batch_size=16) x_test_adv = attack1.generate(x_test) predictions = np.argmax(self.classifier.predict(x_test_adv), axis=1) accuracy = np.sum(predictions == np.argmax(y_test, axis=1)) / NB_TEST adv_trainer = AdversarialTrainer(self.classifier, attacks=[attack1, attack2]) adv_trainer.fit_generator(generator, nb_epochs=3) predictions_new = np.argmax(adv_trainer.predict(x_test_adv), axis=1) accuracy_new = np.sum( predictions_new == np.argmax(y_test, axis=1)) / NB_TEST self.assertAlmostEqual(accuracy_new, 0.25, delta=0.02) self.assertAlmostEqual(accuracy, 0.11, delta=0.0) # Check that x_train and x_test has not been modified by attack and classifier self.assertAlmostEqual(float(np.max(np.abs(x_train_original - x_train))), 0.0, delta=0.00001) self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))), 0.0, delta=0.00001)
def test_generate_attacks_and_targeted(fix_get_mnist_subset, is_tf_version_2): classifier, _ = get_image_classifier_tf(from_logits=True) norm = np.inf eps = 0.3 eps_step = 0.1 batch_size = 32 attacks = list() attacks.append( AutoProjectedGradientDescent( estimator=classifier, norm=norm, eps=eps, eps_step=eps_step, max_iter=100, targeted=True, nb_random_init=5, batch_size=batch_size, loss_type="cross_entropy", )) if is_tf_version_2: loss_type_2 = "difference_logits_ratio" else: loss_type_2 = "cross_entropy" attacks.append( AutoProjectedGradientDescent( estimator=classifier, norm=norm, eps=eps, eps_step=eps_step, max_iter=100, targeted=False, nb_random_init=5, batch_size=batch_size, loss_type=loss_type_2, )) attacks.append( DeepFool(classifier=classifier, max_iter=100, epsilon=1e-6, nb_grads=3, batch_size=batch_size)) attacks.append( SquareAttack(estimator=classifier, norm=norm, max_iter=5000, eps=eps, p_init=0.8, nb_restarts=5)) (x_train_mnist, y_train_mnist, x_test_mnist, y_test_mnist) = fix_get_mnist_subset # First test with defined_attack_only=False attack = AutoAttack( estimator=classifier, norm=norm, eps=eps, eps_step=eps_step, attacks=attacks, batch_size=batch_size, estimator_orig=None, targeted=False, ) x_train_mnist_adv = attack.generate(x=x_train_mnist, y=y_train_mnist) assert np.mean(np.abs(x_train_mnist_adv - x_train_mnist)) == pytest.approx( 0.0182, abs=0.105) assert np.max(np.abs(x_train_mnist_adv - x_train_mnist)) == pytest.approx( 0.3, abs=0.05) # Then test with defined_attack_only=True attack = AutoAttack( estimator=classifier, norm=norm, eps=eps, eps_step=eps_step, attacks=attacks, batch_size=batch_size, estimator_orig=None, targeted=True, ) x_train_mnist_adv = attack.generate(x=x_train_mnist, y=y_train_mnist) assert np.mean(x_train_mnist_adv - x_train_mnist) == pytest.approx( 0.0179, abs=0.0075) assert np.max(np.abs(x_train_mnist_adv - x_train_mnist)) == pytest.approx( eps, abs=0.005)
def __init__( self, estimator: "CLASSIFIER_TYPE", norm: Union[int, float, str] = np.inf, eps: float = 0.3, eps_step: float = 0.1, attacks: Optional[List[EvasionAttack]] = None, batch_size: int = 32, estimator_orig: Optional["CLASSIFIER_TYPE"] = None, targeted: bool = False, ): """ Create a :class:`.AutoAttack` instance. :param estimator: An trained estimator. :param norm: The norm of the adversarial perturbation. Possible values: "inf", np.inf, 1 or 2. :param eps: Maximum perturbation that the attacker can introduce. :param eps_step: Attack step size (input variation) at each iteration. :param attacks: The list of `art.attacks.EvasionAttack` attacks to be used for AutoAttack. If it is `None` or empty the standard attacks (PGD, APGD-ce, APGD-dlr, DeepFool, Square) will be used. :param batch_size: Size of the batch on which adversarial samples are generated. :param estimator_orig: Original estimator to be attacked by adversarial examples. :param targeted: If False run only untargeted attacks, if True also run targeted attacks against each possible target. """ super().__init__(estimator=estimator) if attacks is None or not attacks: attacks = list() attacks.append( AutoProjectedGradientDescent( estimator=estimator, # type: ignore norm=norm, eps=eps, eps_step=eps_step, max_iter=100, targeted=False, nb_random_init=5, batch_size=batch_size, loss_type="cross_entropy", ) ) attacks.append( AutoProjectedGradientDescent( estimator=estimator, # type: ignore norm=norm, eps=eps, eps_step=eps_step, max_iter=100, targeted=False, nb_random_init=5, batch_size=batch_size, loss_type="difference_logits_ratio", ) ) attacks.append( ( DeepFool( classifier=estimator, # type: ignore max_iter=100, epsilon=1e-3, nb_grads=10, batch_size=batch_size, ) ) ) attacks.append( SquareAttack(estimator=estimator, norm=norm, max_iter=5000, eps=eps, p_init=0.8, nb_restarts=5) ) self.norm = norm self.eps = eps self.eps_step = eps_step self.attacks = attacks self.batch_size = batch_size if estimator_orig is not None: self.estimator_orig = estimator_orig else: self.estimator_orig = estimator self._targeted = targeted self._check_params()
def craft(X, Y, art_classifier, attack=None, **attack_params): assert art_classifier is not None assert attack is not None attacker = None if attack == ATTACK.PGD: eps = attack_params.get('eps', 0.2) eps_step = attack_params.get('eps_step', eps / 5.) max_iter = attack_params.get('max_iter', 3) targeted = attack_params.get('targeted', False) batch_size = attack_params.get('batch_size', 128) attacker = ProjectedGradientDescent(classifier=art_classifier, norm=np.inf, eps=eps, eps_step=eps_step, max_iter=max_iter, targeted=targeted, num_random_init=0, batch_size=batch_size) elif attack == ATTACK.DEEPFOOL: eps = attack_params.get('eps', 1e-6) max_iter = attack_params.get('max_iter', 5) nb_grads = attack_params.get('nb_grads', 3) batch_size = attack_params.get('batch_size', 1) attacker = DeepFool(classifier=art_classifier, max_iter=max_iter, epsilon=eps, nb_grads=nb_grads, batch_size=batch_size) elif attack == ATTACK.FGSM: eps = attack_params.get('eps', 0.3) targeted = attack_params.get('targeted', False) batch_size = attack_params.get('batch_size', 128) attacker = FastGradientMethod(classifier=art_classifier, norm=np.inf, eps=eps, targeted=targeted, batch_size=batch_size) elif attack == ATTACK.BIM: eps = attack_params.get('eps', 0.3) eps_step = attack_params.get('eps_step', eps / 5.) norm = attack_params.get('norm', np.inf) targeted = attack_params.get('targeted', False) batch_size = attack_params.get('batch_size', 128) attacker = BasicIterativeMethod(classifier=art_classifier, norm=norm, eps=eps, eps_step=eps_step, targeted=targeted, batch_size=batch_size) elif attack == ATTACK.JSMA: theta = attack_params.get('theta', 0.3) gamma = attack_params.get('gamma', 0.5) batch_size = attack_params.get('batch_size', 128) attacker = SaliencyMapMethod(classifier=art_classifier, theta=theta, gamma=gamma, batch_size=batch_size) elif attack == ATTACK.CW_L2: lr = attack_params.get('lr', 0.1) bsearch_steps = attack_params.get('bsearch_steps', 10) attacker = CarliniL2Method(classifier=art_classifier, learning_rate=lr, binary_search_steps=bsearch_steps) elif attack == ATTACK.CW_Linf: lr = attack_params.get('lr', 0.01) attacker = CarliniLInfMethod(classifier=art_classifier, learning_rate=lr) else: raise NotImplementedError(attack, 'is not implemented.') print( 'Generating [{}] adversarial examples, it will take a while...'.format( attack)) X_adv = attacker.generate(X, y=Y) del attacker return X_adv
def __init__( self, estimator: ClassifierGradients, norm: Union[int, float] = np.inf, eps: float = 0.3, eps_step: float = 0.1, attacks: Optional[List[EvasionAttack]] = None, batch_size: int = 32, estimator_orig: Optional[BaseEstimator] = None, ): """ Create a :class:`.ProjectedGradientDescent` instance. :param estimator: An trained estimator. :param norm: The norm of the adversarial perturbation. Possible values: np.inf, 1 or 2. :param eps: Maximum perturbation that the attacker can introduce. :param eps_step: Attack step size (input variation) at each iteration. :param attacks: The list of `art.attacks.EvasionAttack` attacks to be used for AutoAttack. If it is `None` the original AutoAttack (PGD, APGD-ce, APGD-dlr, FAB, Square) will be used. :param batch_size: Size of the batch on which adversarial samples are generated. :param estimator_orig: Original estimator to be attacked by adversarial examples. """ super().__init__(estimator=estimator) if estimator_orig is None: estimator_orig = estimator if attacks is None: attacks = list() attacks.append( AutoProjectedGradientDescent( estimator=estimator, norm=norm, eps=eps, eps_step=eps_step, max_iter=100, targeted=False, nb_random_init=5, batch_size=batch_size, loss_type="cross_entropy", ) ) attacks.append( AutoProjectedGradientDescent( estimator=estimator, norm=norm, eps=eps, eps_step=eps_step, max_iter=100, targeted=False, nb_random_init=5, batch_size=batch_size, loss_type="difference_logits_ratio", ) ) attacks.append( DeepFool(classifier=estimator, max_iter=100, epsilon=1e-6, nb_grads=3, batch_size=batch_size) ) attacks.append( SquareAttack(estimator=estimator, norm=norm, max_iter=5000, eps=eps, p_init=0.8, nb_restarts=5) ) self.norm = norm self.eps = eps self.eps_step = eps_step self.attacks = attacks self.batch_size = batch_size self.estimator_orig = estimator_orig self._check_params()