def test_one_attack_mnist(self): """ Test the adversarial trainer using one FGSM attacker. The source and target models of the attack are two CNNs on MNIST (TensorFlow and Keras backends). The test cast check if accuracy on adversarial samples increases after adversarially training the model. :return: None """ (x_train, y_train), (x_test, y_test) = self.mnist # Get source and target classifiers classifier_src = self.classifier_k classifier_tgt = self.classifier_tf # Create FGSM attacker adv = FastGradientMethod(classifier_src) x_adv = adv.generate(x_test) preds = classifier_tgt.predict(x_adv) acc = np.sum(np.argmax(preds, axis=1) == np.argmax( y_test, axis=1)) / x_adv.shape[0] # Perform adversarial training adv_trainer = StaticAdversarialTrainer(classifier_tgt, adv) adv_trainer.fit(x_train, y_train, nb_epochs=1) # Evaluate that accuracy on adversarial sample has improved preds_adv_trained = adv_trainer.classifier.predict(x_adv) acc_adv_trained = np.sum( np.argmax(preds_adv_trained, axis=1) == np.argmax( y_test, axis=1)) / x_adv.shape[0] print('\nAccuracy before adversarial training: %.2f%%' % (acc * 100)) print('\nAccuracy after adversarial training: %.2f%%' % (acc_adv_trained * 100))
def __init__(self, model, targeted=False, step_size_iter=0.3, max_perturbation=0.1, norm_order=np.inf, num_random_init=0, minimal=False, batch_size=16): super().__init__(model=model) self._targeted = targeted self._step_size_iter = step_size_iter self._max_perturbation = max_perturbation self._norm_order = norm_order self._num_random_init = num_random_init self._minimal = minimal self._method = FastGradientMethod( classifier=self.model, norm=self._norm_order, eps=self._max_perturbation, eps_step=self._step_size_iter, targeted=self._targeted, num_random_init=self._num_random_init, batch_size=batch_size, minimal=self._minimal)
def test_shared_model_mnist(self): """ Test the adversarial trainer using one FGSM attacker. The source and target models of the attack are the same CNN on MNIST trained for 2 epochs. The test cast check if accuracy on adversarial samples increases after adversarially training the model. :return: None """ (x_train, y_train), (x_test, y_test) = self.mnist # Create and fit classifier params = {'nb_epochs': 2, 'batch_size': BATCH_SIZE} classifier = self.classifier_k # Create FGSM attacker adv = FastGradientMethod(classifier) x_adv = adv.generate(x_test) preds = classifier.predict(x_adv) acc = np.sum(np.argmax(preds, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] # Perform adversarial training adv_trainer = StaticAdversarialTrainer(classifier, adv) adv_trainer.fit(x_train, y_train, **params) # Evaluate that accuracy on adversarial sample has improved preds_adv_trained = adv_trainer.classifier.predict(x_adv) acc_adv_trained = np.sum( np.argmax(preds_adv_trained, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] print('\nAccuracy before adversarial training: %.2f%%' % (acc * 100)) print('\nAccuracy after adversarial training: %.2f%%' % (acc_adv_trained * 100))
def test_two_attacks_with_generator(self): (x_train, y_train), (x_test, y_test) = self.mnist x_train_original = x_train.copy() class MyDataGenerator(DataGenerator): def __init__(self, x, y, size, batch_size): self.x = x self.y = y self.size = size self.batch_size = batch_size def get_batch(self): ids = np.random.choice(self.size, size=min(self.size, self.batch_size), replace=False) return self.x[ids], self.y[ids] generator = MyDataGenerator(x_train, y_train, x_train.shape[0], 128) attack1 = FastGradientMethod(self.classifier_k) attack2 = DeepFool(self.classifier_tf) x_test_adv = attack1.generate(x_test) preds = np.argmax(self.classifier_k.predict(x_test_adv), axis=1) acc = np.sum(preds == np.argmax(y_test, axis=1)) / NB_TEST adv_trainer = AdversarialTrainer(self.classifier_k, attacks=[attack1, attack2]) adv_trainer.fit_generator(generator, nb_epochs=5) preds_new = np.argmax(adv_trainer.predict(x_test_adv), axis=1) acc_new = np.sum(preds_new == np.argmax(y_test, axis=1)) / NB_TEST # No reason to assert the newer accuracy is higher. It might go down slightly self.assertGreaterEqual(acc_new, acc * ACCURACY_DROP) logger.info('Accuracy before adversarial training: %.2f%%', (acc * 100)) logger.info('\nAccuracy after adversarial training: %.2f%%', (acc_new * 100)) # Finally assert that the original training data hasn't changed: self.assertTrue((x_train == x_train_original).all())
def test_iris_unbounded(self): (_, _), (x_test, y_test) = self.iris classifier = get_iris_classifier_kr() def t(x): return x def transformation(): while True: yield t # Recreate a classifier without clip values classifier = KerasClassifier(model=classifier._model, use_logits=False, channel_index=1) classifier = ExpectationOverTransformations( classifier, sample_size=1, transformation=transformation) attack = FastGradientMethod(classifier, eps=1) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv > 1).any()) self.assertTrue((x_test_adv < 0).any()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(y_test, axis=1) == preds_adv).all()) acc = np.sum(preds_adv == np.argmax(y_test, axis=1)) / y_test.shape[0] logger.info('Accuracy on Iris with limited query info: %.2f%%', (acc * 100))
def test_iris_unbounded(self): (_, _), (x_test, y_test) = self.iris classifier = get_iris_classifier_kr() # Recreate a classifier without clip values krc = KerasClassifier(model=classifier._model, use_logits=False, channel_index=1) rs = RandomizedSmoothing(classifier=krc, sample_size=100, scale=0.01, alpha=0.001) attack = FastGradientMethod(rs, eps=1) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv > 1).any()) self.assertTrue((x_test_adv < 0).any()) preds_smooth = np.argmax(rs.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(y_test, axis=1) == preds_smooth).all()) pred = rs.predict(x_test) pred2 = rs.predict(x_test_adv) acc, cov = compute_accuracy(pred, y_test) acc2, cov2 = compute_accuracy(pred2, y_test) logger.info( 'Accuracy on Iris with smoothing on adversarial examples: %.2f%%', (acc * 100)) logger.info( 'Coverage on Iris with smoothing on adversarial examples: %.2f%%', (cov * 100)) logger.info('Accuracy on Iris with smoothing: %.2f%%', (acc2 * 100)) logger.info('Coverage on Iris with smoothing: %.2f%%', (cov2 * 100))
def test_multi_attack_mnist(self): """ Test the adversarial trainer using two attackers: FGSM and DeepFool. The source and target models of the attack are two CNNs on MNIST trained for 2 epochs. FGSM and DeepFool both generate the attack images on the same source classifier. The test cast check if accuracy on adversarial samples increases after adversarially training the model. :return: None """ (x_train, y_train), (x_test, y_test) = self.mnist # Get source and target classifiers classifier_tgt = self.classifier_k classifier_src = self.classifier_tf # Create FGSM and DeepFool attackers adv1 = FastGradientMethod(classifier_src) adv2 = DeepFool(classifier_src) x_adv = np.vstack((adv1.generate(x_test), adv2.generate(x_test))) y_adv = np.vstack((y_test, y_test)) preds = classifier_tgt.predict(x_adv) acc = np.sum(np.argmax(preds, axis=1) == np.argmax(y_adv, axis=1)) / y_adv.shape[0] # Perform adversarial training adv_trainer = StaticAdversarialTrainer(classifier_tgt, [adv1, adv2]) params = {'nb_epochs': 2, 'batch_size': BATCH_SIZE} adv_trainer.fit(x_train, y_train, **params) # Evaluate that accuracy on adversarial sample has improved preds_adv_trained = adv_trainer.classifier.predict(x_adv) acc_adv_trained = np.sum(np.argmax(preds_adv_trained, axis=1) == np.argmax(y_adv, axis=1)) / y_adv.shape[0] logger.info('Accuracy before adversarial training: %.2f%%', (acc * 100)) logger.info('Accuracy after adversarial training: %.2f%%', (acc_adv_trained * 100))
def test_two_attacks(self): (x_train, y_train), (x_test, y_test) = self.mnist x_test_original = x_test.copy() attack1 = FastGradientMethod(classifier=self.classifier, batch_size=16) attack2 = DeepFool(classifier=self.classifier, max_iter=5, batch_size=16) x_test_adv = attack1.generate(x_test) predictions = np.argmax(self.classifier.predict(x_test_adv), axis=1) accuracy = np.sum(predictions == np.argmax(y_test, axis=1)) / NB_TEST adv_trainer = AdversarialTrainer(self.classifier, attacks=[attack1, attack2]) adv_trainer.fit(x_train, y_train, nb_epochs=2, batch_size=16) predictions_new = np.argmax(adv_trainer.predict(x_test_adv), axis=1) accuracy_new = np.sum( predictions_new == np.argmax(y_test, axis=1)) / NB_TEST self.assertEqual(accuracy_new, 0.36) self.assertEqual(accuracy, 0.13) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))), 0.0, delta=0.00001)
def test_iris_clipped(self): (_, _), (x_test, y_test) = self.iris def t(x): return x def transformation(): while True: yield t classifier = get_iris_classifier_kr() classifier = ExpectationOverTransformations( classifier, sample_size=1, transformation=transformation) # Test untargeted attack attack = FastGradientMethod(classifier, eps=.1) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1).all()) self.assertTrue((x_test_adv >= 0).all()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(y_test, axis=1) == preds_adv).all()) acc = np.sum(preds_adv == np.argmax(y_test, axis=1)) / y_test.shape[0] logger.info('Accuracy on Iris with limited query info: %.2f%%', (acc * 100))
def test_minimal_perturbations_images(fix_get_mnist_subset, get_image_classifier_list_for_attack): classifier_list = get_image_classifier_list_for_attack(FastGradientMethod) # TODO this if statement must be removed once we have a classifier for both image and tabular data if classifier_list is None: logging.warning( "Couldn't perform this test because no classifier is defined") return for classifier in classifier_list: attack = FastGradientMethod(classifier, eps=1.0, batch_size=11) attack_params = {"minimal": True, "eps_step": 0.1, "eps": 5.0} attack.set_params(**attack_params) expected_values = { "x_test_mean": ExpectedValue(0.03896513, 0.01), "x_test_min": ExpectedValue(-0.30000000, 0.00001), "x_test_max": ExpectedValue(0.30000000, 0.00001), "y_test_pred_adv_expected": ExpectedValue(np.asarray([4, 2, 4, 7, 0, 4, 7, 2, 0, 7, 0]), 2), } backend_check_adverse_values(attack, fix_get_mnist_subset, expected_values)
def test_subsetscan_detector(self): (x_train, y_train), (x_test, y_test), _, _ = load_dataset('mnist') x_train, y_train = x_train[:NB_TRAIN], y_train[:NB_TRAIN] x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST] # Keras classifier classifier = get_classifier_kr() # Generate adversarial samples: attacker = FastGradientMethod(classifier, eps=0.5) x_train_adv = attacker.generate(x_train) x_test_adv = attacker.generate(x_test) # Compile training data for detector: x_train_detector = np.concatenate((x_train, x_train_adv), axis=0) bgd = x_train clean = x_test anom = x_test_adv detector = SubsetScanningDetector(classifier, bgd, layer=1) _, _, dpwr = detector.scan(clean, clean) self.assertAlmostEqual(dpwr, 0.5) _, _, dpwr = detector.scan(clean, anom) self.assertGreater(dpwr, 0.5) _, _, dpwr = detector.scan(clean, x_train_detector, 85, 15) self.assertGreater(dpwr, 0.5)
def test_multi_attack_mnist_with_generator(self): """ Test the adversarial trainer using two attackers: FGSM and DeepFool. The source and target models of the attack are two CNNs on MNIST trained for 2 epochs. FGSM and DeepFool both generate the attack images on the same source classifier. The test cast check if accuracy on adversarial samples increases after adversarially training the model. Here a generator is used to yield the data for adversarial training :return: None """ (x_train, y_train), (x_test, y_test) = self.mnist x_train_original = x_train.copy() class MyDataGenerator(DataGenerator): def __init__(self, x, y, size, batch_size): self.x = x self.y = y self.size = size self.batch_size = batch_size def get_batch(self): ids = np.random.choice(self.size, size=min(self.size, self.batch_size), replace=False) return (self.x[ids], self.y[ids]) generator = MyDataGenerator(x_train, y_train, x_train.shape[0], BATCH_SIZE) # Get source and target classifiers classifier_tgt = self.classifier_k classifier_src = self.classifier_tf # Create FGSM and DeepFool attackers adv1 = FastGradientMethod(classifier_src) adv2 = DeepFool(classifier_src) x_adv = np.vstack((adv1.generate(x_test), adv2.generate(x_test))) y_adv = np.vstack((y_test, y_test)) preds = classifier_tgt.predict(x_adv) acc = np.sum(np.argmax(preds, axis=1) == np.argmax( y_adv, axis=1)) / y_adv.shape[0] # Perform adversarial training adv_trainer = StaticAdversarialTrainer(classifier_tgt, [adv1, adv2]) params = {'nb_epochs': 2} adv_trainer.fit_generator(generator, **params) # Evaluate that accuracy on adversarial sample has improved preds_adv_trained = adv_trainer.classifier.predict(x_adv) acc_adv_trained = np.sum( np.argmax(preds_adv_trained, axis=1) == np.argmax( y_adv, axis=1)) / y_adv.shape[0] logger.info('Accuracy before adversarial training: %.2f%%', (acc * 100)) logger.info('Accuracy after adversarial training: %.2f%%', (acc_adv_trained * 100)) # Finally assert that the original training data hasn't changed: self.assertTrue((x_train == x_train_original).all())
def __init__(self, model, loss_criterion, norm, batch_size=128): self.wrapped_pytorch_model = wrapModel(model, loss_criterion) self.norm = norm self.batch_size = batch_size self.attack = FastGradientMethod(self.wrapped_pytorch_model, batch_size=batch_size) # Use GPU for computation if it is available self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu")
def atk_FastGradient(x_train, x_test, y_train, y_test, classifier): epsilon = 0.1 #print('Create FastGradientMethod attack \n') adv_crafter = FastGradientMethod(classifier) x_train_adv = adv_crafter.generate(x_train, eps=epsilon) x_test_adv = adv_crafter.generate(x_test, eps=epsilon) print("After FastGradient Attack \n") evaluate(x_train, x_test, y_train, y_test, x_train_adv, x_test_adv, classifier) return x_test_adv, x_train_adv
def test_iris_clipped(self): (_, _), (x_test, y_test) = self.iris krc, _ = get_iris_classifier_kr() rs = RandomizedSmoothing(classifier=krc, sample_size=100, scale=0.01, alpha=0.001) # Test untargeted attack attack = FastGradientMethod(krc, eps=.1) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1).all()) self.assertTrue((x_test_adv >= 0).all()) preds_base = np.argmax(rs.predict(x_test), axis=1) preds_smooth = np.argmax(rs.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(y_test, axis=1) == preds_smooth).all()) pred = rs.predict(x_test) pred2 = rs.predict(x_test_adv) acc, cov = compute_accuracy(pred, y_test) acc2, cov2 = compute_accuracy(pred2, y_test) logger.info( 'Accuracy on Iris with smoothing on adversarial examples: %.2f%%', (acc * 100)) logger.info( 'Coverage on Iris with smoothing on adversarial examples: %.2f%%', (cov * 100)) logger.info('Accuracy on Iris with smoothing: %.2f%%', (acc2 * 100)) logger.info('Coverage on Iris with smoothing: %.2f%%', (cov2 * 100)) # Check basic functionality of RS object # check predict y_test_smooth = rs.predict(x=x_test) self.assertEqual(y_test_smooth.shape, y_test.shape) self.assertTrue((np.sum(y_test_smooth, axis=1) <= 1).all()) # check gradients grad_smooth1 = rs.loss_gradient(x=x_test, y=y_test) grad_smooth2 = rs.class_gradient(x=x_test, label=None) grad_smooth3 = rs.class_gradient(x=x_test, label=np.argmax(y_test, axis=1)) self.assertEqual(grad_smooth1.shape, x_test_adv.shape) self.assertEqual(grad_smooth2.shape[0], len(x_test)) self.assertEqual(grad_smooth3.shape[0], len(x_test)) # check certification pred, radius = rs.certify(x=x_test, n=250) self.assertEqual(len(pred), len(x_test)) self.assertEqual(len(radius), len(x_test)) self.assertTrue((radius <= 1).all()) self.assertTrue((pred < y_test.shape[1]).all())
def run_fgsm_attacks( classifier, target_image, eps, mask_width=20, masked=True, img_show=True, debug=True, use_art=True, feature_extractor="blazeface", iter_step=1, ): adv_image = None if use_art: attack = FastGradientMethod(classifier=classifier, eps=eps) x_adv = None for i in range(iter_step): try: start = time.time() x_adv = attack.generate(x=np.array([target_image]), x_adv_init=x_adv, resume=True) end = time.time() ### Apply mask adv_image = x_adv[0].astype(np.uint) target_image_copy = target_image.copy() if masked: if feature_extractor == "blazeface": adv_image, _ = apply_mask_to_adv_noise( target_image, adv_image, mask_width=mask_width) else: adv_image, _ = apply_mask_to_adv_noise_mtcnn( target_image, adv_image) target_image_copy = cv.resize(target_image_copy, (128, 128)) norm = np.linalg.norm(np.reshape(adv_image - target_image_copy, [-1]), ord=np.inf) logging.debug(f'debug: norm: {norm}') except Exception as e: logging.error(e) attack.max_iter = iter_step else: raise NotImplementedError # loss_object = keras.losses.CategoricalCrossentropy() # with tf.GradientTape() as tape: # tape.watch(target_image) # prediction = classifier(target_image) # loss = loss_object(prediciton, ) return adv_image
def fgsm(model, X, y, optimizer,epsilon=0.1): """ Construct FGSM adversarial examples on the examples X""" classifier = PyTorchClassifier( model=model_concetenate, loss = custom_loss, optimizer=optimizer, input_shape=(1,28,28), nb_classes=10, device_type='gpu' ) attack = FastGradientMethod(classifier=classifier,eps=epsilon) x_adv = attack.generate(X.numpy(),y=y.numpy()) return torch.Tensor(x_adv)
def test_two_attacks_with_generator(self): (x_train, y_train), (x_test, y_test) = self.mnist x_train_original = x_train.copy() x_test_original = x_test.copy() class MyDataGenerator(DataGenerator): def __init__(self, x, y, size, batch_size): super().__init__(size=size, batch_size=batch_size) self.x = x self.y = y self._size = size self._batch_size = batch_size def get_batch(self): ids = np.random.choice(self.size, size=min(self.size, self.batch_size), replace=False) return self.x[ids], self.y[ids] generator = MyDataGenerator(x_train, y_train, size=x_train.shape[0], batch_size=16) attack1 = FastGradientMethod(classifier=self.classifier, batch_size=16) attack2 = DeepFool(classifier=self.classifier, max_iter=5, batch_size=16) x_test_adv = attack1.generate(x_test) predictions = np.argmax(self.classifier.predict(x_test_adv), axis=1) accuracy = np.sum(predictions == np.argmax(y_test, axis=1)) / NB_TEST adv_trainer = AdversarialTrainer(self.classifier, attacks=[attack1, attack2]) adv_trainer.fit_generator(generator, nb_epochs=3) predictions_new = np.argmax(adv_trainer.predict(x_test_adv), axis=1) accuracy_new = np.sum( predictions_new == np.argmax(y_test, axis=1)) / NB_TEST self.assertAlmostEqual(accuracy_new, 0.25, delta=0.02) self.assertAlmostEqual(accuracy, 0.11, delta=0.0) # Check that x_train and x_test has not been modified by attack and classifier self.assertAlmostEqual(float(np.max(np.abs(x_train_original - x_train))), 0.0, delta=0.00001) self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))), 0.0, delta=0.00001)
def evaluate_fgsm(self, data_loader): """Adversarial evaluation by FGSM""" norm, eps = np.inf, attack_configs['FGSM'][self.dataset]['epsilon'] adv_crafter = FastGradientMethod(self.classifier, norm=norm, eps=eps) data_iter = iter(data_loader) examples, labels = next(data_iter) examples, labels = examples.cpu().numpy(), labels.cpu().numpy() labels_one_hot = np.eye(self.nb_classes)[labels] examples_adv = adv_crafter.generate(examples, y=labels_one_hot) preds = np.argmax(self.classifier.predict(examples_adv), axis=1) acc = np.sum(preds == labels) / labels.shape[0] return acc
def test_binary_activation_detector(self): """ Test the binary activation detector end-to-end. :return: """ # Get MNIST (x_train, y_train), (x_test, y_test), _, _ = load_mnist() x_train, y_train = x_train[:NB_TRAIN], y_train[:NB_TRAIN] x_test, y_test = x_test[:NB_TEST], y_test[:NB_TEST] # Keras classifier classifier = get_classifier_kr() # Generate adversarial samples: attacker = FastGradientMethod(classifier, eps=0.1) x_train_adv = attacker.generate(x_train[:NB_TRAIN]) x_test_adv = attacker.generate(x_test[:NB_TRAIN]) # Compile training data for detector: x_train_detector = np.concatenate((x_train[:NB_TRAIN], x_train_adv), axis=0) y_train_detector = np.concatenate((np.array([[1, 0]] * NB_TRAIN), np.array([[0, 1]] * NB_TRAIN)), axis=0) # Create a simple CNN for the detector activation_shape = classifier.get_activations(x_test[:1], 0).shape[1:] number_outputs = 2 model = Sequential() model.add(MaxPooling2D(pool_size=(2, 2), input_shape=activation_shape)) model.add(Flatten()) model.add(Dense(number_outputs, activation='softmax')) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=keras.optimizers.Adam(lr=0.01), metrics=['accuracy']) # Create detector and train it. # Detector consider activations at layer=0: detector = BinaryActivationDetector(classifier=classifier, detector=KerasClassifier(model=model, clip_values=(0, 1), use_logits=False), layer=0) detector.fit(x_train_detector, y_train_detector, nb_epochs=2, batch_size=128) # Apply detector on clean and adversarial test data: test_detection = np.argmax(detector.predict(x_test), axis=1) test_adv_detection = np.argmax(detector.predict(x_test_adv), axis=1) # Assert there is at least one true positive and negative nb_true_positives = len(np.where(test_adv_detection == 1)[0]) nb_true_negatives = len(np.where(test_detection == 0)[0]) logger.debug('Number of true positives detected: %i', nb_true_positives) logger.debug('Number of true negatives detected: %i', nb_true_negatives) self.assertGreater(nb_true_positives, 0) self.assertGreater(nb_true_negatives, 0)
def test_targeted_images(fix_get_mnist_subset, get_image_classifier_list_for_attack): classifier_list = get_image_classifier_list_for_attack(FastGradientMethod) # TODO this if statement must be removed once we have a classifier for both image and tabular data if classifier_list is None: logging.warning( "Couldn't perform this test because no classifier is defined") return for classifier in classifier_list: attack = FastGradientMethod(classifier, eps=1.0, targeted=True) attack_params = {"minimal": True, "eps_step": 0.01, "eps": 1.0} attack.set_params(**attack_params) backend_targeted_images(attack, fix_get_mnist_subset)
def test_classifier_match(self): attack = FastGradientMethod(self.classifier_k) adv_trainer = AdversarialTrainer(self.classifier_k, attack) self.assertEqual(len(adv_trainer.attacks), 1) self.assertEqual(adv_trainer.attacks[0].classifier, adv_trainer.classifier)
class FGMAttack(AdversarialAttack): def __init__(self, model, targeted=False, step_size_iter=0.3, max_perturbation=0.1, norm_order=np.inf, num_random_init=0, minimal=False, batch_size=16): super().__init__(model=model) self._targeted = targeted self._step_size_iter = step_size_iter self._max_perturbation = max_perturbation self._norm_order = norm_order self._num_random_init = num_random_init self._minimal = minimal self._method = FastGradientMethod( classifier=self.model, norm=self._norm_order, eps=self._max_perturbation, eps_step=self._step_size_iter, targeted=self._targeted, num_random_init=self._num_random_init, batch_size=batch_size, minimal=self._minimal) def attack_method(self, x, y=None): params = {'minimal': self._minimal} if y is not None: params['y'] = y return self._method.generate(x=x, **params)
def main(config_filepath): config = load_config(config_filepath) if os.path.isfile(config.x_adv_output_path): click.confirm(f"Overwrite {config.x_adv_output_path}?", abort=True) seed = 45616451 np.random.seed(seed) torch.manual_seed(seed) # Load data x = torch.load(config.x_filepath) x_shape = x.shape y = torch.load(config.y_filepath) # Flatten test set x = x.reshape(x.shape[0], -1) model = torch.load(config.model_filepath) clip_values = {} with open(config.clip_values_filepath, "r") as f: clip_values = json.load(f) clip_values = ( clip_values.get("min_pixel_value"), clip_values.get("max_pixel_value"), ) classifier = PyTorchClassifier( model=model, clip_values=clip_values, loss=model.criterion, optimizer=model.optimizer, input_shape=(1, 28, 28), nb_classes=10, ) # TODO: move these parameters to config # Generate attacks attack = FastGradientMethod( classifier=classifier, eps=0.2) # TODO: move these parameters to config x_adv = attack.generate(x=x) # Reshape adversarial examples back to original test data shape x_adv = torch.from_numpy(x_adv.reshape(x_shape)) torch.save(x_adv, config.x_adv_output_path)
def test_fit_predict(self): (x_train, y_train), (x_test, y_test) = self.mnist attack = FastGradientMethod(self.classifier_k) x_test_adv = attack.generate(x_test) preds = np.argmax(self.classifier_k.predict(x_test_adv), axis=1) acc = np.sum(preds == np.argmax(y_test, axis=1)) / NB_TEST adv_trainer = AdversarialTrainer(self.classifier_k, attack) adv_trainer.fit(x_train, y_train, nb_epochs=5, batch_size=128) preds_new = np.argmax(adv_trainer.predict(x_test_adv), axis=1) acc_new = np.sum(preds_new == np.argmax(y_test, axis=1)) / NB_TEST self.assertGreaterEqual(acc_new, acc * ACCURACY_DROP) logger.info('Accuracy before adversarial training: %.2f%%', (acc * 100)) logger.info('Accuracy after adversarial training: %.2f%%', (acc_new * 100))
def general_test_v2(model, optimizer, input_shape, nb_classes, test_loader, method, conf, btrain=False, model_file='last_model_92_sgd.pkl'): global _classes if not btrain: checked_state = torch.load(model_file)['state_dict'] model.load_state_dict(checked_state) model.eval() loss = nn.CrossEntropyLoss() warped_model = PyTorchClassifier(model, loss, optimizer, input_shape, nb_classes, clip_values=(.0, 1.)) if method == 'Deepfool': adv_crafter = DeepFool(warped_model) elif method == 'BIM': adv_crafter = BasicIterativeMethod(warped_model, batch_size=32) elif method == 'JSMA': adv_crafter = SaliencyMapMethod(warped_model, batch_size=32) elif method == 'CW2': adv_crafter = CarliniL2Method(warped_model, batch_size=32) elif method == 'CWI': adv_crafter = CarliniLInfMethod(warped_model, batch_size=32) elif method == 'FGSM': adv_crafter = FastGradientMethod(warped_model, batch_size=32) correct, total = 0, 0 adv_dataset = adv_generalization(test_loader, adv_crafter, conf) temp_loader = DataLoader(dataset=adv_dataset, batch_size=32, shuffle=False, drop_last=True) # temp_loader = test_loader for images, labels in temp_loader: images = Variable(images.cuda()) labels = Variable(labels.cuda()) outputs = model(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels.data).sum() print('Accuracy of the model on the test images: %d %%' % (100 * float(correct) / total)) print('Accuracy of the model on the test images:', float(correct) / total) return correct / total
def test_two_attacks(self): (x_train, y_train), (x_test, y_test) = self.mnist attack1 = FastGradientMethod(self.classifier_k) attack2 = DeepFool(self.classifier_tf) x_test_adv = attack1.generate(x_test) preds = np.argmax(self.classifier_k.predict(x_test_adv), axis=1) acc = np.sum(preds == np.argmax(y_test, axis=1)) / NB_TEST adv_trainer = AdversarialTrainer(self.classifier_k, attacks=[attack1, attack2]) adv_trainer.fit(x_train, y_train, nb_epochs=5, batch_size=128) preds_new = np.argmax(adv_trainer.predict(x_test_adv), axis=1) acc_new = np.sum(preds_new == np.argmax(y_test, axis=1)) / NB_TEST # No reason to assert the newer accuracy is higher. It might go down slightly self.assertGreaterEqual(acc_new, acc * ACCURACY_DROP) logger.info('Accuracy before adversarial training: %.2f%%', (acc * 100)) logger.info('\nAccuracy after adversarial training: %.2f%%', (acc_new * 100))
def robust_score_test(eps=0.1, X_test=None, y_test=None, model=None, feature_selector=None, scorer=None): X_test_filtered = feature_selector.transform(X_test) best_model = copy.deepcopy(model) classifier = SklearnClassifier(model=best_model) attack = FastGradientMethod(classifier, eps=eps, batch_size=1) X_test_adv = attack.generate(X_test_filtered) score_original_test = scorer(best_model, X_test_filtered, y_test) score_corrupted_test = scorer(best_model, X_test_adv, y_test) diff = score_original_test - score_corrupted_test return diff
def _test_mnist_targeted(self, classifier, x_test, y_test): # Test FGSM with np.inf norm attack = FastGradientMethod(classifier, eps=1.0, targeted=True) pred_sort = classifier.predict(x_test).argsort(axis=1) y_test_adv = np.zeros((x_test.shape[0], 10)) for i in range(x_test.shape[0]): y_test_adv[i, pred_sort[i, -2]] = 1.0 attack_params = {"minimal": True, "eps_step": 0.01, "eps": 1.0} attack.set_params(**attack_params) x_test_adv = attack.generate(x_test, y=y_test_adv) self.assertFalse((x_test == x_test_adv).all()) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertEqual(y_test_adv.shape, test_y_pred.shape) self.assertGreaterEqual((y_test_adv == test_y_pred).sum(), x_test.shape[0] // 2)
def test_keras_iris_unbounded(self): (_, _), (x_test, y_test) = self.iris classifier = get_iris_classifier_kr() # Recreate a classifier without clip values classifier = KerasClassifier(model=classifier._model, use_logits=False, channel_index=1) attack = FastGradientMethod(classifier, eps=1) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv > 1).any()) self.assertTrue((x_test_adv < 0).any()) predictions_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(y_test, axis=1) == predictions_adv).all()) accuracy = np.sum( predictions_adv == np.argmax(y_test, axis=1)) / y_test.shape[0] logger.info('Accuracy on Iris with FGM adversarial examples: %.2f%%', (accuracy * 100))