def test_scikitlearn(self): from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from art.classifiers.scikitlearn import ScikitlearnLogisticRegression, ScikitlearnSVC scikitlearn_test_cases = { LogisticRegression: ScikitlearnLogisticRegression, SVC: ScikitlearnSVC, LinearSVC: ScikitlearnSVC } (_, _), (x_test, y_test) = self.iris x_test_original = x_test.copy() for (model_class, classifier_class) in scikitlearn_test_cases.items(): model = model_class() classifier = classifier_class(model=model, clip_values=(0, 1)) classifier.fit(x=x_test, y=y_test) # Test untargeted attack attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.1) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1).all()) self.assertTrue((x_test_adv >= 0).all()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(y_test, axis=1) == preds_adv).all()) acc = np.sum( preds_adv == np.argmax(y_test, axis=1)) / y_test.shape[0] logger.info( 'Accuracy of ' + classifier.__class__.__name__ + ' on Iris with PGD adversarial examples: ' '%.2f%%', (acc * 100)) # Test targeted attack targets = random_targets(y_test, nb_classes=3) attack = ProjectedGradientDescent(classifier, targeted=True, eps=1, eps_step=0.1) x_test_adv = attack.generate(x_test, **{'y': targets}) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1).all()) self.assertTrue((x_test_adv >= 0).all()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertTrue((np.argmax(targets, axis=1) == preds_adv).any()) acc = np.sum( preds_adv == np.argmax(targets, axis=1)) / y_test.shape[0] logger.info( 'Success rate of ' + classifier.__class__.__name__ + ' on targeted PGD on Iris: %.2f%%', (acc * 100)) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float( np.max(np.abs(x_test_original - x_test))), 0.0, delta=0.00001)
def test_iris_pt(self): (_, _), (x_test, y_test) = self.iris classifier = get_iris_classifier_pt() # Test untargeted attack attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.1) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1).all()) self.assertTrue((x_test_adv >= 0).all()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(y_test, axis=1) == preds_adv).all()) acc = np.sum(preds_adv == np.argmax(y_test, axis=1)) / y_test.shape[0] logger.info('Accuracy on Iris with PGD adversarial examples: %.2f%%', (acc * 100)) # Test targeted attack targets = random_targets(y_test, nb_classes=3) attack = ProjectedGradientDescent(classifier, targeted=True, eps=1, eps_step=0.1) x_test_adv = attack.generate(x_test, **{'y': targets}) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1).all()) self.assertTrue((x_test_adv >= 0).all()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertTrue((np.argmax(targets, axis=1) == preds_adv).any()) acc = np.sum(preds_adv == np.argmax(targets, axis=1)) / y_test.shape[0] logger.info('Success rate of targeted PGD on Iris: %.2f%%', (acc * 100))
def _test_backend_mnist(self, classifier): # Get MNIST (x_train, y_train), (x_test, y_test) = self.mnist # Test PGD with np.inf norm attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.1) x_train_adv = attack.generate(x_train) x_test_adv = attack.generate(x_test) self.assertFalse((x_train == x_train_adv).all()) self.assertFalse((x_test == x_test_adv).all()) train_y_pred = get_labels_np_array(classifier.predict(x_train_adv)) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_train == train_y_pred).all()) self.assertFalse((y_test == test_y_pred).all()) acc = np.sum( np.argmax(train_y_pred, axis=1) == np.argmax( y_train, axis=1)) / y_train.shape[0] logger.info('Accuracy on adversarial train examples: %.2f%%', acc * 100) acc = np.sum( np.argmax(test_y_pred, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] logger.info('Accuracy on adversarial test examples: %.2f%%', acc * 100) # Test PGD with 3 random initialisations attack = ProjectedGradientDescent(classifier, num_random_init=3) x_train_adv = attack.generate(x_train) x_test_adv = attack.generate(x_test) self.assertFalse((x_train == x_train_adv).all()) self.assertFalse((x_test == x_test_adv).all()) train_y_pred = get_labels_np_array(classifier.predict(x_train_adv)) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_train == train_y_pred).all()) self.assertFalse((y_test == test_y_pred).all()) acc = np.sum( np.argmax(train_y_pred, axis=1) == np.argmax( y_train, axis=1)) / y_train.shape[0] logger.info( 'Accuracy on adversarial train examples with 3 random initialisations: %.2f%%', acc * 100) acc = np.sum( np.argmax(test_y_pred, axis=1) == np.argmax( y_test, axis=1)) / y_test.shape[0] logger.info( 'Accuracy on adversarial test examples with 3 random initialisations: %.2f%%', acc * 100)
class PGDAttack(AdversarialAttack): def __init__(self, model, targeted=False, step_size_iter=.1, max_perturbation=.3, norm_order=np.inf, max_iterations=100, num_random_init=0, batch_size=16): super().__init__(model=model) self._targeted = targeted self._step_size_iter = step_size_iter self._max_perturbation = max_perturbation self._norm_order = norm_order self._max_iterations = max_iterations self._num_random_init = num_random_init self._method = ProjectedGradientDescent( classifier=self.model, targeted=self._targeted, norm=self._norm_order, eps=self._max_perturbation, eps_step=self._step_size_iter, max_iter=self._max_iterations, num_random_init=self._num_random_init, batch_size=batch_size) def attack_method(self, x, y=None): params = {} if y is not None: params['y'] = y return self._method.generate(x=x, **params)
def test_pytorch_iris_pt(self): classifier = get_tabular_classifier_pt() # Test untargeted attack attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.1, max_iter=5) x_test_adv = attack.generate(self.x_test_iris) self.assertFalse((self.x_test_iris == x_test_adv).all()) self.assertTrue((x_test_adv <= 1).all()) self.assertTrue((x_test_adv >= 0).all()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(self.y_test_iris, axis=1) == preds_adv).all()) acc = np.sum(preds_adv == np.argmax( self.y_test_iris, axis=1)) / self.y_test_iris.shape[0] logger.info("Accuracy on Iris with PGD adversarial examples: %.2f%%", (acc * 100)) # Test targeted attack targets = random_targets(self.y_test_iris, nb_classes=3) attack = ProjectedGradientDescent(classifier, targeted=True, eps=1, eps_step=0.1, max_iter=5) x_test_adv = attack.generate(self.x_test_iris, **{"y": targets}) self.assertFalse((self.x_test_iris == x_test_adv).all()) self.assertTrue((x_test_adv <= 1).all()) self.assertTrue((x_test_adv >= 0).all()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertTrue((np.argmax(targets, axis=1) == preds_adv).any()) acc = np.sum(preds_adv == np.argmax( targets, axis=1)) / self.y_test_iris.shape[0] logger.info("Success rate of targeted PGD on Iris: %.2f%%", (acc * 100))
def pgd_linf(model, X, y, optimizer,epsilon=0.1): """ Construct pgd adversarial examples on the examples X""" classifier = PyTorchClassifier( model=model_concetenate, loss = custom_loss, optimizer=optimizer, input_shape=(1,28,28), nb_classes=10, device_type='gpu' ) attack = ProjectedGradientDescent(classifier=classifier,eps=epsilon) X_adv = attack.generate(X.numpy(),y.numpy()) return torch.Tensor(X_adv)
def test_iris_k_unbounded(self): (_, _), (x_test, y_test) = self.iris classifier, _ = get_iris_classifier_kr() # Recreate a classifier without clip values classifier = KerasClassifier(model=classifier._model, use_logits=False, channel_index=1) attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.2) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv > 1).any()) self.assertTrue((x_test_adv < 0).any()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(y_test, axis=1) == preds_adv).all()) acc = np.sum(preds_adv == np.argmax(y_test, axis=1)) / y_test.shape[0] logger.info('Accuracy on Iris with PGD adversarial examples: %.2f%%', (acc * 100))
def evaluate_pgd(self, data_loader, num_iter=40): """Adversarial evaluation by PGD""" norm, eps = np.inf, attack_configs['PGD'][self.dataset]['epsilon'] eps_step = 2 * eps / num_iter adv_crafter = ProjectedGradientDescent(self.classifier, norm=norm, eps=eps, eps_step=eps_step, max_iter=num_iter, random_init=True) data_iter = iter(data_loader) examples, labels = next(data_iter) examples, labels = examples.cpu().numpy(), labels.cpu().numpy() labels_one_hot = np.eye(self.nb_classes)[labels] examples_adv = adv_crafter.generate(examples, y=labels_one_hot) preds = np.argmax(self.classifier.predict(examples_adv), axis=1) acc = np.sum(preds == labels) / labels.shape[0] return acc
def test_scikitlearn(self): from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from art.classifiers.scikitlearn import ScikitlearnLogisticRegression, ScikitlearnSVC scikitlearn_test_cases = { LogisticRegression: ScikitlearnLogisticRegression, SVC: ScikitlearnSVC, LinearSVC: ScikitlearnSVC, } (_, _), (x_test, y_test) = self.iris x_test_original = x_test.copy() for (model_class, classifier_class) in scikitlearn_test_cases.items(): model = model_class() classifier = classifier_class(model=model, clip_values=(0, 1)) classifier.fit(x=x_test, y=y_test) # Test untargeted attack attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.1, max_iter=5) x_test_adv = attack.generate(x_test) self.assertFalse((np.array(x_test) == x_test_adv).all()) self.assertTrue((x_test_adv <= 1).all()) self.assertTrue((x_test_adv >= 0).all()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(np.array(y_test), axis=1) == preds_adv).all()) acc = np.sum( preds_adv == np.argmax(np.array(y_test), axis=1)) / len(y_test) logger.info( "Accuracy of " + classifier.__class__.__name__ + " on Iris with PGD adversarial examples: " "%.2f%%", (acc * 100), )
def test_pytorch_iris_pt(self): (_, _), (x_test, y_test) = self.iris classifier = get_tabular_classifier_pt() # Test untargeted attack attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.1, max_iter=5) x_test_adv = attack.generate(x_test) self.assertFalse((np.array(x_test) == x_test_adv).all()) self.assertTrue((x_test_adv <= 1).all()) self.assertTrue((x_test_adv >= 0).all()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(np.array(y_test), axis=1) == preds_adv).all()) acc = np.sum( preds_adv == np.argmax(np.array(y_test), axis=1)) / len(y_test) logger.info("Accuracy on Iris with PGD adversarial examples: %.2f%%", (acc * 100))
class PGD: """ Class for adversarial attacks based on projected gradient descent (PGD). The implementation of PGD in ART executes projection on a feasible region after each iteration. However, random restrating is not used in this implementation. Not using radom restarting is the difference between the PGD implemented in ART and the one described by Madry et al. This adversarial attack subsumes the iterative FGSM. """ def __init__(self, model, loss_criterion, norm=np.inf, batch_size=128): self.wrapped_pytorch_model = wrapModel(model, loss_criterion) self.norm = norm self.batch_size = batch_size self.attack = ProjectedGradientDescent(self.wrapped_pytorch_model, norm=norm, random_init=False, batch_size=batch_size) # Use GPU for computation if it is available self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") def generatePerturbation(self, data, budget, max_iter=15): images, _ = data # eps_step is not allowed to be larger than budget according to the # documentation of ART. eps_step = budget / 5 images_adv = self.attack.generate(x=images.cpu().numpy(), norm=self.norm, eps=budget, eps_step=eps_step, max_iter=max_iter, batch_size=self.batch_size) images_adv = torch.from_numpy(images_adv) # The output to be returned should be loaded on an appropriate device. return images_adv.to(self.device)
def test_keras_iris_unbounded(self): classifier = get_tabular_classifier_kr() # Recreate a classifier without clip values classifier = KerasClassifier(model=classifier._model, use_logits=False, channel_index=1) attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.2, max_iter=5) x_test_adv = attack.generate(self.x_test_iris) self.assertFalse((self.x_test_iris == x_test_adv).all()) self.assertTrue((x_test_adv > 1).any()) self.assertTrue((x_test_adv < 0).any()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(self.y_test_iris, axis=1) == preds_adv).all()) acc = np.sum(preds_adv == np.argmax( self.y_test_iris, axis=1)) / self.y_test_iris.shape[0] logger.info("Accuracy on Iris with PGD adversarial examples: %.2f%%", (acc * 100))
# ## training for CIFAR # classifier = KerasClassifier(model=model, use_logits=False) # attack = ProjectedGradientDescent(classifier, eps=8/255, eps_step=2/255, max_iter=10, batch_size=512) ## training for SVHN classifier = KerasClassifier(clip_values=(-0.5, 0.5), model=model, use_logits=False) attack = ProjectedGradientDescent(classifier, eps=8 / 255, eps_step=1 / 255, max_iter=20, batch_size=512) x_test_pgd = attack.generate(x_test, y_test) # np.save('./data/' + dataset + '_data/model/' + model_name + '_y_' + attack_name + '.npy', x_test_pgd) # Evaluate the benign trained model on adv test set labels_pgd = np.argmax(classifier.predict(x_test_pgd), axis=1) print('Accuracy on original PGD adversarial samples: %.2f%%' % (np.sum(labels_pgd == labels_true) / x_test.shape[0] * 100)) trainer = AdversarialTrainer(classifier, attack, ratio=1.0) trainer.fit(x_train, y_train, nb_epochs=60, batch_size=1024) classifier.save(filename='adv_' + model_name + '.h5', path='../data/' + dataset + '_data/model/') # Evaluate the adversarially trained model on clean test set labels_true = np.argmax(y_test, axis=1)
batch_size=batch_size, epsilon=epsilon) x_test_adv = adv_crafter_deepfool.generate(x=x_test / 255.0) predictions = classifier.predict(x_test_adv * 255.0) print(np.argmax(predictions, axis=1)) accuracy = np.sum( np.argmax(predictions, axis=1) == y_test) / len(y_test) print('Accuracy on adversarial test examples: {}%'.format(accuracy * 100)) # pgd 20 adv_crafter_pgd_20 = ProjectedGradientDescent(classifier, eps=epsilon, eps_step=0.00775, max_iter=20, batch_size=batch_size) x_test_adv = adv_crafter_pgd_20.generate(x=x_test / 255.0) # print(x_test_adv) predictions = classifier.predict(x_test_adv * 255.0) accuracy = np.sum( np.argmax(predictions, axis=1) == y_test) / len(y_test) print('Accuracy on adversarial test examples: {}%'.format(accuracy * 100)) # C&W 20 # adv_crafter_cwinf = CarliniLInfMethod(classifier, eps=epsilon, learning_rate=epsilon/10, max_iter=20, batch_size=batch_size) # x_test_adv = adv_crafter_cwinf.generate(x=x_test/255.0) # predictions = classifier.predict(x_test_adv*255.0) # accuracy = np.sum(np.argmax(predictions, axis=1) == y_test) / len(y_test) # print('Accuracy after C&W attack: {}%'.format(accuracy * 100))
def _test_backend_mnist(self, classifier, x_train, y_train, x_test, y_test): x_test_original = x_test.copy() # Test PGD with np.inf norm attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.1, max_iter=5) x_train_adv = attack.generate(x_train) x_test_adv = attack.generate(x_test) self.assertFalse((x_train == x_train_adv).all()) self.assertFalse((x_test == x_test_adv).all()) train_y_pred = get_labels_np_array(classifier.predict(x_train_adv)) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_train == train_y_pred).all()) self.assertFalse((y_test == test_y_pred).all()) acc = np.sum( np.argmax(train_y_pred, axis=1) == np.argmax( y_train, axis=1)) / len(y_train) logger.info("Accuracy on adversarial train examples: %.2f%%", acc * 100) acc = np.sum( np.argmax(test_y_pred, axis=1) == np.argmax(np.array(y_test), axis=1)) / len(y_test) logger.info("Accuracy on adversarial test examples: %.2f%%", acc * 100) # Test PGD with 3 random initialisations attack = ProjectedGradientDescent(classifier, num_random_init=3, max_iter=5) x_train_adv = attack.generate(x_train) x_test_adv = attack.generate(x_test) self.assertFalse((x_train == x_train_adv).all()) self.assertFalse((x_test == x_test_adv).all()) train_y_pred = get_labels_np_array(classifier.predict(x_train_adv)) test_y_pred = get_labels_np_array(classifier.predict(x_test_adv)) self.assertFalse((y_train == train_y_pred).all()) self.assertFalse((y_test == test_y_pred).all()) acc = np.sum( np.argmax(train_y_pred, axis=1) == np.argmax( y_train, axis=1)) / len(y_train) logger.info( "Accuracy on adversarial train examples with 3 random initialisations: %.2f%%", acc * 100) acc = np.sum( np.argmax(test_y_pred, axis=1) == np.argmax(np.array(y_test), axis=1)) / len(y_test) logger.info( "Accuracy on adversarial test examples with 3 random initialisations: %.2f%%", acc * 100) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float( np.max(np.abs(np.array(x_test_original) - np.array(x_test)))), 0.0, delta=0.00001)
) # Create a toy Keras CNN architecture & wrap it under ART interface classifier = KerasClassifier(build_model(), clip_values=(0, 1), use_logits=False) # Create attack for adversarial trainer; here, we use 2 attacks, both crafting adv examples on the target model pgd = ProjectedGradientDescent(classifier, eps=8, eps_step=2, max_iter=10, num_random_init=20) # Create some adversarial samples for evaluation x_test_pgd = pgd.generate(x_test) # Create adversarial trainer and perform adversarial training adv_trainer = AdversarialTrainer(classifier, attacks=pgd, ratio=1.0) adv_trainer.fit_generator(art_datagen, nb_epochs=83) # Evaluate the adversarially trained model on clean test set labels_true = np.argmax(y_test, axis=1) labels_test = np.argmax(classifier.predict(x_test), axis=1) print("Accuracy test set: %.2f%%" % (np.sum(labels_test == labels_true) / x_test.shape[0] * 100)) # Evaluate the adversarially trained model on original adversarial samples labels_pgd = np.argmax(classifier.predict(x_test_pgd), axis=1) print("Accuracy on original PGD adversarial samples: %.2f%%" % (np.sum(labels_pgd == labels_true) / x_test.shape[0] * 100))
def test_scikitlearn(self): from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from art.classifiers.scikitlearn import SklearnClassifier scikitlearn_test_cases = [ LogisticRegression(solver="lbfgs", multi_class="auto"), SVC(gamma="auto"), LinearSVC(), ] x_test_original = self.x_test_iris.copy() for model in scikitlearn_test_cases: classifier = SklearnClassifier(model=model, clip_values=(0, 1)) classifier.fit(x=self.x_test_iris, y=self.y_test_iris) # Test untargeted attack attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.1, max_iter=5) x_test_adv = attack.generate(self.x_test_iris) self.assertFalse((self.x_test_iris == x_test_adv).all()) self.assertTrue((x_test_adv <= 1).all()) self.assertTrue((x_test_adv >= 0).all()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(self.y_test_iris, axis=1) == preds_adv).all()) acc = np.sum(preds_adv == np.argmax( self.y_test_iris, axis=1)) / self.y_test_iris.shape[0] logger.info( "Accuracy of " + classifier.__class__.__name__ + " on Iris with PGD adversarial examples: " "%.2f%%", (acc * 100), ) # Test targeted attack targets = random_targets(self.y_test_iris, nb_classes=3) attack = ProjectedGradientDescent(classifier, targeted=True, eps=1, eps_step=0.1, max_iter=5) x_test_adv = attack.generate(self.x_test_iris, **{"y": targets}) self.assertFalse((self.x_test_iris == x_test_adv).all()) self.assertTrue((x_test_adv <= 1).all()) self.assertTrue((x_test_adv >= 0).all()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertTrue((np.argmax(targets, axis=1) == preds_adv).any()) acc = np.sum(preds_adv == np.argmax( targets, axis=1)) / self.y_test_iris.shape[0] logger.info( "Success rate of " + classifier.__class__.__name__ + " on targeted PGD on Iris: %.2f%%", (acc * 100)) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float( np.max(np.abs(x_test_original - self.x_test_iris))), 0.0, delta=0.00001)
adv_inputs = adv_inputs[:nb_elements] out1 = model2.predict(inputs) #adv1 = atk.generate(x_train[:5]) #out_adv1 = model2.predict(adv1) #success = compute_success(classifier, x_train[:5],y_train2[:5],adv1) #print(success) #plt.imshow(x_train[0].squeeze(), cmap='gray') #plt.show() #plt.imshow(adv1[0].squeeze(), cmap='gray') #plt.show() out2 = model2.predict(adv_inputs) adv2 = atk.generate(inputs, out2) out_adv2 = model2.predict(adv2) success = compute_success(classifier, inputs, out1, adv2) success_targeted = compute_success(classifier, inputs, out2, adv2, targeted=True) print(success, success_targeted) good_targeted = np.argmax(out_adv2, axis=1) == np.argmax(out2, axis=1) good_adv, good_clean = adv2[good_targeted], inputs[good_targeted] plt.imshow(good_adv[0].squeeze(), cmap='gray')