def test_scikitlearn(self): from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from art.classifiers.scikitlearn import ScikitlearnDecisionTreeClassifier, ScikitlearnExtraTreeClassifier from art.classifiers.scikitlearn import ScikitlearnAdaBoostClassifier, ScikitlearnBaggingClassifier from art.classifiers.scikitlearn import ScikitlearnExtraTreesClassifier, ScikitlearnGradientBoostingClassifier from art.classifiers.scikitlearn import ScikitlearnRandomForestClassifier, ScikitlearnLogisticRegression from art.classifiers.scikitlearn import ScikitlearnSVC scikitlearn_test_cases = { DecisionTreeClassifier: ScikitlearnDecisionTreeClassifier, ExtraTreeClassifier: ScikitlearnExtraTreeClassifier, AdaBoostClassifier: ScikitlearnAdaBoostClassifier, BaggingClassifier: ScikitlearnBaggingClassifier, ExtraTreesClassifier: ScikitlearnExtraTreesClassifier, GradientBoostingClassifier: ScikitlearnGradientBoostingClassifier, RandomForestClassifier: ScikitlearnRandomForestClassifier, LogisticRegression: ScikitlearnLogisticRegression, SVC: ScikitlearnSVC, LinearSVC: ScikitlearnSVC } for (model_class, classifier_class) in scikitlearn_test_cases.items(): model = model_class() classifier = classifier_class(model=model, clip_values=(0, 1)) classifier.fit(x=self.x_test, y=self.y_test) attack = BoundaryAttack(classifier, targeted=False, delta=0.01, epsilon=0.01, step_adapt=0.667, max_iter=50, num_trial=25, sample_size=20, init_size=100) x_test_adv = attack.generate(self.x_test) self.assertFalse((self.x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1).all()) self.assertTrue((x_test_adv >= 0).all()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(self.y_test, axis=1) == preds_adv).all()) accuracy = np.sum(preds_adv == np.argmax( self.y_test, axis=1)) / self.y_test.shape[0] logger.info( 'Accuracy of ' + classifier.__class__.__name__ + ' on Iris with BoundaryAttack adversarial ' 'examples: %.2f%%', (accuracy * 100))
def test_iris_pt(self): classifier = get_iris_classifier_pt() attack = BoundaryAttack(classifier, targeted=False, max_iter=10) x_test_adv = attack.generate(self.x_test.astype(np.float32)) self.assertFalse((self.x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1).all()) self.assertTrue((x_test_adv >= 0).all()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(self.y_test, axis=1) == preds_adv).all()) accuracy = np.sum( preds_adv == np.argmax(self.y_test, axis=1)) / self.y_test.shape[0] logger.info( 'Accuracy on Iris with boundary adversarial examples: %.2f%%', (accuracy * 100))
def test_keras_iris_clipped(self): (_, _), (x_test, y_test) = self.iris classifier = get_iris_classifier_kr() attack = BoundaryAttack(classifier, targeted=False, max_iter=10) x_test_adv = attack.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1).all()) self.assertTrue((x_test_adv >= 0).all()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(y_test, axis=1) == preds_adv).all()) accuracy = np.sum( preds_adv == np.argmax(y_test, axis=1)) / y_test.shape[0] logger.info( 'Accuracy on Iris with boundary adversarial examples: %.2f%%', (accuracy * 100))
def test_iris_k_unbounded(self): classifier, _ = get_iris_classifier_kr() # Recreate a classifier without clip values classifier = KerasClassifier(model=classifier._model, use_logits=False, channel_index=1) attack = BoundaryAttack(classifier, targeted=False, max_iter=10) x_test_adv = attack.generate(self.x_test) self.assertFalse((self.x_test == x_test_adv).all()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(self.y_test, axis=1) == preds_adv).all()) accuracy = np.sum( preds_adv == np.argmax(self.y_test, axis=1)) / self.y_test.shape[0] logger.info( 'Accuracy on Iris with boundary adversarial examples: %.2f%%', (accuracy * 100))
def GetAttackers(classifier, x_test, attacker_name): """ Function: Load classifier and generate adversarial samples """ t_start = time.time() if attacker_name == "FGSM": attacker = FastGradientMethod(classifier=classifier, eps=0.3) elif attacker_name == "Elastic": attacker = ElasticNet(classifier=classifier, confidence=0.5) elif attacker_name == "BasicIterativeMethod": attacker = BasicIterativeMethod(classifier=classifier, eps=0.3) elif attacker_name == "NewtonFool": attacker = NewtonFool(classifier=classifier, max_iter=20) elif attacker_name == "HopSkipJump": attacker = HopSkipJump(classifier=classifier, max_iter=20) elif attacker_name == "ZooAttack": attacker = ZooAttack(classifier=classifier, max_iter=20) elif attacker_name == "VirtualAdversarialMethod": attacker = VirtualAdversarialMethod(classifier=classifier, max_iter=20) elif attacker_name == "UniversalPerturbation": attacker = UniversalPerturbation(classifier=classifier, max_iter=20) elif attacker_name == "AdversarialPatch": attacker = AdversarialPatch(classifier=classifier, max_iter=20) elif attacker_name == "Attack": attacker = Attack(classifier=classifier) elif attacker_name == "BoundaryAttack": attacker = BoundaryAttack(classifier=classifier, targeted=False, epsilon=0.05, max_iter=20) #, max_iter=20 elif attacker_name == "CarliniL2": attacker = CarliniL2Method(classifier=classifier, confidence=0.5, learning_rate=0.001, max_iter=15) elif attacker_name == "CarliniLinf": attacker = CarliniLInfMethod(classifier=classifier, confidence=0.5, learning_rate=0.001, max_iter=15) elif attacker_name == "DeepFool": attacker = DeepFool(classifier) elif attacker_name == "SMM": attacker = SaliencyMapMethod(classifier=classifier, theta=2) elif attacker_name == "PGD": attacker = ProjectedGradientDescent(classifier=classifier, norm=2, eps=1, eps_step=0.5) else: raise ValueError("Please get the right attacker's name for the input.") test_adv = attacker.generate(x_test) dt = time.time() - t_start return test_adv, dt
def test_krclassifier(self): """ Second test with the KerasClassifier. :return: """ # Build KerasClassifier krc = get_classifier_kr() # First targeted attack boundary = BoundaryAttack(classifier=krc, targeted=True, max_iter=20) params = {'y': random_targets(self.y_test, krc.nb_classes())} x_test_adv = boundary.generate(self.x_test, **params) self.assertFalse((self.x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) target = np.argmax(params['y'], axis=1) y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1) self.assertTrue((target == y_pred_adv).any()) # Second untargeted attack boundary = BoundaryAttack(classifier=krc, targeted=False, max_iter=20) x_test_adv = boundary.generate(self.x_test) self.assertFalse((self.x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) y_pred = np.argmax(krc.predict(self.x_test), axis=1) y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1) self.assertTrue((y_pred != y_pred_adv).any()) # Clean-up session k.clear_session()
def test_iris_tf(self): classifier, _ = get_iris_classifier_tf() # Test untargeted attack attack = BoundaryAttack(classifier, targeted=False, max_iter=10) x_test_adv = attack.generate(self.x_test) self.assertFalse((self.x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1).all()) self.assertTrue((x_test_adv >= 0).all()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertFalse((np.argmax(self.y_test, axis=1) == preds_adv).all()) accuracy = np.sum( preds_adv == np.argmax(self.y_test, axis=1)) / self.y_test.shape[0] logger.info( 'Accuracy on Iris with boundary adversarial examples: %.2f%%', (accuracy * 100)) # Test targeted attack targets = random_targets(self.y_test, nb_classes=3) attack = BoundaryAttack(classifier, targeted=True, max_iter=10) x_test_adv = attack.generate(self.x_test, **{'y': targets}) self.assertFalse((self.x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1).all()) self.assertTrue((x_test_adv >= 0).all()) preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1) self.assertTrue((np.argmax(targets, axis=1) == preds_adv).any()) accuracy = np.sum( preds_adv == np.argmax(targets, axis=1)) / self.y_test.shape[0] logger.info('Success rate of targeted boundary on Iris: %.2f%%', (accuracy * 100))
def test_ptclassifier(self): """ Third test with the PyTorchClassifier. :return: """ # Build PyTorchClassifier ptc = get_classifier_pt() x_test = np.swapaxes(self.x_test, 1, 3).astype(np.float32) # First targeted attack boundary = BoundaryAttack(classifier=ptc, targeted=True, max_iter=20) params = {'y': random_targets(self.y_test, ptc.nb_classes())} x_test_adv = boundary.generate(x_test, **params) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) target = np.argmax(params['y'], axis=1) y_pred_adv = np.argmax(ptc.predict(x_test_adv), axis=1) self.assertTrue((target == y_pred_adv).any()) # Second untargeted attack boundary = BoundaryAttack(classifier=ptc, targeted=False, max_iter=20) x_test_adv = boundary.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) y_pred = np.argmax(ptc.predict(x_test), axis=1) y_pred_adv = np.argmax(ptc.predict(x_test_adv), axis=1) self.assertTrue((y_pred != y_pred_adv).any())
def test_classifier_type_check_fail_classifier(self): # Use a useless test classifier to test basic classifier properties class ClassifierNoAPI: pass classifier = ClassifierNoAPI with self.assertRaises(TypeError) as context: _ = BoundaryAttack(classifier=classifier) self.assertIn( 'For `BoundaryAttack` classifier must be an instance of `art.classifiers.classifier.Classifier`, ' 'the provided classifier is instance of (<class \'object\'>,).', str(context.exception))
def test_images(fix_get_mnist_subset, get_image_classifier_list_for_attack, framework, targeted): classifier_list = get_image_classifier_list_for_attack(BoundaryAttack) if classifier_list is None: logging.warning( "Couldn't perform this test because no classifier is defined") return for classifier in classifier_list: attack = BoundaryAttack(classifier=classifier, targeted=targeted, max_iter=20) if targeted: backend_targeted_images(attack, fix_get_mnist_subset) else: back_end_untargeted_images(attack, fix_get_mnist_subset, framework)
def test_tabular(get_tabular_classifier_list, framework, get_iris_dataset, clipped_classifier, targeted): classifier_list = get_tabular_classifier_list(BoundaryAttack, clipped=clipped_classifier) if classifier_list is None: logging.warning( "Couldn't perform this test because no classifier is defined") return for classifier in classifier_list: attack = BoundaryAttack(classifier, targeted=targeted, max_iter=10) if targeted: backend_targeted_tabular(attack, get_iris_dataset) else: backend_untargeted_tabular(attack, get_iris_dataset, clipped=clipped_classifier)
def GetAttackers(classifier, x_test, attacker_name): """ Function: Load classifier and generate adversarial samples """ t_start = time.time() if attacker_name == "AdversarialPatch": attacker = AdversarialPatch(classifier=classifier, max_iter=10) elif attacker_name == "Attack": attacker = Attack(classifier=classifier) elif attacker_name == "BoundaryAttack": attacker = BoundaryAttack(classifier=classifier, targeted=False, epsilon=0.05, max_iter=10) #, max_iter=20 else: raise ValueError("Please get the right attacker's name for the input.") test_adv = attacker.generate(x_test) dt = time.time() - t_start return test_adv, dt
def test_keras_mnist(self): """ Second test with the KerasClassifier. :return: """ (_, _), (x_test, y_test) = self.mnist x_test_original = x_test.copy() # Build KerasClassifier krc = get_classifier_kr() # First targeted attack boundary = BoundaryAttack(classifier=krc, targeted=True, max_iter=20) params = {'y': random_targets(y_test, krc.nb_classes())} x_test_adv = boundary.generate(x_test, **params) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) target = np.argmax(params['y'], axis=1) y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1) self.assertTrue((target == y_pred_adv).any()) # Second untargeted attack boundary = BoundaryAttack(classifier=krc, targeted=False, max_iter=20) x_test_adv = boundary.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) y_pred = np.argmax(krc.predict(x_test), axis=1) y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1) self.assertTrue((y_pred != y_pred_adv).any()) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))), 0.0, delta=0.00001) # Clean-up session k.clear_session()
def test_pytorch_mnist(self): """ Third test with the PyTorchClassifier. :return: """ (_, _), (x_test, y_test) = self.mnist x_test = np.swapaxes(x_test, 1, 3).astype(np.float32) x_test = np.reshape(x_test, (x_test.shape[0], 1, 28, 28)).astype(np.float32) x_test_original = x_test.copy() # Build PyTorchClassifier ptc = get_classifier_pt() # First targeted attack boundary = BoundaryAttack(classifier=ptc, targeted=True, max_iter=20) params = {'y': random_targets(y_test, ptc.nb_classes())} x_test_adv = boundary.generate(x_test, **params) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) target = np.argmax(params['y'], axis=1) y_pred_adv = np.argmax(ptc.predict(x_test_adv), axis=1) self.assertTrue((target == y_pred_adv).any()) # Second untargeted attack boundary = BoundaryAttack(classifier=ptc, targeted=False, max_iter=20) x_test_adv = boundary.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) y_pred = np.argmax(ptc.predict(x_test), axis=1) y_pred_adv = np.argmax(ptc.predict(x_test_adv), axis=1) self.assertTrue((y_pred != y_pred_adv).any()) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))), 0.0, delta=0.00001)
def test_tfclassifier(self): """ First test with the TFClassifier. :return: """ # Build TFClassifier tfc, sess = get_classifier_tf() # Get MNIST (_, _), (x_test, y_test) = self.mnist # First targeted attack boundary = BoundaryAttack(classifier=tfc, targeted=True, max_iter=20) params = {'y': random_targets(y_test, tfc.nb_classes)} x_test_adv = boundary.generate(x_test, **params) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) target = np.argmax(params['y'], axis=1) y_pred_adv = np.argmax(tfc.predict(x_test_adv), axis=1) self.assertTrue((target == y_pred_adv).any()) # Second untargeted attack boundary = BoundaryAttack(classifier=tfc, targeted=False, max_iter=20) x_test_adv = boundary.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) y_pred = np.argmax(tfc.predict(x_test), axis=1) y_pred_adv = np.argmax(tfc.predict(x_test_adv), axis=1) self.assertTrue((y_pred != y_pred_adv).any()) # Clean-up session sess.close() tf.reset_default_graph()
nb_classes=nclasses) test_data, test_label = load_svmlight_file(data_path, n_features=nfeatures) test_data = test_data.toarray() test_label = test_label.astype('int') n = len(test_label) df = pd.DataFrame(test_data) df['label'] = test_label df = df.sample(frac=1) test_label = df['label'].tolist() test_data = np.array(df.drop(columns=['label'])) predictions = np.argmax(classifier.predict(test_data), axis=1) attack = BoundaryAttack(classifier=classifier, targeted=False, delta=0.05, epsilon=0.05, step_adapt=0.5) n_selected = 100 corrected = [] c_labels = [] for i in range(len(test_label)): if test_label[i] == predictions[i]: corrected.append(test_data[i]) c_labels.append(test_label[i]) if len(corrected) >= n_selected: break corrected = np.array(corrected) start = time.time() test_adv = attack.generate(corrected) end = time.time()
def test_tfclassifier(self): """ First test with the TensorFlowClassifier. :return: """ # Build TensorFlowClassifier tfc, sess = get_classifier_tf() # First targeted attack boundary = BoundaryAttack(classifier=tfc, targeted=True, max_iter=200, delta=0.5) params = {'y': random_targets(self.y_test, tfc.nb_classes())} x_test_adv = boundary.generate(self.x_test, **params) expected_x_test_adv_1 = np.asarray([ 0.42622495, 0.0, 0.0, 0.33005068, 0.2277837, 0.0, 0.18348512, 0.42622495, 0.27452883, 0.0, 0.0, 0.0, 0.1653487, 0.70523715, 0.7367977, 0.7974912, 0.28579983, 0.0, 0.36499417, 0.0, 0.0, 0.0, 0.42622495, 0.0, 0.26680174, 0.42622495, 0.0, 0.19260764 ]) expected_x_test_adv_2 = np.asarray([ 0.0459, 0., 0., 0.0756, 0.2048, 0.037, 0., 0., 0.0126, 0.4338, 0.1566, 0.3061, 0., 0.296, 0.8318, 0.7267, 0.2252, 0.074, 0., 0.1208, 0.4362, 0., 0., 0., 0., 0.0359, 0., 0.1191 ]) try: np.testing.assert_array_almost_equal(x_test_adv[2, 14, :, 0], expected_x_test_adv_1, decimal=4) except AssertionError: np.testing.assert_array_almost_equal(x_test_adv[2, 14, :, 0], expected_x_test_adv_2, decimal=4) self.assertLessEqual(np.max(x_test_adv), 1.0) self.assertGreaterEqual(np.min(x_test_adv), 0.0) y_pred_adv = tfc.predict(x_test_adv) y_pred_adv_expected = np.asarray([ 1.57103419e-01, -7.31061280e-01, -4.03979905e-02, -4.79048371e-01, 9.37852338e-02, -8.01057637e-01, -4.77534801e-01, 1.08687377e+00, -3.06577891e-01, -5.74976981e-01 ]) np.testing.assert_array_almost_equal(y_pred_adv[0], y_pred_adv_expected, decimal=4) # Second untargeted attack boundary = BoundaryAttack(classifier=tfc, targeted=False, max_iter=20) x_test_adv = boundary.generate(self.x_test) self.assertFalse((self.x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) y_pred = np.argmax(tfc.predict(self.x_test), axis=1) y_pred_adv = np.argmax(tfc.predict(x_test_adv), axis=1) self.assertTrue((y_pred != y_pred_adv).any()) # Clean-up session sess.close()
def test_tensorflow_mnist(self): """ First test with the TensorFlowClassifier. :return: """ (_, _), (x_test, y_test) = self.mnist x_test_original = x_test.copy() # Build TensorFlowClassifier tfc, sess = get_classifier_tf() # First targeted attack boundary = BoundaryAttack(classifier=tfc, targeted=True, max_iter=200, delta=0.5) params = {'y': random_targets(y_test, tfc.nb_classes())} x_test_adv = boundary.generate(x_test, **params) # expected_x_test_adv_1 = np.asarray([0.42622495, 0.0, 0.0, 0.33005068, 0.2277837, 0.0, # 0.18348512, 0.42622495, 0.27452883, 0.0, 0.0, 0.0, # 0.1653487, 0.70523715, 0.7367977, 0.7974912, 0.28579983, 0.0, # 0.36499417, 0.0, 0.0, 0.0, 0.42622495, 0.0, # 0.26680174, 0.42622495, 0.0, 0.19260764]) # expected_x_test_adv_2 = np.asarray([0.0459, 0., 0., 0.0756, 0.2048, 0.037, 0., 0., # 0.0126, 0.4338, 0.1566, 0.3061, 0., 0.296, 0.8318, 0.7267, # 0.2252, 0.074, 0., 0.1208, 0.4362, 0., 0., 0., # 0., 0.0359, 0., 0.1191]) # # expected_x_test_adv_3 = np.asarray([0.0671, 0.0644, 0.3012, 0., 0., 0., 0.3407, 0., # 0.1507, 0.0478, 0.3253, 0., 0.3334, 0.3473, 1., 0.8649, # 0.5639, 0.5198, 0., 0., 0.6173, 0., 0.3116, 0., # 0.3937, 0.6173, 0., 0.0021]) # try: # np.testing.assert_array_almost_equal(x_test_adv[2, 14, :, 0], expected_x_test_adv_1, decimal=4) # except AssertionError: # try: # np.testing.assert_array_almost_equal(x_test_adv[2, 14, :, 0], expected_x_test_adv_2, decimal=4) # except AssertionError: # np.testing.assert_array_almost_equal(x_test_adv[2, 14, :, 0], expected_x_test_adv_3, decimal=4) self.assertLessEqual(np.max(x_test_adv), 1.0) self.assertGreaterEqual(np.min(x_test_adv), 0.0) y_pred_adv = tfc.predict(x_test_adv) y_pred_adv_expected = np.asarray([ 1.57103419e-01, -7.31061280e-01, -4.03979905e-02, -4.79048371e-01, 9.37852338e-02, -8.01057637e-01, -4.77534801e-01, 1.08687377e+00, -3.06577891e-01, -5.74976981e-01 ]) # np.testing.assert_array_almost_equal(y_pred_adv[0], y_pred_adv_expected, decimal=4) # Second untargeted attack boundary = BoundaryAttack(classifier=tfc, targeted=False, max_iter=20) x_test_adv = boundary.generate(x_test) self.assertFalse((x_test == x_test_adv).all()) self.assertTrue((x_test_adv <= 1.0001).all()) self.assertTrue((x_test_adv >= -0.0001).all()) y_pred = np.argmax(tfc.predict(x_test), axis=1) y_pred_adv = np.argmax(tfc.predict(x_test_adv), axis=1) self.assertTrue((y_pred != y_pred_adv).any()) # Check that x_test has not been modified by attack and classifier self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))), 0.0, delta=0.00001) # Clean-up session sess.close()
def adversarial_attack_shift(x, y, delta=1.0, model=RandomForestClassifier(), attack_type='zoo', numerical_features=None, feat_delta=1.0): # in this case delta is the portion of half the data on which to generate attacks # because the first half as a minimum has to be used to train a model against which generate the attacks assert (attack_type in ['zoo', 'boundary', 'hop-skip-jump']) le = preprocessing.LabelEncoder() le.fit(np.squeeze(y)) y = le.transform(y) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=(0.5 * delta)) if numerical_features is not None: n_numerical = len(numerical_features) feat_indices = np.random.choice(n_numerical, ceil(n_numerical * feat_delta), replace=False) feat_indices = np.array(numerical_features)[feat_indices] else: feat_indices = np.random.choice(x.shape[1], ceil(x.shape[1] * feat_delta), replace=False) other_features = list(set(range(x.shape[1])) - set(feat_indices)) x_train_other = x_train[:, other_features] x_train_numerical = x_train[:, feat_indices] x_test_other = x_test[:, other_features] x_test_numerical = x_test[:, feat_indices] classifier = SklearnClassifier(model=model, clip_values=(0, np.max(x_train_numerical))) # Train the ART classifier classifier.fit(x_train_numerical, y_train) # Evaluate the ART classifier on benign test examples predictions = classifier.predict(x_test_numerical) accuracy = np.sum(np.argmax(predictions, axis=1) == y_test) / len(y_test) print("Accuracy on benign test examples: {}%".format(accuracy * 100)) # Generate adversarial test examples if attack_type == 'zoo': attack = ZooAttack( classifier=classifier, confidence=0.0, targeted=False, learning_rate=1e-1, max_iter=10, binary_search_steps=10, initial_const=1e-3, abort_early=True, use_resize=False, use_importance=False, nb_parallel=x_test_numerical.shape[1], batch_size=1, variable_h=0.01, ) elif attack_type == 'boundary': attack = BoundaryAttack(classifier, targeted=False, epsilon=0.02, max_iter=20, num_trial=10) elif attack_type == 'hop-skip-jump': attack = HopSkipJump(classifier, targeted=False, norm=2, max_iter=20, max_eval=10, init_eval=9, init_size=10) x_adv = attack.generate(x=x_test_numerical, y=y_test) # Evaluate the ART classifier on adversarial test examples predictions_adv = classifier.predict(x_adv) accuracy = np.sum(np.argmax(predictions_adv, axis=1) == y_test) / len(y_test) print("Accuracy on adversarial test examples: {}%".format(accuracy * 100)) print("Max difference: {}".format(np.max(np.abs(x_test_numerical - x_adv) / x_test_numerical))) x_final = np.zeros_like(x) x_final[:, feat_indices] = np.vstack([x_train_numerical, x_adv]) x_final[:, other_features] = np.vstack([x_train_other, x_test_other]) y_final = np.concatenate([y_train, y_test], axis=0) y_final = le.inverse_transform(y_final) adv_indices = list(range(len(y_train), len(y))) return x_final, y_final, adv_indices, feat_indices
def attack_run_rejection_policy(model, hps): """ An attack run with rejection policy. :param model: Pytorch model. :param hps: hyperparameters :return: """ model.eval() # Get thresholds threshold_list1 = [] threshold_list2 = [] for label_id in range(hps.n_classes): # No data augmentation(crop_flip=False) when getting in-distribution thresholds dataset = get_dataset(data_name=hps.problem, train=True, label_id=label_id, crop_flip=False) in_test_loader = DataLoader(dataset=dataset, batch_size=hps.n_batch_test, shuffle=False) print('Inference on {}, label_id {}'.format(hps.problem, label_id)) in_ll_list = [] for batch_id, (x, y) in enumerate(in_test_loader): x = x.to(hps.device) y = y.to(hps.device) ll = model(x) correct_idx = ll.argmax(dim=1) == y ll_, y_ = ll[correct_idx], y[correct_idx] # choose samples are classified correctly in_ll_list += list(ll_[:, label_id].detach().cpu().numpy()) thresh_idx = int(0.01 * len(in_ll_list)) thresh1 = sorted(in_ll_list)[thresh_idx] thresh_idx = int(0.02 * len(in_ll_list)) thresh2 = sorted(in_ll_list)[thresh_idx] threshold_list1.append(thresh1) # class mean as threshold threshold_list2.append(thresh2) # class mean as threshold print('1st & 2nd percentile thresholds: {:.3f}, {:.3f}'.format(thresh1, thresh2)) # Evaluation n_total = 0 # total number of correct classified samples by clean classifier n_successful_adv = 0 # total number of successful adversarial examples generated n_rejected_adv1 = 0 # total number of successfully rejected (successful) adversarial examples, <= n_successful_adv n_rejected_adv2 = 0 # total number of successfully rejected (successful) adversarial examples, <= n_successful_adv attack_path = os.path.join(hps.attack_dir, hps.attack) if not os.path.exists(attack_path): os.mkdir(attack_path) thresholds1 = torch.tensor(threshold_list1).to(hps.device) thresholds2 = torch.tensor(threshold_list2).to(hps.device) l2_distortion_list = [] n_eval = 0 wrapped_target_model = PyTorchClassifier(model=model, loss=None, optimizer=None, input_shape=(hps.image_channel, 32, 32), nb_classes=hps.n_classes) if hps.attack == 'boundary': attack = BoundaryAttack(wrapped_target_model, targeted=hps.targeted) elif hps.attack == 'cw': attack = CarliniL2Method(wrapped_target_model, confidence=hps.cw_confidence, targeted=hps.targeted) hps.n_batch_test = 1 for label_id in range(hps.n_classes): dataset = get_dataset(data_name=hps.problem, train=False, label_id=label_id) test_loader = DataLoader(dataset=dataset, batch_size=hps.n_batch_test, shuffle=False) for batch_id, (x, y) in enumerate(test_loader): # Note that images are scaled to [0., 1.0] x, y = x.to(hps.device), y.to(hps.device) with torch.no_grad(): output = model(x) pred = output.argmax(dim=1) correct_idx = pred == y # Only evaluate on the correct classified samples by clean classifier. x, y = x[correct_idx], y[correct_idx] n_eval += correct_idx.sum().item() for id in range(hps.n_classes): if label_id != id: n_total += 1 y_cur = torch.LongTensor([id] * x.size(0)).to(hps.device) # adv_x = adversary.perturb(x, y_cur) x_ = x.cpu().numpy().astype(np.float32) y_ = y_cur.cpu().numpy().astype(np.float32) adv_x = attack.generate(x_, y_) with torch.no_grad(): adv_x = torch.tensor(adv_x).to(hps.device) output = model(adv_x) logits, preds = output.max(dim=1) success_idx = preds == y_cur n_successful_adv += success_idx.sum().item() diff = adv_x - x l2_distortion = diff.norm(p=2, dim=-1).mean().item() # mean l2 distortion l2_distortion_list.append(l2_distortion) rej_idx1 = logits < thresholds1[preds] n_rejected_adv1 += rej_idx1.sum().item() rej_idx2 = logits < thresholds2[preds] n_rejected_adv2 += rej_idx2.sum().item() break # only one batch print('Evaluating on samples of class {} ...'.format(label_id)) reject_rate1 = n_rejected_adv1 / n_successful_adv reject_rate2 = n_rejected_adv2 / n_successful_adv success_adv_rate = n_successful_adv / n_total print('success rate of adv examples generation: {}/{}={:.4f}'.format(n_successful_adv, n_total, success_adv_rate)) print('Mean L2 distortion of Adv Examples: {:.4f}'.format(np.mean(l2_distortion_list))) print('1st percentile, reject success rate: {}/{}={:.4f}'.format(n_rejected_adv1, n_successful_adv, reject_rate1)) print('2nd percentile, reject success rate: {}/{}={:.4f}'.format(n_rejected_adv2, n_successful_adv, reject_rate2))