Ejemplo n.º 1
0
    def test_scikitlearn(self):
        from sklearn.linear_model import LogisticRegression
        from sklearn.svm import SVC, LinearSVC
        from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
        from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
        from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

        from art.classifiers.scikitlearn import ScikitlearnDecisionTreeClassifier, ScikitlearnExtraTreeClassifier
        from art.classifiers.scikitlearn import ScikitlearnAdaBoostClassifier, ScikitlearnBaggingClassifier
        from art.classifiers.scikitlearn import ScikitlearnExtraTreesClassifier, ScikitlearnGradientBoostingClassifier
        from art.classifiers.scikitlearn import ScikitlearnRandomForestClassifier, ScikitlearnLogisticRegression
        from art.classifiers.scikitlearn import ScikitlearnSVC

        scikitlearn_test_cases = {
            DecisionTreeClassifier: ScikitlearnDecisionTreeClassifier,
            ExtraTreeClassifier: ScikitlearnExtraTreeClassifier,
            AdaBoostClassifier: ScikitlearnAdaBoostClassifier,
            BaggingClassifier: ScikitlearnBaggingClassifier,
            ExtraTreesClassifier: ScikitlearnExtraTreesClassifier,
            GradientBoostingClassifier: ScikitlearnGradientBoostingClassifier,
            RandomForestClassifier: ScikitlearnRandomForestClassifier,
            LogisticRegression: ScikitlearnLogisticRegression,
            SVC: ScikitlearnSVC,
            LinearSVC: ScikitlearnSVC
        }

        for (model_class, classifier_class) in scikitlearn_test_cases.items():
            model = model_class()
            classifier = classifier_class(model=model, clip_values=(0, 1))
            classifier.fit(x=self.x_test, y=self.y_test)

            attack = BoundaryAttack(classifier,
                                    targeted=False,
                                    delta=0.01,
                                    epsilon=0.01,
                                    step_adapt=0.667,
                                    max_iter=50,
                                    num_trial=25,
                                    sample_size=20,
                                    init_size=100)
            x_test_adv = attack.generate(self.x_test)
            self.assertFalse((self.x_test == x_test_adv).all())
            self.assertTrue((x_test_adv <= 1).all())
            self.assertTrue((x_test_adv >= 0).all())

            preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
            self.assertFalse((np.argmax(self.y_test,
                                        axis=1) == preds_adv).all())
            accuracy = np.sum(preds_adv == np.argmax(
                self.y_test, axis=1)) / self.y_test.shape[0]
            logger.info(
                'Accuracy of ' + classifier.__class__.__name__ +
                ' on Iris with BoundaryAttack adversarial '
                'examples: %.2f%%', (accuracy * 100))
Ejemplo n.º 2
0
    def test_iris_pt(self):
        classifier = get_iris_classifier_pt()
        attack = BoundaryAttack(classifier, targeted=False, max_iter=10)
        x_test_adv = attack.generate(self.x_test.astype(np.float32))
        self.assertFalse((self.x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1).all())
        self.assertTrue((x_test_adv >= 0).all())

        preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
        self.assertFalse((np.argmax(self.y_test, axis=1) == preds_adv).all())
        accuracy = np.sum(
            preds_adv == np.argmax(self.y_test, axis=1)) / self.y_test.shape[0]
        logger.info(
            'Accuracy on Iris with boundary adversarial examples: %.2f%%',
            (accuracy * 100))
Ejemplo n.º 3
0
    def test_keras_iris_clipped(self):
        (_, _), (x_test, y_test) = self.iris
        classifier = get_iris_classifier_kr()
        attack = BoundaryAttack(classifier, targeted=False, max_iter=10)
        x_test_adv = attack.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1).all())
        self.assertTrue((x_test_adv >= 0).all())

        preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
        self.assertFalse((np.argmax(y_test, axis=1) == preds_adv).all())
        accuracy = np.sum(
            preds_adv == np.argmax(y_test, axis=1)) / y_test.shape[0]
        logger.info(
            'Accuracy on Iris with boundary adversarial examples: %.2f%%',
            (accuracy * 100))
Ejemplo n.º 4
0
    def test_iris_k_unbounded(self):
        classifier, _ = get_iris_classifier_kr()

        # Recreate a classifier without clip values
        classifier = KerasClassifier(model=classifier._model,
                                     use_logits=False,
                                     channel_index=1)
        attack = BoundaryAttack(classifier, targeted=False, max_iter=10)
        x_test_adv = attack.generate(self.x_test)
        self.assertFalse((self.x_test == x_test_adv).all())

        preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
        self.assertFalse((np.argmax(self.y_test, axis=1) == preds_adv).all())
        accuracy = np.sum(
            preds_adv == np.argmax(self.y_test, axis=1)) / self.y_test.shape[0]
        logger.info(
            'Accuracy on Iris with boundary adversarial examples: %.2f%%',
            (accuracy * 100))
Ejemplo n.º 5
0
def GetAttackers(classifier, x_test, attacker_name):
    """
    Function:
        Load classifier and generate adversarial samples
    """
    t_start = time.time()
    if attacker_name == "FGSM":
        attacker = FastGradientMethod(classifier=classifier, eps=0.3)
    elif attacker_name == "Elastic":
        attacker = ElasticNet(classifier=classifier, confidence=0.5)
    elif attacker_name == "BasicIterativeMethod":
        attacker = BasicIterativeMethod(classifier=classifier, eps=0.3)
    elif attacker_name == "NewtonFool":
        attacker = NewtonFool(classifier=classifier, max_iter=20)
    elif attacker_name == "HopSkipJump":
        attacker = HopSkipJump(classifier=classifier, max_iter=20)
    elif attacker_name == "ZooAttack":
        attacker = ZooAttack(classifier=classifier, max_iter=20)
    elif attacker_name == "VirtualAdversarialMethod":
        attacker = VirtualAdversarialMethod(classifier=classifier, max_iter=20)
    elif attacker_name == "UniversalPerturbation":
        attacker = UniversalPerturbation(classifier=classifier, max_iter=20)
    elif attacker_name == "AdversarialPatch":
        attacker = AdversarialPatch(classifier=classifier, max_iter=20)
    elif attacker_name == "Attack":
        attacker = Attack(classifier=classifier)
    elif attacker_name == "BoundaryAttack":
        attacker = BoundaryAttack(classifier=classifier,
                                  targeted=False,
                                  epsilon=0.05,
                                  max_iter=20)  #, max_iter=20
    elif attacker_name == "CarliniL2":
        attacker = CarliniL2Method(classifier=classifier,
                                   confidence=0.5,
                                   learning_rate=0.001,
                                   max_iter=15)
    elif attacker_name == "CarliniLinf":
        attacker = CarliniLInfMethod(classifier=classifier,
                                     confidence=0.5,
                                     learning_rate=0.001,
                                     max_iter=15)
    elif attacker_name == "DeepFool":
        attacker = DeepFool(classifier)
    elif attacker_name == "SMM":
        attacker = SaliencyMapMethod(classifier=classifier, theta=2)
    elif attacker_name == "PGD":
        attacker = ProjectedGradientDescent(classifier=classifier,
                                            norm=2,
                                            eps=1,
                                            eps_step=0.5)
    else:
        raise ValueError("Please get the right attacker's name for the input.")
    test_adv = attacker.generate(x_test)
    dt = time.time() - t_start
    return test_adv, dt
Ejemplo n.º 6
0
    def test_krclassifier(self):
        """
        Second test with the KerasClassifier.
        :return:
        """
        # Build KerasClassifier
        krc = get_classifier_kr()

        # First targeted attack
        boundary = BoundaryAttack(classifier=krc, targeted=True, max_iter=20)
        params = {'y': random_targets(self.y_test, krc.nb_classes())}
        x_test_adv = boundary.generate(self.x_test, **params)

        self.assertFalse((self.x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())

        target = np.argmax(params['y'], axis=1)
        y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1)
        self.assertTrue((target == y_pred_adv).any())

        # Second untargeted attack
        boundary = BoundaryAttack(classifier=krc, targeted=False, max_iter=20)
        x_test_adv = boundary.generate(self.x_test)

        self.assertFalse((self.x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())

        y_pred = np.argmax(krc.predict(self.x_test), axis=1)
        y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1)
        self.assertTrue((y_pred != y_pred_adv).any())

        # Clean-up session
        k.clear_session()
Ejemplo n.º 7
0
    def test_iris_tf(self):
        classifier, _ = get_iris_classifier_tf()

        # Test untargeted attack
        attack = BoundaryAttack(classifier, targeted=False, max_iter=10)
        x_test_adv = attack.generate(self.x_test)
        self.assertFalse((self.x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1).all())
        self.assertTrue((x_test_adv >= 0).all())

        preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
        self.assertFalse((np.argmax(self.y_test, axis=1) == preds_adv).all())
        accuracy = np.sum(
            preds_adv == np.argmax(self.y_test, axis=1)) / self.y_test.shape[0]
        logger.info(
            'Accuracy on Iris with boundary adversarial examples: %.2f%%',
            (accuracy * 100))

        # Test targeted attack
        targets = random_targets(self.y_test, nb_classes=3)
        attack = BoundaryAttack(classifier, targeted=True, max_iter=10)
        x_test_adv = attack.generate(self.x_test, **{'y': targets})
        self.assertFalse((self.x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1).all())
        self.assertTrue((x_test_adv >= 0).all())

        preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
        self.assertTrue((np.argmax(targets, axis=1) == preds_adv).any())
        accuracy = np.sum(
            preds_adv == np.argmax(targets, axis=1)) / self.y_test.shape[0]
        logger.info('Success rate of targeted boundary on Iris: %.2f%%',
                    (accuracy * 100))
Ejemplo n.º 8
0
    def test_ptclassifier(self):
        """
        Third test with the PyTorchClassifier.
        :return:
        """
        # Build PyTorchClassifier
        ptc = get_classifier_pt()

        x_test = np.swapaxes(self.x_test, 1, 3).astype(np.float32)

        # First targeted attack
        boundary = BoundaryAttack(classifier=ptc, targeted=True, max_iter=20)
        params = {'y': random_targets(self.y_test, ptc.nb_classes())}
        x_test_adv = boundary.generate(x_test, **params)

        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())

        target = np.argmax(params['y'], axis=1)
        y_pred_adv = np.argmax(ptc.predict(x_test_adv), axis=1)
        self.assertTrue((target == y_pred_adv).any())

        # Second untargeted attack
        boundary = BoundaryAttack(classifier=ptc, targeted=False, max_iter=20)
        x_test_adv = boundary.generate(x_test)

        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())

        y_pred = np.argmax(ptc.predict(x_test), axis=1)
        y_pred_adv = np.argmax(ptc.predict(x_test_adv), axis=1)
        self.assertTrue((y_pred != y_pred_adv).any())
Ejemplo n.º 9
0
    def test_classifier_type_check_fail_classifier(self):
        # Use a useless test classifier to test basic classifier properties
        class ClassifierNoAPI:
            pass

        classifier = ClassifierNoAPI
        with self.assertRaises(TypeError) as context:
            _ = BoundaryAttack(classifier=classifier)

        self.assertIn(
            'For `BoundaryAttack` classifier must be an instance of `art.classifiers.classifier.Classifier`, '
            'the provided classifier is instance of (<class \'object\'>,).',
            str(context.exception))
def test_images(fix_get_mnist_subset, get_image_classifier_list_for_attack,
                framework, targeted):
    classifier_list = get_image_classifier_list_for_attack(BoundaryAttack)
    if classifier_list is None:
        logging.warning(
            "Couldn't perform  this test because no classifier is defined")
        return

    for classifier in classifier_list:

        attack = BoundaryAttack(classifier=classifier,
                                targeted=targeted,
                                max_iter=20)
        if targeted:
            backend_targeted_images(attack, fix_get_mnist_subset)
        else:
            back_end_untargeted_images(attack, fix_get_mnist_subset, framework)
def test_tabular(get_tabular_classifier_list, framework, get_iris_dataset,
                 clipped_classifier, targeted):
    classifier_list = get_tabular_classifier_list(BoundaryAttack,
                                                  clipped=clipped_classifier)
    if classifier_list is None:
        logging.warning(
            "Couldn't perform  this test because no classifier is defined")
        return

    for classifier in classifier_list:

        attack = BoundaryAttack(classifier, targeted=targeted, max_iter=10)
        if targeted:
            backend_targeted_tabular(attack, get_iris_dataset)
        else:
            backend_untargeted_tabular(attack,
                                       get_iris_dataset,
                                       clipped=clipped_classifier)
def GetAttackers(classifier, x_test, attacker_name):
    """
    Function:
        Load classifier and generate adversarial samples
    """
    t_start = time.time()
    if attacker_name == "AdversarialPatch":
        attacker = AdversarialPatch(classifier=classifier, max_iter=10)
    elif attacker_name == "Attack":
        attacker = Attack(classifier=classifier)
    elif attacker_name == "BoundaryAttack":
        attacker = BoundaryAttack(classifier=classifier,
                                  targeted=False,
                                  epsilon=0.05,
                                  max_iter=10)  #, max_iter=20
    else:
        raise ValueError("Please get the right attacker's name for the input.")
    test_adv = attacker.generate(x_test)
    dt = time.time() - t_start
    return test_adv, dt
Ejemplo n.º 13
0
    def test_keras_mnist(self):
        """
        Second test with the KerasClassifier.
        :return:
        """
        (_, _), (x_test, y_test) = self.mnist

        x_test_original = x_test.copy()

        # Build KerasClassifier
        krc = get_classifier_kr()

        # First targeted attack
        boundary = BoundaryAttack(classifier=krc, targeted=True, max_iter=20)
        params = {'y': random_targets(y_test, krc.nb_classes())}
        x_test_adv = boundary.generate(x_test, **params)

        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())

        target = np.argmax(params['y'], axis=1)
        y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1)
        self.assertTrue((target == y_pred_adv).any())

        # Second untargeted attack
        boundary = BoundaryAttack(classifier=krc, targeted=False, max_iter=20)
        x_test_adv = boundary.generate(x_test)

        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())

        y_pred = np.argmax(krc.predict(x_test), axis=1)
        y_pred_adv = np.argmax(krc.predict(x_test_adv), axis=1)
        self.assertTrue((y_pred != y_pred_adv).any())

        # Check that x_test has not been modified by attack and classifier
        self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))),
                               0.0,
                               delta=0.00001)

        # Clean-up session
        k.clear_session()
Ejemplo n.º 14
0
    def test_pytorch_mnist(self):
        """
        Third test with the PyTorchClassifier.
        :return:
        """
        (_, _), (x_test, y_test) = self.mnist
        x_test = np.swapaxes(x_test, 1, 3).astype(np.float32)
        x_test = np.reshape(x_test,
                            (x_test.shape[0], 1, 28, 28)).astype(np.float32)
        x_test_original = x_test.copy()

        # Build PyTorchClassifier
        ptc = get_classifier_pt()

        # First targeted attack
        boundary = BoundaryAttack(classifier=ptc, targeted=True, max_iter=20)
        params = {'y': random_targets(y_test, ptc.nb_classes())}
        x_test_adv = boundary.generate(x_test, **params)

        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())

        target = np.argmax(params['y'], axis=1)
        y_pred_adv = np.argmax(ptc.predict(x_test_adv), axis=1)
        self.assertTrue((target == y_pred_adv).any())

        # Second untargeted attack
        boundary = BoundaryAttack(classifier=ptc, targeted=False, max_iter=20)
        x_test_adv = boundary.generate(x_test)

        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())

        y_pred = np.argmax(ptc.predict(x_test), axis=1)
        y_pred_adv = np.argmax(ptc.predict(x_test_adv), axis=1)
        self.assertTrue((y_pred != y_pred_adv).any())

        # Check that x_test has not been modified by attack and classifier
        self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))),
                               0.0,
                               delta=0.00001)
Ejemplo n.º 15
0
    def test_tfclassifier(self):
        """
        First test with the TFClassifier.
        :return:
        """
        # Build TFClassifier
        tfc, sess = get_classifier_tf()

        # Get MNIST
        (_, _), (x_test, y_test) = self.mnist

        # First targeted attack
        boundary = BoundaryAttack(classifier=tfc, targeted=True, max_iter=20)
        params = {'y': random_targets(y_test, tfc.nb_classes)}
        x_test_adv = boundary.generate(x_test, **params)

        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())

        target = np.argmax(params['y'], axis=1)
        y_pred_adv = np.argmax(tfc.predict(x_test_adv), axis=1)
        self.assertTrue((target == y_pred_adv).any())

        # Second untargeted attack
        boundary = BoundaryAttack(classifier=tfc, targeted=False, max_iter=20)
        x_test_adv = boundary.generate(x_test)

        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())

        y_pred = np.argmax(tfc.predict(x_test), axis=1)
        y_pred_adv = np.argmax(tfc.predict(x_test_adv), axis=1)
        self.assertTrue((y_pred != y_pred_adv).any())

        # Clean-up session
        sess.close()
        tf.reset_default_graph()
                                   nb_classes=nclasses)

    test_data, test_label = load_svmlight_file(data_path, n_features=nfeatures)
    test_data = test_data.toarray()
    test_label = test_label.astype('int')
    n = len(test_label)
    df = pd.DataFrame(test_data)
    df['label'] = test_label
    df = df.sample(frac=1)
    test_label = df['label'].tolist()
    test_data = np.array(df.drop(columns=['label']))

    predictions = np.argmax(classifier.predict(test_data), axis=1)
    attack = BoundaryAttack(classifier=classifier,
                            targeted=False,
                            delta=0.05,
                            epsilon=0.05,
                            step_adapt=0.5)
    n_selected = 100
    corrected = []
    c_labels = []
    for i in range(len(test_label)):
        if test_label[i] == predictions[i]:
            corrected.append(test_data[i])
            c_labels.append(test_label[i])
        if len(corrected) >= n_selected:
            break
    corrected = np.array(corrected)
    start = time.time()
    test_adv = attack.generate(corrected)
    end = time.time()
Ejemplo n.º 17
0
    def test_tfclassifier(self):
        """
        First test with the TensorFlowClassifier.
        :return:
        """
        # Build TensorFlowClassifier
        tfc, sess = get_classifier_tf()

        # First targeted attack
        boundary = BoundaryAttack(classifier=tfc,
                                  targeted=True,
                                  max_iter=200,
                                  delta=0.5)
        params = {'y': random_targets(self.y_test, tfc.nb_classes())}
        x_test_adv = boundary.generate(self.x_test, **params)
        expected_x_test_adv_1 = np.asarray([
            0.42622495, 0.0, 0.0, 0.33005068, 0.2277837, 0.0, 0.18348512,
            0.42622495, 0.27452883, 0.0, 0.0, 0.0, 0.1653487, 0.70523715,
            0.7367977, 0.7974912, 0.28579983, 0.0, 0.36499417, 0.0, 0.0, 0.0,
            0.42622495, 0.0, 0.26680174, 0.42622495, 0.0, 0.19260764
        ])
        expected_x_test_adv_2 = np.asarray([
            0.0459, 0., 0., 0.0756, 0.2048, 0.037, 0., 0., 0.0126, 0.4338,
            0.1566, 0.3061, 0., 0.296, 0.8318, 0.7267, 0.2252, 0.074, 0.,
            0.1208, 0.4362, 0., 0., 0., 0., 0.0359, 0., 0.1191
        ])
        try:
            np.testing.assert_array_almost_equal(x_test_adv[2, 14, :, 0],
                                                 expected_x_test_adv_1,
                                                 decimal=4)
        except AssertionError:
            np.testing.assert_array_almost_equal(x_test_adv[2, 14, :, 0],
                                                 expected_x_test_adv_2,
                                                 decimal=4)
        self.assertLessEqual(np.max(x_test_adv), 1.0)
        self.assertGreaterEqual(np.min(x_test_adv), 0.0)

        y_pred_adv = tfc.predict(x_test_adv)
        y_pred_adv_expected = np.asarray([
            1.57103419e-01, -7.31061280e-01, -4.03979905e-02, -4.79048371e-01,
            9.37852338e-02, -8.01057637e-01, -4.77534801e-01, 1.08687377e+00,
            -3.06577891e-01, -5.74976981e-01
        ])
        np.testing.assert_array_almost_equal(y_pred_adv[0],
                                             y_pred_adv_expected,
                                             decimal=4)

        # Second untargeted attack
        boundary = BoundaryAttack(classifier=tfc, targeted=False, max_iter=20)
        x_test_adv = boundary.generate(self.x_test)

        self.assertFalse((self.x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())

        y_pred = np.argmax(tfc.predict(self.x_test), axis=1)
        y_pred_adv = np.argmax(tfc.predict(x_test_adv), axis=1)
        self.assertTrue((y_pred != y_pred_adv).any())

        # Clean-up session
        sess.close()
Ejemplo n.º 18
0
    def test_tensorflow_mnist(self):
        """
        First test with the TensorFlowClassifier.
        :return:
        """
        (_, _), (x_test, y_test) = self.mnist

        x_test_original = x_test.copy()

        # Build TensorFlowClassifier
        tfc, sess = get_classifier_tf()

        # First targeted attack
        boundary = BoundaryAttack(classifier=tfc,
                                  targeted=True,
                                  max_iter=200,
                                  delta=0.5)
        params = {'y': random_targets(y_test, tfc.nb_classes())}
        x_test_adv = boundary.generate(x_test, **params)
        # expected_x_test_adv_1 = np.asarray([0.42622495, 0.0, 0.0, 0.33005068, 0.2277837, 0.0,
        #                                     0.18348512, 0.42622495, 0.27452883, 0.0, 0.0, 0.0,
        #                                     0.1653487, 0.70523715, 0.7367977, 0.7974912, 0.28579983, 0.0,
        #                                     0.36499417, 0.0, 0.0, 0.0, 0.42622495, 0.0,
        #                                     0.26680174, 0.42622495, 0.0, 0.19260764])
        # expected_x_test_adv_2 = np.asarray([0.0459, 0., 0., 0.0756, 0.2048, 0.037, 0., 0.,
        #                                     0.0126, 0.4338, 0.1566, 0.3061, 0., 0.296, 0.8318, 0.7267,
        #                                     0.2252, 0.074, 0., 0.1208, 0.4362, 0., 0., 0.,
        #                                     0., 0.0359, 0., 0.1191])
        #
        # expected_x_test_adv_3 = np.asarray([0.0671, 0.0644, 0.3012, 0., 0., 0., 0.3407, 0.,
        #                                     0.1507, 0.0478, 0.3253, 0., 0.3334, 0.3473, 1., 0.8649,
        #                                     0.5639, 0.5198, 0., 0., 0.6173, 0., 0.3116, 0.,
        #                                     0.3937, 0.6173, 0., 0.0021])
        # try:
        #     np.testing.assert_array_almost_equal(x_test_adv[2, 14, :, 0], expected_x_test_adv_1, decimal=4)
        # except AssertionError:
        #     try:
        #         np.testing.assert_array_almost_equal(x_test_adv[2, 14, :, 0], expected_x_test_adv_2, decimal=4)
        #     except AssertionError:
        #         np.testing.assert_array_almost_equal(x_test_adv[2, 14, :, 0], expected_x_test_adv_3, decimal=4)
        self.assertLessEqual(np.max(x_test_adv), 1.0)
        self.assertGreaterEqual(np.min(x_test_adv), 0.0)

        y_pred_adv = tfc.predict(x_test_adv)
        y_pred_adv_expected = np.asarray([
            1.57103419e-01, -7.31061280e-01, -4.03979905e-02, -4.79048371e-01,
            9.37852338e-02, -8.01057637e-01, -4.77534801e-01, 1.08687377e+00,
            -3.06577891e-01, -5.74976981e-01
        ])
        # np.testing.assert_array_almost_equal(y_pred_adv[0], y_pred_adv_expected, decimal=4)

        # Second untargeted attack
        boundary = BoundaryAttack(classifier=tfc, targeted=False, max_iter=20)
        x_test_adv = boundary.generate(x_test)

        self.assertFalse((x_test == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1.0001).all())
        self.assertTrue((x_test_adv >= -0.0001).all())

        y_pred = np.argmax(tfc.predict(x_test), axis=1)
        y_pred_adv = np.argmax(tfc.predict(x_test_adv), axis=1)
        self.assertTrue((y_pred != y_pred_adv).any())

        # Check that x_test has not been modified by attack and classifier
        self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))),
                               0.0,
                               delta=0.00001)

        # Clean-up session
        sess.close()
def adversarial_attack_shift(x, y, delta=1.0, model=RandomForestClassifier(), attack_type='zoo',
                             numerical_features=None, feat_delta=1.0):
    # in this case delta is the portion of half the data on which to generate attacks
    # because the first half as a minimum has to be used to train a model against which generate the attacks
    assert (attack_type in ['zoo', 'boundary', 'hop-skip-jump'])

    le = preprocessing.LabelEncoder()
    le.fit(np.squeeze(y))
    y = le.transform(y)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=(0.5 * delta))

    if numerical_features is not None:

        n_numerical = len(numerical_features)
        feat_indices = np.random.choice(n_numerical, ceil(n_numerical * feat_delta), replace=False)
        feat_indices = np.array(numerical_features)[feat_indices]

    else:

        feat_indices = np.random.choice(x.shape[1], ceil(x.shape[1] * feat_delta), replace=False)

    other_features = list(set(range(x.shape[1])) - set(feat_indices))

    x_train_other = x_train[:, other_features]
    x_train_numerical = x_train[:, feat_indices]
    x_test_other = x_test[:, other_features]
    x_test_numerical = x_test[:, feat_indices]

    classifier = SklearnClassifier(model=model, clip_values=(0, np.max(x_train_numerical)))

    # Train the ART classifier

    classifier.fit(x_train_numerical, y_train)

    # Evaluate the ART classifier on benign test examples

    predictions = classifier.predict(x_test_numerical)
    accuracy = np.sum(np.argmax(predictions, axis=1) == y_test) / len(y_test)
    print("Accuracy on benign test examples: {}%".format(accuracy * 100))

    # Generate adversarial test examples
    if attack_type == 'zoo':
        attack = ZooAttack(
            classifier=classifier,
            confidence=0.0,
            targeted=False,
            learning_rate=1e-1,
            max_iter=10,
            binary_search_steps=10,
            initial_const=1e-3,
            abort_early=True,
            use_resize=False,
            use_importance=False,
            nb_parallel=x_test_numerical.shape[1],
            batch_size=1,
            variable_h=0.01,
        )
    elif attack_type == 'boundary':
        attack = BoundaryAttack(classifier, targeted=False, epsilon=0.02, max_iter=20, num_trial=10)
    elif attack_type == 'hop-skip-jump':
        attack = HopSkipJump(classifier,
                             targeted=False,
                             norm=2,
                             max_iter=20,
                             max_eval=10,
                             init_eval=9,
                             init_size=10)

    x_adv = attack.generate(x=x_test_numerical, y=y_test)

    # Evaluate the ART classifier on adversarial test examples

    predictions_adv = classifier.predict(x_adv)
    accuracy = np.sum(np.argmax(predictions_adv, axis=1) == y_test) / len(y_test)
    print("Accuracy on adversarial test examples: {}%".format(accuracy * 100))
    print("Max difference: {}".format(np.max(np.abs(x_test_numerical - x_adv) / x_test_numerical)))

    x_final = np.zeros_like(x)
    x_final[:, feat_indices] = np.vstack([x_train_numerical, x_adv])
    x_final[:, other_features] = np.vstack([x_train_other, x_test_other])

    y_final = np.concatenate([y_train, y_test], axis=0)
    y_final = le.inverse_transform(y_final)

    adv_indices = list(range(len(y_train), len(y)))

    return x_final, y_final, adv_indices, feat_indices
Ejemplo n.º 20
0
def attack_run_rejection_policy(model, hps):
    """
    An attack run with rejection policy.
    :param model: Pytorch model.
    :param hps: hyperparameters
    :return:
    """
    model.eval()

    # Get thresholds
    threshold_list1 = []
    threshold_list2 = []
    for label_id in range(hps.n_classes):
        # No data augmentation(crop_flip=False) when getting in-distribution thresholds
        dataset = get_dataset(data_name=hps.problem, train=True, label_id=label_id, crop_flip=False)
        in_test_loader = DataLoader(dataset=dataset, batch_size=hps.n_batch_test, shuffle=False)

        print('Inference on {}, label_id {}'.format(hps.problem, label_id))
        in_ll_list = []
        for batch_id, (x, y) in enumerate(in_test_loader):
            x = x.to(hps.device)
            y = y.to(hps.device)
            ll = model(x)

            correct_idx = ll.argmax(dim=1) == y

            ll_, y_ = ll[correct_idx], y[correct_idx]  # choose samples are classified correctly
            in_ll_list += list(ll_[:, label_id].detach().cpu().numpy())

        thresh_idx = int(0.01 * len(in_ll_list))
        thresh1 = sorted(in_ll_list)[thresh_idx]
        thresh_idx = int(0.02 * len(in_ll_list))
        thresh2 = sorted(in_ll_list)[thresh_idx]
        threshold_list1.append(thresh1)  # class mean as threshold
        threshold_list2.append(thresh2)  # class mean as threshold
        print('1st & 2nd percentile thresholds: {:.3f}, {:.3f}'.format(thresh1, thresh2))

    # Evaluation
    n_total = 0   # total number of correct classified samples by clean classifier
    n_successful_adv = 0  # total number of successful adversarial examples generated
    n_rejected_adv1 = 0   # total number of successfully rejected (successful) adversarial examples, <= n_successful_adv
    n_rejected_adv2 = 0   # total number of successfully rejected (successful) adversarial examples, <= n_successful_adv

    attack_path = os.path.join(hps.attack_dir, hps.attack)
    if not os.path.exists(attack_path):
        os.mkdir(attack_path)

    thresholds1 = torch.tensor(threshold_list1).to(hps.device)
    thresholds2 = torch.tensor(threshold_list2).to(hps.device)

    l2_distortion_list = []
    n_eval = 0

    wrapped_target_model = PyTorchClassifier(model=model,
                                             loss=None,
                                             optimizer=None,
                                             input_shape=(hps.image_channel, 32, 32),
                                             nb_classes=hps.n_classes)

    if hps.attack == 'boundary':
        attack = BoundaryAttack(wrapped_target_model, targeted=hps.targeted)
    elif hps.attack == 'cw':
        attack = CarliniL2Method(wrapped_target_model, confidence=hps.cw_confidence, targeted=hps.targeted)


    hps.n_batch_test = 1
    for label_id in range(hps.n_classes):
        dataset = get_dataset(data_name=hps.problem, train=False, label_id=label_id)
        test_loader = DataLoader(dataset=dataset, batch_size=hps.n_batch_test, shuffle=False)
        for batch_id, (x, y) in enumerate(test_loader):
            # Note that images are scaled to [0., 1.0]
            x, y = x.to(hps.device), y.to(hps.device)
            with torch.no_grad():
                output = model(x)

            pred = output.argmax(dim=1)
            correct_idx = pred == y  # Only evaluate on the correct classified samples by clean classifier.
            x, y = x[correct_idx], y[correct_idx]

            n_eval += correct_idx.sum().item()

            for id in range(hps.n_classes):
                if label_id != id:
                    n_total += 1
                    y_cur = torch.LongTensor([id] * x.size(0)).to(hps.device)
                    # adv_x = adversary.perturb(x, y_cur)
                    x_ = x.cpu().numpy().astype(np.float32)
                    y_ = y_cur.cpu().numpy().astype(np.float32)
                    adv_x = attack.generate(x_, y_)

                    with torch.no_grad():
                        adv_x = torch.tensor(adv_x).to(hps.device)
                        output = model(adv_x)

                    logits, preds = output.max(dim=1)

                    success_idx = preds == y_cur
                    n_successful_adv += success_idx.sum().item()

                    diff = adv_x - x
                    l2_distortion = diff.norm(p=2, dim=-1).mean().item()  # mean l2 distortion
                    l2_distortion_list.append(l2_distortion)

                    rej_idx1 = logits < thresholds1[preds]
                    n_rejected_adv1 += rej_idx1.sum().item()

                    rej_idx2 = logits < thresholds2[preds]
                    n_rejected_adv2 += rej_idx2.sum().item()

            break  # only one batch

        print('Evaluating on samples of class {} ...'.format(label_id))

    reject_rate1 = n_rejected_adv1 / n_successful_adv
    reject_rate2 = n_rejected_adv2 / n_successful_adv
    success_adv_rate = n_successful_adv / n_total
    print('success rate of adv examples generation: {}/{}={:.4f}'.format(n_successful_adv, n_total, success_adv_rate))
    print('Mean L2 distortion of Adv Examples: {:.4f}'.format(np.mean(l2_distortion_list)))
    print('1st percentile, reject success rate: {}/{}={:.4f}'.format(n_rejected_adv1, n_successful_adv, reject_rate1))
    print('2nd percentile, reject success rate: {}/{}={:.4f}'.format(n_rejected_adv2, n_successful_adv, reject_rate2))