Example #1
0
 def __init__(self,
              model,
              max_iter=1000,
              search_iter=10,
              is_targeted=False,
              init_attack=None,
              sparse=True):
     super(PointWiseAttack, self).__init__()
     self._model = check_model('model', model, BlackModel)
     self._max_iter = check_int_positive('max_iter', max_iter)
     self._search_iter = check_int_positive('search_iter', search_iter)
     self._is_targeted = check_param_type('is_targeted', is_targeted, bool)
     if init_attack is None:
         self._init_attack = SaltAndPepperNoiseAttack(
             model, is_targeted=self._is_targeted)
     else:
         self._init_attack = init_attack
     self._sparse = check_param_type('sparse', sparse, bool)
Example #2
0
def test_salt_and_pepper_attack_method():
    """
    Salt and pepper attack method unit test.
    """
    batch_size = 6
    np.random.seed(123)
    net = SimpleNet()
    inputs = np.random.rand(batch_size, 10)

    model = ModelToBeAttacked(net)
    labels = np.random.randint(low=0, high=10, size=batch_size)
    labels = np.eye(10)[labels]
    labels = labels.astype(np.float32)

    attack = SaltAndPepperNoiseAttack(model, sparse=False)
    is_adv, adv_data, query_times = attack.generate(inputs, labels)
    assert np.any(adv_data[0] != inputs[0]), 'Salt and pepper attack method: ' \
                                             'generate value must not be equal' \
                                             ' to original value.'
Example #3
0
def test_salt_and_pepper_attack_on_mnist():
    """
    Salt-and-Pepper-Attack test
    """
    # upload trained network
    ckpt_name = './trained_ckpt_file/checkpoint_lenet-10_1875.ckpt'
    net = LeNet5()
    load_dict = load_checkpoint(ckpt_name)
    load_param_into_net(net, load_dict)

    # get test data
    data_list = "./MNIST_unzip/test"
    batch_size = 32
    ds = generate_mnist_dataset(data_list, batch_size=batch_size)

    # prediction accuracy before attack
    model = ModelToBeAttacked(net)
    batch_num = 3  # the number of batches of attacking samples
    test_images = []
    test_labels = []
    predict_labels = []
    i = 0
    for data in ds.create_tuple_iterator():
        i += 1
        images = data[0].astype(np.float32)
        labels = data[1]
        test_images.append(images)
        test_labels.append(labels)
        pred_labels = np.argmax(model.predict(images), axis=1)
        predict_labels.append(pred_labels)
        if i >= batch_num:
            break
    LOGGER.debug(
        TAG,
        'model input image shape is: {}'.format(np.array(test_images).shape))
    predict_labels = np.concatenate(predict_labels)
    true_labels = np.concatenate(test_labels)
    accuracy = np.mean(np.equal(predict_labels, true_labels))
    LOGGER.info(TAG, "prediction accuracy before attacking is : %g", accuracy)

    # attacking
    is_target = False
    attack = SaltAndPepperNoiseAttack(model=model,
                                      is_targeted=is_target,
                                      sparse=True)
    if is_target:
        targeted_labels = np.random.randint(0, 10, size=len(true_labels))
        for i, true_l in enumerate(true_labels):
            if targeted_labels[i] == true_l:
                targeted_labels[i] = (targeted_labels[i] + 1) % 10
    else:
        targeted_labels = true_labels
    LOGGER.debug(
        TAG, 'input shape is: {}'.format(np.concatenate(test_images).shape))
    success_list, adv_data, query_list = attack.generate(
        np.concatenate(test_images), targeted_labels)
    success_list = np.arange(success_list.shape[0])[success_list]
    LOGGER.info(TAG, 'success_list: %s', success_list)
    LOGGER.info(TAG, 'average of query times is : %s', np.mean(query_list))
    adv_preds = []
    for ite_data in adv_data:
        pred_logits_adv = model.predict(ite_data)
        # rescale predict confidences into (0, 1).
        pred_logits_adv = softmax(pred_logits_adv, axis=1)
        adv_preds.extend(pred_logits_adv)
    accuracy_adv = np.mean(np.equal(np.max(adv_preds, axis=1), true_labels))
    LOGGER.info(TAG, "prediction accuracy after attacking is : %g",
                accuracy_adv)
    test_labels_onehot = np.eye(10)[true_labels]
    attack_evaluate = AttackEvaluate(np.concatenate(test_images),
                                     test_labels_onehot,
                                     adv_data,
                                     adv_preds,
                                     targeted=is_target,
                                     target_label=targeted_labels)
    LOGGER.info(TAG, 'mis-classification rate of adversaries is : %s',
                attack_evaluate.mis_classification_rate())
    LOGGER.info(TAG, 'The average confidence of adversarial class is : %s',
                attack_evaluate.avg_conf_adv_class())
    LOGGER.info(TAG, 'The average confidence of true class is : %s',
                attack_evaluate.avg_conf_true_class())
    LOGGER.info(
        TAG, 'The average distance (l0, l2, linf) between original '
        'samples and adversarial samples are: %s',
        attack_evaluate.avg_lp_distance())
Example #4
0
class PointWiseAttack(Attack):
    """
    The Pointwise Attack make sure use the minimum number of changed pixels
    to generate adversarial sample for each original sample.Those changed pixels
    will use binary seach to make sure the distance between adversarial sample
    and original sample is as close as possible.

    References: `L. Schott, J. Rauber, M. Bethge, W. Brendel: "Towards the
    first adversarially robust neural network model on MNIST", ICLR (2019)
    <https://arxiv.org/abs/1805.09190>`_

    Args:
        model (BlackModel): Target model.
        max_iter (int): Max rounds of iteration to generate adversarial image.
        search_iter (int): Max rounds of binary search.
        is_targeted (bool): If True, targeted attack. If False, untargeted
            attack. Default: False.
        init_attack (Attack): Attack used to find a starting point. Default:
            None.
        sparse (bool): If True, input labels are sparse-encoded. If False,
            input labels are one-hot-encoded. Default: True.

    Examples:
        >>> attack = PointWiseAttack(model)
    """
    def __init__(self,
                 model,
                 max_iter=1000,
                 search_iter=10,
                 is_targeted=False,
                 init_attack=None,
                 sparse=True):
        super(PointWiseAttack, self).__init__()
        self._model = check_model('model', model, BlackModel)
        self._max_iter = check_int_positive('max_iter', max_iter)
        self._search_iter = check_int_positive('search_iter', search_iter)
        self._is_targeted = check_param_type('is_targeted', is_targeted, bool)
        if init_attack is None:
            self._init_attack = SaltAndPepperNoiseAttack(
                model, is_targeted=self._is_targeted)
        else:
            self._init_attack = init_attack
        self._sparse = check_param_type('sparse', sparse, bool)

    def generate(self, inputs, labels):
        """
        Generate adversarial examples based on input samples and targeted labels.

        Args:
            inputs (numpy.ndarray): Benign input samples used as references to create
                adversarial examples.
            labels (numpy.ndarray): For targeted attack, labels are adversarial
                target labels. For untargeted attack, labels are ground-truth labels.

        Returns:
            - numpy.ndarray, bool values for each attack result.

            - numpy.ndarray, generated adversarial examples.

            - numpy.ndarray, query times for each sample.

        Examples:
            >>> is_adv_list, adv_list, query_times_each_adv = attack.generate(
            >>>     [[0.1, 0.2, 0.6], [0.3, 0, 0.4]],
            >>>     [2, 3])
        """
        arr_x, arr_y = check_pair_numpy_param('inputs', inputs, 'labels',
                                              labels)
        if not self._sparse:
            arr_y = np.argmax(arr_y, axis=1)
        ini_bool, ini_advs, ini_count = self._initialize_starting_point(
            arr_x, arr_y)
        is_adv_list = list()
        adv_list = list()
        query_times_each_adv = list()
        for sample, sample_label, start_adv, ite_bool, ite_c in zip(
                arr_x, arr_y, ini_advs, ini_bool, ini_count):
            if ite_bool:
                LOGGER.info(TAG, 'Start optimizing.')
                ori_label = np.argmax(
                    self._model.predict(np.expand_dims(sample, axis=0))[0])
                ini_label = np.argmax(
                    self._model.predict(np.expand_dims(start_adv, axis=0))[0])
                is_adv, adv_x, query_times = self._decision_optimize(
                    sample, sample_label, start_adv)
                adv_label = np.argmax(
                    self._model.predict(np.expand_dims(adv_x, axis=0))[0])
                LOGGER.debug(
                    TAG, 'before ini attack label is :{}'.format(ori_label))
                LOGGER.debug(TAG,
                             'after ini attack label is :{}'.format(ini_label))
                LOGGER.debug(
                    TAG, 'INPUT optimize label is :{}'.format(sample_label))
                LOGGER.debug(
                    TAG,
                    'after pointwise attack label is :{}'.format(adv_label))
                is_adv_list.append(is_adv)
                adv_list.append(adv_x)
                query_times_each_adv.append(query_times + ite_c)
            else:
                LOGGER.info(TAG, 'Initial sample is not adversarial, pass.')
                is_adv_list.append(False)
                adv_list.append(start_adv)
                query_times_each_adv.append(ite_c)
        is_adv_list = np.array(is_adv_list)
        adv_list = np.array(adv_list)
        query_times_each_adv = np.array(query_times_each_adv)
        LOGGER.debug(TAG, 'ret list is: {}'.format(adv_list))
        return is_adv_list, adv_list, query_times_each_adv

    def _decision_optimize(self, unperturbed_img, input_label, perturbed_img):
        """
        Make the perturbed samples more similar to unperturbed samples,
        while maintaining the perturbed_label.

        Args:
            unperturbed_img (numpy.ndarray): Input sample as reference to create
                adversarial example.
            input_label (numpy.ndarray): Input label.
            perturbed_img (numpy.ndarray): Starting point to optimize.

        Returns:
            numpy.ndarray, a generated adversarial example.

        Raises:
            ValueError: if input unperturbed and perturbed samples have different size.
        """
        query_count = 0
        img_size = unperturbed_img.size
        img_shape = unperturbed_img.shape
        perturbed_img = perturbed_img.reshape(-1)
        unperturbed_img = unperturbed_img.reshape(-1)
        recover = np.copy(perturbed_img)

        if unperturbed_img.dtype != perturbed_img.dtype:
            msg = 'unperturbed sample and perturbed sample must have the same' \
                  ' dtype, but got dtype of unperturbed is: {}, dtype of perturbed ' \
                  'is: {}'.format(unperturbed_img.dtype, perturbed_img.dtype)
            LOGGER.error(TAG, msg)
            raise ValueError(msg)
        l2_dis = np.linalg.norm(perturbed_img - unperturbed_img)
        LOGGER.debug(
            TAG, 'Before optimize, the l2 distance between original '
            'sample and adversarial sample is: {}'.format(l2_dis))
        # recover pixel if image is adversarial
        for _ in range(self._max_iter):
            is_improve = False
            # at the premise of adversarial feature, recover pixels
            pixels_ind = np.arange(img_size)
            mask = unperturbed_img != perturbed_img
            np.random.shuffle(pixels_ind)
            for ite_ind in pixels_ind:
                if mask[ite_ind]:
                    recover[ite_ind] = unperturbed_img[ite_ind]
                    query_count += 1
                    is_adv = self._model.is_adversarial(
                        recover.reshape(img_shape), input_label,
                        self._is_targeted)
                    if is_adv:
                        is_improve = True
                        perturbed_img[ite_ind] = recover[ite_ind]
                        break
                    else:
                        recover[ite_ind] = perturbed_img[ite_ind]
            l2_dis = np.linalg.norm(perturbed_img - unperturbed_img)
            if not is_improve or (np.square(l2_dis) / np.sqrt(len(pixels_ind))
                                  <= self._get_threthod()):
                break
        LOGGER.debug(TAG, 'first round: Query count {}'.format(query_count))
        LOGGER.debug(TAG, 'Starting binary searches.')
        # tag the optimized pixels.
        mask = unperturbed_img != perturbed_img
        for _ in range(self._max_iter):
            is_improve = False
            pixels_ind = np.arange(img_size)
            np.random.shuffle(pixels_ind)
            for ite_ind in pixels_ind:
                if not mask[ite_ind]:
                    continue
                recover[ite_ind] = unperturbed_img[ite_ind]
                query_count += 1
                is_adv = self._model.is_adversarial(recover.reshape(img_shape),
                                                    input_label,
                                                    self._is_targeted)
                if is_adv:
                    is_improve = True
                    mask[ite_ind] = True
                    perturbed_img[ite_ind] = recover[ite_ind]
                    l2_dis = np.linalg.norm(perturbed_img - unperturbed_img)
                    LOGGER.debug(
                        TAG, 'Reset {}th pixel value to original, '
                        'l2 distance: {}.'.format(ite_ind, l2_dis))
                    break
                else:
                    # use binary searches
                    optimized_value, b_query = self._binary_search(
                        perturbed_img, unperturbed_img, ite_ind, input_label,
                        img_shape)
                    query_count += b_query
                    if optimized_value != perturbed_img[ite_ind]:
                        is_improve = True
                        mask[ite_ind] = True
                        perturbed_img[ite_ind] = optimized_value
                        l2_dis = np.linalg.norm(perturbed_img -
                                                unperturbed_img)
                        LOGGER.debug(
                            TAG, 'Reset {}th pixel value to original, '
                            'l2 distance: {}.'.format(ite_ind, l2_dis))
                        break
            l2_dis = np.linalg.norm(perturbed_img - unperturbed_img)
            if not is_improve or (np.square(l2_dis) / np.sqrt(len(pixels_ind))
                                  <= self._get_threthod()):
                LOGGER.debug(TAG, 'second optimized finish.')
                break
        LOGGER.info(
            TAG, 'Optimized finished, query count is {}'.format(query_count))
        # this method use to optimized the adversarial sample
        return True, perturbed_img.reshape(img_shape), query_count

    def _binary_search(self, perturbed_img, unperturbed_img, ite_ind,
                       input_label, img_shape):
        """
        For original pixel of inputs, use binary search to get the nearest pixel
        value with original value with adversarial feature.

        Args:
            perturbed_img (numpy.ndarray): Adversarial sample.
            unperturbed_img (numpy.ndarray): Input sample.
            ite_ind (int): The index of pixel in inputs.
            input_label (numpy.ndarray): Input labels.
            img_shape (tuple): Shape of the original sample.

        Returns:
            float, adversarial pixel value.
        """
        query_count = 0
        adv_value = perturbed_img[ite_ind]
        non_adv_value = unperturbed_img[ite_ind]
        for _ in range(self._search_iter):
            next_value = (adv_value + non_adv_value) / 2
            recover = np.copy(perturbed_img)
            recover[ite_ind] = next_value
            query_count += 1
            is_adversarial = self._model.is_adversarial(
                recover.reshape(img_shape), input_label, self._is_targeted)
            if is_adversarial:
                adv_value = next_value
            else:
                non_adv_value = next_value
        return adv_value, query_count

    def _initialize_starting_point(self, inputs, labels):
        """
        Use init_attack to generate original adversarial inputs.

        Args:
            inputs (numpy.ndarray): Benign input sample used as references to create
                adversarial examples.
            labels (numpy.ndarray): If is targeted attack, labels is adversarial
                labels, if is untargeted attack, labels is true labels.

        Returns:
            numpy.ndarray, adversarial image(s) generate by init_attack method.
        """
        is_adv, start_adv, query_c = self._init_attack.generate(inputs, labels)
        return is_adv, start_adv, query_c

    def _get_threthod(self):
        """
        Return a float number, when distance small than this number,
        optimize will abort early.

        Returns:
            float, the optimized level, the smaller of number, the better
            of adversarial sample.
        """
        predefined_threshold = 0.01
        return predefined_threshold