Beispiel #1
0
    def __call__(self, x, transformed_classifier):
        """
        Perform the defensive distillation defence mechanism and return a robuster classifier.

        :param x: Dataset for training the transformed classifier.
        :type x: `np.ndarray`
        :param transformed_classifier: A classifier to be transformed for increased robustness. Note that, the
            objective loss function used for fitting inside the input transformed_classifier must support soft labels,
            i.e. probability labels.
        :type transformed_classifier: :class:`.Classifier`
        :return: The transformed classifier.
        :rtype: :class:`.Classifier`
        """
        # Check if the trained classifier produces probability outputs
        preds = self.classifier.predict(x=x, batch_size=self.batch_size)
        are_probability = [is_probability(y) for y in preds]
        all_probability = np.sum(are_probability) == preds.shape[0]

        if not all_probability:
            raise ValueError("The input trained classifier do not produce probability outputs.")

        # Check if the transformed classifier produces probability outputs
        transformed_preds = transformed_classifier.predict(x=x, batch_size=self.batch_size)
        are_probability = [is_probability(y) for y in transformed_preds]
        all_probability = np.sum(are_probability) == transformed_preds.shape[0]

        if not all_probability:
            raise ValueError("The input transformed classifier do not produce probability outputs.")

        # Train the transformed classifier with soft labels
        transformed_classifier.fit(x=x, y=preds, batch_size=self.batch_size, nb_epochs=self.nb_epochs)

        return transformed_classifier
Beispiel #2
0
    def test_is_probability(self):
        probabilities = np.array([0.1, 0.3, 0.6])
        self.assertTrue(is_probability(probabilities))

        not_probabilities = np.array([0.1, 0.3, 0.8])
        self.assertFalse(is_probability(not_probabilities))

        not_probabilities = np.array([0.1, 0.3, 1.8])
        self.assertFalse(is_probability(not_probabilities))

        not_probabilities = np.array([-1.1, 0.3, 1.8])
        self.assertFalse(is_probability(not_probabilities))

        not_probabilities = np.array([-1.1, 0.3, 0.7])
        self.assertFalse(is_probability(not_probabilities))
    def __call__(self, preds):
        """
        Perform model postprocessing and return postprocessed output.

        :param preds: model output to be postprocessed.
        :type preds: `np.ndarray`
        :return: Postprocessed model output.
        :rtype: `np.ndarray`
        """
        # Generate random noise
        noise = np.random.normal(loc=0.0, scale=self.scale, size=preds.shape)

        # Add noise to model output
        post_preds = preds.copy()
        post_preds += noise

        if preds.shape[1] > 1:
            # Check if model output is logits or probability
            are_probability = [is_probability(x) for x in preds]
            all_probability = np.sum(are_probability) == preds.shape[0]

            # Finally normalize probability output
            if all_probability:
                post_preds[post_preds < 0.0] = 0.0
                sums = np.sum(post_preds, axis=1)
                post_preds /= sums
        else:
            post_preds[post_preds < 0.0] = 0.0

        return post_preds
    def generate(self,
                 x: np.ndarray,
                 y: Optional[np.ndarray] = None,
                 **kwargs) -> np.ndarray:
        """
        Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs to be attacked.
        :param y: An array with the original labels to be predicted.
        :return: An array holding the adversarial examples.
        """
        x_adv = x.astype(ART_NUMPY_DTYPE)
        preds = self.estimator.predict(x, batch_size=self.batch_size)

        if is_probability(preds[0]):
            logger.warning(
                "It seems that the attacked model is predicting probabilities. DeepFool expects logits as model output "
                "to achieve its full attack strength.")

        # Determine the class labels for which to compute the gradients
        use_grads_subset = self.nb_grads < self.estimator.nb_classes
        if use_grads_subset:
            # TODO compute set of unique labels per batch
            grad_labels = np.argsort(-preds, axis=1)[:, :self.nb_grads]
            labels_set = np.unique(grad_labels)
        else:
            labels_set = np.arange(self.estimator.nb_classes)
        sorter = np.arange(len(labels_set))

        # Pick a small scalar to avoid division by 0
        tol = 10e-8

        # Compute perturbation with implicit batching
        for batch_id in trange(int(
                np.ceil(x_adv.shape[0] / float(self.batch_size))),
                               desc="DeepFool",
                               disable=not self.verbose):
            batch_index_1, batch_index_2 = batch_id * self.batch_size, (
                batch_id + 1) * self.batch_size
            batch = x_adv[batch_index_1:batch_index_2].copy()

            # Get predictions and gradients for batch
            f_batch = preds[batch_index_1:batch_index_2]
            fk_hat = np.argmax(f_batch, axis=1)
            if use_grads_subset:
                # Compute gradients only for top predicted classes
                grd = np.array([
                    self.estimator.class_gradient(batch, label=_)
                    for _ in labels_set
                ])
                grd = np.squeeze(np.swapaxes(grd, 0, 2), axis=0)
            else:
                # Compute gradients for all classes
                grd = self.estimator.class_gradient(batch)

            # Get current predictions
            active_indices = np.arange(len(batch))
            current_step = 0
            while active_indices.size > 0 and current_step < self.max_iter:
                # Compute difference in predictions and gradients only for selected top predictions
                labels_indices = sorter[np.searchsorted(labels_set,
                                                        fk_hat,
                                                        sorter=sorter)]
                grad_diff = grd - grd[np.arange(len(grd)),
                                      labels_indices][:, None]
                f_diff = f_batch[:,
                                 labels_set] - f_batch[np.arange(len(f_batch)),
                                                       labels_indices][:, None]

                # Choose coordinate and compute perturbation
                norm = np.linalg.norm(grad_diff.reshape(
                    len(grad_diff), len(labels_set), -1),
                                      axis=2) + tol
                value = np.abs(f_diff) / norm
                value[np.arange(len(value)), labels_indices] = np.inf
                l_var = np.argmin(value, axis=1)
                absolute1 = abs(f_diff[np.arange(len(f_diff)), l_var])
                draddiff = grad_diff[np.arange(len(grad_diff)),
                                     l_var].reshape(len(grad_diff), -1)
                pow1 = (pow(
                    np.linalg.norm(draddiff, axis=1),
                    2,
                ) + tol)
                r_var = absolute1 / pow1
                r_var = r_var.reshape((-1, ) + (1, ) * (len(x.shape) - 1))
                r_var = r_var * grad_diff[np.arange(len(grad_diff)), l_var]

                # Add perturbation and clip result
                if self.estimator.clip_values is not None:
                    batch[active_indices] = np.clip(
                        batch[active_indices] + r_var[active_indices] *
                        (self.estimator.clip_values[1] -
                         self.estimator.clip_values[0]),
                        self.estimator.clip_values[0],
                        self.estimator.clip_values[1],
                    )
                else:
                    batch[active_indices] += r_var[active_indices]

                # Recompute prediction for new x
                f_batch = self.estimator.predict(batch)
                fk_i_hat = np.argmax(f_batch, axis=1)

                # Recompute gradients for new x
                if use_grads_subset:
                    # Compute gradients only for (originally) top predicted classes
                    grd = np.array([
                        self.estimator.class_gradient(batch, label=_)
                        for _ in labels_set
                    ])
                    grd = np.squeeze(np.swapaxes(grd, 0, 2), axis=0)
                else:
                    # Compute gradients for all classes
                    grd = self.estimator.class_gradient(batch)

                # Stop if misclassification has been achieved
                active_indices = np.where(fk_i_hat == fk_hat)[0]

                current_step += 1

            # Apply overshoot parameter
            x_adv1 = x_adv[batch_index_1:batch_index_2]
            x_adv2 = (1 + self.epsilon) * (batch -
                                           x_adv[batch_index_1:batch_index_2])
            x_adv[batch_index_1:batch_index_2] = x_adv1 + x_adv2
            if self.estimator.clip_values is not None:
                np.clip(
                    x_adv[batch_index_1:batch_index_2],
                    self.estimator.clip_values[0],
                    self.estimator.clip_values[1],
                    out=x_adv[batch_index_1:batch_index_2],
                )

        logger.info(
            "Success rate of DeepFool attack: %.2f%%",
            100 * compute_success(
                self.estimator, x, y, x_adv, batch_size=self.batch_size),
        )
        return x_adv
    def generate(self,
                 x: np.ndarray,
                 y: Optional[np.ndarray] = None,
                 **kwargs) -> Tuple[np.ndarray, np.ndarray]:
        """
        Generate an adversarial patch and return the patch and its mask in arrays.

        :param x: An array with the original input images of shape NHWC or input videos of shape NFHWC.
        :param y: An array with the original true labels.
        :param mask: A boolean array of shape equal to the shape of a single samples (1, H, W) or the shape of `x`
                     (N, H, W) without their channel dimensions. Any features for which the mask is True can be the
                     center location of the patch during sampling.
        :type mask: `np.ndarray`
        :param reset_patch: If `True` reset patch to initial values of mean of minimal and maximal clip value, else if
                            `False` (default) restart from previous patch values created by previous call to `generate`
                            or mean of minimal and maximal clip value if first call to `generate`.
        :type reset_patch: bool
        :return: An array with adversarial patch and an array of the patch mask.
        """
        import tensorflow as tf  # lgtm [py/repeated-import]

        shuffle = kwargs.get("shuffle", True)
        mask = kwargs.get("mask")
        if mask is not None:
            mask = mask.copy()
        mask = self._check_mask(mask=mask, x=x)

        if kwargs.get("reset_patch"):
            self.reset_patch(initial_patch_value=self._initial_value)

        y = check_and_transform_label_format(
            labels=y, nb_classes=self.estimator.nb_classes)

        # check if logits or probabilities
        y_pred = self.estimator.predict(x=x[[0]])

        if is_probability(y_pred):
            self.use_logits = False
        else:
            self.use_logits = True

        if mask is None:
            if shuffle:
                dataset = (tf.data.Dataset.from_tensor_slices(
                    (x, y)).shuffle(10000).batch(self.batch_size).repeat(
                        math.ceil(x.shape[0] / self.batch_size)))
            else:
                dataset = (tf.data.Dataset.from_tensor_slices(
                    (x, y)).batch(self.batch_size).repeat(
                        math.ceil(x.shape[0] / self.batch_size)))
        else:
            if shuffle:
                dataset = (tf.data.Dataset.from_tensor_slices(
                    (x, y, mask)).shuffle(10000).batch(self.batch_size).repeat(
                        math.ceil(x.shape[0] / self.batch_size)))
            else:
                dataset = (tf.data.Dataset.from_tensor_slices(
                    (x, y, mask)).batch(self.batch_size).repeat(
                        math.ceil(x.shape[0] / self.batch_size)))

        for _ in trange(self.max_iter,
                        desc="Adversarial Patch TensorFlow v2",
                        disable=not self.verbose):
            if mask is None:
                for images, target in dataset:
                    _ = self._train_step(images=images,
                                         target=target,
                                         mask=None)
            else:
                for images, target, mask_i in dataset:
                    _ = self._train_step(images=images,
                                         target=target,
                                         mask=mask_i)

        return (
            self._patch.numpy(),
            self._get_circular_patch_mask(nb_samples=1).numpy()[0],
        )
def PDTP(  # pylint: disable=C0103
    target_estimator: "Classifier",
    extra_estimator: "Classifier",
    x: np.ndarray,
    y: np.ndarray,
    indexes: Optional[np.ndarray] = None,
    num_iter: Optional[int] = 10,
) -> np.ndarray:
    """
    Compute the pointwise differential training privacy metric for the given classifier and training set.
    | Paper link: https://arxiv.org/abs/1712.09136

    :param target_estimator: The classifier to be analyzed.
    :param extra_estimator: Another classifier of the same type as the target classifier, but not yet fit.
    :param x: The training data of the classifier.
    :param y: Target values (class labels) of `x`, one-hot-encoded of shape (nb_samples, nb_classes) or indices of
              shape (nb_samples,).
    :param indexes: the subset of indexes of `x` to compute the PDTP metric on. If not supplied, PDTP will be
                    computed for all samples in `x`.
    :param num_iter: the number of iterations of PDTP computation to run for each sample. If not supplied,
                     defaults to 10. The result is the average across iterations.
    :return: an array containing the average PDTP value for each sample in the training set. The higher the value,
             the higher the privacy leakage for that sample.
    """
    from art.estimators.classification.pytorch import PyTorchClassifier
    from art.estimators.classification.tensorflow import TensorFlowV2Classifier
    from art.estimators.classification.scikitlearn import ScikitlearnClassifier

    supported_classifiers = (PyTorchClassifier, TensorFlowV2Classifier,
                             ScikitlearnClassifier)

    if not isinstance(target_estimator,
                      supported_classifiers) or not isinstance(
                          extra_estimator, supported_classifiers):
        raise ValueError(
            "PDTP metric only supports classifiers of type PyTorch, TensorFlowV2 and ScikitLearn."
        )
    if target_estimator.input_shape[0] != x.shape[1]:
        raise ValueError("Shape of x does not match input_shape of classifier")
    y = check_and_transform_label_format(y, target_estimator.nb_classes)
    if y.shape[0] != x.shape[0]:
        raise ValueError("Number of rows in x and y do not match")

    results = []

    for _ in range(num_iter):
        iter_results = []
        # get probabilities from original model
        pred = target_estimator.predict(x)
        if not is_probability(pred):
            try:
                pred = scipy.special.softmax(pred, axis=1)
            except Exception as exc:
                raise ValueError(
                    "PDTP metric only supports classifiers that output logits or probabilities."
                ) from exc
        # divide into 100 bins and return center of bin
        bins = np.array(np.arange(0.0, 1.01, 0.01).round(decimals=2))
        pred_bin_indexes = np.digitize(pred, bins)
        pred_bin = bins[pred_bin_indexes] - 0.005

        if not indexes:
            indexes = range(x.shape[0])
        for row in indexes:
            # create new model without sample in training data
            alt_x = np.delete(x, row, 0)
            alt_y = np.delete(y, row, 0)
            try:
                extra_estimator.reset()
            except NotImplementedError as exc:
                raise ValueError(
                    "PDTP metric can only be applied to classifiers that implement the reset method."
                ) from exc
            extra_estimator.fit(alt_x, alt_y)
            # get probabilities from new model
            alt_pred = extra_estimator.predict(x)
            if not is_probability(alt_pred):
                alt_pred = scipy.special.softmax(alt_pred, axis=1)
            # divide into 100 bins and return center of bin
            alt_pred_bin_indexes = np.digitize(alt_pred, bins)
            alt_pred_bin = bins[alt_pred_bin_indexes] - 0.005
            ratio_1 = pred_bin / alt_pred_bin
            ratio_2 = alt_pred_bin / pred_bin
            # get max value
            max_value = max(ratio_1.max(), ratio_2.max())
            iter_results.append(max_value)
        results.append(iter_results)

    # get average of iterations for each sample
    # We now have a list of list, internal lists represent an iteration. We need to transpose and get averages.
    per_sample = list(map(list, zip(*results)))
    avg_per_sample = np.array([sum(val) / len(val) for val in per_sample])

    # return leakage per sample
    return avg_per_sample
    def __init__(
        self,
        estimator: "CLASSIFIER_LOSS_GRADIENTS_TYPE",
        norm: Union[int, float, str] = np.inf,
        eps: float = 0.3,
        eps_step: float = 0.1,
        max_iter: int = 100,
        targeted: bool = False,
        nb_random_init: int = 5,
        batch_size: int = 32,
        loss_type: Optional[str] = None,
        verbose: bool = True,
    ):
        """
        Create a :class:`.AutoProjectedGradientDescent` instance.

        :param estimator: An trained estimator.
        :param norm: The norm of the adversarial perturbation. Possible values: "inf", np.inf, 1 or 2.
        :param eps: Maximum perturbation that the attacker can introduce.
        :param eps_step: Attack step size (input variation) at each iteration.
        :param max_iter: The maximum number of iterations.
        :param targeted: Indicates whether the attack is targeted (True) or untargeted (False).
        :param nb_random_init: Number of random initialisations within the epsilon ball. For num_random_init=0
            starting at the original input.
        :param batch_size: Size of the batch on which adversarial samples are generated.
        :param loss_type: Defines the loss to attack. Available options: None (Use loss defined by estimator),
            "cross_entropy", or "difference_logits_ratio"
        :param verbose: Show progress bars.
        """
        from art.estimators.classification import TensorFlowClassifier, TensorFlowV2Classifier, PyTorchClassifier

        if loss_type not in self._predefined_losses:
            raise ValueError(
                "The argument loss_type has an invalid value. The following options for `loss_type` are currently "
                "supported: {}".format(self._predefined_losses)
            )

        if loss_type is None:
            if hasattr(estimator, "predict") and is_probability(
                estimator.predict(x=np.ones(shape=(1, *estimator.input_shape), dtype=np.float32))
            ):
                raise ValueError(
                    "AutoProjectedGradientDescent is expecting logits as estimator output, the provided "
                    "estimator seems to predict probabilities."
                )

            estimator_apgd = estimator
        else:
            if isinstance(estimator, TensorFlowClassifier):
                import tensorflow as tf

                if loss_type == "cross_entropy":
                    if is_probability(estimator.predict(x=np.ones(shape=(1, *estimator.input_shape)))):
                        raise NotImplementedError("Cross-entropy loss is not implemented for probability output.")

                    self._loss_object = tf.reduce_mean(
                        tf.keras.losses.categorical_crossentropy(
                            y_pred=estimator._output, y_true=estimator._labels_ph, from_logits=True
                        )
                    )

                elif loss_type == "difference_logits_ratio":
                    if is_probability(estimator.predict(x=np.ones(shape=(1, *estimator.input_shape)))):
                        raise ValueError(
                            "The provided estimator seems to predict probabilities. "
                            "If loss_type='difference_logits_ratio' the estimator has to to predict logits."
                        )

                    raise ValueError(
                        "The loss `difference_logits_ratio` has not been validate completely. It seems that the "
                        "commented implemented below is failing to selected the second largest logit for cases "
                        "where the largest logit is the true logit. For future work `difference_logits_ratio` and "
                        "loss_fn should return the same loss value."
                    )

                    # def difference_logits_ratio(y_true, y_pred):
                    #     i_y_true = tf.cast(tf.math.argmax(tf.cast(y_true, tf.int32), axis=1), tf.int32)
                    #     i_y_pred_arg = tf.argsort(y_pred, axis=1)
                    #     # Not completely sure if the following line is correct.
                    #     # `i_y_pred_arg[:, -2], i_y_pred_arg[:, -1]` seems closer to the output of `loss_fn` than
                    #     # `i_y_pred_arg[:, -1], i_y_pred_arg[:, -2]`
                    #     i_z_i = tf.where(i_y_pred_arg[:, -1] != i_y_true[:], i_y_pred_arg[:, -2],
                    #                      i_y_pred_arg[:, -1])
                    #
                    #     z_1 = tf.gather(y_pred, i_y_pred_arg[:, -1], axis=1, batch_dims=0)
                    #     z_3 = tf.gather(y_pred, i_y_pred_arg[:, -3], axis=1, batch_dims=0)
                    #     z_i = tf.gather(y_pred, i_z_i, axis=1, batch_dims=0)
                    #     z_y = tf.gather(y_pred, i_y_true, axis=1, batch_dims=0)
                    #
                    #     z_1 = tf.linalg.diag_part(z_1)
                    #     z_3 = tf.linalg.diag_part(z_3)
                    #     z_i = tf.linalg.diag_part(z_i)
                    #     z_y = tf.linalg.diag_part(z_y)
                    #
                    #     dlr = -(z_y - z_i) / (z_1 - z_3)
                    #
                    #     return tf.reduce_mean(dlr)
                    #
                    # def loss_fn(y_true, y_pred):
                    #     i_y_true = np.argmax(y_true, axis=1)
                    #     i_y_pred_arg = np.argsort(y_pred, axis=1)
                    #     i_z_i = np.where(i_y_pred_arg[:, -1] != i_y_true[:], i_y_pred_arg[:, -1],
                    #                      i_y_pred_arg[:, -2])
                    #
                    #     z_1 = y_pred[:, i_y_pred_arg[:, -1]]
                    #     z_3 = y_pred[:, i_y_pred_arg[:, -3]]
                    #     z_i = y_pred[:, i_z_i]
                    #     z_y = y_pred[:, i_y_true]
                    #
                    #     z_1 = np.diag(z_1)
                    #     z_3 = np.diag(z_3)
                    #     z_i = np.diag(z_i)
                    #     z_y = np.diag(z_y)
                    #
                    #     dlr = -(z_y - z_i) / (z_1 - z_3)
                    #
                    #     return np.mean(dlr)
                    #
                    # self._loss_fn = loss_fn
                    # self._loss_object = difference_logits_ratio(y_true=estimator._labels_ph,
                    #                                             y_pred=estimator._output)

                estimator_apgd = TensorFlowClassifier(
                    input_ph=estimator._input_ph,
                    output=estimator._output,
                    labels_ph=estimator._labels_ph,
                    train=estimator._train,
                    loss=self._loss_object,
                    learning=estimator._learning,
                    sess=estimator._sess,
                    channels_first=estimator.channels_first,
                    clip_values=estimator.clip_values,
                    preprocessing_defences=estimator.preprocessing_defences,
                    postprocessing_defences=estimator.postprocessing_defences,
                    preprocessing=estimator.preprocessing,
                    feed_dict=estimator._feed_dict,
                )

            elif isinstance(estimator, TensorFlowV2Classifier):
                import tensorflow as tf

                if loss_type == "cross_entropy":
                    if is_probability(estimator.predict(x=np.ones(shape=(1, *estimator.input_shape)))):
                        self._loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
                    else:
                        self._loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
                elif loss_type == "difference_logits_ratio":
                    if is_probability(estimator.predict(x=np.ones(shape=(1, *estimator.input_shape)))):
                        raise ValueError(
                            "The provided estimator seems to predict probabilities. "
                            "If loss_type='difference_logits_ratio' the estimator has to to predict logits."
                        )

                    class difference_logits_ratio:
                        def __init__(self):
                            self.reduction = "mean"

                        def __call__(self, y_true, y_pred):
                            i_y_true = tf.cast(tf.math.argmax(tf.cast(y_true, tf.int32), axis=1), tf.int32)
                            i_y_pred_arg = tf.argsort(y_pred, axis=1)
                            i_z_i_list = list()

                            for i in range(y_true.shape[0]):
                                if i_y_pred_arg[i, -1] != i_y_true[i]:
                                    i_z_i_list.append(i_y_pred_arg[i, -1])
                                else:
                                    i_z_i_list.append(i_y_pred_arg[i, -2])

                            i_z_i = tf.stack(i_z_i_list)

                            z_1 = tf.gather(y_pred, i_y_pred_arg[:, -1], axis=1, batch_dims=0)
                            z_3 = tf.gather(y_pred, i_y_pred_arg[:, -3], axis=1, batch_dims=0)
                            z_i = tf.gather(y_pred, i_z_i, axis=1, batch_dims=0)
                            z_y = tf.gather(y_pred, i_y_true, axis=1, batch_dims=0)

                            z_1 = tf.linalg.diag_part(z_1)
                            z_3 = tf.linalg.diag_part(z_3)
                            z_i = tf.linalg.diag_part(z_i)
                            z_y = tf.linalg.diag_part(z_y)

                            dlr = -(z_y - z_i) / (z_1 - z_3)

                            return tf.reduce_mean(dlr)

                    self._loss_fn = difference_logits_ratio()
                    self._loss_object = difference_logits_ratio()

                estimator_apgd = TensorFlowV2Classifier(
                    model=estimator.model,
                    nb_classes=estimator.nb_classes,
                    input_shape=estimator.input_shape,
                    loss_object=self._loss_object,
                    train_step=estimator._train_step,
                    channels_first=estimator.channels_first,
                    clip_values=estimator.clip_values,
                    preprocessing_defences=estimator.preprocessing_defences,
                    postprocessing_defences=estimator.postprocessing_defences,
                    preprocessing=estimator.preprocessing,
                )
            elif isinstance(estimator, PyTorchClassifier):
                import torch

                if loss_type == "cross_entropy":
                    if is_probability(
                        estimator.predict(x=np.ones(shape=(1, *estimator.input_shape), dtype=np.float32))
                    ):
                        raise ValueError(
                            "The provided estimator seems to predict probabilities. If loss_type='cross_entropy' "
                            "the estimator has to to predict logits."
                        )

                    self._loss_object = torch.nn.CrossEntropyLoss(reduction="mean")
                elif loss_type == "difference_logits_ratio":
                    if is_probability(
                        estimator.predict(x=np.ones(shape=(1, *estimator.input_shape), dtype=ART_NUMPY_DTYPE))
                    ):
                        raise ValueError(
                            "The provided estimator seems to predict probabilities. "
                            "If loss_type='difference_logits_ratio' the estimator has to to predict logits."
                        )

                    class difference_logits_ratio:
                        def __init__(self):
                            self.reduction = "mean"

                        def __call__(self, y_pred, y_true):  # type: ignore
                            if isinstance(y_true, np.ndarray):
                                y_true = torch.from_numpy(y_true)
                            if isinstance(y_pred, np.ndarray):
                                y_pred = torch.from_numpy(y_pred)

                            y_true = y_true.float()

                            i_y_true = torch.argmax(y_true, axis=1)
                            i_y_pred_arg = torch.argsort(y_pred, axis=1)
                            i_z_i_list = list()

                            for i in range(y_true.shape[0]):
                                if i_y_pred_arg[i, -1] != i_y_true[i]:
                                    i_z_i_list.append(i_y_pred_arg[i, -1])
                                else:
                                    i_z_i_list.append(i_y_pred_arg[i, -2])

                            i_z_i = torch.stack(i_z_i_list)

                            z_1 = y_pred[:, i_y_pred_arg[:, -1]]
                            z_3 = y_pred[:, i_y_pred_arg[:, -3]]
                            z_i = y_pred[:, i_z_i]
                            z_y = y_pred[:, i_y_true]

                            z_1 = torch.diagonal(z_1)
                            z_3 = torch.diagonal(z_3)
                            z_i = torch.diagonal(z_i)
                            z_y = torch.diagonal(z_y)

                            dlr = -(z_y - z_i) / (z_1 - z_3)

                            return torch.mean(dlr.float())

                    self._loss_object = difference_logits_ratio()

                estimator_apgd = PyTorchClassifier(
                    model=estimator.model,
                    loss=self._loss_object,
                    input_shape=estimator.input_shape,
                    nb_classes=estimator.nb_classes,
                    optimizer=None,
                    channels_first=estimator.channels_first,
                    clip_values=estimator.clip_values,
                    preprocessing_defences=estimator.preprocessing_defences,
                    postprocessing_defences=estimator.postprocessing_defences,
                    preprocessing=estimator.preprocessing,
                    device_type=estimator._device,
                )

            else:
                raise ValueError("The loss type {} is not supported for the provided estimator.".format(loss_type))

        super().__init__(estimator=estimator_apgd)
        self.norm = norm
        self.eps = eps
        self.eps_step = eps_step
        self.max_iter = max_iter
        self.targeted = targeted
        self.nb_random_init = nb_random_init
        self.batch_size = batch_size
        self.loss_type = loss_type
        self.verbose = verbose
        self._check_params()
Beispiel #8
0
    def generate(self,
                 x: np.ndarray,
                 y: Optional[np.ndarray] = None,
                 **kwargs) -> Tuple[np.ndarray, np.ndarray]:
        """
        Generate an adversarial patch and return the patch and its mask in arrays.

        :param x: An array with the original input images of shape NHWC or input videos of shape NFHWC.
        :param y: An array with the original true labels.
        :param mask: A boolean array of shape equal to the shape of a single samples (1, H, W) or the shape of `x`
                     (N, H, W) without their channel dimensions. Any features for which the mask is True can be the
                     center location of the patch during sampling.
        :type mask: `np.ndarray`
        :param reset_patch: If `True` reset patch to initial values of mean of minimal and maximal clip value, else if
                            `False` (default) restart from previous patch values created by previous call to `generate`
                            or mean of minimal and maximal clip value if first call to `generate`.
        :type reset_patch: bool
        :return: An array with adversarial patch and an array of the patch mask.
        """
        import tensorflow as tf  # lgtm [py/repeated-import]

        shuffle = kwargs.get("shuffle", True)
        mask = kwargs.get("mask")
        if mask is not None:
            mask = mask.copy()
        mask = self._check_mask(mask=mask, x=x)

        if y is None:  # pragma: no cover
            logger.info(
                "Setting labels to estimator predictions and running untargeted attack because `y=None`."
            )
            y = to_categorical(np.argmax(self.estimator.predict(x=x), axis=1),
                               nb_classes=self.estimator.nb_classes)
            self.targeted = False
        else:
            self.targeted = True

        if kwargs.get("reset_patch"):
            self.reset_patch(initial_patch_value=self._initial_value)

        y = check_and_transform_label_format(
            labels=y, nb_classes=self.estimator.nb_classes)

        # check if logits or probabilities
        y_pred = self.estimator.predict(x=x[[0]])

        if is_probability(y_pred):
            self.use_logits = False
        else:
            self.use_logits = True

        if mask is None:
            if shuffle:
                dataset = tf.data.Dataset.from_tensor_slices(
                    (x, y)).shuffle(10000).batch(self.batch_size)
            else:
                dataset = tf.data.Dataset.from_tensor_slices(
                    (x, y)).batch(self.batch_size)
        else:
            if shuffle:
                dataset = tf.data.Dataset.from_tensor_slices(
                    (x, y, mask)).shuffle(10000).batch(self.batch_size)
            else:
                dataset = tf.data.Dataset.from_tensor_slices(
                    (x, y, mask)).batch(self.batch_size)

        for i_iter in trange(self.max_iter,
                             desc="Adversarial Patch TensorFlow v2",
                             disable=not self.verbose):
            if mask is None:
                counter = 0
                for images, target in dataset:
                    counter += 1
                    _ = self._train_step(images=images,
                                         target=target,
                                         mask=None)
            else:
                for images, target, mask_i in dataset:
                    _ = self._train_step(images=images,
                                         target=target,
                                         mask=mask_i)

            if self.summary_writer is not None:  # pragma: no cover
                self.summary_writer.add_image(
                    "patch",
                    self._patch.numpy().transpose((2, 0, 1)),
                    global_step=i_iter,
                )

                if hasattr(self.estimator, "compute_losses"):
                    x_patched = self._random_overlay(images=x,
                                                     patch=self._patch,
                                                     mask=mask)
                    losses = self.estimator.compute_losses(x=x_patched, y=y)

                    for key, value in losses.items():
                        self.summary_writer.add_scalar(
                            "loss/{}".format(key),
                            np.mean(value),
                            global_step=i_iter,
                        )

        return (
            self._patch.numpy(),
            self._get_circular_patch_mask(nb_samples=1).numpy()[0],
        )
Beispiel #9
0
    def generate(self,
                 inputs: np.ndarray,
                 labels: Optional[np.ndarray] = None,
                 **kwargs) -> np.ndarray:
        """
        Function creates adversarial examples, then returns them in a numpy array

        :param inputs: Array with original inputs (to be attacked)
        :param labels: Array with original labels (to be predicted)
        :return: Array containing adversarial examples
        """

        adv_inputs = inputs.astype(ART_NUMPY_DTYPE)
        predictions = self.estimator.predict(inputs,
                                             batch_size=self.batch_size)

        if (is_probability(predictions[0])):
            logger.warning(
                "Targeted model should output logits, not probabilities for predictions."
            )

        # Determine class labels for gradients
        use_grad_subset, labels_set = self.define_class_labels(predictions)
        sorter = np.arange(len(labels_set))

        # Calculate perturbation with batch
        for batch_nb in trange(int(
                np.ceil(adv_inputs.shape[0] / float(self.batch_size))),
                               desc="DeepFool",
                               disable=not self.show_prog):
            batch_idx_1, batch_idx_2 = batch_nb * self.batch_size, (
                batch_nb + 1) * self.batch_size
            batch = adv_inputs[batch_idx_1:batch_idx_2].copy()

            # Predictions for batch
            f_batch, fk_hat = self.batch_predict(predictions, batch_idx_1,
                                                 batch_idx_2)

            # Gradient for batch
            grads = self.batch_gradient(batch, use_grad_subset, labels_set)

            # Gets current predictions
            active_idxs = np.arange(len(batch))
            step = 0
            while (active_idxs.size > 0) and (step < self.total_iter):
                # Difference in gradients and predictions for selected predictions
                labels_idxs = sorter[np.searchsorted(labels_set,
                                                     fk_hat,
                                                     sorter=sorter)]
                grad_dif = grads - grads[np.arange(len(grads)),
                                         labels_idxs][:, None]
                f_dif = f_batch[:,
                                labels_set] - f_batch[np.arange(len(f_batch)),
                                                      labels_idxs][:, None]

                # Select coordinate and compute perturbation
                r_var = self.perturbation(adv_inputs, labels_set, labels_idxs,
                                          grad_dif, f_dif)

                # Add new perturbation to clip result
                if self.estimator.clip_values is not None:
                    batch[active_idxs] = np.clip(
                        batch[active_idxs] + r_var[active_idxs] *
                        (self.estimator.clip_values[1] -
                         self.estimator.clip_values[0]),
                        self.estimator.clip_values[0],
                        self.estimator.clip_values[1],
                    )
                else:
                    batch[active_idxs] += r_var[active_idxs]

                # Recalculate prediction
                f_batch = self.estimator.predict(batch)
                fk_i_hat = np.argmax(f_batch, axis=1)

                # Recalculate gradient
                grads = self.batch_gradient(batch, use_grad_subset, labels_set)

                # Check if misclassification has occured
                active_idxs = np.where(fk_i_hat == fk_hat)[0]

                step += 1

            # Apply overshoot parameters
            adv_inputs[batch_idx_1:batch_idx_2] = self.overshoot(
                adv_inputs, batch_idx_1, batch_idx_2, batch, batch_nb)

            if self.estimator.clip_values is not None:
                np.clip(
                    adv_inputs[batch_idx_1:batch_idx_2],
                    self.estimator.clip_values[0],
                    self.estimator.clip_values[1],
                    out=adv_inputs[batch_idx_1:batch_idx_2],
                )

        logger.info(
            "DeepFool attack success rate: %.2f%%",
            100 * compute_success(self.estimator,
                                  inputs,
                                  labels,
                                  adv_inputs,
                                  batch_size=self.batch_size),
        )

        return adv_inputs
Beispiel #10
0
    def generate(  # type: ignore
        self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs
    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Generate an adversarial patch and return the patch and its mask in arrays.

        :param x: An array with the original input images of shape NCHW or input videos of shape NFCHW.
        :param y: An array with the original true labels.
        :param mask: An boolean array of shape equal to the shape of a single samples (1, H, W) or the shape of `x`
                     (N, H, W) without their channel dimensions. Any features for which the mask is True can be the
                     center location of the patch during sampling.
        :type mask: `np.ndarray`
        :return: An array with adversarial patch and an array of the patch mask.
        """
        import torch  # lgtm [py/repeated-import]

        shuffle = kwargs.get("shuffle", True)
        mask = kwargs.get("mask")
        if mask is not None:
            mask = mask.copy()
        mask = self._check_mask(mask=mask, x=x)

        if self.patch_location is not None and mask is not None:
            raise ValueError("Masks can only be used if the `patch_location` is `None`.")

        if y is None:  # pragma: no cover
            logger.info("Setting labels to estimator predictions and running untargeted attack because `y=None`.")
            y = to_categorical(np.argmax(self.estimator.predict(x=x), axis=1), nb_classes=self.estimator.nb_classes)

        if hasattr(self.estimator, "nb_classes"):
            y = check_and_transform_label_format(labels=y, nb_classes=self.estimator.nb_classes)

            # check if logits or probabilities
            y_pred = self.estimator.predict(x=x[[0]])

            if is_probability(y_pred):
                self.use_logits = False
            else:
                self.use_logits = True

        if isinstance(y, np.ndarray):
            x_tensor = torch.Tensor(x)
            y_tensor = torch.Tensor(y)

            if mask is None:
                dataset = torch.utils.data.TensorDataset(x_tensor, y_tensor)
                data_loader = torch.utils.data.DataLoader(
                    dataset=dataset,
                    batch_size=self.batch_size,
                    shuffle=shuffle,
                    drop_last=False,
                )
            else:
                mask_tensor = torch.Tensor(mask)
                dataset = torch.utils.data.TensorDataset(x_tensor, y_tensor, mask_tensor)
                data_loader = torch.utils.data.DataLoader(
                    dataset=dataset,
                    batch_size=self.batch_size,
                    shuffle=shuffle,
                    drop_last=False,
                )
        else:

            class ObjectDetectionDataset(torch.utils.data.Dataset):
                """
                Object detection dataset in PyTorch.
                """

                def __init__(self, x, y):
                    self.x = x
                    self.y = y

                def __len__(self):
                    return self.x.shape[0]

                def __getitem__(self, idx):
                    img = torch.from_numpy(self.x[idx])

                    target = {}
                    target["boxes"] = torch.from_numpy(self.y[idx]["boxes"])
                    target["labels"] = torch.from_numpy(self.y[idx]["labels"])
                    target["scores"] = torch.from_numpy(self.y[idx]["scores"])

                    return img, target

            class ObjectDetectionDatasetMask(torch.utils.data.Dataset):
                """
                Object detection dataset in PyTorch.
                """

                def __init__(self, x, y, mask):
                    self.x = x
                    self.y = y
                    self.mask = mask

                def __len__(self):
                    return self.x.shape[0]

                def __getitem__(self, idx):
                    img = torch.from_numpy(self.x[idx])

                    target = {}
                    target["boxes"] = torch.from_numpy(y[idx]["boxes"])
                    target["labels"] = torch.from_numpy(y[idx]["labels"])
                    target["scores"] = torch.from_numpy(y[idx]["scores"])
                    mask_i = torch.from_numpy(self.mask[idx])

                    return img, target, mask_i

            dataset_object_detection: Union[ObjectDetectionDataset, ObjectDetectionDatasetMask]
            if mask is None:
                dataset_object_detection = ObjectDetectionDataset(x, y)
            else:
                dataset_object_detection = ObjectDetectionDatasetMask(x, y, mask)

            data_loader = torch.utils.data.DataLoader(
                dataset=dataset_object_detection,
                batch_size=self.batch_size,
                shuffle=shuffle,
                drop_last=False,
            )

        for i_iter in trange(self.max_iter, desc="Adversarial Patch PyTorch", disable=not self.verbose):
            if mask is None:
                for images, target in data_loader:
                    images = images.to(self.estimator.device)
                    if isinstance(target, torch.Tensor):
                        target = target.to(self.estimator.device)
                    else:
                        target["boxes"] = target["boxes"].to(self.estimator.device)
                        target["labels"] = target["labels"].to(self.estimator.device)
                        target["scores"] = target["scores"].to(self.estimator.device)
                    _ = self._train_step(images=images, target=target, mask=None)
            else:
                for images, target, mask_i in data_loader:
                    images = images.to(self.estimator.device)
                    if isinstance(target, torch.Tensor):
                        target = target.to(self.estimator.device)
                    else:
                        target["boxes"] = target["boxes"].to(self.estimator.device)
                        target["labels"] = target["labels"].to(self.estimator.device)
                        target["scores"] = target["scores"].to(self.estimator.device)
                    mask_i = mask_i.to(self.estimator.device)
                    _ = self._train_step(images=images, target=target, mask=mask_i)

            # Write summary
            if self.summary_writer is not None:  # pragma: no cover
                x_patched = (
                    self._random_overlay(
                        images=torch.from_numpy(x).to(self.estimator.device), patch=self._patch, mask=mask
                    )
                    .detach()
                    .cpu()
                    .numpy()
                )

                self.summary_writer.update(
                    batch_id=0,
                    global_step=i_iter,
                    grad=None,
                    patch=self._patch,
                    estimator=self.estimator,
                    x=x_patched,
                    y=y,
                    targeted=self.targeted,
                )

        if self.summary_writer is not None:
            self.summary_writer.reset()

        return (
            self._patch.detach().cpu().numpy(),
            self._get_circular_patch_mask(nb_samples=1).cpu().numpy()[0],
        )
Beispiel #11
0
    def __init__(
        self,
        estimator: "CLASSIFIER_LOSS_GRADIENTS_TYPE",
        norm: Union[int, float, str] = np.inf,
        eps: float = 0.3,
        eps_step: float = 0.1,
        max_iter: int = 100,
        targeted: bool = False,
        nb_random_init: int = 5,
        batch_size: int = 32,
        loss_type: Optional[str] = None,
    ):
        """
        Create a :class:`.AutoProjectedGradientDescent` instance.

        :param estimator: An trained estimator.
        :param norm: The norm of the adversarial perturbation. Possible values: "inf", np.inf, 1 or 2.
        :param eps: Maximum perturbation that the attacker can introduce.
        :param eps_step: Attack step size (input variation) at each iteration.
        :param max_iter: The maximum number of iterations.
        :param targeted: Indicates whether the attack is targeted (True) or untargeted (False).
        :param nb_random_init: Number of random initialisations within the epsilon ball. For num_random_init=0
            starting at the original input.
        :param batch_size: Size of the batch on which adversarial samples are generated.
        """
        from art.estimators.classification import TensorFlowClassifier, TensorFlowV2Classifier, PyTorchClassifier

        if isinstance(estimator, TensorFlowClassifier):
            import tensorflow as tf

            if loss_type == "cross_entropy":
                if is_probability(
                        estimator.predict(x=np.ones(
                            shape=(1, *estimator.input_shape)))):
                    raise NotImplementedError(
                        "Cross-entropy loss is not implemented for probability output."
                    )
                else:
                    self._loss_object = tf.reduce_mean(
                        tf.keras.losses.categorical_crossentropy(
                            y_pred=estimator._output,
                            y_true=estimator._labels_ph,
                            from_logits=True))

                    def loss_fn(y_true, y_pred):
                        y_pred_norm = y_pred - np.amax(
                            y_pred, axis=1, keepdims=True)
                        loss_value = -(y_true * y_pred_norm - np.log(
                            np.sum(np.exp(y_pred_norm), axis=1,
                                   keepdims=True)))
                        return np.mean(loss_value)

                    self._loss_fn = loss_fn
            elif loss_type == "difference_logits_ratio":
                if is_probability(
                        estimator.predict(x=np.ones(
                            shape=(1, *estimator.input_shape)))):
                    raise ValueError(
                        "The provided estimator seems to predict probabilities. If loss_type='difference_logits_ratio' "
                        "the estimator has to to predict logits.")
                else:

                    def difference_logits_ratio(y_true, y_pred):
                        i_y_true = tf.cast(
                            tf.math.argmax(tf.cast(y_true, tf.int32), axis=1),
                            tf.int32)
                        i_y_pred_arg = tf.argsort(y_pred, axis=1)
                        i_z_i = tf.where(i_y_pred_arg[:, -1] != i_y_true[:],
                                         i_y_pred_arg[:, -2], i_y_pred_arg[:,
                                                                           -1])

                        z_1 = tf.gather(y_pred,
                                        i_y_pred_arg[:, -1],
                                        axis=1,
                                        batch_dims=0)
                        z_3 = tf.gather(y_pred,
                                        i_y_pred_arg[:, -3],
                                        axis=1,
                                        batch_dims=0)
                        z_i = tf.gather(y_pred, i_z_i, axis=1, batch_dims=0)
                        z_y = tf.gather(y_pred, i_y_true, axis=1, batch_dims=0)

                        z_1 = tf.linalg.diag_part(z_1)
                        z_3 = tf.linalg.diag_part(z_3)
                        z_i = tf.linalg.diag_part(z_i)
                        z_y = tf.linalg.diag_part(z_y)

                        dlr = -(z_y - z_i) / (z_1 - z_3)

                        return tf.reduce_mean(dlr)

                    def loss_fn(y_true, y_pred):
                        i_y_true = np.argmax(y_true, axis=1)
                        i_y_pred_arg = np.argsort(y_pred, axis=1)
                        i_z_i = np.where(i_y_pred_arg[:, -1] != i_y_true[:],
                                         i_y_pred_arg[:, -1], i_y_pred_arg[:,
                                                                           -2])

                        z_1 = y_pred[:, i_y_pred_arg[:, -1]]
                        z_3 = y_pred[:, i_y_pred_arg[:, -3]]
                        z_i = y_pred[:, i_z_i]
                        z_y = y_pred[:, i_y_true]

                        z_1 = np.diag(z_1)
                        z_3 = np.diag(z_3)
                        z_i = np.diag(z_i)
                        z_y = np.diag(z_y)

                        dlr = -(z_y - z_i) / (z_1 - z_3)

                        return np.mean(dlr)

                    self._loss_fn = loss_fn
                    self._loss_object = difference_logits_ratio(
                        y_true=estimator._labels_ph, y_pred=estimator._output)
            elif loss_type is None:
                self._loss_object = estimator._loss_object
            else:
                raise ValueError(
                    "The argument loss_type has an invalid value. The following options for loss_type are "
                    "supported: {}".format(
                        [None, "cross_entropy", "difference_logits_ratio"]))

            estimator_apgd = TensorFlowClassifier(
                input_ph=estimator._input_ph,
                output=estimator._output,
                labels_ph=estimator._labels_ph,
                train=estimator._train,
                loss=self._loss_object,
                learning=estimator._learning,
                sess=estimator._sess,
                channels_first=estimator.channels_first,
                clip_values=estimator.clip_values,
                preprocessing_defences=estimator.preprocessing_defences,
                postprocessing_defences=estimator.postprocessing_defences,
                preprocessing=estimator.preprocessing,
                feed_dict=estimator._feed_dict,
            )

        elif isinstance(estimator, TensorFlowV2Classifier):
            import tensorflow as tf

            if loss_type == "cross_entropy":
                if is_probability(
                        estimator.predict(x=np.ones(
                            shape=(1, *estimator.input_shape)))):
                    self._loss_object = tf.keras.losses.CategoricalCrossentropy(
                        from_logits=False)
                    self._loss_fn = self._loss_object
                else:
                    self._loss_object = tf.keras.losses.CategoricalCrossentropy(
                        from_logits=True)
                    self._loss_fn = self._loss_object
            elif loss_type == "difference_logits_ratio":
                if is_probability(
                        estimator.predict(x=np.ones(
                            shape=(1, *estimator.input_shape)))):
                    raise ValueError(
                        "The provided estimator seems to predict probabilities. If loss_type='difference_logits_ratio' "
                        "the estimator has to to predict logits.")
                else:

                    def difference_logits_ratio(y_true, y_pred):
                        i_y_true = tf.cast(
                            tf.math.argmax(tf.cast(y_true, tf.int32), axis=1),
                            tf.int32)
                        i_y_pred_arg = tf.argsort(y_pred, axis=1)
                        i_z_i_list = list()

                        for i in range(y_true.shape[0]):
                            if i_y_pred_arg[i, -1] != i_y_true[i]:
                                i_z_i_list.append(i_y_pred_arg[i, -1])
                            else:
                                i_z_i_list.append(i_y_pred_arg[i, -2])

                        i_z_i = tf.stack(i_z_i_list)

                        z_1 = tf.gather(y_pred,
                                        i_y_pred_arg[:, -1],
                                        axis=1,
                                        batch_dims=0)
                        z_3 = tf.gather(y_pred,
                                        i_y_pred_arg[:, -3],
                                        axis=1,
                                        batch_dims=0)
                        z_i = tf.gather(y_pred, i_z_i, axis=1, batch_dims=0)
                        z_y = tf.gather(y_pred, i_y_true, axis=1, batch_dims=0)

                        z_1 = tf.linalg.diag_part(z_1)
                        z_3 = tf.linalg.diag_part(z_3)
                        z_i = tf.linalg.diag_part(z_i)
                        z_y = tf.linalg.diag_part(z_y)

                        dlr = -(z_y - z_i) / (z_1 - z_3)

                        return tf.reduce_mean(dlr)

                    self._loss_fn = difference_logits_ratio
                    self._loss_object = difference_logits_ratio
            elif loss_type is None:
                self._loss_object = estimator._loss_object
            else:
                raise ValueError(
                    "The argument loss_type has an invalid value. The following options for loss_type are "
                    "supported: {}".format(
                        [None, "cross_entropy", "difference_logits_ratio"]))

            estimator_apgd = TensorFlowV2Classifier(
                model=estimator.model,
                nb_classes=estimator.nb_classes,
                input_shape=estimator.input_shape,
                loss_object=self._loss_object,
                train_step=estimator._train_step,
                channels_first=estimator.channels_first,
                clip_values=estimator.clip_values,
                preprocessing_defences=estimator.preprocessing_defences,
                postprocessing_defences=estimator.postprocessing_defences,
                preprocessing=estimator.preprocessing,
            )
        elif isinstance(estimator, PyTorchClassifier):
            import torch

            if loss_type == "cross_entropy":
                if is_probability(
                        estimator.predict(
                            x=np.ones(shape=(1, *estimator.input_shape),
                                      dtype=np.float32))):
                    raise ValueError(
                        "The provided estimator seems to predict probabilities. If loss_type='cross_entropy' "
                        "the estimator has to to predict logits.")
                else:

                    def loss_fn(y_true, y_pred):
                        return torch.nn.CrossEntropyLoss()(
                            torch.from_numpy(y_pred),
                            torch.from_numpy(np.argmax(y_true, axis=1)))

                    self._loss_fn = loss_fn
                    self._loss_object = torch.nn.CrossEntropyLoss()
            elif loss_type == "difference_logits_ratio":
                if is_probability(
                        estimator.predict(
                            x=np.ones(shape=(1, *estimator.input_shape),
                                      dtype=ART_NUMPY_DTYPE))):
                    raise ValueError(
                        "The provided estimator seems to predict probabilities. If loss_type='difference_logits_ratio' "
                        "the estimator has to to predict logits.")
                else:

                    # def difference_logits_ratio(y_true, y_pred):
                    def difference_logits_ratio(y_pred,
                                                y_true):  # type: ignore
                        if isinstance(y_true, np.ndarray):
                            y_true = torch.from_numpy(y_true)
                        if isinstance(y_pred, np.ndarray):
                            y_pred = torch.from_numpy(y_pred)

                        y_true = y_true.float()

                        # dlr = torch.mean((y_pred - y_true) ** 2)
                        # return loss

                        i_y_true = torch.argmax(y_true, axis=1)
                        i_y_pred_arg = torch.argsort(y_pred, axis=1)
                        i_z_i_list = list()

                        for i in range(y_true.shape[0]):
                            if i_y_pred_arg[i, -1] != i_y_true[i]:
                                i_z_i_list.append(i_y_pred_arg[i, -1])
                            else:
                                i_z_i_list.append(i_y_pred_arg[i, -2])

                        i_z_i = torch.stack(i_z_i_list)

                        z_1 = y_pred[:, i_y_pred_arg[:, -1]]
                        z_3 = y_pred[:, i_y_pred_arg[:, -3]]
                        z_i = y_pred[:, i_z_i]
                        z_y = y_pred[:, i_y_true]

                        z_1 = torch.diagonal(z_1)
                        z_3 = torch.diagonal(z_3)
                        z_i = torch.diagonal(z_i)
                        z_y = torch.diagonal(z_y)

                        dlr = -(z_y - z_i) / (z_1 - z_3)

                        return torch.mean(dlr.float())

                    self._loss_fn = difference_logits_ratio
                    self._loss_object = difference_logits_ratio
            elif loss_type is None:
                self._loss_object = estimator._loss_object
            else:
                raise ValueError(
                    "The argument loss_type has an invalid value. The following options for loss_type are "
                    "supported: {}".format(
                        [None, "cross_entropy", "difference_logits_ratio"]))

            estimator_apgd = PyTorchClassifier(
                model=estimator.model,
                loss=self._loss_object,
                input_shape=estimator.input_shape,
                nb_classes=estimator.nb_classes,
                optimizer=None,
                channels_first=estimator.channels_first,
                clip_values=estimator.clip_values,
                preprocessing_defences=estimator.preprocessing_defences,
                postprocessing_defences=estimator.postprocessing_defences,
                preprocessing=estimator.preprocessing,
                device_type=estimator._device,
            )

        else:
            estimator_apgd = None

        super().__init__(estimator=estimator_apgd)
        self.norm = norm
        self.eps = eps
        self.eps_step = eps_step
        self.max_iter = max_iter
        self.targeted = targeted
        self.nb_random_init = nb_random_init
        self.batch_size = batch_size
        self.loss_type = loss_type
        self._check_params()
    def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> Tuple[np.ndarray, np.ndarray]:
        """
        Generate an adversarial patch and return the patch and its mask in arrays.

        :param x: An array with the original input images of shape NHWC or input videos of shape NFHWC.
        :param y: An array with the original true labels.
        :param mask: An boolean array of shape equal to the shape of a single samples (1, H, W) or the shape of `x`
                     (N, H, W) without their channel dimensions. Any features for which the mask is True can be the
                     center location of the patch during sampling.
        :type mask: `np.ndarray`
        :return: An array with adversarial patch and an array of the patch mask.
        """
        import torch  # lgtm [py/repeated-import]

        shuffle = kwargs.get("shuffle", True)
        mask = kwargs.get("mask")
        if mask is not None:
            mask = mask.copy()
        mask = self._check_mask(mask=mask, x=x)

        if y is None:
            logger.info("Setting labels to estimator predictions and running untargeted attack because `y=None`.")
            y = to_categorical(np.argmax(self.estimator.predict(x=x), axis=1), nb_classes=self.estimator.nb_classes)
            self.targeted = False
        else:
            self.targeted = True

        y = check_and_transform_label_format(labels=y, nb_classes=self.estimator.nb_classes)

        # check if logits or probabilities
        y_pred = self.estimator.predict(x=x[[0]])

        if is_probability(y_pred):
            self.use_logits = False
        else:
            self.use_logits = True

        x_tensor = torch.Tensor(x)
        y_tensor = torch.Tensor(y)

        if mask is None:
            dataset = torch.utils.data.TensorDataset(x_tensor, y_tensor)
            data_loader = torch.utils.data.DataLoader(
                dataset=dataset,
                batch_size=self.batch_size,
                shuffle=shuffle,
                drop_last=False,
            )
        else:
            mask_tensor = torch.Tensor(mask)
            dataset = torch.utils.data.TensorDataset(x_tensor, y_tensor, mask_tensor)
            data_loader = torch.utils.data.DataLoader(
                dataset=dataset,
                batch_size=self.batch_size,
                shuffle=shuffle,
                drop_last=False,
            )

        for i_iter in trange(self.max_iter, desc="Adversarial Patch PyTorch", disable=not self.verbose):
            if mask is None:
                for images, target in data_loader:
                    _ = self._train_step(images=images, target=target, mask=None)
            else:
                for images, target, mask_i in data_loader:
                    _ = self._train_step(images=images, target=target, mask=mask_i)

            if self.summary_writer is not None:
                self.summary_writer.add_image(
                    "patch",
                    self._patch,
                    global_step=i_iter,
                )

                if hasattr(self.estimator, "compute_losses"):
                    x_patched = self._random_overlay(
                        images=torch.from_numpy(x).to(self.estimator.device), patch=self._patch, mask=mask
                    )
                    losses = self.estimator.compute_losses(x=x_patched, y=torch.from_numpy(y).to(self.estimator.device))

                    for key, value in losses.items():
                        self.summary_writer.add_scalar(
                            "loss/{}".format(key),
                            np.mean(value.detach().cpu().numpy()),
                            global_step=i_iter,
                        )

        return (
            self._patch.detach().cpu().numpy(),
            self._get_circular_patch_mask(nb_samples=1).numpy()[0],
        )
Beispiel #13
0
    def generate(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray:
        """
        Generate adversarial samples and return them in an array.

        :param x: An array with the original inputs to be attacked.
        :param y: An array with the original labels to be predicted.
        :return: An array holding the adversarial examples.
        """
        x_adv = x.astype(ART_NUMPY_DTYPE)
        preds = self.estimator.predict(x, batch_size=self.batch_size)

        if y is None:
            raise ValueError("Labels `y` cannot be None.")

        if self.estimator.nb_classes == 2 and preds.shape[1] == 1:
            raise ValueError(  # pragma: no cover
                "This attack has not yet been tested for binary classification with a single output classifier."
            )

        if is_probability(preds[0]):
            logger.warning(
                "It seems that the attacked model is predicting probabilities. DeepFool expects logits as model output "
                "to achieve its full attack strength."
            )

        # Determine the class labels for which to compute the gradients
        labels_set = np.arange(self.estimator.nb_classes)
        sorter = np.arange(len(labels_set))

        # Pick a small scalar to avoid division by 0
        tol = 10e-8

        # Compute perturbation with implicit batching
        for batch_id in trange(
            int(np.ceil(x_adv.shape[0] / float(self.batch_size))), desc="TargetedDeepFool_simple", disable=not self.verbose
        ):
            batch_index_1, batch_index_2 = batch_id * self.batch_size, (batch_id + 1) * self.batch_size
            batch = x_adv[batch_index_1:batch_index_2].copy()

            # Get predictions, labels, and gradients for batch
            f_batch = preds[batch_index_1:batch_index_2]
            f_target_y = y[batch_index_1:batch_index_2]
            fk_hat = np.argmax(f_batch, axis=1)
            # Compute gradients for all classes
            grd = self.estimator.class_gradient(batch)

            # Get current predictions
            active_indices = np.arange(len(batch))
            # exclude the inputs that have already classified into thier target label.
            active_indices = np.where(fk_hat != np.argmax(f_target_y, axis=1))[0]
            target_labels_indices = sorter[np.searchsorted(labels_set, np.argmax(f_target_y, axis=1), sorter=sorter)]
            current_step = 0
            while active_indices.size > 0 and current_step < self.max_iter:
                # Compute difference in predictions and gradients only for selected top predictions
                pred_labels_indices = sorter[np.searchsorted(labels_set, fk_hat, sorter=sorter)]
                grad_diff = grd - grd[np.arange(len(grd)), pred_labels_indices][:, None]
                f_diff = f_batch[:, labels_set] - f_batch[np.arange(len(f_batch)), pred_labels_indices][:, None]

                # Choose coordinate and compute perturbation
                norm = np.linalg.norm(grad_diff.reshape(len(grad_diff), len(labels_set), -1), axis=2) + tol
                value = np.abs(f_diff) / norm
                #value[np.arange(len(value)), pred_labels_indices] = np.inf
                #l_var = np.argmin(value, axis=1)
                l_var = target_labels_indices
                absolute1 = abs(f_diff[np.arange(len(f_diff)), l_var])
                draddiff = grad_diff[np.arange(len(grad_diff)), l_var].reshape(len(grad_diff), -1)
                pow1 = (
                    pow(
                        np.linalg.norm(draddiff, axis=1),
                        2,
                    )
                    + tol
                )
                r_var = absolute1 / pow1
                r_var = r_var.reshape((-1,) + (1,) * (len(x.shape) - 1))
                r_var = r_var * grad_diff[np.arange(len(grad_diff)), l_var]

                # Add perturbation and clip result
                if self.estimator.clip_values is not None:
                    batch[active_indices] = np.clip(
                        batch[active_indices]
                        + r_var[active_indices] * (self.estimator.clip_values[1] - self.estimator.clip_values[0]),
                        self.estimator.clip_values[0],
                        self.estimator.clip_values[1],
                    )
                else:
                    batch[active_indices] += r_var[active_indices]

                # Recompute prediction for new x
                f_batch = self.estimator.predict(batch)
                fk_i_hat = np.argmax(f_batch, axis=1)

                # Recompute gradients for new x
                grd = self.estimator.class_gradient(batch)

                # Stop if misclassification has been achieved
                active_indices = np.where(fk_i_hat != np.argmax(f_target_y, axis=1))[0]
                fk_hat = fk_i_hat

                current_step += 1

            # Apply overshoot parameter
            x_adv1 = x_adv[batch_index_1:batch_index_2]
            x_adv2 = (1 + self.epsilon) * (batch - x_adv[batch_index_1:batch_index_2])
            x_adv[batch_index_1:batch_index_2] = x_adv1 + x_adv2
            if self.estimator.clip_values is not None:
                np.clip(
                    x_adv[batch_index_1:batch_index_2],
                    self.estimator.clip_values[0],
                    self.estimator.clip_values[1],
                    out=x_adv[batch_index_1:batch_index_2],
                )

        logger.info(
            "Success rate of TargetedDeepFool(simple) attack: %.2f%%",
            100 * compute_success(self.estimator, x, y, x_adv, targeted=True, batch_size=self.batch_size),
        )
        return x_adv
def check_softmax(model, images):
    modelout = model(images).detach().cpu().numpy()
    is_softmax = True
    for i in range(modelout.shape[0]):
        is_softmax = is_probability(modelout[i, :]) & is_softmax
    return is_softmax