Esempio n. 1
0
    def _loss_gradient_masking_threshold(
            self, perturbation: np.ndarray,
            x: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """
        Compute loss gradient of the global masking threshold w.r.t. the PSD approximate of the perturbation.

        The loss is defined as the hinge loss w.r.t. to the frequency masking threshold of the original audio input `x`
        and the normalized power spectral density estimate of the perturbation. In order to stabilize the optimization
        problem during back-propagation, the `10*log`-terms are canceled out.

        :param perturbation: Adversarial perturbation.
        :param x: An array with the original inputs to be attacked.
        :return: Tuple consisting of the loss gradient, which has same shape as `perturbation`, and loss value.
        """
        # pad input
        perturbation_padded, delta_mask = pad_sequence_input(perturbation)
        x_padded, _ = pad_sequence_input(x)

        # calculate masking threshold and PSD maximum
        masking_threshold = []
        psd_maximum = []
        for x_i in x_padded:
            mt, pm = self.masker.calculate_threshold_and_psd_maximum(x_i)
            masking_threshold.append(mt)
            psd_maximum.append(pm)
        masking_threshold = np.array(masking_threshold)
        psd_maximum = np.array(psd_maximum)

        # stabilize masking threshold loss by canceling out the "10*log" term in power spectral density and masking
        # threshold
        masking_threshold_stabilized = 10**(masking_threshold * 0.1)
        psd_maximum_stabilized = 10**(psd_maximum * 0.1)

        if self._framework == "tensorflow":
            # get loss gradients (TensorFlow)
            feed_dict = {
                self._delta: perturbation_padded,
                self._power_spectral_density_maximum_tf:
                psd_maximum_stabilized,
                self._masking_threshold_tf: masking_threshold_stabilized,
            }
            gradients_padded, loss = self.estimator._sess.run(
                self._loss_gradient_masking_threshold_op_tf, feed_dict)
        elif self._framework == "pytorch":
            # get loss gradients (TensorFlow)
            gradients_padded, loss = self._loss_gradient_masking_threshold_torch(
                perturbation_padded, psd_maximum_stabilized,
                masking_threshold_stabilized)
        else:
            raise NotImplementedError

        # undo padding, i.e. change gradients shape from (nb_samples, max_length) to (nb_samples)
        lengths = delta_mask.sum(axis=1)
        gradients = list()
        for gradient_padded, length in zip(gradients_padded, lengths):
            gradient = gradient_padded[:length]
            gradients.append(gradient)

        return np.array(gradients, dtype=object), loss
Esempio n. 2
0
    def _loss_gradient_masking_threshold(
        self,
        perturbation: np.ndarray,
        x: np.ndarray,
        masking_threshold_stabilized: np.ndarray,
        psd_maximum_stabilized: np.ndarray,
    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Compute loss gradient of the global masking threshold w.r.t. the PSD approximate of the perturbation.

        The loss is defined as the hinge loss w.r.t. to the frequency masking threshold of the original audio input `x`
        and the normalized power spectral density estimate of the perturbation. In order to stabilize the optimization
        problem during back-propagation, the `10*log`-terms are canceled out.

        :param perturbation: Adversarial perturbation.
        :param x: An array with the original inputs to be attacked.
        :param masking_threshold_stabilized: Stabilized masking threshold for the original input `x`.
        :param psd_maximum_stabilized: Stabilized maximum across frames, i.e. shape is `(batch_size, frame_length)`, of
            the original unnormalized PSD of `x`.
        :return: Tuple consisting of the loss gradient, which has same shape as `perturbation`, and loss value.
        """
        # pad input
        perturbation_padded, delta_mask = pad_sequence_input(perturbation)

        if self._framework == "tensorflow":
            # get loss gradients (TensorFlow)
            feed_dict = {
                self._delta: perturbation_padded,
                self._power_spectral_density_maximum_tf:
                psd_maximum_stabilized,
                self._masking_threshold_tf: masking_threshold_stabilized,
            }
            # pylint: disable=W0212
            gradients_padded, loss = self.estimator._sess.run(
                self._loss_gradient_masking_threshold_op_tf, feed_dict)
        elif self._framework == "pytorch":
            # get loss gradients (TensorFlow)
            gradients_padded, loss = self._loss_gradient_masking_threshold_torch(
                perturbation_padded, psd_maximum_stabilized,
                masking_threshold_stabilized)
        else:
            raise NotImplementedError

        # undo padding, i.e. change gradients shape from (nb_samples, max_length) to (nb_samples)
        lengths = delta_mask.sum(axis=1)
        gradients = list()
        for gradient_padded, length in zip(gradients_padded, lengths):
            gradient = gradient_padded[:length]
            gradients.append(gradient)

        # for ragged input, use np.object dtype
        dtype = np.float32 if x.ndim != 1 else np.object
        return np.array(gradients, dtype=dtype), loss
    def _stabilized_threshold_and_psd_maximum(self, x: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """
        Return batch of stabilized masking thresholds and PSD maxima.

        :param x: An array with the original inputs to be attacked.
        :return: Tuple consisting of stabilized masking thresholds and PSD maxima.
        """
        masking_threshold = []
        psd_maximum = []
        x_padded, _ = pad_sequence_input(x)

        for x_i in x_padded:
            mt, pm = self.masker.calculate_threshold_and_psd_maximum(x_i)
            masking_threshold.append(mt)
            psd_maximum.append(pm)
        # stabilize imperceptible loss by canceling out the "10*log" term in power spectral density maximum and
        # masking threshold
        masking_threshold_stabilized = 10 ** (np.array(masking_threshold) * 0.1)
        psd_maximum_stabilized = 10 ** (np.array(psd_maximum) * 0.1)
        return masking_threshold_stabilized, psd_maximum_stabilized