コード例 #1
0
def test_imperceptible_asr_pytorch(art_warning, expected_values, use_amp,
                                   device_type):
    # Only import if deepspeech_pytorch module is available
    import torch

    from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech
    from art.attacks.evasion.imperceptible_asr.imperceptible_asr_pytorch import ImperceptibleASRPyTorch
    from art.preprocessing.audio import LFilterPyTorch

    try:
        # Skip test if gpu is not available and use_amp is true
        if use_amp and not torch.cuda.is_available():
            return

        # Load data for testing
        expected_data = expected_values()

        x1 = expected_data["x1"]
        x2 = expected_data["x2"]
        x3 = expected_data["x3"]

        # Create signal data
        x = np.array([
            np.array(x1 * 200, dtype=ART_NUMPY_DTYPE),
            np.array(x2 * 200, dtype=ART_NUMPY_DTYPE),
            np.array(x3 * 200, dtype=ART_NUMPY_DTYPE),
        ])

        # Create labels
        y = np.array(["S", "I", "GD"])

        # Create DeepSpeech estimator with preprocessing
        numerator_coef = np.array(
            [0.0000001, 0.0000002, -0.0000001, -0.0000002],
            dtype=ART_NUMPY_DTYPE)
        denominator_coef = np.array([1.0, 0.0, 0.0, 0.0],
                                    dtype=ART_NUMPY_DTYPE)
        audio_filter = LFilterPyTorch(numerator_coef=numerator_coef,
                                      denominator_coef=denominator_coef,
                                      device_type=device_type)

        speech_recognizer = PyTorchDeepSpeech(
            pretrained_model="librispeech",
            device_type=device_type,
            use_amp=use_amp,
            preprocessing_defences=audio_filter,
        )

        # Create attack
        asr_attack = ImperceptibleASRPyTorch(
            estimator=speech_recognizer,
            eps=0.001,
            max_iter_1=5,
            max_iter_2=5,
            learning_rate_1=0.00001,
            learning_rate_2=0.001,
            optimizer_1=torch.optim.Adam,
            optimizer_2=torch.optim.Adam,
            global_max_length=3200,
            initial_rescale=1.0,
            decrease_factor_eps=0.8,
            num_iter_decrease_eps=5,
            alpha=0.01,
            increase_factor_alpha=1.2,
            num_iter_increase_alpha=5,
            decrease_factor_alpha=0.8,
            num_iter_decrease_alpha=5,
            win_length=2048,
            hop_length=512,
            n_fft=2048,
            batch_size=2,
            use_amp=use_amp,
            opt_level="O1",
        )

        # Test transcription output
        transcriptions_preprocessing = speech_recognizer.predict(
            x, batch_size=2, transcription_output=True)

        expected_transcriptions = np.array(["", "", ""])

        assert (expected_transcriptions == transcriptions_preprocessing).all()

        # Generate attack
        x_adv_preprocessing = asr_attack.generate(x, y)

        # Test shape
        assert x_adv_preprocessing[0].shape == x[0].shape
        assert x_adv_preprocessing[1].shape == x[1].shape
        assert x_adv_preprocessing[2].shape == x[2].shape

        # Test content
        assert not (x_adv_preprocessing[0] == x[0]).all()
        assert not (x_adv_preprocessing[1] == x[1]).all()
        assert not (x_adv_preprocessing[2] == x[2]).all()

        assert np.sum(x_adv_preprocessing[0]) != np.inf
        assert np.sum(x_adv_preprocessing[1]) != np.inf
        assert np.sum(x_adv_preprocessing[2]) != np.inf

        assert np.sum(x_adv_preprocessing[0]) != 0
        assert np.sum(x_adv_preprocessing[1]) != 0
        assert np.sum(x_adv_preprocessing[2]) != 0

    except ARTTestException as e:
        art_warning(e)
コード例 #2
0
def test_check_params(art_warning):
    try:
        from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech
        from art.attacks.evasion.imperceptible_asr.imperceptible_asr_pytorch import ImperceptibleASRPyTorch

        speech_recognizer = PyTorchDeepSpeech(
            pretrained_model="librispeech",
            device_type="cpu",
            use_amp=False,
            preprocessing_defences=None,
        )

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, eps=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, max_iter_1=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, max_iter_1=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, max_iter_2=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, max_iter_2=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, learning_rate_1="1")
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        learning_rate_1=-1.0)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, learning_rate_2="1")
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        learning_rate_2=-1.0)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        global_max_length=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        global_max_length=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, initial_rescale="1")
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        initial_rescale=-1.0)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        decrease_factor_eps="1")
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        decrease_factor_eps=-1.0)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        num_iter_decrease_eps=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        num_iter_decrease_eps=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, alpha="1")
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, alpha=-1.0)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        increase_factor_alpha="1")
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        increase_factor_alpha=-1.0)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        num_iter_increase_alpha=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        num_iter_increase_alpha=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        decrease_factor_alpha="1")
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        decrease_factor_alpha=-1.0)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        num_iter_decrease_alpha=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        num_iter_decrease_alpha=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, win_length=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, win_length=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, hop_length=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, hop_length=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, n_fft=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, n_fft=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        win_length=5,
                                        n_fft=1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, batch_size=-1)

    except ARTTestException as e:
        art_warning(e)
コード例 #3
0
class AsrAttack():
    '''
    This class controls all the configuration and parameters, 
    including parameters for attack and inference.

    The attack used here is from `Trusted-AI/adversarial-robustness-toolbox`.
    Check their github page for more information.

    TODO: Use modified version of the attack module written specifically for audio captcha.
    '''

    SAMPLE_RATE = 16000

    def __init__(self, pretrained_model="librispeech", 
                       gpus="0",
                       debug=False, 
                       **attack_kwargs):
        '''
        Create a class `.AsrAttack` instance.

        Args:
            pretrained_model (str) : The choice of target model. Currently this attack supports 
                                     3 different pretrained models consisting of `an4`, `librispeech`
                                     and `tedlium`, representing which dataset the model was trained with.
            gpus (str) : assign specific gpu to use. Default is "0". 
                         If gpu is unavailable, use cpu instead.
            debug (bool) : whether to print the debug message
            attack_kwargs (dict) : arguments for attack parameters. Read the documentation below.

            Args for `attack_kwargs`:
                estimator (PyTorchDeepSpeech) : A trained estimator.
                initial_eps (float) : Initial maximum perturbation that the attacker can introduce.
                max_iter_1st_stage (int): The maximum number of iterations applied for the first 
                                          stage of the optimization of the attack.
                max_iter_2nd_stage (int): The maximum number of iterations applied for the second 
                                          stage of the optimization of the attack.
                learning_rate_1st_stage (float) : The initial learning rate applied for the first 
                                                  stage of the optimization of the attack.
                learning_rate_2nd_stage (float) : The initial learning rate applied for the second 
                                                  stage of the optimization of the attack.
                optimizer_1st_stage: The optimizer applied for the first stage of the optimization 
                                     of the attack. If `None` attack will use `torch.optim.SGD`.
                optimizer_2nd_stage: The optimizer applied for the second stage of the optimization 
                                     of the attack. If `None` attack will use `torch.optim.SGD`.
                global_max_length (int) : The length of the longest audio signal allowed by this attack.
                initial_rescale (float) : Initial rescale coefficient to speedup the decrease of the 
                                          perturbation size during the first stage of the optimization of the attack.
                rescale_factor (float) : The factor to adjust the rescale coefficient during the first 
                                         stage of the optimization of the attack.
                num_iter_adjust_rescale (int) : Number of iterations to adjust the rescale coefficient.
                initial_alpha (float) : The initial value of the alpha coefficient used in the second 
                                        stage of the optimization of the attack.
                increase_factor_alpha (float) : The factor to increase the alpha coefficient used in the second 
                                                stage of the optimization of the attack.
                num_iter_increase_alpha (int) : Number of iterations to increase alpha.
                decrease_factor_alpha (float) : The factor to decrease the alpha coefficient used in the second stage of the
                                                optimization of the attack.
                num_iter_decrease_alpha (int) : Number of iterations to decrease alpha.
                batch_size (int) : Size of the batch on which adversarial samples are generated.
                use_amp (bool) : Whether to use the automatic mixed precision tool to enable mixed precision training or
                                 gradient computation, e.g. with loss gradient computation. When set to True, this option is
                                 only triggered if there are GPUs available.
                opt_level (str) : Specify a pure or mixed precision optimization level. Used when use_amp is True. Accepted
                                  values are `O0`, `O1`, `O2`, and `O3`.
        '''

        self.pretrained_model = pretrained_model
        self.gpus = gpus
        self.debug = debug
        self.attack_kwargs = attack_kwargs

        # set gpu device here
        if torch.cuda.is_available():
            os.environ["CUDA_VISIBLE_DEVICES"] = self.gpus
            self.device_type = "gpu"
        else:
            self.device_type = "cpu"

        # TODO : Set up optimizer in `attack_kwargs`

        # initialize target asr model
        self.asr_model = PyTorchDeepSpeech(pretrained_model=self.pretrained_model,
                                           device_type=self.device_type)

        # attack!
        self.asr_attack = ImperceptibleASRPyTorch(estimator=self.asr_model, **self.attack_kwargs)

    
    def load_audio(self, path):
        '''
        It's the same loader used by deepspeech-pytorch
        '''
        sound, _ = librosa.load(path, sr=AsrAttack.SAMPLE_RATE)
        if len(sound.shape) > 1:
            sound = sound.mean(axis=1)  # multiple channels, average

        return sound


    def save_audio(self, path, audio):
        '''
        Save audio file. Will be rescaled in 16-bits integer.
        '''

        wavfile.write(path, AsrAttack.SAMPLE_RATE, audio)


    def generate_adv_example(self, input_path, target, output_path):
        '''
        Generate adversarial example.

        Args:
            input_path (str) : the path of audio being attacked.
            target (str) : target output in capital letter. Ex: "OPEN THE DOOR".
            output_path (str) : the path where targeted audio is stored.
        '''
        
        audio = self.load_audio(input_path)
        prediction = self.asr_model.predict(np.array([audio]), batch_size=1, transcription_output=True)
        if self.debug:
            print('input path:', input_path)
            print('original prediction:', prediction)
            print('target:', target)

        # start generating adv example
        adv_audio = self.asr_attack.generate(np.array([audio]), np.array([target]), batch_size=1)

        # check the transcription of targeted audio
        adv_transcriptions = self.asr_model.predict(adv_audio, batch_size=1, transcription_output=True)
        print("Groundtruth transcriptions: ", prediction)
        print("Target      transcriptions: ", target)
        print("Adversarial transcriptions: ", adv_transcriptions)

        # save adv audio
        self.save_audio(output_path, adv_audio[0])
        if self.debug:
            print('Generated audio stored at:', output_path)
コード例 #4
0
    def __init__(self, pretrained_model="librispeech", 
                       gpus="0",
                       debug=False, 
                       **attack_kwargs):
        '''
        Create a class `.AsrAttack` instance.

        Args:
            pretrained_model (str) : The choice of target model. Currently this attack supports 
                                     3 different pretrained models consisting of `an4`, `librispeech`
                                     and `tedlium`, representing which dataset the model was trained with.
            gpus (str) : assign specific gpu to use. Default is "0". 
                         If gpu is unavailable, use cpu instead.
            debug (bool) : whether to print the debug message
            attack_kwargs (dict) : arguments for attack parameters. Read the documentation below.

            Args for `attack_kwargs`:
                estimator (PyTorchDeepSpeech) : A trained estimator.
                initial_eps (float) : Initial maximum perturbation that the attacker can introduce.
                max_iter_1st_stage (int): The maximum number of iterations applied for the first 
                                          stage of the optimization of the attack.
                max_iter_2nd_stage (int): The maximum number of iterations applied for the second 
                                          stage of the optimization of the attack.
                learning_rate_1st_stage (float) : The initial learning rate applied for the first 
                                                  stage of the optimization of the attack.
                learning_rate_2nd_stage (float) : The initial learning rate applied for the second 
                                                  stage of the optimization of the attack.
                optimizer_1st_stage: The optimizer applied for the first stage of the optimization 
                                     of the attack. If `None` attack will use `torch.optim.SGD`.
                optimizer_2nd_stage: The optimizer applied for the second stage of the optimization 
                                     of the attack. If `None` attack will use `torch.optim.SGD`.
                global_max_length (int) : The length of the longest audio signal allowed by this attack.
                initial_rescale (float) : Initial rescale coefficient to speedup the decrease of the 
                                          perturbation size during the first stage of the optimization of the attack.
                rescale_factor (float) : The factor to adjust the rescale coefficient during the first 
                                         stage of the optimization of the attack.
                num_iter_adjust_rescale (int) : Number of iterations to adjust the rescale coefficient.
                initial_alpha (float) : The initial value of the alpha coefficient used in the second 
                                        stage of the optimization of the attack.
                increase_factor_alpha (float) : The factor to increase the alpha coefficient used in the second 
                                                stage of the optimization of the attack.
                num_iter_increase_alpha (int) : Number of iterations to increase alpha.
                decrease_factor_alpha (float) : The factor to decrease the alpha coefficient used in the second stage of the
                                                optimization of the attack.
                num_iter_decrease_alpha (int) : Number of iterations to decrease alpha.
                batch_size (int) : Size of the batch on which adversarial samples are generated.
                use_amp (bool) : Whether to use the automatic mixed precision tool to enable mixed precision training or
                                 gradient computation, e.g. with loss gradient computation. When set to True, this option is
                                 only triggered if there are GPUs available.
                opt_level (str) : Specify a pure or mixed precision optimization level. Used when use_amp is True. Accepted
                                  values are `O0`, `O1`, `O2`, and `O3`.
        '''

        self.pretrained_model = pretrained_model
        self.gpus = gpus
        self.debug = debug
        self.attack_kwargs = attack_kwargs

        # set gpu device here
        if torch.cuda.is_available():
            os.environ["CUDA_VISIBLE_DEVICES"] = self.gpus
            self.device_type = "gpu"
        else:
            self.device_type = "cpu"

        # TODO : Set up optimizer in `attack_kwargs`

        # initialize target asr model
        self.asr_model = PyTorchDeepSpeech(pretrained_model=self.pretrained_model,
                                           device_type=self.device_type)

        # attack!
        self.asr_attack = ImperceptibleASRPyTorch(estimator=self.asr_model, **self.attack_kwargs)
def test_imperceptible_asr_pytorch(art_warning, expected_values, use_amp,
                                   device_type):
    # Only import if deepspeech_pytorch module is available
    import torch

    from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech
    from art.attacks.evasion.imperceptible_asr.imperceptible_asr_pytorch import ImperceptibleASRPyTorch
    from art.defences.preprocessor import LFilterPyTorch

    try:
        # Load data for testing
        expected_data = expected_values()

        x1 = expected_data[0]
        x2 = expected_data[1]
        x3 = expected_data[2]

        # Create signal data
        x = np.array([
            np.array(x1 * 100, dtype=ART_NUMPY_DTYPE),
            np.array(x2 * 100, dtype=ART_NUMPY_DTYPE),
            np.array(x3 * 100, dtype=ART_NUMPY_DTYPE),
        ])

        # Create labels
        y = np.array(["S", "I", "GD"])

        # Create DeepSpeech estimator with preprocessing
        numerator_coef = np.array(
            [0.0000001, 0.0000002, -0.0000001, -0.0000002])
        denominator_coef = np.array([1.0, 0.0, 0.0, 0.0])
        audio_filter = LFilterPyTorch(numerator_coef=numerator_coef,
                                      denominator_coef=denominator_coef,
                                      device_type=device_type)

        speech_recognizer = PyTorchDeepSpeech(
            pretrained_model="librispeech",
            device_type=device_type,
            use_amp=use_amp,
            preprocessing_defences=audio_filter,
        )

        # Create attack
        asr_attack = ImperceptibleASRPyTorch(
            estimator=speech_recognizer,
            initial_eps=0.001,
            max_iter_1st_stage=5,
            max_iter_2nd_stage=5,
            learning_rate_1st_stage=0.00001,
            learning_rate_2nd_stage=0.001,
            optimizer_1st_stage=torch.optim.SGD,
            optimizer_2nd_stage=torch.optim.SGD,
            global_max_length=2000,
            initial_rescale=1.0,
            rescale_factor=0.8,
            num_iter_adjust_rescale=5,
            initial_alpha=0.01,
            increase_factor_alpha=1.2,
            num_iter_increase_alpha=5,
            decrease_factor_alpha=0.8,
            num_iter_decrease_alpha=5,
            batch_size=2,
            use_amp=use_amp,
            opt_level="O1",
        )

        # Test transcription output
        transcriptions_preprocessing = speech_recognizer.predict(
            x, batch_size=2, transcription_output=True)

        expected_transcriptions = np.array(["", "", ""])

        assert (expected_transcriptions == transcriptions_preprocessing).all()

        # Generate attack
        x_adv_preprocessing = asr_attack.generate(x, y)

        # Test shape
        assert x_adv_preprocessing[0].shape == x[0].shape
        assert x_adv_preprocessing[1].shape == x[1].shape
        assert x_adv_preprocessing[2].shape == x[2].shape

    except ARTTestException as e:
        art_warning(e)