def main(args):

    # select device here
    if torch.cuda.is_available():
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
        device_type = "gpu"
    else:
        device_type = "cpu"

    deepspeech = PyTorchDeepSpeech(pretrained_model=args.model,
                                   device_type=device_type)

    # load audio
    sample_rate, sound = wavfile.read(args.input)
    assert sample_rate == 16000, "This module only supports audio with sample rate of 16000 currently."

    # start prediction
    transcription = deepspeech.predict(np.array([sound]),
                                       batch_size=1,
                                       transcription_output=True)

    print("output:", transcription)
Beispiel #2
0
def test_pytorch_deep_speech(art_warning, expected_values, use_amp,
                             device_type):
    # Only import if deepspeech_pytorch module is available
    import torch

    from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech

    try:
        # Load data for testing
        expected_data = expected_values()

        x1 = expected_data[0]
        x2 = expected_data[1]
        x3 = expected_data[2]
        expected_sizes = expected_data[3]
        expected_transcriptions1 = expected_data[4]
        expected_transcriptions2 = expected_data[5]
        expected_probs = expected_data[6]
        expected_gradients1 = expected_data[7]
        expected_gradients2 = expected_data[8]
        expected_gradients3 = expected_data[9]

        # Create signal data
        x = np.array([
            np.array(x1 * 100, dtype=ART_NUMPY_DTYPE),
            np.array(x2 * 100, dtype=ART_NUMPY_DTYPE),
            np.array(x3 * 100, dtype=ART_NUMPY_DTYPE),
        ])

        # Create labels
        y = np.array(["SIX", "HI", "GOOD"])

        # Test probability outputs
        speech_recognizer = PyTorchDeepSpeech(pretrained_model="librispeech",
                                              device_type=device_type,
                                              use_amp=use_amp)
        probs, sizes = speech_recognizer.predict(x, batch_size=2)

        np.testing.assert_array_almost_equal(probs[1][1],
                                             expected_probs,
                                             decimal=3)
        np.testing.assert_array_almost_equal(sizes, expected_sizes)

        # Test transcription outputs
        transcriptions = speech_recognizer.predict(x,
                                                   batch_size=2,
                                                   transcription_output=True)

        assert (expected_transcriptions1 == transcriptions).all()

        # Test transcription outputs, corner case
        transcriptions = speech_recognizer.predict(np.array([x[0]]),
                                                   batch_size=2,
                                                   transcription_output=True)

        assert (expected_transcriptions2 == transcriptions).all()

        # Now test loss gradients
        # Compute gradients
        grads = speech_recognizer.loss_gradient(x, y)

        assert grads[0].shape == (1300, )
        assert grads[1].shape == (1500, )
        assert grads[2].shape == (1400, )

        np.testing.assert_array_almost_equal(grads[0][0:20],
                                             expected_gradients1,
                                             decimal=-2)
        np.testing.assert_array_almost_equal(grads[1][0:20],
                                             expected_gradients2,
                                             decimal=-2)
        np.testing.assert_array_almost_equal(grads[2][0:20],
                                             expected_gradients3,
                                             decimal=-2)

        # Now test fit function
        # Create the optimizer
        parameters = speech_recognizer.model.parameters()
        speech_recognizer._optimizer = torch.optim.SGD(parameters, lr=0.01)

        # Before train
        transcriptions1 = speech_recognizer.predict(x,
                                                    batch_size=2,
                                                    transcription_output=True)

        # Train the estimator
        speech_recognizer.fit(x=x, y=y, batch_size=2, nb_epochs=5)

        # After train
        transcriptions2 = speech_recognizer.predict(x,
                                                    batch_size=2,
                                                    transcription_output=True)

        assert not ((transcriptions1 == transcriptions2).all())

    except ARTTestException as e:
        art_warning(e)
Beispiel #3
0
class TestPyTorchDeepSpeech:
    """
    This class tests the PyTorchDeepSpeech estimator.
    """

    @pytest.fixture
    def setup_class(self):
        master_seed(seed=1234)

        # Small data for testing
        x1 = np.array(
            [
                -1.0376293e-03,
                -1.0681478e-03,
                -1.0986663e-03,
                -1.1291848e-03,
                -1.1291848e-03,
                -1.1291848e-03,
                -1.1902219e-03,
                -1.1597034e-03,
                -1.1902219e-03,
                -1.1291848e-03,
                -1.1291848e-03,
                -1.0681478e-03,
                -9.1555528e-04,
            ]
            * 100
        )

        x2 = np.array(
            [
                -1.8311106e-04,
                -1.2207404e-04,
                -6.1037019e-05,
                0.0000000e00,
                3.0518509e-05,
                0.0000000e00,
                -3.0518509e-05,
                0.0000000e00,
                0.0000000e00,
                9.1555528e-05,
                2.1362957e-04,
                3.3570360e-04,
                4.2725913e-04,
                4.5777764e-04,
                -1.8311106e-04,
            ]
            * 100
        )

        x3 = np.array(
            [
                -8.2399976e-04,
                -7.0192572e-04,
                -5.4933317e-04,
                -4.2725913e-04,
                -3.6622211e-04,
                -2.7466659e-04,
                -2.1362957e-04,
                5.4933317e-04,
                5.7985168e-04,
                6.1037019e-04,
                6.7140721e-04,
                7.0192572e-04,
                6.7140721e-04,
                -1.5259255e-04,
            ]
            * 100
        )

        self.x = np.array([x1, x2, x3])

    def test_all(self, _test_all):
        pass

    @pytest.fixture(params=[False, True])
    def _test_all(self, request, setup_class):
        # Only import if deep speech module is available
        import torch

        from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech

        # Test probability outputs
        if request.param is True:
            self.speech_recognizer_amp = PyTorchDeepSpeech(
                pretrained_model="librispeech", device_type="gpu", use_amp=True
            )
            probs, sizes = self.speech_recognizer_amp.predict(self.x, batch_size=2)

        else:
            self.speech_recognizer = PyTorchDeepSpeech(pretrained_model="librispeech")
            probs, sizes = self.speech_recognizer.predict(self.x, batch_size=2)

        expected_sizes = np.asarray([5, 5, 5])
        np.testing.assert_array_almost_equal(sizes, expected_sizes)

        expected_probs = np.asarray(
            [
                1.0000000e00,
                7.0154901e-14,
                1.9170589e-13,
                8.2194836e-13,
                8.9967915e-13,
                1.8518193e-12,
                1.7883164e-10,
                1.8951663e-12,
                1.8818237e-13,
                3.2806991e-12,
                3.5664666e-16,
                3.3147299e-14,
                2.3439516e-13,
                8.4845603e-12,
                1.2017718e-13,
                1.1180213e-12,
                6.5572378e-15,
                3.0194697e-12,
                4.9065188e-15,
                1.9765363e-13,
                4.1670646e-11,
                2.6884213e-12,
                1.1436632e-13,
                7.1931783e-15,
                2.8135227e-11,
                4.5599673e-14,
                6.4587983e-13,
                2.4159567e-15,
                4.6668241e-13,
            ]
        )
        np.testing.assert_array_almost_equal(probs[1][1], expected_probs, decimal=3)

        # Test transcription outputs
        if request.param is True:
            transcriptions = self.speech_recognizer_amp.predict(self.x, batch_size=2, transcription_output=True)
        else:
            transcriptions = self.speech_recognizer.predict(self.x, batch_size=2, transcription_output=True)

        expected_transcriptions = np.array(["", "", ""])
        assert (expected_transcriptions == transcriptions).all()

        # Test transcription outputs, corner case
        if request.param is True:
            transcriptions = self.speech_recognizer_amp.predict(
                np.array([self.x[0]]), batch_size=2, transcription_output=True
            )
        else:
            transcriptions = self.speech_recognizer.predict(
                np.array([self.x[0]]), batch_size=2, transcription_output=True
            )

        expected_transcriptions = np.array([""])
        assert (expected_transcriptions == transcriptions).all()

        # Now test loss gradients
        # Create labels
        y = np.array(["SIX", "HI", "GOOD"])

        # Compute gradients
        if request.param is True:
            grads = self.speech_recognizer_amp.loss_gradient(self.x, y)
        else:
            grads = self.speech_recognizer.loss_gradient(self.x, y)

        assert grads[0].shape == (1300,)
        assert grads[1].shape == (1500,)
        assert grads[2].shape == (1400,)

        if request.param is True:
            expected_gradients1 = np.asarray(
                [
                    -3485.7,
                    659.0,
                    -111.7,
                    283.6,
                    1691.9,
                    715.0,
                    1480.4,
                    -3522.3,
                    -4087.9,
                    -8824.2,
                    -304.7,
                    2013.4,
                    -445.1,
                    4125.0,
                    1754.1,
                    -503.6,
                    1160.0,
                    7051.7,
                    -1992.2,
                    350.4,
                ]
            )

        else:
            expected_gradients1 = np.asarray(
                [
                    -3482.77892371,
                    665.64673575,
                    -116.24408896,
                    265.93803869,
                    1667.02236699,
                    688.33557577,
                    1455.14911883,
                    -3524.90476617,
                    -4082.06471587,
                    -8802.39419605,
                    -277.74274789,
                    2034.54679277,
                    -428.53153241,
                    4114.63683848,
                    1722.53840709,
                    -513.68916798,
                    1159.88786568,
                    7072.47761446,
                    -1963.71829047,
                    382.65287411,
                ]
            )
        np.testing.assert_array_almost_equal(grads[0][0:20], expected_gradients1, decimal=0)

        if request.param is True:
            expected_gradients2 = np.asarray(
                [
                    20924.5,
                    3046.3,
                    -7872.5,
                    15525.1,
                    -15766.9,
                    -18494.1,
                    19139.6,
                    6446.2,
                    26323.1,
                    4230.0,
                    -31122.4,
                    -2890.9,
                    12936.7,
                    13834.1,
                    17649.9,
                    8866.1,
                    -16454.6,
                    -6953.1,
                    -17899.6,
                    4100.7,
                ]
            )

        else:
            expected_gradients2 = np.asarray(
                [
                    20992.44844133,
                    3048.78701634,
                    -7849.13725934,
                    15557.59663939,
                    -15760.10725159,
                    -18422.9438386,
                    19132.22699435,
                    6508.51437337,
                    26292.5249963,
                    4232.62414548,
                    -31128.82664215,
                    -2894.85284984,
                    13008.74538039,
                    13845.08921681,
                    17657.67725957,
                    8807.42144017,
                    -16477.89414508,
                    -6977.8092622,
                    -17914.22352666,
                    4086.51150059,
                ]
            )
        np.testing.assert_array_almost_equal(grads[1][0:20], expected_gradients2, decimal=0)

        if request.param is True:
            expected_gradients3 = np.asarray(
                [
                    -1687.3,
                    6715.0,
                    16448.4,
                    -3848.9,
                    16521.1,
                    -15736.1,
                    -26204.0,
                    -8992.2,
                    9697.9,
                    13999.6,
                    -7595.3,
                    14181.0,
                    -24507.2,
                    5481.9,
                    7166.7,
                    -6182.3,
                    2510.3,
                    -7229.0,
                    -10821.9,
                    -11134.2,
                ]
            )

        else:
            expected_gradients3 = np.asarray(
                [
                    -1693.10472689,
                    6711.39788693,
                    16480.14166546,
                    -3786.95541286,
                    16448.3969823,
                    -15702.45621671,
                    -26162.89260564,
                    -8979.81601681,
                    9657.87483965,
                    13955.78845296,
                    -7552.01438108,
                    14170.60635269,
                    -24434.37243957,
                    5502.81163675,
                    7171.56926943,
                    -6154.06511686,
                    2483.93980406,
                    -7244.24618697,
                    -10798.70438903,
                    -11129.57632319,
                ]
            )
        np.testing.assert_array_almost_equal(grads[2][0:20], expected_gradients3, decimal=0)

        # Now test fit function
        if request.param is True:
            # Create the optimizer
            parameters = self.speech_recognizer_amp.model.parameters()
            self.speech_recognizer_amp._optimizer = torch.optim.SGD(parameters, lr=0.01)

            # Before train
            transcriptions1 = self.speech_recognizer_amp.predict(self.x, batch_size=2, transcription_output=True)

            # Train the estimator
            self.speech_recognizer_amp.fit(x=self.x, y=y, batch_size=2, nb_epochs=5)

            # After train
            transcriptions2 = self.speech_recognizer_amp.predict(self.x, batch_size=2, transcription_output=True)

            assert not ((transcriptions1 == transcriptions2).all())

        else:
            # Create the optimizer
            parameters = self.speech_recognizer.model.parameters()
            self.speech_recognizer._optimizer = torch.optim.SGD(parameters, lr=0.01)

            # Before train
            transcriptions1 = self.speech_recognizer.predict(self.x, batch_size=2, transcription_output=True)

            # Train the estimator
            self.speech_recognizer.fit(x=self.x, y=y, batch_size=2, nb_epochs=5)

            # After train
            transcriptions2 = self.speech_recognizer.predict(self.x, batch_size=2, transcription_output=True)

            assert not ((transcriptions1 == transcriptions2).all())
Beispiel #4
0
def test_imperceptible_asr_pytorch(art_warning, expected_values, use_amp,
                                   device_type):
    # Only import if deepspeech_pytorch module is available
    import torch

    from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech
    from art.attacks.evasion.imperceptible_asr.imperceptible_asr_pytorch import ImperceptibleASRPyTorch
    from art.preprocessing.audio import LFilterPyTorch

    try:
        # Skip test if gpu is not available and use_amp is true
        if use_amp and not torch.cuda.is_available():
            return

        # Load data for testing
        expected_data = expected_values()

        x1 = expected_data["x1"]
        x2 = expected_data["x2"]
        x3 = expected_data["x3"]

        # Create signal data
        x = np.array([
            np.array(x1 * 200, dtype=ART_NUMPY_DTYPE),
            np.array(x2 * 200, dtype=ART_NUMPY_DTYPE),
            np.array(x3 * 200, dtype=ART_NUMPY_DTYPE),
        ])

        # Create labels
        y = np.array(["S", "I", "GD"])

        # Create DeepSpeech estimator with preprocessing
        numerator_coef = np.array(
            [0.0000001, 0.0000002, -0.0000001, -0.0000002],
            dtype=ART_NUMPY_DTYPE)
        denominator_coef = np.array([1.0, 0.0, 0.0, 0.0],
                                    dtype=ART_NUMPY_DTYPE)
        audio_filter = LFilterPyTorch(numerator_coef=numerator_coef,
                                      denominator_coef=denominator_coef,
                                      device_type=device_type)

        speech_recognizer = PyTorchDeepSpeech(
            pretrained_model="librispeech",
            device_type=device_type,
            use_amp=use_amp,
            preprocessing_defences=audio_filter,
        )

        # Create attack
        asr_attack = ImperceptibleASRPyTorch(
            estimator=speech_recognizer,
            eps=0.001,
            max_iter_1=5,
            max_iter_2=5,
            learning_rate_1=0.00001,
            learning_rate_2=0.001,
            optimizer_1=torch.optim.Adam,
            optimizer_2=torch.optim.Adam,
            global_max_length=3200,
            initial_rescale=1.0,
            decrease_factor_eps=0.8,
            num_iter_decrease_eps=5,
            alpha=0.01,
            increase_factor_alpha=1.2,
            num_iter_increase_alpha=5,
            decrease_factor_alpha=0.8,
            num_iter_decrease_alpha=5,
            win_length=2048,
            hop_length=512,
            n_fft=2048,
            batch_size=2,
            use_amp=use_amp,
            opt_level="O1",
        )

        # Test transcription output
        transcriptions_preprocessing = speech_recognizer.predict(
            x, batch_size=2, transcription_output=True)

        expected_transcriptions = np.array(["", "", ""])

        assert (expected_transcriptions == transcriptions_preprocessing).all()

        # Generate attack
        x_adv_preprocessing = asr_attack.generate(x, y)

        # Test shape
        assert x_adv_preprocessing[0].shape == x[0].shape
        assert x_adv_preprocessing[1].shape == x[1].shape
        assert x_adv_preprocessing[2].shape == x[2].shape

        # Test content
        assert not (x_adv_preprocessing[0] == x[0]).all()
        assert not (x_adv_preprocessing[1] == x[1]).all()
        assert not (x_adv_preprocessing[2] == x[2]).all()

        assert np.sum(x_adv_preprocessing[0]) != np.inf
        assert np.sum(x_adv_preprocessing[1]) != np.inf
        assert np.sum(x_adv_preprocessing[2]) != np.inf

        assert np.sum(x_adv_preprocessing[0]) != 0
        assert np.sum(x_adv_preprocessing[1]) != 0
        assert np.sum(x_adv_preprocessing[2]) != 0

    except ARTTestException as e:
        art_warning(e)
Beispiel #5
0
def test_check_params(art_warning):
    try:
        from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech
        from art.attacks.evasion.imperceptible_asr.imperceptible_asr_pytorch import ImperceptibleASRPyTorch

        speech_recognizer = PyTorchDeepSpeech(
            pretrained_model="librispeech",
            device_type="cpu",
            use_amp=False,
            preprocessing_defences=None,
        )

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, eps=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, max_iter_1=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, max_iter_1=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, max_iter_2=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, max_iter_2=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, learning_rate_1="1")
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        learning_rate_1=-1.0)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, learning_rate_2="1")
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        learning_rate_2=-1.0)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        global_max_length=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        global_max_length=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, initial_rescale="1")
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        initial_rescale=-1.0)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        decrease_factor_eps="1")
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        decrease_factor_eps=-1.0)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        num_iter_decrease_eps=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        num_iter_decrease_eps=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, alpha="1")
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, alpha=-1.0)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        increase_factor_alpha="1")
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        increase_factor_alpha=-1.0)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        num_iter_increase_alpha=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        num_iter_increase_alpha=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        decrease_factor_alpha="1")
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        decrease_factor_alpha=-1.0)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        num_iter_decrease_alpha=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        num_iter_decrease_alpha=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, win_length=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, win_length=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, hop_length=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, hop_length=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, n_fft=1.0)
        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, n_fft=-1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer,
                                        win_length=5,
                                        n_fft=1)

        with pytest.raises(ValueError):
            _ = ImperceptibleASRPyTorch(speech_recognizer, batch_size=-1)

    except ARTTestException as e:
        art_warning(e)
    def __init__(self, pretrained_model="librispeech", 
                       gpus="0",
                       debug=False, 
                       **attack_kwargs):
        '''
        Create a class `.AsrAttack` instance.

        Args:
            pretrained_model (str) : The choice of target model. Currently this attack supports 
                                     3 different pretrained models consisting of `an4`, `librispeech`
                                     and `tedlium`, representing which dataset the model was trained with.
            gpus (str) : assign specific gpu to use. Default is "0". 
                         If gpu is unavailable, use cpu instead.
            debug (bool) : whether to print the debug message
            attack_kwargs (dict) : arguments for attack parameters. Read the documentation below.

            Args for `attack_kwargs`:
                estimator (PyTorchDeepSpeech) : A trained estimator.
                initial_eps (float) : Initial maximum perturbation that the attacker can introduce.
                max_iter_1st_stage (int): The maximum number of iterations applied for the first 
                                          stage of the optimization of the attack.
                max_iter_2nd_stage (int): The maximum number of iterations applied for the second 
                                          stage of the optimization of the attack.
                learning_rate_1st_stage (float) : The initial learning rate applied for the first 
                                                  stage of the optimization of the attack.
                learning_rate_2nd_stage (float) : The initial learning rate applied for the second 
                                                  stage of the optimization of the attack.
                optimizer_1st_stage: The optimizer applied for the first stage of the optimization 
                                     of the attack. If `None` attack will use `torch.optim.SGD`.
                optimizer_2nd_stage: The optimizer applied for the second stage of the optimization 
                                     of the attack. If `None` attack will use `torch.optim.SGD`.
                global_max_length (int) : The length of the longest audio signal allowed by this attack.
                initial_rescale (float) : Initial rescale coefficient to speedup the decrease of the 
                                          perturbation size during the first stage of the optimization of the attack.
                rescale_factor (float) : The factor to adjust the rescale coefficient during the first 
                                         stage of the optimization of the attack.
                num_iter_adjust_rescale (int) : Number of iterations to adjust the rescale coefficient.
                initial_alpha (float) : The initial value of the alpha coefficient used in the second 
                                        stage of the optimization of the attack.
                increase_factor_alpha (float) : The factor to increase the alpha coefficient used in the second 
                                                stage of the optimization of the attack.
                num_iter_increase_alpha (int) : Number of iterations to increase alpha.
                decrease_factor_alpha (float) : The factor to decrease the alpha coefficient used in the second stage of the
                                                optimization of the attack.
                num_iter_decrease_alpha (int) : Number of iterations to decrease alpha.
                batch_size (int) : Size of the batch on which adversarial samples are generated.
                use_amp (bool) : Whether to use the automatic mixed precision tool to enable mixed precision training or
                                 gradient computation, e.g. with loss gradient computation. When set to True, this option is
                                 only triggered if there are GPUs available.
                opt_level (str) : Specify a pure or mixed precision optimization level. Used when use_amp is True. Accepted
                                  values are `O0`, `O1`, `O2`, and `O3`.
        '''

        self.pretrained_model = pretrained_model
        self.gpus = gpus
        self.debug = debug
        self.attack_kwargs = attack_kwargs

        # set gpu device here
        if torch.cuda.is_available():
            os.environ["CUDA_VISIBLE_DEVICES"] = self.gpus
            self.device_type = "gpu"
        else:
            self.device_type = "cpu"

        # TODO : Set up optimizer in `attack_kwargs`

        # initialize target asr model
        self.asr_model = PyTorchDeepSpeech(pretrained_model=self.pretrained_model,
                                           device_type=self.device_type)

        # attack!
        self.asr_attack = ImperceptibleASRPyTorch(estimator=self.asr_model, **self.attack_kwargs)
class AsrAttack():
    '''
    This class controls all the configuration and parameters, 
    including parameters for attack and inference.

    The attack used here is from `Trusted-AI/adversarial-robustness-toolbox`.
    Check their github page for more information.

    TODO: Use modified version of the attack module written specifically for audio captcha.
    '''

    SAMPLE_RATE = 16000

    def __init__(self, pretrained_model="librispeech", 
                       gpus="0",
                       debug=False, 
                       **attack_kwargs):
        '''
        Create a class `.AsrAttack` instance.

        Args:
            pretrained_model (str) : The choice of target model. Currently this attack supports 
                                     3 different pretrained models consisting of `an4`, `librispeech`
                                     and `tedlium`, representing which dataset the model was trained with.
            gpus (str) : assign specific gpu to use. Default is "0". 
                         If gpu is unavailable, use cpu instead.
            debug (bool) : whether to print the debug message
            attack_kwargs (dict) : arguments for attack parameters. Read the documentation below.

            Args for `attack_kwargs`:
                estimator (PyTorchDeepSpeech) : A trained estimator.
                initial_eps (float) : Initial maximum perturbation that the attacker can introduce.
                max_iter_1st_stage (int): The maximum number of iterations applied for the first 
                                          stage of the optimization of the attack.
                max_iter_2nd_stage (int): The maximum number of iterations applied for the second 
                                          stage of the optimization of the attack.
                learning_rate_1st_stage (float) : The initial learning rate applied for the first 
                                                  stage of the optimization of the attack.
                learning_rate_2nd_stage (float) : The initial learning rate applied for the second 
                                                  stage of the optimization of the attack.
                optimizer_1st_stage: The optimizer applied for the first stage of the optimization 
                                     of the attack. If `None` attack will use `torch.optim.SGD`.
                optimizer_2nd_stage: The optimizer applied for the second stage of the optimization 
                                     of the attack. If `None` attack will use `torch.optim.SGD`.
                global_max_length (int) : The length of the longest audio signal allowed by this attack.
                initial_rescale (float) : Initial rescale coefficient to speedup the decrease of the 
                                          perturbation size during the first stage of the optimization of the attack.
                rescale_factor (float) : The factor to adjust the rescale coefficient during the first 
                                         stage of the optimization of the attack.
                num_iter_adjust_rescale (int) : Number of iterations to adjust the rescale coefficient.
                initial_alpha (float) : The initial value of the alpha coefficient used in the second 
                                        stage of the optimization of the attack.
                increase_factor_alpha (float) : The factor to increase the alpha coefficient used in the second 
                                                stage of the optimization of the attack.
                num_iter_increase_alpha (int) : Number of iterations to increase alpha.
                decrease_factor_alpha (float) : The factor to decrease the alpha coefficient used in the second stage of the
                                                optimization of the attack.
                num_iter_decrease_alpha (int) : Number of iterations to decrease alpha.
                batch_size (int) : Size of the batch on which adversarial samples are generated.
                use_amp (bool) : Whether to use the automatic mixed precision tool to enable mixed precision training or
                                 gradient computation, e.g. with loss gradient computation. When set to True, this option is
                                 only triggered if there are GPUs available.
                opt_level (str) : Specify a pure or mixed precision optimization level. Used when use_amp is True. Accepted
                                  values are `O0`, `O1`, `O2`, and `O3`.
        '''

        self.pretrained_model = pretrained_model
        self.gpus = gpus
        self.debug = debug
        self.attack_kwargs = attack_kwargs

        # set gpu device here
        if torch.cuda.is_available():
            os.environ["CUDA_VISIBLE_DEVICES"] = self.gpus
            self.device_type = "gpu"
        else:
            self.device_type = "cpu"

        # TODO : Set up optimizer in `attack_kwargs`

        # initialize target asr model
        self.asr_model = PyTorchDeepSpeech(pretrained_model=self.pretrained_model,
                                           device_type=self.device_type)

        # attack!
        self.asr_attack = ImperceptibleASRPyTorch(estimator=self.asr_model, **self.attack_kwargs)

    
    def load_audio(self, path):
        '''
        It's the same loader used by deepspeech-pytorch
        '''
        sound, _ = librosa.load(path, sr=AsrAttack.SAMPLE_RATE)
        if len(sound.shape) > 1:
            sound = sound.mean(axis=1)  # multiple channels, average

        return sound


    def save_audio(self, path, audio):
        '''
        Save audio file. Will be rescaled in 16-bits integer.
        '''

        wavfile.write(path, AsrAttack.SAMPLE_RATE, audio)


    def generate_adv_example(self, input_path, target, output_path):
        '''
        Generate adversarial example.

        Args:
            input_path (str) : the path of audio being attacked.
            target (str) : target output in capital letter. Ex: "OPEN THE DOOR".
            output_path (str) : the path where targeted audio is stored.
        '''
        
        audio = self.load_audio(input_path)
        prediction = self.asr_model.predict(np.array([audio]), batch_size=1, transcription_output=True)
        if self.debug:
            print('input path:', input_path)
            print('original prediction:', prediction)
            print('target:', target)

        # start generating adv example
        adv_audio = self.asr_attack.generate(np.array([audio]), np.array([target]), batch_size=1)

        # check the transcription of targeted audio
        adv_transcriptions = self.asr_model.predict(adv_audio, batch_size=1, transcription_output=True)
        print("Groundtruth transcriptions: ", prediction)
        print("Target      transcriptions: ", target)
        print("Adversarial transcriptions: ", adv_transcriptions)

        # save adv audio
        self.save_audio(output_path, adv_audio[0])
        if self.debug:
            print('Generated audio stored at:', output_path)
def test_imperceptible_asr_pytorch(art_warning, expected_values, use_amp,
                                   device_type):
    # Only import if deepspeech_pytorch module is available
    import torch

    from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech
    from art.attacks.evasion.imperceptible_asr.imperceptible_asr_pytorch import ImperceptibleASRPyTorch
    from art.defences.preprocessor import LFilterPyTorch

    try:
        # Load data for testing
        expected_data = expected_values()

        x1 = expected_data[0]
        x2 = expected_data[1]
        x3 = expected_data[2]

        # Create signal data
        x = np.array([
            np.array(x1 * 100, dtype=ART_NUMPY_DTYPE),
            np.array(x2 * 100, dtype=ART_NUMPY_DTYPE),
            np.array(x3 * 100, dtype=ART_NUMPY_DTYPE),
        ])

        # Create labels
        y = np.array(["S", "I", "GD"])

        # Create DeepSpeech estimator with preprocessing
        numerator_coef = np.array(
            [0.0000001, 0.0000002, -0.0000001, -0.0000002])
        denominator_coef = np.array([1.0, 0.0, 0.0, 0.0])
        audio_filter = LFilterPyTorch(numerator_coef=numerator_coef,
                                      denominator_coef=denominator_coef,
                                      device_type=device_type)

        speech_recognizer = PyTorchDeepSpeech(
            pretrained_model="librispeech",
            device_type=device_type,
            use_amp=use_amp,
            preprocessing_defences=audio_filter,
        )

        # Create attack
        asr_attack = ImperceptibleASRPyTorch(
            estimator=speech_recognizer,
            initial_eps=0.001,
            max_iter_1st_stage=5,
            max_iter_2nd_stage=5,
            learning_rate_1st_stage=0.00001,
            learning_rate_2nd_stage=0.001,
            optimizer_1st_stage=torch.optim.SGD,
            optimizer_2nd_stage=torch.optim.SGD,
            global_max_length=2000,
            initial_rescale=1.0,
            rescale_factor=0.8,
            num_iter_adjust_rescale=5,
            initial_alpha=0.01,
            increase_factor_alpha=1.2,
            num_iter_increase_alpha=5,
            decrease_factor_alpha=0.8,
            num_iter_decrease_alpha=5,
            batch_size=2,
            use_amp=use_amp,
            opt_level="O1",
        )

        # Test transcription output
        transcriptions_preprocessing = speech_recognizer.predict(
            x, batch_size=2, transcription_output=True)

        expected_transcriptions = np.array(["", "", ""])

        assert (expected_transcriptions == transcriptions_preprocessing).all()

        # Generate attack
        x_adv_preprocessing = asr_attack.generate(x, y)

        # Test shape
        assert x_adv_preprocessing[0].shape == x[0].shape
        assert x_adv_preprocessing[1].shape == x[1].shape
        assert x_adv_preprocessing[2].shape == x[2].shape

    except ARTTestException as e:
        art_warning(e)
    def _test_all(self, request, setup_class):
        # Only import if deep speech module is available
        from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech
        from art.attacks.evasion.imperceptible_asr.imperceptible_asr_pytorch import ImperceptibleASRPytorch

        # Without amp
        if request.param is False:
            # Create DeepSpeech estimator
            speech_recognizer = PyTorchDeepSpeech(pretrained_model="librispeech")

            # Create attack
            asr_attack = ImperceptibleASRPytorch(
                estimator=speech_recognizer,
                initial_eps=0.001,
                max_iter_1st_stage=50,
                max_iter_2nd_stage=50,
                learning_rate_1st_stage=0.00001,
                learning_rate_2nd_stage=0.001,
                optimizer_1st_stage=torch.optim.SGD,
                optimizer_2nd_stage=torch.optim.SGD,
                global_max_length=2000,
                initial_rescale=1.0,
                rescale_factor=0.8,
                num_iter_adjust_rescale=5,
                initial_alpha=0.01,
                increase_factor_alpha=1.2,
                num_iter_increase_alpha=5,
                decrease_factor_alpha=0.8,
                num_iter_decrease_alpha=5,
                batch_size=2,
                use_amp=False,
                opt_level="O1",
                loss_scale=1,
            )

        # With amp
        else:
            # Create DeepSpeech estimator
            speech_recognizer = PyTorchDeepSpeech(pretrained_model="librispeech", device_type="gpu", use_amp=True)

            # Create attack
            asr_attack = ImperceptibleASRPytorch(
                estimator=speech_recognizer,
                initial_eps=0.001,
                max_iter_1st_stage=50,
                max_iter_2nd_stage=50,
                learning_rate_1st_stage=0.00001,
                learning_rate_2nd_stage=0.001,
                optimizer_1st_stage=torch.optim.SGD,
                optimizer_2nd_stage=torch.optim.SGD,
                global_max_length=2000,
                initial_rescale=1.0,
                rescale_factor=0.8,
                num_iter_adjust_rescale=2,
                initial_alpha=0.01,
                increase_factor_alpha=1.2,
                num_iter_increase_alpha=2,
                decrease_factor_alpha=0.8,
                num_iter_decrease_alpha=2,
                batch_size=2,
                use_amp=True,
                opt_level="O1",
                loss_scale=1,
            )

        # Test transcription output
        transcriptions = speech_recognizer.predict(self.x, batch_size=2, transcription_output=True)

        expected_transcriptions = np.array(["", "", ""])
        assert (expected_transcriptions == transcriptions).all()

        # Generate attack
        x_adv = asr_attack.generate(self.x, self.y)

        # Test shape
        for i in range(3):
            assert x_adv[i].shape == self.x[i].shape