Esempio n. 1
0
def test_pytorch_deep_speech(art_warning, expected_values, use_amp,
                             device_type):
    # Only import if deepspeech_pytorch module is available
    import torch

    from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech

    try:
        # Load data for testing
        expected_data = expected_values()

        x1 = expected_data[0]
        x2 = expected_data[1]
        x3 = expected_data[2]
        expected_sizes = expected_data[3]
        expected_transcriptions1 = expected_data[4]
        expected_transcriptions2 = expected_data[5]
        expected_probs = expected_data[6]
        expected_gradients1 = expected_data[7]
        expected_gradients2 = expected_data[8]
        expected_gradients3 = expected_data[9]

        # Create signal data
        x = np.array([
            np.array(x1 * 100, dtype=ART_NUMPY_DTYPE),
            np.array(x2 * 100, dtype=ART_NUMPY_DTYPE),
            np.array(x3 * 100, dtype=ART_NUMPY_DTYPE),
        ])

        # Create labels
        y = np.array(["SIX", "HI", "GOOD"])

        # Test probability outputs
        speech_recognizer = PyTorchDeepSpeech(pretrained_model="librispeech",
                                              device_type=device_type,
                                              use_amp=use_amp)
        probs, sizes = speech_recognizer.predict(x, batch_size=2)

        np.testing.assert_array_almost_equal(probs[1][1],
                                             expected_probs,
                                             decimal=3)
        np.testing.assert_array_almost_equal(sizes, expected_sizes)

        # Test transcription outputs
        transcriptions = speech_recognizer.predict(x,
                                                   batch_size=2,
                                                   transcription_output=True)

        assert (expected_transcriptions1 == transcriptions).all()

        # Test transcription outputs, corner case
        transcriptions = speech_recognizer.predict(np.array([x[0]]),
                                                   batch_size=2,
                                                   transcription_output=True)

        assert (expected_transcriptions2 == transcriptions).all()

        # Now test loss gradients
        # Compute gradients
        grads = speech_recognizer.loss_gradient(x, y)

        assert grads[0].shape == (1300, )
        assert grads[1].shape == (1500, )
        assert grads[2].shape == (1400, )

        np.testing.assert_array_almost_equal(grads[0][0:20],
                                             expected_gradients1,
                                             decimal=-2)
        np.testing.assert_array_almost_equal(grads[1][0:20],
                                             expected_gradients2,
                                             decimal=-2)
        np.testing.assert_array_almost_equal(grads[2][0:20],
                                             expected_gradients3,
                                             decimal=-2)

        # Now test fit function
        # Create the optimizer
        parameters = speech_recognizer.model.parameters()
        speech_recognizer._optimizer = torch.optim.SGD(parameters, lr=0.01)

        # Before train
        transcriptions1 = speech_recognizer.predict(x,
                                                    batch_size=2,
                                                    transcription_output=True)

        # Train the estimator
        speech_recognizer.fit(x=x, y=y, batch_size=2, nb_epochs=5)

        # After train
        transcriptions2 = speech_recognizer.predict(x,
                                                    batch_size=2,
                                                    transcription_output=True)

        assert not ((transcriptions1 == transcriptions2).all())

    except ARTTestException as e:
        art_warning(e)
Esempio n. 2
0
class TestPyTorchDeepSpeech:
    """
    This class tests the PyTorchDeepSpeech estimator.
    """

    @pytest.fixture
    def setup_class(self):
        master_seed(seed=1234)

        # Small data for testing
        x1 = np.array(
            [
                -1.0376293e-03,
                -1.0681478e-03,
                -1.0986663e-03,
                -1.1291848e-03,
                -1.1291848e-03,
                -1.1291848e-03,
                -1.1902219e-03,
                -1.1597034e-03,
                -1.1902219e-03,
                -1.1291848e-03,
                -1.1291848e-03,
                -1.0681478e-03,
                -9.1555528e-04,
            ]
            * 100
        )

        x2 = np.array(
            [
                -1.8311106e-04,
                -1.2207404e-04,
                -6.1037019e-05,
                0.0000000e00,
                3.0518509e-05,
                0.0000000e00,
                -3.0518509e-05,
                0.0000000e00,
                0.0000000e00,
                9.1555528e-05,
                2.1362957e-04,
                3.3570360e-04,
                4.2725913e-04,
                4.5777764e-04,
                -1.8311106e-04,
            ]
            * 100
        )

        x3 = np.array(
            [
                -8.2399976e-04,
                -7.0192572e-04,
                -5.4933317e-04,
                -4.2725913e-04,
                -3.6622211e-04,
                -2.7466659e-04,
                -2.1362957e-04,
                5.4933317e-04,
                5.7985168e-04,
                6.1037019e-04,
                6.7140721e-04,
                7.0192572e-04,
                6.7140721e-04,
                -1.5259255e-04,
            ]
            * 100
        )

        self.x = np.array([x1, x2, x3])

    def test_all(self, _test_all):
        pass

    @pytest.fixture(params=[False, True])
    def _test_all(self, request, setup_class):
        # Only import if deep speech module is available
        import torch

        from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech

        # Test probability outputs
        if request.param is True:
            self.speech_recognizer_amp = PyTorchDeepSpeech(
                pretrained_model="librispeech", device_type="gpu", use_amp=True
            )
            probs, sizes = self.speech_recognizer_amp.predict(self.x, batch_size=2)

        else:
            self.speech_recognizer = PyTorchDeepSpeech(pretrained_model="librispeech")
            probs, sizes = self.speech_recognizer.predict(self.x, batch_size=2)

        expected_sizes = np.asarray([5, 5, 5])
        np.testing.assert_array_almost_equal(sizes, expected_sizes)

        expected_probs = np.asarray(
            [
                1.0000000e00,
                7.0154901e-14,
                1.9170589e-13,
                8.2194836e-13,
                8.9967915e-13,
                1.8518193e-12,
                1.7883164e-10,
                1.8951663e-12,
                1.8818237e-13,
                3.2806991e-12,
                3.5664666e-16,
                3.3147299e-14,
                2.3439516e-13,
                8.4845603e-12,
                1.2017718e-13,
                1.1180213e-12,
                6.5572378e-15,
                3.0194697e-12,
                4.9065188e-15,
                1.9765363e-13,
                4.1670646e-11,
                2.6884213e-12,
                1.1436632e-13,
                7.1931783e-15,
                2.8135227e-11,
                4.5599673e-14,
                6.4587983e-13,
                2.4159567e-15,
                4.6668241e-13,
            ]
        )
        np.testing.assert_array_almost_equal(probs[1][1], expected_probs, decimal=3)

        # Test transcription outputs
        if request.param is True:
            transcriptions = self.speech_recognizer_amp.predict(self.x, batch_size=2, transcription_output=True)
        else:
            transcriptions = self.speech_recognizer.predict(self.x, batch_size=2, transcription_output=True)

        expected_transcriptions = np.array(["", "", ""])
        assert (expected_transcriptions == transcriptions).all()

        # Test transcription outputs, corner case
        if request.param is True:
            transcriptions = self.speech_recognizer_amp.predict(
                np.array([self.x[0]]), batch_size=2, transcription_output=True
            )
        else:
            transcriptions = self.speech_recognizer.predict(
                np.array([self.x[0]]), batch_size=2, transcription_output=True
            )

        expected_transcriptions = np.array([""])
        assert (expected_transcriptions == transcriptions).all()

        # Now test loss gradients
        # Create labels
        y = np.array(["SIX", "HI", "GOOD"])

        # Compute gradients
        if request.param is True:
            grads = self.speech_recognizer_amp.loss_gradient(self.x, y)
        else:
            grads = self.speech_recognizer.loss_gradient(self.x, y)

        assert grads[0].shape == (1300,)
        assert grads[1].shape == (1500,)
        assert grads[2].shape == (1400,)

        if request.param is True:
            expected_gradients1 = np.asarray(
                [
                    -3485.7,
                    659.0,
                    -111.7,
                    283.6,
                    1691.9,
                    715.0,
                    1480.4,
                    -3522.3,
                    -4087.9,
                    -8824.2,
                    -304.7,
                    2013.4,
                    -445.1,
                    4125.0,
                    1754.1,
                    -503.6,
                    1160.0,
                    7051.7,
                    -1992.2,
                    350.4,
                ]
            )

        else:
            expected_gradients1 = np.asarray(
                [
                    -3482.77892371,
                    665.64673575,
                    -116.24408896,
                    265.93803869,
                    1667.02236699,
                    688.33557577,
                    1455.14911883,
                    -3524.90476617,
                    -4082.06471587,
                    -8802.39419605,
                    -277.74274789,
                    2034.54679277,
                    -428.53153241,
                    4114.63683848,
                    1722.53840709,
                    -513.68916798,
                    1159.88786568,
                    7072.47761446,
                    -1963.71829047,
                    382.65287411,
                ]
            )
        np.testing.assert_array_almost_equal(grads[0][0:20], expected_gradients1, decimal=0)

        if request.param is True:
            expected_gradients2 = np.asarray(
                [
                    20924.5,
                    3046.3,
                    -7872.5,
                    15525.1,
                    -15766.9,
                    -18494.1,
                    19139.6,
                    6446.2,
                    26323.1,
                    4230.0,
                    -31122.4,
                    -2890.9,
                    12936.7,
                    13834.1,
                    17649.9,
                    8866.1,
                    -16454.6,
                    -6953.1,
                    -17899.6,
                    4100.7,
                ]
            )

        else:
            expected_gradients2 = np.asarray(
                [
                    20992.44844133,
                    3048.78701634,
                    -7849.13725934,
                    15557.59663939,
                    -15760.10725159,
                    -18422.9438386,
                    19132.22699435,
                    6508.51437337,
                    26292.5249963,
                    4232.62414548,
                    -31128.82664215,
                    -2894.85284984,
                    13008.74538039,
                    13845.08921681,
                    17657.67725957,
                    8807.42144017,
                    -16477.89414508,
                    -6977.8092622,
                    -17914.22352666,
                    4086.51150059,
                ]
            )
        np.testing.assert_array_almost_equal(grads[1][0:20], expected_gradients2, decimal=0)

        if request.param is True:
            expected_gradients3 = np.asarray(
                [
                    -1687.3,
                    6715.0,
                    16448.4,
                    -3848.9,
                    16521.1,
                    -15736.1,
                    -26204.0,
                    -8992.2,
                    9697.9,
                    13999.6,
                    -7595.3,
                    14181.0,
                    -24507.2,
                    5481.9,
                    7166.7,
                    -6182.3,
                    2510.3,
                    -7229.0,
                    -10821.9,
                    -11134.2,
                ]
            )

        else:
            expected_gradients3 = np.asarray(
                [
                    -1693.10472689,
                    6711.39788693,
                    16480.14166546,
                    -3786.95541286,
                    16448.3969823,
                    -15702.45621671,
                    -26162.89260564,
                    -8979.81601681,
                    9657.87483965,
                    13955.78845296,
                    -7552.01438108,
                    14170.60635269,
                    -24434.37243957,
                    5502.81163675,
                    7171.56926943,
                    -6154.06511686,
                    2483.93980406,
                    -7244.24618697,
                    -10798.70438903,
                    -11129.57632319,
                ]
            )
        np.testing.assert_array_almost_equal(grads[2][0:20], expected_gradients3, decimal=0)

        # Now test fit function
        if request.param is True:
            # Create the optimizer
            parameters = self.speech_recognizer_amp.model.parameters()
            self.speech_recognizer_amp._optimizer = torch.optim.SGD(parameters, lr=0.01)

            # Before train
            transcriptions1 = self.speech_recognizer_amp.predict(self.x, batch_size=2, transcription_output=True)

            # Train the estimator
            self.speech_recognizer_amp.fit(x=self.x, y=y, batch_size=2, nb_epochs=5)

            # After train
            transcriptions2 = self.speech_recognizer_amp.predict(self.x, batch_size=2, transcription_output=True)

            assert not ((transcriptions1 == transcriptions2).all())

        else:
            # Create the optimizer
            parameters = self.speech_recognizer.model.parameters()
            self.speech_recognizer._optimizer = torch.optim.SGD(parameters, lr=0.01)

            # Before train
            transcriptions1 = self.speech_recognizer.predict(self.x, batch_size=2, transcription_output=True)

            # Train the estimator
            self.speech_recognizer.fit(x=self.x, y=y, batch_size=2, nb_epochs=5)

            # After train
            transcriptions2 = self.speech_recognizer.predict(self.x, batch_size=2, transcription_output=True)

            assert not ((transcriptions1 == transcriptions2).all())