Ejemplo n.º 1
0
def test_vad_is_triggered(mock_class):
    context = SpeechContext()
    detector = VoiceActivityDetector(sample_rate=16000,
                                     frame_width=10,
                                     vad_rise_delay=0,
                                     vad_fall_delay=0)
    frame = np.zeros(160, np.int16)
    detector(context, frame)
    assert context.is_speech
    detector.close()
Ejemplo n.º 2
0
def test_vad_rise_delay():
    context = SpeechContext()
    detector = VoiceActivityDetector(frame_width=10, vad_rise_delay=30)
    for i in range(3):
        frame = voice_frame()
        detector(context, frame)
        if i < 2:
            assert not context.is_speech
        else:
            assert context.is_speech
    detector.close()
Ejemplo n.º 3
0
def test_vad_is_triggered():
    context = SpeechContext()
    detector = VoiceActivityDetector(frame_width=10)

    frame = silence_frame()
    detector(context, frame)
    assert not context.is_speech

    frame = voice_frame()
    detector(context, frame)
    assert context.is_speech

    detector.close()
Ejemplo n.º 4
0
def test_vad_rise_delay(mock_class):
    context = SpeechContext()
    detector = VoiceActivityDetector(sample_rate=16000,
                                     frame_width=10,
                                     vad_rise_delay=30,
                                     vad_fall_delay=0)
    for i in range(3):
        frame = np.zeros(160, np.int16)
        detector(context, frame)
        if i < 2:
            assert not context.is_speech
        else:
            assert context.is_speech
    detector.close()
Ejemplo n.º 5
0
    def create(spokestack_id: str,
               spokestack_secret: str,
               sample_rate: int = 16000,
               frame_width: int = 20,
               **kwargs) -> SpeechPipeline:
        """

        Args:
            spokestack_id (str): spokestack API id.
            spokestack_secret (str): Spokestack API secret.
            sample_rate (int): sample rate of the audio (Hz).
            frame_width (int): width of the audio frame: 10, 20, or 30 (ms).

        Returns:
            SpeechPipeline instance with profile configuration.

        """
        pipeline = SpeechPipeline(
            input_source=PyAudioInput(sample_rate=sample_rate,
                                      frame_width=frame_width,
                                      **kwargs),
            stages=[
                VoiceActivityDetector(sample_rate=sample_rate,
                                      frame_width=frame_width,
                                      **kwargs),
                VoiceActivityTrigger(),
                ActivationTimeout(frame_width=frame_width, **kwargs),
                CloudSpeechRecognizer(spokestack_id=spokestack_id,
                                      spokestack_secret=spokestack_secret,
                                      sample_rate=sample_rate,
                                      frame_width=frame_width,
                                      **kwargs),
            ],
        )
        return pipeline
Ejemplo n.º 6
0
def test_invalid_dtype():
    context = SpeechContext()
    detector = VoiceActivityDetector()

    bad_frame = np.random.rand(160)
    with pytest.raises(Exception):
        detector(context, bad_frame)
Ejemplo n.º 7
0
def test_vad_fall_untriggered():
    context = SpeechContext()
    detector = VoiceActivityDetector(frame_width=10, vad_fall_delay=20)

    voice = voice_frame()
    silence = silence_frame()

    detector(context, voice)
    assert context.is_speech

    for i in range(10):
        detector(context, silence)
        assert context.is_speech

    detector(context, silence)
    assert not context.is_speech
    detector.close()
Ejemplo n.º 8
0
def test_vad_fall_untriggered():
    context = SpeechContext()
    detector = VoiceActivityDetector(sample_rate=16000,
                                     frame_width=10,
                                     vad_rise_delay=0,
                                     vad_fall_delay=20)
    with patch("webrtcvad.Vad.is_speech", return_value=True):
        frame = np.zeros(160, np.int16)
        detector(context, frame)
        assert context.is_speech

    with patch("webrtcvad.Vad.is_speech", return_value=False):
        frame = np.zeros(160, np.int16)
        detector(context, frame)
        assert context.is_speech

    with patch("webrtcvad.Vad.is_speech", return_value=False):
        frame = np.zeros(160, np.int16)
        detector(context, frame)
        assert not context.is_speech
    detector.close()
Ejemplo n.º 9
0
    def create(
        spokestack_id: str,
        spokestack_secret: str,
        sample_rate: int = 16000,
        frame_width: int = 20,
        model_dir: str = "",
        **kwargs: Any,
    ) -> SpeechPipeline:
        """Creates a speech pipeline instance from profile

        Args:
            spokestack_id (str): spokestack API id.
            spokestack_secret (str): Spokestack API secret.
            sample_rate (int): sample rate of the audio (Hz).
            frame_width (int): width of the audio frame: 10, 20, or 30 (ms).
            model_dir (str): Directory containing the tflite wakeword models.

        Returns:

        """
        pipeline = SpeechPipeline(
            input_source=PyAudioInput(frame_width=frame_width,
                                      sample_rate=sample_rate,
                                      **kwargs),
            stages=[
                AutomaticGainControl(sample_rate=sample_rate,
                                     frame_width=frame_width),
                AutomaticNoiseSuppression(sample_rate=sample_rate),
                VoiceActivityDetector(
                    frame_width=frame_width,
                    sample_rate=sample_rate,
                    **kwargs,
                ),
                WakewordTrigger(model_dir=model_dir, **kwargs),
                ActivationTimeout(frame_width=frame_width, **kwargs),
                CloudSpeechRecognizer(
                    spokestack_secret=spokestack_secret,
                    spokestack_id=spokestack_id,
                    **kwargs,
                ),
            ],
        )
        return pipeline
Ejemplo n.º 10
0
def main():
    pipeline = SpeechPipeline(
        PyAudioInput(frame_width=20,
                     sample_rate=16000,
                     exception_on_overflow=False),
        [
            VoiceActivityDetector(),
            WakewordTrigger(pre_emphasis=0.97, model_dir="tflite"),
            GoogleSpeechRecognizer(GOOGLE_CREDS),
            ActivationTimeout(),
        ],
    )

    dialogue_manager = DialogueManager(
        "tflite", "distilbert-base-cased-distilled-squad")
    manager = TextToSpeechManager(
        TextToSpeechClient(KEY_ID, KEY_SECRET),
        PyAudioOutput(),
    )

    @pipeline.event
    def on_activate(context):
        print(context.is_active)

    @pipeline.event
    def on_recognize(context):
        pipeline.pause()
        answer = dialogue_manager(context.transcript)
        manager.synthesize(answer, "text", "demo-male")
        pipeline.resume()

    @pipeline.event
    def on_deactivate(context):
        print(context.is_active)

    manager.synthesize(dialogue_manager.greet(), "text", "demo-male")
    pipeline.start()
    pipeline.run()
Ejemplo n.º 11
0
def main():
    pipeline = SpeechPipeline(
        PyAudioInput(frame_width=20, sample_rate=16000, exception_on_overflow=False),
        [
            VoiceActivityDetector(),
            WakewordTrigger(pre_emphasis=0.97, model_dir="tflite"),
            CloudSpeechRecognizer(spokestack_id=KEY_ID, spokestack_secret=KEY_SECRET),
            ActivationTimeout(),
        ],
    )

    nlu = TFLiteNLU("tflite")
    dialogue_manager = DialogueManager()
    manager = TextToSpeechManager(
        TextToSpeechClient(KEY_ID, KEY_SECRET), PyAudioOutput(),
    )

    @pipeline.event
    def on_activate(context):
        print("active")

    @pipeline.event
    def on_recognize(context):
        pipeline.pause()
        results = nlu(context.transcript)
        response = dialogue_manager(results)
        if response:
            manager.synthesize(response, "text", "demo-male")
        pipeline.resume()

        if results.intent == "AMAZON.StopIntent":
            pipeline.stop()

    manager.synthesize(Response.WELCOME.value, "text", "demo-male")
    pipeline.start()
    pipeline.run()
Ejemplo n.º 12
0
    def create(classes: List[str],
               model_dir: str,
               sample_rate: int = 16000,
               frame_width: int = 20,
               **kwargs: Any) -> SpeechPipeline:
        """Create a speech pipeline instance from profile.

        Args:
            model_dir (str): Directory containing the tflite keyword models.
            classes: (List(str)): Classes for the keyword model to recognize
            sample_rate (int): sample rate of the audio (Hz).
            frame_width (int): width of the audio frame: 10, 20, or 30 (ms).

        """
        pipeline = SpeechPipeline(
            input_source=PyAudioInput(frame_width=frame_width,
                                      sample_rate=sample_rate,
                                      **kwargs),
            stages=[
                AutomaticGainControl(sample_rate=sample_rate,
                                     frame_width=frame_width),
                AutomaticNoiseSuppression(sample_rate=sample_rate),
                VoiceActivityDetector(sample_rate=sample_rate,
                                      frame_width=frame_width,
                                      **kwargs),
                VoiceActivityTrigger(),
                KeywordRecognizer(
                    classes=classes,
                    model_dir=model_dir,
                    sample_rate=sample_rate,
                    **kwargs,
                ),
                ActivationTimeout(frame_width=frame_width, **kwargs),
            ],
        )
        return pipeline
Ejemplo n.º 13
0
def test_invalid_sample_rate():
    with pytest.raises(ValueError):
        _ = VoiceActivityDetector(sample_rate=9000)
Ejemplo n.º 14
0
def test_invalid_frame_width():
    with pytest.raises(ValueError):
        _ = VoiceActivityDetector(frame_width=30)