def __call__(self, context: SpeechContext, frame: np.ndarray) -> None:
        """Processes a single frame of audio to determine if voice is present

        Args:
            context (SpeechContext): State based information that needs to be shared
            between pieces of the pipeline
            frame (np.ndarray): Single frame of PCM-16 audio from an input source

        """
        frame = frame.tobytes()
        result: bool = self._vad.is_speech(frame, self._sample_rate)

        raw = result > 0
        if raw == self._run_value:
            self._run_length += 1
        else:
            self._run_value = raw
            self._run_length = 1

        if self._run_value != context.is_speech:
            if self._run_value and self._run_length >= self._rise_length:
                context.is_speech = True
                _LOG.info("vad: true")
            if not self._run_value and self._run_length >= self._fall_length:
                context.is_speech = False
                _LOG.info("vad: false")
Example #2
0
    def __call__(self, context: SpeechContext, frame: np.ndarray) -> None:
        """Processes a single frame of audio to determine if voice is present

        Args:
            context (SpeechContext): State based information that needs to be shared
            between pieces of the pipeline
            frame (np.ndarray): Single frame of PCM-16 audio from an input source

        """
        # validate dtype
        if not np.issubdtype(frame.dtype, np.signedinteger):
            raise TypeError("invalid_dtype")

        result: bool = self._vad.is_speech(frame)

        raw = result > 0
        if raw == self._run_value:
            self._run_length += 1
        else:
            self._run_value = raw
            self._run_length = 1

        if self._run_value != context.is_speech:
            if self._run_value and self._run_length >= self._rise_length:
                context.is_speech = True
                _LOG.info("vad: true")
            if not self._run_value and self._run_length >= self._fall_length:
                context.is_speech = False
                _LOG.info("vad: false")
Example #3
0
def test_detect_activate(_mock):
    context = SpeechContext()
    detector = WakewordTrigger(model_dir="wakeword_model")
    detector.detect_model.return_value[0][:] = 0.6

    test_frame = np.random.rand(512, ).astype(np.float32)
    context.is_speech = True
    detector(context, test_frame)
    context.is_speech = False
    assert context.is_active
Example #4
0
def test_detect_inactive_vad_deactivate(_mock):
    context = SpeechContext()
    detector = WakewordTrigger(model_dir="wakeword_model")

    for _ in range(3):
        test_frame = np.random.rand(160, ).astype(np.float32)
        context.is_speech = True
        detector(context, test_frame)
        context.is_speech = False
        assert not context.is_active
    detector(context, test_frame)
Example #5
0
def test_context():
    context = SpeechContext()

    # test is_speech
    assert not context.is_speech
    context.is_speech = True
    assert context.is_speech

    # test is_active
    assert not context.is_active
    context.is_active = True
    assert context.is_active

    # test transcript
    assert not context.transcript
    context.transcript = "this is a test"
    assert context.transcript

    # test confidence
    assert context.confidence == 0.0
    context.confidence = 1.0
    assert context.confidence == 1.0

    # test reset
    context.reset()
    assert not context.is_speech
    assert not context.is_active
    assert not context.transcript
    assert context.confidence == 0.0
Example #6
0
def test_detect_vad_inactive(_mock):
    context = SpeechContext()

    detector = WakewordTrigger(model_dir="wakeword_model")

    test_frame = np.random.rand(160, ).astype(np.float32)
    context.is_speech = False
    detector(context, test_frame)
    assert not context.is_active
def test_min_active():
    max_active = 500
    min_active = 120
    context = SpeechContext()
    timeout = ActivationTimeout(min_active=min_active, max_active=max_active)

    context.is_active = True

    # call with speech active
    context.is_speech = True
    timeout(context)

    # call timeout after speech is no longer detected
    context.is_speech = False
    timeout(context)
    assert context.is_active

    # vad fall should be True
    # with context still active
    timeout(context)
    assert context.is_active

    # context should remain active until min active
    steps_before_deactivate = min_active // 20
    for _ in range(steps_before_deactivate):
        timeout(context)
        assert context.is_active

    # call with speech active
    context.is_speech = True
    timeout(context)

    # call timeout after speech is no longer detected
    # min active should be satisfied
    context.is_speech = False
    timeout(context)
    assert not context.is_active

    timeout.close()
def test_timeout_vad_fall():
    max_active = 500
    min_active = 20
    context = SpeechContext()
    timeout = ActivationTimeout(min_active=min_active, max_active=max_active)

    context.is_active = True
    context.is_speech = False

    timeout(context)
    context.is_speech = True

    timeout(context)
    assert context.is_active

    context.is_speech = False

    steps_before_timeout = (min_active // 20) + 2
    for _ in range(steps_before_timeout):
        timeout(context)
    assert not context.is_active

    timeout.close()
Example #9
0
def test_voice_activity_trigger():
    context = SpeechContext()
    trigger = VoiceActivityTrigger()

    frame = np.zeros(160, np.int16)

    trigger(context, frame)
    assert not context.is_active

    context.is_speech = True
    trigger(context, frame)
    assert context.is_active

    trigger.close()