def __call__(self, context: SpeechContext, frame: np.ndarray) -> None: """Processes a single frame of audio to determine if voice is present Args: context (SpeechContext): State based information that needs to be shared between pieces of the pipeline frame (np.ndarray): Single frame of PCM-16 audio from an input source """ frame = frame.tobytes() result: bool = self._vad.is_speech(frame, self._sample_rate) raw = result > 0 if raw == self._run_value: self._run_length += 1 else: self._run_value = raw self._run_length = 1 if self._run_value != context.is_speech: if self._run_value and self._run_length >= self._rise_length: context.is_speech = True _LOG.info("vad: true") if not self._run_value and self._run_length >= self._fall_length: context.is_speech = False _LOG.info("vad: false")
def __call__(self, context: SpeechContext, frame: np.ndarray) -> None: """Processes a single frame of audio to determine if voice is present Args: context (SpeechContext): State based information that needs to be shared between pieces of the pipeline frame (np.ndarray): Single frame of PCM-16 audio from an input source """ # validate dtype if not np.issubdtype(frame.dtype, np.signedinteger): raise TypeError("invalid_dtype") result: bool = self._vad.is_speech(frame) raw = result > 0 if raw == self._run_value: self._run_length += 1 else: self._run_value = raw self._run_length = 1 if self._run_value != context.is_speech: if self._run_value and self._run_length >= self._rise_length: context.is_speech = True _LOG.info("vad: true") if not self._run_value and self._run_length >= self._fall_length: context.is_speech = False _LOG.info("vad: false")
def test_detect_activate(_mock): context = SpeechContext() detector = WakewordTrigger(model_dir="wakeword_model") detector.detect_model.return_value[0][:] = 0.6 test_frame = np.random.rand(512, ).astype(np.float32) context.is_speech = True detector(context, test_frame) context.is_speech = False assert context.is_active
def test_detect_inactive_vad_deactivate(_mock): context = SpeechContext() detector = WakewordTrigger(model_dir="wakeword_model") for _ in range(3): test_frame = np.random.rand(160, ).astype(np.float32) context.is_speech = True detector(context, test_frame) context.is_speech = False assert not context.is_active detector(context, test_frame)
def test_context(): context = SpeechContext() # test is_speech assert not context.is_speech context.is_speech = True assert context.is_speech # test is_active assert not context.is_active context.is_active = True assert context.is_active # test transcript assert not context.transcript context.transcript = "this is a test" assert context.transcript # test confidence assert context.confidence == 0.0 context.confidence = 1.0 assert context.confidence == 1.0 # test reset context.reset() assert not context.is_speech assert not context.is_active assert not context.transcript assert context.confidence == 0.0
def test_detect_vad_inactive(_mock): context = SpeechContext() detector = WakewordTrigger(model_dir="wakeword_model") test_frame = np.random.rand(160, ).astype(np.float32) context.is_speech = False detector(context, test_frame) assert not context.is_active
def test_min_active(): max_active = 500 min_active = 120 context = SpeechContext() timeout = ActivationTimeout(min_active=min_active, max_active=max_active) context.is_active = True # call with speech active context.is_speech = True timeout(context) # call timeout after speech is no longer detected context.is_speech = False timeout(context) assert context.is_active # vad fall should be True # with context still active timeout(context) assert context.is_active # context should remain active until min active steps_before_deactivate = min_active // 20 for _ in range(steps_before_deactivate): timeout(context) assert context.is_active # call with speech active context.is_speech = True timeout(context) # call timeout after speech is no longer detected # min active should be satisfied context.is_speech = False timeout(context) assert not context.is_active timeout.close()
def test_timeout_vad_fall(): max_active = 500 min_active = 20 context = SpeechContext() timeout = ActivationTimeout(min_active=min_active, max_active=max_active) context.is_active = True context.is_speech = False timeout(context) context.is_speech = True timeout(context) assert context.is_active context.is_speech = False steps_before_timeout = (min_active // 20) + 2 for _ in range(steps_before_timeout): timeout(context) assert not context.is_active timeout.close()
def test_voice_activity_trigger(): context = SpeechContext() trigger = VoiceActivityTrigger() frame = np.zeros(160, np.int16) trigger(context, frame) assert not context.is_active context.is_speech = True trigger(context, frame) assert context.is_active trigger.close()