def __call__(self, context: SpeechContext, frame: np.ndarray) -> None: """Processes a single frame of audio to determine if voice is present Args: context (SpeechContext): State based information that needs to be shared between pieces of the pipeline frame (np.ndarray): Single frame of PCM-16 audio from an input source """ frame = frame.tobytes() result: bool = self._vad.is_speech(frame, self._sample_rate) raw = result > 0 if raw == self._run_value: self._run_length += 1 else: self._run_value = raw self._run_length = 1 if self._run_value != context.is_speech: if self._run_value and self._run_length >= self._rise_length: context.is_speech = True _LOG.info("vad: true") if not self._run_value and self._run_length >= self._fall_length: context.is_speech = False _LOG.info("vad: false")
def __call__(self, context: SpeechContext, frame: np.ndarray) -> None: """Processes a single frame of audio to determine if voice is present Args: context (SpeechContext): State based information that needs to be shared between pieces of the pipeline frame (np.ndarray): Single frame of PCM-16 audio from an input source """ # validate dtype if not np.issubdtype(frame.dtype, np.signedinteger): raise TypeError("invalid_dtype") result: bool = self._vad.is_speech(frame) raw = result > 0 if raw == self._run_value: self._run_length += 1 else: self._run_value = raw self._run_length = 1 if self._run_value != context.is_speech: if self._run_value and self._run_length >= self._rise_length: context.is_speech = True _LOG.info("vad: true") if not self._run_value and self._run_length >= self._fall_length: context.is_speech = False _LOG.info("vad: false")
def test_reset(): context = SpeechContext() recognizer = CloudSpeechRecognizer() recognizer._client._socket = mock.MagicMock() recognizer._client._socket.recv.return_value = json.dumps({ "error": None, "final": False, "hypotheses": [{ "confidence": 0.5, "transcript": "this is a test" }], "status": "ok", }) frame = np.random.rand(160).astype(np.int16) # trigger _begin and first _send context.is_active = True recognizer(context, frame) # trigger _send recognizer(context, frame) # we haven't triggered _commit or sent the final frame # which means context is still active and _is_active is True recognizer.reset() assert not recognizer._is_active assert not recognizer._client.is_connected
def test_detect_vad_inactive(_mock): context = SpeechContext() detector = WakewordTrigger(model_dir="wakeword_model") test_frame = np.random.rand(160, ).astype(np.float32) context.is_speech = False detector(context, test_frame) assert not context.is_active
def test_detect_activate(_mock): context = SpeechContext() detector = WakewordTrigger(model_dir="wakeword_model") detector.detect_model.return_value[0][:] = 0.6 test_frame = np.random.rand(512, ).astype(np.float32) context.is_speech = True detector(context, test_frame) context.is_speech = False assert context.is_active
def test_detect_inactive_vad_deactivate(_mock): context = SpeechContext() detector = WakewordTrigger(model_dir="wakeword_model") for _ in range(3): test_frame = np.random.rand(160, ).astype(np.float32) context.is_speech = True detector(context, test_frame) context.is_speech = False assert not context.is_active detector(context, test_frame)
def test_detect_manual_min_delay(_mock): context = SpeechContext() detector = WakewordTrigger(model_dir="wakeword_model") detector.detect_model.return_value[0][:] = 1 context.is_active = True test_frame = np.random.rand(512, ).astype(np.float32) detector(context, test_frame) detector(context, test_frame) detector(context, test_frame) assert context.is_active
def test_recognize(): context = SpeechContext() recognizer = CloudSpeechRecognizer() recognizer._client._socket = mock.MagicMock() recognizer._client._socket.recv.return_value = json.dumps({ "error": None, "final": False, "hypotheses": [{ "confidence": 0.5, "transcript": "this is a test" }], "status": "ok", }) frame = np.random.rand(160).astype(np.int16) # call with context active to test _begin and first _send context.is_active = True recognizer(context, frame) # call again to test with internal _is_active as True recognizer(context, frame) # call with context not active to test _commit context.is_active = False recognizer(context, frame) recognizer._client._socket.recv.return_value = json.dumps({ "error": None, "final": True, "hypotheses": [{ "confidence": 0.5, "transcript": "this is a test" }], "status": "ok", }) # call with the client indicating it's the final frame to test _receive recognizer(context, frame) recognizer._client._socket.max_idle_time = 500 # test timeout for i in range(501): recognizer(context, frame) assert not context.is_active assert not recognizer._client.is_connected
def test_recognize(*args): context = SpeechContext() audio = np.zeros(160).astype(np.int16) recognizer = GoogleSpeechRecognizer(language="en-US", credentials="") context.is_active = True for i in range(10): if i > 3: context.is_active = False recognizer(context, audio) recognizer.reset() recognizer.close()
def test_voice_activity_trigger(): context = SpeechContext() trigger = VoiceActivityTrigger() frame = np.zeros(160, np.int16) trigger(context, frame) assert not context.is_active context.is_speech = True trigger(context, frame) assert context.is_active trigger.close()
def test_context(): context = SpeechContext() # test is_speech assert not context.is_speech context.is_speech = True assert context.is_speech # test is_active assert not context.is_active context.is_active = True assert context.is_active # test transcript assert not context.transcript context.transcript = "this is a test" assert context.transcript # test confidence assert context.confidence == 0.0 context.confidence = 1.0 assert context.confidence == 1.0 # test reset context.reset() assert not context.is_speech assert not context.is_active assert not context.transcript assert context.confidence == 0.0
def test_max_active(): max_active = 500 min_active = 20 context = SpeechContext() timeout = ActivationTimeout(min_active=min_active, max_active=max_active) context.is_active = True steps_before_timeout = (max_active // 20) + 1 for _ in range(steps_before_timeout): timeout(context) assert not context.is_active timeout.close()
def test_processing(): context = SpeechContext() sample_rate = 8000 frequency = 2000 agc = AutomaticGainControl( sample_rate=sample_rate, frame_width=10, target_level_dbfs=9, compression_gain_db=15, ) # valid amplification frame = sin_frame(sample_rate, frequency, amplitude=0.08) level = rms(frame) agc(context, frame) assert rms(frame) > level # valid attenuation frame = sin_frame(sample_rate, frequency) level = rms(frame) agc(context, frame) assert rms(frame) < level agc.close()
def test_invalid_frame_size(): context = SpeechContext() agc = AutomaticGainControl() bad_frame = np.random.rand(100) with pytest.raises(ValueError): agc(context, bad_frame)
def test_invalid_frame_dtype(): context = SpeechContext() agc = AutomaticGainControl() bad_frame = np.random.rand(320) with pytest.raises(TypeError): agc(context, bad_frame)
def test_invalid_dtype(): context = SpeechContext() detector = VoiceActivityDetector() bad_frame = np.random.rand(160) with pytest.raises(Exception): detector(context, bad_frame)
def test_response(): context = SpeechContext() recognizer = CloudSpeechRecognizer() recognizer._client._socket = mock.MagicMock() recognizer._client._socket.recv.return_value = json.dumps({ "error": None, "final": False, "hypotheses": [{ "confidence": 0.5, "transcript": "this is a test" }], "status": "ok", }) frame = np.random.rand(160).astype(np.int16) # run through all the steps context.is_active = True recognizer(context, frame) recognizer(context, frame) context.is_active = False recognizer(context, frame) recognizer._client._socket.recv.return_value = json.dumps({ "error": None, "final": True, "hypotheses": [{ "confidence": 0.5, "transcript": "this is a test" }], "status": "ok", }) # process the final frame with the final transcript recognizer(context, frame) assert context.transcript == "this is a test" assert context.confidence == 0.5 recognizer.close()
def test_recognize(*args): context = SpeechContext() recognizer = KeywordRecognizer(classes=["one", "two", "three"]) test_frame = np.random.rand( 160, ).astype(np.float32) context.is_active = True for i in range(10): recognizer(context, test_frame) recognizer(context, test_frame) context.is_active = False recognizer(context, test_frame) assert context.transcript == "one" recognizer.close()
def test_timeout(*args): context = SpeechContext() recognizer = KeywordRecognizer(classes=["one", "two", "three"]) recognizer.detect_model.return_value = [[[0.0, 0.0, 0.0]]] test_frame = np.random.rand( 160, ).astype(np.float32) context.is_active = True for i in range(10): recognizer(context, test_frame) recognizer(context, test_frame) context.is_active = False recognizer(context, test_frame) assert not context.transcript recognizer.close()
def test_vad_is_triggered(mock_class): context = SpeechContext() detector = VoiceActivityDetector(sample_rate=16000, frame_width=10, vad_rise_delay=0, vad_fall_delay=0) frame = np.zeros(160, np.int16) detector(context, frame) assert context.is_speech detector.close()
def test_vad_rise_delay(): context = SpeechContext() detector = VoiceActivityDetector(frame_width=10, vad_rise_delay=30) for i in range(3): frame = voice_frame() detector(context, frame) if i < 2: assert not context.is_speech else: assert context.is_speech detector.close()
def test_vad_is_triggered(): context = SpeechContext() detector = VoiceActivityDetector(frame_width=10) frame = silence_frame() detector(context, frame) assert not context.is_speech frame = voice_frame() detector(context, frame) assert context.is_speech detector.close()
def _detect(self, context: SpeechContext) -> None: # read the full contents of the encode window and add the batch dimension # calculate a scalar probability of if the frame contains the wakeword # with the detect model frame = self.encode_window.read_all() frame = np.expand_dims(frame, 0) posterior = self.detect_model(frame)[0][0][0] if posterior > self._posterior_max: self._posterior_max = posterior if posterior > self._posterior_threshold: context.is_active = True _LOG.info(f"wake: {self._posterior_max}")
def __call__(self, context: SpeechContext, frame: np.ndarray) -> None: """Activates speech context whenever speech is detected Args: context (SpeechContext): State based information that needs to be shared between pieces of the pipeline frame (np.ndarray): Single frame of PCM-16 audio from an input source """ if context.is_speech != self._is_speech: if context.is_speech: context.is_active = True self._is_speech = context.is_speech
def test_vad_rise_delay(mock_class): context = SpeechContext() detector = VoiceActivityDetector(sample_rate=16000, frame_width=10, vad_rise_delay=30, vad_fall_delay=0) for i in range(3): frame = np.zeros(160, np.int16) detector(context, frame) if i < 2: assert not context.is_speech else: assert context.is_speech detector.close()
def _receive(self, context: SpeechContext) -> None: for response in self._client.streaming_recognize( self._config, self._drain()): for result in response.results[:1]: for alternative in result.alternatives[:1]: context.transcript = alternative.transcript context.confidence = alternative.confidence if context.transcript: context.event("partial_recognize") if result.is_final: if context.transcript: context.event("recognize") _LOG.debug("recognize event") else: context.event("timeout") _LOG.debug("timeout event")
def test_receive(*args): context = SpeechContext() audio = np.zeros(160).astype(np.int16) recognizer = GoogleSpeechRecognizer(language="en-US", credentials="") recognizer._queue.put([audio, audio, audio]) recognizer._client.streaming_recognize.return_value = [ mock.Mock( results=[ mock.Mock(alternatives=[mock.Mock(transcript="test", confidence=0.99)]) ] ) ] context.is_active = True for i in range(10): if i > 3: context.is_active = False recognizer(context, audio) recognizer._thread = mock.Mock() recognizer.reset() recognizer.close()
def _receive(self, context: SpeechContext) -> None: self._client.receive() hypotheses = self._client.response.get("hypotheses") if hypotheses: hypothesis = hypotheses[0] context.transcript = hypothesis["transcript"] context.confidence = hypothesis["confidence"] if context.transcript: context.event("partial_recognize") if self._client.is_final: if context.transcript: context.event("recognize") _LOG.debug("recognize event") else: context.event("timeout") _LOG.debug("timeout event")
def test_handler(): def on_speech(context): context.transcript = "event handled" context = SpeechContext() context.add_handler("recognize", on_speech) context.event("recognize") assert context.transcript == "event handled"
def test_vad_fall_untriggered(): context = SpeechContext() detector = VoiceActivityDetector(frame_width=10, vad_fall_delay=20) voice = voice_frame() silence = silence_frame() detector(context, voice) assert context.is_speech for i in range(10): detector(context, silence) assert context.is_speech detector(context, silence) assert not context.is_speech detector.close()