def test_recognize(): context = SpeechContext() recognizer = CloudSpeechRecognizer() recognizer._client._socket = mock.MagicMock() recognizer._client._socket.recv.return_value = json.dumps({ "error": None, "final": False, "hypotheses": [{ "confidence": 0.5, "transcript": "this is a test" }], "status": "ok", }) frame = np.random.rand(160).astype(np.int16) # call with context active to test _begin and first _send context.is_active = True recognizer(context, frame) # call again to test with internal _is_active as True recognizer(context, frame) # call with context not active to test _commit context.is_active = False recognizer(context, frame) recognizer._client._socket.recv.return_value = json.dumps({ "error": None, "final": True, "hypotheses": [{ "confidence": 0.5, "transcript": "this is a test" }], "status": "ok", }) # call with the client indicating it's the final frame to test _receive recognizer(context, frame) recognizer._client._socket.max_idle_time = 500 # test timeout for i in range(501): recognizer(context, frame) assert not context.is_active assert not recognizer._client.is_connected
def test_recognize(*args): context = SpeechContext() audio = np.zeros(160).astype(np.int16) recognizer = GoogleSpeechRecognizer(language="en-US", credentials="") context.is_active = True for i in range(10): if i > 3: context.is_active = False recognizer(context, audio) recognizer.reset() recognizer.close()
def test_context(): context = SpeechContext() # test is_speech assert not context.is_speech context.is_speech = True assert context.is_speech # test is_active assert not context.is_active context.is_active = True assert context.is_active # test transcript assert not context.transcript context.transcript = "this is a test" assert context.transcript # test confidence assert context.confidence == 0.0 context.confidence = 1.0 assert context.confidence == 1.0 # test reset context.reset() assert not context.is_speech assert not context.is_active assert not context.transcript assert context.confidence == 0.0
def test_reset(): context = SpeechContext() recognizer = CloudSpeechRecognizer() recognizer._client._socket = mock.MagicMock() recognizer._client._socket.recv.return_value = json.dumps({ "error": None, "final": False, "hypotheses": [{ "confidence": 0.5, "transcript": "this is a test" }], "status": "ok", }) frame = np.random.rand(160).astype(np.int16) # trigger _begin and first _send context.is_active = True recognizer(context, frame) # trigger _send recognizer(context, frame) # we haven't triggered _commit or sent the final frame # which means context is still active and _is_active is True recognizer.reset() assert not recognizer._is_active assert not recognizer._client.is_connected
def test_response(): context = SpeechContext() recognizer = CloudSpeechRecognizer() recognizer._client._socket = mock.MagicMock() recognizer._client._socket.recv.return_value = json.dumps({ "error": None, "final": False, "hypotheses": [{ "confidence": 0.5, "transcript": "this is a test" }], "status": "ok", }) frame = np.random.rand(160).astype(np.int16) # run through all the steps context.is_active = True recognizer(context, frame) recognizer(context, frame) context.is_active = False recognizer(context, frame) recognizer._client._socket.recv.return_value = json.dumps({ "error": None, "final": True, "hypotheses": [{ "confidence": 0.5, "transcript": "this is a test" }], "status": "ok", }) # process the final frame with the final transcript recognizer(context, frame) assert context.transcript == "this is a test" assert context.confidence == 0.5 recognizer.close()
def test_recognize(*args): context = SpeechContext() recognizer = KeywordRecognizer(classes=["one", "two", "three"]) test_frame = np.random.rand( 160, ).astype(np.float32) context.is_active = True for i in range(10): recognizer(context, test_frame) recognizer(context, test_frame) context.is_active = False recognizer(context, test_frame) assert context.transcript == "one" recognizer.close()
def test_timeout(*args): context = SpeechContext() recognizer = KeywordRecognizer(classes=["one", "two", "three"]) recognizer.detect_model.return_value = [[[0.0, 0.0, 0.0]]] test_frame = np.random.rand( 160, ).astype(np.float32) context.is_active = True for i in range(10): recognizer(context, test_frame) recognizer(context, test_frame) context.is_active = False recognizer(context, test_frame) assert not context.transcript recognizer.close()
def test_detect_manual_min_delay(_mock): context = SpeechContext() detector = WakewordTrigger(model_dir="wakeword_model") detector.detect_model.return_value[0][:] = 1 context.is_active = True test_frame = np.random.rand(512, ).astype(np.float32) detector(context, test_frame) detector(context, test_frame) detector(context, test_frame) assert context.is_active
def _detect(self, context: SpeechContext) -> None: # read the full contents of the encode window and add the batch dimension # calculate a scalar probability of if the frame contains the wakeword # with the detect model frame = self.encode_window.read_all() frame = np.expand_dims(frame, 0) posterior = self.detect_model(frame)[0][0][0] if posterior > self._posterior_max: self._posterior_max = posterior if posterior > self._posterior_threshold: context.is_active = True _LOG.info(f"wake: {self._posterior_max}")
def __call__(self, context: SpeechContext, frame: np.ndarray) -> None: """Activates speech context whenever speech is detected Args: context (SpeechContext): State based information that needs to be shared between pieces of the pipeline frame (np.ndarray): Single frame of PCM-16 audio from an input source """ if context.is_speech != self._is_speech: if context.is_speech: context.is_active = True self._is_speech = context.is_speech
def test_receive(*args): context = SpeechContext() audio = np.zeros(160).astype(np.int16) recognizer = GoogleSpeechRecognizer(language="en-US", credentials="") recognizer._queue.put([audio, audio, audio]) recognizer._client.streaming_recognize.return_value = [ mock.Mock( results=[ mock.Mock(alternatives=[mock.Mock(transcript="test", confidence=0.99)]) ] ) ] context.is_active = True for i in range(10): if i > 3: context.is_active = False recognizer(context, audio) recognizer._thread = mock.Mock() recognizer.reset() recognizer.close()
def test_max_active(): max_active = 500 min_active = 20 context = SpeechContext() timeout = ActivationTimeout(min_active=min_active, max_active=max_active) context.is_active = True steps_before_timeout = (max_active // 20) + 1 for _ in range(steps_before_timeout): timeout(context) assert not context.is_active timeout.close()
def test_min_active(): max_active = 500 min_active = 120 context = SpeechContext() timeout = ActivationTimeout(min_active=min_active, max_active=max_active) context.is_active = True # call with speech active context.is_speech = True timeout(context) # call timeout after speech is no longer detected context.is_speech = False timeout(context) assert context.is_active # vad fall should be True # with context still active timeout(context) assert context.is_active # context should remain active until min active steps_before_deactivate = min_active // 20 for _ in range(steps_before_deactivate): timeout(context) assert context.is_active # call with speech active context.is_speech = True timeout(context) # call timeout after speech is no longer detected # min active should be satisfied context.is_speech = False timeout(context) assert not context.is_active timeout.close()
def test_timeout_vad_fall(): max_active = 500 min_active = 20 context = SpeechContext() timeout = ActivationTimeout(min_active=min_active, max_active=max_active) context.is_active = True context.is_speech = False timeout(context) context.is_speech = True timeout(context) assert context.is_active context.is_speech = False steps_before_timeout = (min_active // 20) + 2 for _ in range(steps_before_timeout): timeout(context) assert not context.is_active timeout.close()
def deactivate(self, context: SpeechContext) -> None: """ Deactivates the speech pipeline """ self.reset() context.is_active = False