class PocketSphinxWUW(WUWInterface): def __init__(self, keyword: str, kws_threshold: float): self._decoder = Pocketsphinx(keyphrase=keyword, lm=False, kws_threshold=kws_threshold) self._sound = pyaudio.PyAudio() self._audio_stream = self._sound.open(rate=_SAMPLE_RATE, channels=1, format=pyaudio.paInt16, input=True, frames_per_buffer=_FRAME_LENGTH) def prepare(self) -> None: print("starting utterance") self._audio_stream.start_stream() self._decoder.start_utt() print("started utterance") def process(self) -> bool: buf = self._audio_stream.read(_FRAME_LENGTH) if buf: self._decoder.process_raw(buf, False, False) else: return False if self._decoder.hyp(): print(self._decoder.hyp().hypstr) # print([(seg.word, seg.prob, seg.start_frame, seg.end_frame) for seg in self._decoder.seg()]) # print("Detected keyphrase, restarting search") # for best, i in zip(self._decoder.nbest(), range(10)): # print(best.hypstr, best.score) print("ending utterance") self._decoder.end_utt() self._audio_stream.stop_stream() print("ended utterance") return True return False def terminate(self) -> None: if self._audio_stream is not None: self._audio_stream.close() if self._sound is not None: self._sound.terminate()
def detect(): from pocketsphinx import Pocketsphinx, Ad ad = Ad(None, 16000) # default input decoder = Pocketsphinx(lm=False, hmm=hmm, dic=dic, keyphrase=keyphrase, kws_threshold=kws_threshold) buf = bytearray(2048) with ad: with decoder.start_utterance(): while ad.readinto(buf) >= 0: decoder.process_raw(buf, False, False) if decoder.hyp(): with decoder.end_utterance(): logging.info('Wake word detected for %s' % system) wake_statuses[system] = 'detected' break
def decode(): nonlocal decoder, decoded_phrase # Dynamically load decoder if decoder is None: _LOGGER.debug('Loading decoder') hass.states.async_set(OBJECT_POCKETSPHINX, STATE_LOADING, state_attrs) decoder = Pocketsphinx( hmm=acoustic_model, lm=language_model, dic=dictionary) hass.states.async_set(OBJECT_POCKETSPHINX, STATE_DECODING, state_attrs) # Do actual decoding with decoder.start_utterance(): decoder.process_raw(recorded_data, False, True) # full utterance hyp = decoder.hyp() if hyp: with decoder.end_utterance(): decoded_phrase = hyp.hypstr decoded_event.set()
class HotwordRecognizer: """热词(唤醒词)识别器,对 |pocketsphinx| 的简单封装,默认的热词是 `'阿Q'` 和 `'R-cute`。 如果要自定义热词,请参考 https://blog.51cto.com/feature09/2300352 .. |pocketsphinx| raw:: html <a href='https://github.com/bambocher/pocketsphinx-python' target='blank'>pocketsphinx</a> .. |config| raw:: html <a href='https://github.com/bambocher/pocketsphinx-python#default-config' target='blank'>pocketsphinx Default config</a> :param hotword: 热词或热词列表,默认为 `['阿Q', 'R-cute']` :type hotword: str / list, optional :param hmm: 参考 |config| :type hmm: str, optional :param lm: 参考 |config| :type lm: str, optional :param dic: 参考 |config| :type dic: str, optional """ def __init__(self, **kwargs): # signal.signal(signal.SIGINT, self.stop) self._no_search = False self._full_utt = False hotword = kwargs.pop('hotword', ['阿Q', 'R-cute']) self._hotwords = hotword if isinstance(hotword, list) else [hotword] model_path = get_model_path() opt = { 'verbose': False, 'hmm': os.path.join(model_path, 'en-us'), 'lm': util.resource('sphinx/rcute.lm'), 'dic': util.resource('sphinx/rcute.dic'), } opt.update(kwargs) self._rec = Pocketsphinx(**opt) def recognize(self, stream, timeout=None): """开始识别 :param source: 声音来源 :param timeout: 超时,即识别的最长时间(秒),默认为 `None` ,表示不设置超时,知道识别到热词才返回 :type timeout: float, optional :return: 识别到的热词模型对应的热词,若超时没识别到热词则返回 `None` :rtype: str """ self._cancel = False if timeout: count = 0.0 in_speech = False with self._rec.start_utterance(): while True: data = stream.raw_read() self._rec.process_raw(data, self._no_search, self._full_utt) if in_speech != self._rec.get_in_speech(): in_speech = not in_speech if not in_speech and self._rec.hyp(): with self._rec.end_utterance(): hyp = self._rec.hypothesis() if hyp in self._hotwords: return hyp if self._cancel: raise RuntimeError( 'Hotword detection cancelled by another thread') elif timeout: count += source.frame_duration #len(data) / 32000 if count > timeout: return def cancel(self): """停止识别""" self._cancel = True
def decode(): nonlocal decoder, decoded_phrase, data, filename # Check if WAV is in the correct format. # Convert with sox if not. with io.BytesIO(data) as wav_data: with wave.open(wav_data, mode='rb') as wav_file: rate, width, channels = wav_file.getframerate(), wav_file.getsampwidth(), wav_file.getnchannels() _LOGGER.debug('rate=%s, width=%s, channels=%s.' % (rate, width, channels)) if (rate != 16000) or (width != 2) or (channels != 1): # Convert to 16-bit 16Khz mono (required by pocketsphinx acoustic models) _LOGGER.debug('Need to convert to 16-bit 16Khz mono.') if shutil.which('sox') is None: _LOGGER.error("'sox' command not found. Cannot convert WAV file to appropriate format. Expect poor performance.") else: temp_input_file = None if filename is None: # Need to write original WAV data out to a file for sox temp_input_file = tempfile.NamedTemporaryFile(suffix='.wav', mode='wb+') temp_input_file.write(data) temp_input_file.seek(0) filename = temp_input_file.name # sox <IN> -r 16000 -e signed-integer -b 16 -c 1 <OUT> with tempfile.NamedTemporaryFile(suffix='.wav', mode='wb+') as out_wav_file: subprocess.check_call(['sox', filename, '-r', '16000', '-e', 'signed-integer', '-b', '16', '-c', '1', out_wav_file.name]) out_wav_file.seek(0) # Use converted data with wave.open(out_wav_file, 'rb') as wav_file: data = wav_file.readframes(wav_file.getnframes()) if temp_input_file is not None: # Clean up temporary file del temp_input_file # Dynamically load decoder if decoder is None: _LOGGER.debug('Loading decoder') hass.states.async_set(OBJECT_POCKETSPHINX, STATE_LOADING, state_attrs) decoder = Pocketsphinx( hmm=acoustic_model, lm=language_model, dic=dictionary) hass.states.async_set(OBJECT_POCKETSPHINX, STATE_DECODING, state_attrs) # Process WAV data as a complete utterance (best performance) with decoder.start_utterance(): decoder.process_raw(data, False, True) # full utterance if decoder.hyp(): with decoder.end_utterance(): decoded_phrase = decoder.hyp().hypstr decoded_event.set()