def __init__(self, profile, hmm=None, dict=None, lm=None, kws_threshold=None, keyphrase=None): self.profile = profile if keyphrase: if not dict: dict = fullpath('config/keyphrase.dic') if not lm: lm = fullpath('config/keyphrase.lm') else: if not dict: dict = fullpath('config/corpus.dic') if not lm: lm = fullpath('config/corpus.lm') if not hmm: hmm = 'share/pocketsphinx/model/en-us/en-us' config = Decoder.default_config() config.set_string('-hmm', os.path.join(SPHINX_ROOT, hmm)) config.set_string('-dict', dict) config.set_string('-lm', lm) config.set_string('-logfn', fullpath('config/sphinx.log')) if keyphrase: config.set_string('-keyphrase', keyphrase) if kws_threshold: config.set_float('-kws_threshold', kws_threshold) self.decoder = Decoder(config) self.transcribe = self.transcribe_darwin self.hyp = None
def record(listen_time): THRESHOLD=None WAVE_OUTPUT_FILENAME = "livewav.wav" p = pyaudio.PyAudio() if THRESHOLD == None: THRESHOLD = fetchThreshold() print THRESHOLD stream = p.open(format=FORMAT, channels=1, rate=RATE, input=True, frames_per_buffer=CHUNK) print "* recording" frames = [] detected=False for i in range(0, RATE / CHUNK * listen_time): data = stream.read(CHUNK) frames.append(data) score = getScore(data) if score < THRESHOLD: continue else: detected=True if not detected: print "nothing detected" return("") print "* done recording" #stream.stop_stream() stream.close() p.terminate() # write data to WAVE file data = ''.join(frames) wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb') wf.setnchannels(1) wf.setsampwidth(p.get_sample_size(FORMAT)) wf.setframerate(RATE) wf.writeframes(data) wf.close() sysdir = os.getcwd() wavfile = sysdir+"/livewav.wav" speechRec = Decoder(hmm=hmdir, lm=lmdir, dict=dictd) with open(wavfile, 'rb') as wavFile: speechRec.decode_raw(wavFile) result = speechRec.get_hyp() return(result[0])
def __init__(self, key_phrase="hey mycroft", config=None, lang="en-us"): super(PocketsphinxHotWord, self).__init__(key_phrase, config, lang) # Hotword module imports from pocketsphinx import Decoder # Hotword module params self.phonemes = self.config.get("phonemes", "HH EY . M AY K R AO F T") self.num_phonemes = len(self.phonemes.split()) self.threshold = self.config.get("threshold", 1e-90) self.sample_rate = self.listener_config.get("sample_rate", 1600) dict_name = self.create_dict(self.key_phrase, self.phonemes) config = self.create_config(dict_name, Decoder.default_config()) self.decoder = Decoder(config)
def best_sphinx_speech_result(pyaudio, wav_name, profile): if not have_sphinx_dictionary: if not profile.has_key("words"): raise(KeyError("Pass the possible words in in profile")) compile("sentences.txt", "dictionary.dic", "language_model.lm", profile["words"]) global have_sphinx_dictionary have_sphinx_dictionary = True wav_file = file(wav_name, 'rb') speechRec = Decoder( hmm = "/usr/local/share/pocketsphinx/model/hmm/en_US/hub4wsj_sc_8k", lm = "language_model.lm", dict = "dictionary.dic" ) speechRec.decode_raw(wav_file) results = speechRec.get_hyp() return results[0]
def prepareDecoder(self, pGramma): ''' Entry point where sphinx decoder is initialized or grammar updated ''' if self.decoder is None: self.config = self.createConfig(pGramma); self.decoder = Decoder(self.config); else: self.updateGrammar(self.decoder, pGramma);
def createConfig(self,pGramma): print ("[createConfig]+++") config = Decoder.default_config() config.set_string('-hmm', os.path.join(self.MODELDIR, 'hmm/liepa.cd_semi_200/')) config.set_string('-fsg', os.path.join("../resource/", pGramma+'.fsg')) #config.set_string('-jsgf', os.path.join("../resource/", pGramma+'.gram')) config.set_string('-dict', os.path.join("../resource/", 'service.dict')) print ("[createConfig]---") return config;
def __init__(self, key_phrase, phonemes, threshold, sample_rate=16000, lang="en-us"): self.lang = lang self.key_phrase = key_phrase self.sample_rate = sample_rate self.threshold = threshold self.phonemes = phonemes dict_name = self.create_dict(key_phrase, phonemes) self.decoder = Decoder(self.create_config(dict_name))
def __init__(self, key_phrase="hey mycroft", config=None, lang="en-us"): super(PocketsphinxHotWord, self).__init__(key_phrase, config, lang) # Hotword module imports from pocketsphinx import Decoder # Hotword module config module = self.config.get("module") if module != "pocketsphinx": LOG.warning( str(module) + " module does not match with " "Hotword class pocketsphinx") # Hotword module params self.phonemes = self.config.get("phonemes", "HH EY . M AY K R AO F T") self.num_phonemes = len(self.phonemes.split()) self.threshold = self.config.get("threshold", 1e-90) self.sample_rate = self.listener_config.get("sample_rate", 1600) dict_name = self.create_dict(key_phrase, self.phonemes) config = self.create_config(dict_name, Decoder.default_config()) self.decoder = Decoder(config)
class PocketsphinxHotWord(HotWordEngine): """Wake word engine using PocketSphinx. PocketSphinx is very general purpose but has a somewhat high error rate. The key advantage is to be able to specify the wake word with phonemes. """ def __init__(self, key_phrase="hey mycroft", config=None, lang="en-us"): super().__init__(key_phrase, config, lang) # Hotword module imports from pocketsphinx import Decoder # Hotword module params self.phonemes = self.config.get("phonemes", "HH EY . M AY K R AO F T") self.num_phonemes = len(self.phonemes.split()) self.threshold = self.config.get("threshold", 1e-90) self.sample_rate = self.listener_config.get("sample_rate", 1600) dict_name = self.create_dict(self.key_phrase, self.phonemes) config = self.create_config(dict_name, Decoder.default_config()) self.decoder = Decoder(config) def create_dict(self, key_phrase, phonemes): (fd, file_name) = tempfile.mkstemp() words = key_phrase.split() phoneme_groups = phonemes.split('.') with os.fdopen(fd, 'w') as f: for word, phoneme in zip(words, phoneme_groups): f.write(word + ' ' + phoneme + '\n') return file_name def create_config(self, dict_name, config): """If language config doesn't exist then we use default language (english) config as a fallback. """ model_file = join(RECOGNIZER_DIR, 'model', self.lang, 'hmm') if not exists(model_file): LOG.error( 'PocketSphinx model not found at "{}". '.format(model_file) + 'Falling back to en-us model') model_file = join(RECOGNIZER_DIR, 'model', 'en-us', 'hmm') config.set_string('-hmm', model_file) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') return config def transcribe(self, byte_data, metrics=None): start = time() self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() if metrics: metrics.timer("mycroft.stt.local.time_s", time() - start) return self.decoder.hyp() def found_wake_word(self, frame_data): hyp = self.transcribe(frame_data) return hyp and self.key_phrase in hyp.hypstr.lower()
def create_config(self, dict_name): config = Decoder.default_config() config.set_string('-hmm', join(BASEDIR, 'model', self.lang, 'hmm')) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') return config
def create_config(self, dict_name): config = Decoder.default_config() config.set_string('-hmm', join(BASEDIR, 'model', self.lang, 'hmm')) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', self.threshold) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') return config
def __init__(self): config = get_decoder_config() self.decoder = Decoder(config) self.speech = pyttsx3.init() self.audio = sphinxbase.Ad(self.audio_device, self.sampling_rate) self.buffer = bytearray(self.buffer_size) self.default_search = self.decoder.get_search() self.in_speech = False self.max_history = 100 self.phrases = [] self.prompts = {} self.next_prompt_id = 1 self.current_prompt = None self.prompt_queue = queue.Queue()
def create_config(self, dict_name): config = Decoder.default_config() config.set_string('-hmm', os.path.join(MODELDIR, 'en-us')) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') return config
def __init__(self, rt, on_activation: Callable): super().__init__(rt, on_activation) lang = rt.config['lang'] self.hmm_folder = join(rt.paths.user_config, 'models', lang) self.rate, self.width = self.rec_config['sample_rate'], self.rec_config['sample_width'] self.padding = b'\0' * int(self.rate * self.width * self.SILENCE_SEC) self.buffer = b'' download_extract_tar(self.url.format(lang=lang), self.hmm_folder) config = Decoder.default_config() config.set_string('-hmm', self.hmm_folder) config.set_string('-dict', self._create_dict(self.wake_word, self.config['phonemes'])) config.set_string('-keyphrase', self.wake_word) config.set_float('-kws_threshold', float(self.config['threshold'])) config.set_float('-samprate', self.rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') self.ps = Decoder(config)
def onStart(self): super().onStart() if not self.checkLanguage(): self.downloadLanguage() self._config = Decoder.default_config() self._config.set_string( '-hmm', f'{self.Commons.rootDir()}/venv/lib/python3.7/site-packages/pocketsphinx/model/{self.LanguageManager.activeLanguageAndCountryCode.lower()}' ) self._config.set_string( '-lm', f'{self.Commons.rootDir()}/venv/lib/python3.7/site-packages/pocketsphinx/model/{self.LanguageManager.activeLanguageAndCountryCode.lower()}.lm.bin' ) self._config.set_string( '-dict', f'{self.Commons.rootDir()}/venv/lib/python3.7/site-packages/pocketsphinx/model/cmudict-{self.LanguageManager.activeLanguageAndCountryCode.lower()}.dict' ) self._decoder = Decoder(self._config)
def create_config(self, dict_name): config = Decoder.default_config() config.set_string('-hmm', join(BASEDIR, 'model', self.lang, 'hmm')) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/home/sg/mycroft-core/scripts/logs/pocket.log') return config
def __init__(self, file_name='aux.wav', raspi=False, local=True): ## load environment self.FILE_NAME = file_name self.audio = pyaudio.PyAudio() self.raspi = raspi self.local = local self.config = Decoder.default_config() self.config.set_string('-hmm', os.path.join(self.MODELDIR, 'acoustic-model')) self.config.set_string( '-dict', os.path.join(self.MODELDIR, 'pronounciation-dictionary.dict')) self.config.set_string('-logfn', os.devnull) self.decoder = Decoder(self.config) self.r = sr.Recognizer() print("adjunting...") with sr.Microphone() as source: self.r.adjust_for_ambient_noise(source) # tts if self.local: self.tts = pyttsx3.init() self.tts.setProperty('rate', self.RATE) self.tts.setProperty('volume', self.VOLUME) self.tts.setProperty('voice', 'spanish-latin-am') else: # Instantiates a client self.tts_client = texttospeech.TextToSpeechClient() # Build the voice request, select the language code ("en-US") and the ssml # voice gender ("neutral") self.tts_voice = texttospeech.types.VoiceSelectionParams( language_code='es-ES', ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE) # Select the type of audio file you want returned self.tts_audio_config = texttospeech.types.AudioConfig( audio_encoding=texttospeech.enums.AudioEncoding.MP3)
class LocalRecognizer(object): def __init__(self, key_phrase, phonemes, threshold, sample_rate=16000, lang="en-us"): self.lang = lang self.key_phrase = key_phrase self.sample_rate = sample_rate self.threshold = threshold self.phonemes = phonemes dict_name = self.create_dict(key_phrase, phonemes) self.decoder = Decoder(self.create_config(dict_name)) def create_dict(self, key_phrase, phonemes): (fd, file_name) = tempfile.mkstemp() words = key_phrase.split() phoneme_groups = phonemes.split('.') with os.fdopen(fd, 'w') as f: for word, phoneme in zip(words, phoneme_groups): f.write(word + ' ' + phoneme + '\n') return file_name def create_config(self, dict_name): config = Decoder.default_config() config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang, 'hmm')) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/home/sg/mycroft-core/scripts/logs/pocket.log') return config def transcribe(self, byte_data, metrics=None): start = time.time() self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() if metrics: metrics.timer("mycroft.stt.local.time_s", time.time() - start) return self.decoder.hyp() def is_recognized(self, byte_data, metrics): hyp = self.transcribe(byte_data, metrics) return hyp and self.key_phrase in hyp.hypstr.lower() def found_wake_word(self, frame_data): hyp = self.transcribe(frame_data) hyp = self.transcribe(frame_data) #if hyp is not None: #print("hyp is not null") return hyp and self.key_phrase in hyp.hypstr.lower()
def createConfig(self,pGramma): ''' Create configuration with acoustic model path, grammar and dictionary ''' print ("[createConfig]+++") config = Decoder.default_config() config.set_string('-hmm', os.path.join(self.MODELDIR, 'hmm/lt.cd_cont_200/')) config.set_string('-fsg', os.path.join("../resource/", pGramma+'.fsg')) #config.set_string('-jsgf', os.path.join("../resource/", pGramma+'.gram')) config.set_string('-dict', os.path.join("../resource/", 'service.dict')) print ("[createConfig]---") return config;
class CMUSphinxRecognizer(BaseRecognizer): def __init__(self): config = Decoder.default_config() config.set_string('-hmm', SPHINX_HMM) config.set_string('-lm', SPHINX_LM) config.set_string('-dict', SPHINX_DICT) self.decoder = Decoder(config) def recognize(self, raw_audio): file_path = self.__save_file(raw_audio) with open(file_path, 'r') as wav_fp: self.decoder.decode_raw(wav_fp) hypothesis = self.decoder.hyp() return hypothesis.hypstr, hypothesis.best_score, hypothesis.prob @staticmethod def __save_file(data): tmp_fp = NamedTemporaryFile(delete=False) tmp_fp.write(data) tmp_fp.close() return tmp_fp.name
class PocketsphinxRecognizer(LocalRecognizer): def __init__(self, key_phrase, phonemes, threshold, sample_rate=16000, lang="en-us"): self.lang = str(lang) self.key_phrase = str(key_phrase) print("####key_phrase-->", key_phrase) self.sample_rate = sample_rate self.threshold = threshold self.phonemes = phonemes print("####phonemes -->", phonemes) dict_name = self.create_dict(key_phrase, phonemes) print("####dict_name --->", dict_name) self.decoder = Decoder(self.create_config(dict_name)) def create_dict(self, key_phrase, phonemes): (fd, file_name) = tempfile.mkstemp() words = key_phrase.split() phoneme_groups = phonemes.split('.') with os.fdopen(fd, 'w') as f: for word, phoneme in zip(words, phoneme_groups): f.write(word + ' ' + phoneme + '\n') return file_name def create_config(self, dict_name): config = Decoder.default_config() config.set_string('-hmm', join(BASEDIR, 'model', self.lang, 'hmm')) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/home/sg/mycroft-core/scripts/logs/pocket.log') return config def transcribe(self, byte_data, metrics=None): start = time.time() #sr = r.recognize_sphinx() self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() if metrics: metrics.timer("mycroft.stt.local.time_s", time.time() - start) #LOG.error("transcribed ---> +"+str(self.decoder.hyp())) return self.decoder.hyp() def found_wake_word(self, frame_data): hyp = self.transcribe(frame_data) #LOG.info("hyp is ---->"+hyp)) return hyp and self.key_phrase in hyp.hypstr.lower()
class PocketsphinxHotWord(HotWordEngine): def __init__(self, key_phrase="hey mycroft", config=None, lang="en-us"): super(PocketsphinxHotWord, self).__init__(key_phrase, config, lang) # Hotword module imports from pocketsphinx import Decoder # Hotword module config module = self.config.get("module") if module != "pocketsphinx": LOG.warning( str(module) + " module does not match with " "Hotword class pocketsphinx") # Hotword module params self.phonemes = self.config.get("phonemes", "HH EY . M AY K R AO F T") self.num_phonemes = len(self.phonemes.split()) self.threshold = self.config.get("threshold", 1e-90) self.sample_rate = self.listener_config.get("sample_rate", 1600) dict_name = self.create_dict(key_phrase, self.phonemes) config = self.create_config(dict_name, Decoder.default_config()) self.decoder = Decoder(config) def create_dict(self, key_phrase, phonemes): (fd, file_name) = tempfile.mkstemp() words = key_phrase.split() phoneme_groups = phonemes.split('.') with os.fdopen(fd, 'w') as f: for word, phoneme in zip(words, phoneme_groups): f.write(word + ' ' + phoneme + '\n') return file_name def create_config(self, dict_name, config): model_file = join(RECOGNIZER_DIR, 'model', self.lang, 'hmm') if not exists(model_file): LOG.error('PocketSphinx model not found at ' + str(model_file)) config.set_string('-hmm', model_file) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') return config def transcribe(self, byte_data, metrics=None): start = time.time() self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() if metrics: metrics.timer("mycroft.stt.local.time_s", time.time() - start) return self.decoder.hyp() def found_wake_word(self, frame_data): hyp = self.transcribe(frame_data) return hyp and self.key_phrase in hyp.hypstr.lower()
def record(listen_time): THRESHOLD = None WAVE_OUTPUT_FILENAME = "livewav.wav" p = pyaudio.PyAudio() if THRESHOLD == None: THRESHOLD = fetchThreshold() print THRESHOLD stream = p.open(format=FORMAT, channels=1, rate=RATE, input=True, frames_per_buffer=CHUNK) print "* recording" frames = [] detected = False for i in range(0, RATE / CHUNK * listen_time): data = stream.read(CHUNK) frames.append(data) score = getScore(data) if score < THRESHOLD: continue else: detected = True if not detected: print "nothing detected" return("") print "* done recording" # stream.stop_stream() stream.close() p.terminate() # write data to WAVE file data = ''.join(frames) wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb') wf.setnchannels(1) wf.setsampwidth(p.get_sample_size(FORMAT)) wf.setframerate(RATE) wf.writeframes(data) wf.close() sysdir = os.getcwd() wavfile = sysdir + "/livewav.wav" config = Decoder.default_config() config.set_string('-hmm', hmdir) config.set_string('-lm', lmdir) config.set_string('-dict', dictd) config.set_string('-logfn', '/dev/null') speechRec = Decoder(config) with open(wavfile, 'rb') as wavFile: speechRec.decode_raw(wavFile) #result = speechRec.get_hyp() return(speechRec.hyp().hypstr)
class PocketsphinxHotWord(HotWordEngine): def __init__(self, key_phrase="hey mycroft", config=None, lang="en-us"): super(PocketsphinxHotWord, self).__init__(key_phrase, config, lang) # Hotword module imports from pocketsphinx import Decoder # Hotword module config module = self.config.get("module") if module != "pocketsphinx": LOG.warning( str(module) + " module does not match with " "Hotword class pocketsphinx") # Hotword module params self.phonemes = self.config.get("phonemes", "HH EY . M AY K R AO F T") self.num_phonemes = len(self.phonemes.split()) self.threshold = self.config.get("threshold", 1e-90) self.sample_rate = self.listener_config.get("sample_rate", 1600) dict_name = self.create_dict(self.key_phrase, self.phonemes) config = self.create_config(dict_name, Decoder.default_config()) self.decoder = Decoder(config) def create_dict(self, key_phrase, phonemes): (fd, file_name) = tempfile.mkstemp() words = key_phrase.split() phoneme_groups = phonemes.split('.') with os.fdopen(fd, 'w') as f: for word, phoneme in zip(words, phoneme_groups): f.write(word + ' ' + phoneme + '\n') return file_name def create_config(self, dict_name, config): model_file = join(RECOGNIZER_DIR, 'model', self.lang, 'hmm') if not exists(model_file): LOG.error('PocketSphinx model not found at ' + str(model_file)) config.set_string('-hmm', model_file) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') return config def transcribe(self, byte_data, metrics=None): start = time.time() self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() if metrics: metrics.timer("mycroft.stt.local.time_s", time.time() - start) return self.decoder.hyp() def found_wake_word(self, frame_data): hyp = self.transcribe(frame_data) return hyp and self.key_phrase in hyp.hypstr.lower()
class PocketsphinxListener: """Pocketsphinx listener implementation used for comparison with Precise""" def __init__(self, key_phrase, dict_file, hmm_folder, threshold=1e-90, chunk_size=-1): from pocketsphinx import Decoder config = Decoder.default_config() config.set_string('-hmm', hmm_folder) config.set_string('-dict', dict_file) config.set_string('-keyphrase', key_phrase) config.set_float('-kws_threshold', float(threshold)) config.set_float('-samprate', 16000) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') self.key_phrase = key_phrase self.buffer = b'\0' * pr.sample_depth * pr.buffer_samples self.pr = pr self.read_size = -1 if chunk_size == -1 else pr.sample_depth * chunk_size try: self.decoder = Decoder(config) except RuntimeError: options = dict(key_phrase=key_phrase, dict_file=dict_file, hmm_folder=hmm_folder, threshold=threshold) raise RuntimeError('Invalid Pocketsphinx options: ' + str(options)) def _transcribe(self, byte_data): self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() return self.decoder.hyp() def found_wake_word(self, frame_data): hyp = self._transcribe(frame_data + b'\0' * int(2 * 16000 * 0.01)) return bool(hyp and self.key_phrase in hyp.hypstr.lower()) def update(self, stream: Union[BinaryIO, np.ndarray, bytes]) -> float: if isinstance(stream, np.ndarray): chunk = audio_to_buffer(stream) else: if isinstance(stream, (bytes, bytearray)): chunk = stream else: chunk = stream.read(self.read_size) if len(chunk) == 0: raise EOFError self.buffer = self.buffer[len(chunk):] + chunk return float(self.found_wake_word(self.buffer))
def __init__(self, file_name='aux.wav', raspi=False): self.FILE_NAME = file_name self.audio = pyaudio.PyAudio() self.raspi = raspi self.config = Decoder.default_config() self.config.set_string('-hmm', os.path.join(self.MODELDIR, 'acoustic-model')) self.config.set_string( '-dict', os.path.join(self.MODELDIR, 'pronounciation-dictionary.dict')) self.config.set_string('-logfn', os.devnull) self.decoder = Decoder(self.config) self.r = sr.Recognizer() print("adjunting...") with sr.Microphone() as source: self.r.adjust_for_ambient_noise(source) # tts self.tts = pyttsx3.init() self.tts.setProperty('rate', self.RATE) self.tts.setProperty('volume', self.VOLUME) self.tts.setProperty('voice', 'spanish-latin-am')
def __init__(self, in_fs, out_fs, mute_period_length, kws_frame_length): threading.Thread.__init__(self) # 初始化配置 self.daemon = True self.exit_flag = False self.in_fs = in_fs self.out_fs = out_fs self.mute_period_frames_count = int(in_fs * mute_period_length) self.kws_frames_count = int(in_fs * kws_frame_length) model_path = get_model_path() config = Decoder.default_config() config.set_string('-hmm', os.path.join(model_path, 'en-us')) # 声学模型路径 # config.set_string('-lm',"./tests/7567.lm") config.set_string('-dict', os.path.join(model_path, 'cmudict-en-us.dict')) # 字典路径 config.set_string('-keyphrase', 'alexa') config.set_float('-kws_threshold', 1e-20) config.set_string('-logfn', './logs/tmp') # INFO输出到其他位置 self.decoder = Decoder(config) self.decoder.start_utt() self.start()
def __init__(self): ''' Constructor ''' print ("[__init__]+++") # Create a decoder with certain model self.ai = Artificialintelligence() self.config = self.createConfig("code"); self.decoder = Decoder(self.config); print ("[__init__] created decoder") #self.updateGrammar(self.decoder, "confirmation"); print ("[__init__]---") p = pyaudio.PyAudio() self.stream = p.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input=True, frames_per_buffer=self.CHUNK) #Indicate listening for next utterance print ("READY....")
def createConfig(self, pGramma): ''' Create configuration with acoustic model path, grammar and dictionary ''' print("[createConfig]+++") config = Decoder.default_config() config.set_string('-hmm', os.path.join(self.MODELDIR, 'hmm/lt.cd_cont_200/')) config.set_string('-fsg', os.path.join("../resource/", pGramma + '.fsg')) #config.set_string('-jsgf', os.path.join("../resource/", pGramma+'.gram')) config.set_string('-dict', os.path.join("../resource/", 'service.dict')) print("[createConfig]---") return config
def init(): # Create a decoder with certain model config = DefaultConfig() # config.set_string('-logfn', settings.POCKET_LOG) config.set_string('-hmm', settings.POCKET_HMM_ACOUSTIC_MODEL) config.set_string('-lm', settings.POCKET_LANGUAGE_MODEL) config.set_string('-dict', settings.POCKET_DICTIONARY) # config.set_string('-kws', settings.POCKET_KEYPHRASES) # Decode streaming data global decoder, p decoder = Decoder(config) p = pyaudio.PyAudio() # Set up speech recognition recogniser global r r = speech_recognition.Recognizer()
def transcribe(decoder: pocketsphinx.Decoder, audio_data: bytes, nbest: int = 0) -> Dict[str, Any]: """Transcribes audio data to text.""" # Process data as an entire utterance start_time = time.time() decoder.start_utt() decoder.process_raw(audio_data, False, True) decoder.end_utt() end_time = time.time() logger.debug(f"Decoded audio in {end_time - start_time} second(s)") transcription = "" decode_seconds = end_time - start_time likelihood = 0.0 score = 0 hyp = decoder.hyp() if hyp is not None: likelihood = decoder.get_logmath().exp(hyp.prob) transcription = hyp.hypstr result = { "text": transcription, "transcribe_seconds": decode_seconds, "likelihood": likelihood, } if nbest > 0: # Include alternative transcriptions result["nbest"] = { nb.hypstr: nb.score for nb in decoder.nbest()[:nbest] } return result
class LocalRecognizer(object): def __init__(self, key_phrase, phonemes, threshold, sample_rate=16000, lang="en-us"): self.lang = lang self.key_phrase = key_phrase self.sample_rate = sample_rate self.threshold = threshold self.phonemes = phonemes dict_name = self.create_dict(key_phrase, phonemes) self.decoder = Decoder(self.create_config(dict_name)) def create_dict(self, key_phrase, phonemes): (fd, file_name) = tempfile.mkstemp() words = key_phrase.split() phoneme_groups = phonemes.split('.') with os.fdopen(fd, 'w') as f: for word, phoneme in zip(words, phoneme_groups): f.write(word + ' ' + phoneme + '\n') return file_name def create_config(self, dict_name): config = Decoder.default_config() config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang, 'hmm')) config.set_string('-dict', dict_name) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float(self.threshold)) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') return config def transcribe(self, byte_data, metrics=None): start = time.time() self.decoder.start_utt() self.decoder.process_raw(byte_data, False, False) self.decoder.end_utt() if metrics: metrics.timer("mycroft.stt.local.time_s", time.time() - start) return self.decoder.hyp() def is_recognized(self, byte_data, metrics): hyp = self.transcribe(byte_data, metrics) return hyp and self.key_phrase in hyp.hypstr.lower() def found_wake_word(self, hypothesis): return hypothesis and self.key_phrase in hypothesis.hypstr.lower()
def init(): # Create a decoder with certain model config = DefaultConfig() config.set_string('-logfn', settings.POCKETSPHINX_LOG) #config.set_string('-hmm', settings.ACOUSTIC_MODEL) config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) config.set_string('-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) #config.set_string('-lm', settings.LANGUAGE_MODEL) config.set_string('-kws', settings.KEYPHRASES) #config.set_string('-dict', settings.POCKET_DICT) # Decode streaming data global decoder, p decoder = Decoder(config) p = pyaudio.PyAudio() global r r = speech_recognition.Recognizer()
def get_decoder_config(): """ Get a populated configuration object for the pocketsphinx Decoder. """ model_dir = get_model_path() config = Decoder.default_config() config.set_string("-dict", os.path.join(model_dir, "cmudict-en-us.dict")) config.set_string("-fdict", os.path.join(model_dir, "en-us/noisedict")) config.set_string("-featparams", os.path.join(model_dir, "en-us/feat.params")) config.set_string("-hmm", os.path.join(model_dir, "en-us")) config.set_string("-lm", os.path.join(model_dir, "en-us.lm.bin")) config.set_string("-mdef", os.path.join(model_dir, "en-us/mdef")) config.set_string("-mean", os.path.join(model_dir, "en-us/means")) config.set_string("-sendump", os.path.join(model_dir, "en-us/sendump")) config.set_string("-tmat", os.path.join(model_dir, "en-us/transition_matrices")) config.set_string("-var", os.path.join(model_dir, "en-us/variances")) return config
def __init__(self, config=Decoder.default_config()): assert isinstance(config, Config) search_args_set = search_arguments_set(config) if len(search_args_set) == 0: # Use the language model by default if nothing else is set set_lm_path(config) elif len(search_args_set) > 1: raise ConfigError( "more than one search argument was set in the Config " "object") # Set the required config paths if they aren't already set if not (config.get_string("-hmm") and config.get_string("-dict")): set_hmm_and_dict_paths(config) self._speech_start_callback = None self._hypothesis_callback = None self._utterance_state = self._UTT_ENDED super(PocketSphinx, self).__init__(config)
def load_decoder(myid, model_config, out): # Create a decoder with certain model pocketsphinx_config = DefaultConfig() model_name = model_config.sections()[0] hmm = model_config[model_name]['hmm'] dict = model_config[model_name]['dict'] lm = model_config[model_name]['lm'] # logfn = model_config[model_name]['log'] logfn = '{}_{}.log'.format(out, myid) if not os.path.exists(hmm): print('ERROR: {} does not exist'.format(hmm)) sys.exit(-2) if not os.path.exists(lm): print('ERROR: {} does not exist'.format(lm)) sys.exit(-4) if not os.path.exists(dict): print('ERROR: {} does not exist'.format(dict)) sys.exit(-5) pocketsphinx_config.set_string('-hmm', hmm) pocketsphinx_config.set_string('-lm', lm) pocketsphinx_config.set_string('-dict', dict) pocketsphinx_config.set_string('-logfn', logfn) decoder_engine = Decoder(pocketsphinx_config) return decoder_engine
class PocketSphinxASR(ASR): NAME = 'Pocketsphinx ASR' DEPENDENCIES = { 'system': [ 'swig', 'libpulse-dev' ], 'pip' : [ 'pocketsphinx==0.1.15' ] } LANGUAGE_PACKS = { 'en': [ f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/en-us/en-us.tar', f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/en-us/en-us.lm.bin', f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/en-us/cmudict-en-us.dict' ], 'fr': [ f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/fr-fr/fr-fr.tar', f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/fr-fr/fr-fr.lm.bin', f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/fr-fr/cmudict-fr-fr.dict' ], 'de': [ f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/de-de/de-de.tar', f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/de-de/de-de.lm.bin', f'{constants.GITHUB_URL}/cmusphinx-models/blob/master/de-de/cmudict-de-de.dict' ] } def __init__(self): super().__init__() self._capableOfArbitraryCapture = True self._isOnlineASR = False self._decoder: Optional[Decoder] = None self._config = None def onStart(self): super().onStart() if not self.checkLanguage(): self.downloadLanguage() self._config = Decoder.default_config() self._config.set_string('-hmm', f'{self.Commons.rootDir()}/venv/lib/python3.7/site-packages/pocketsphinx/model/{self.LanguageManager.activeLanguageAndCountryCode.lower()}') self._config.set_string('-lm', f'{self.Commons.rootDir()}/venv/lib/python3.7/site-packages/pocketsphinx/model/{self.LanguageManager.activeLanguageAndCountryCode.lower()}.lm.bin') self._config.set_string('-dict', f'{self.Commons.rootDir()}/venv/lib/python3.7/site-packages/pocketsphinx/model/cmudict-{self.LanguageManager.activeLanguageAndCountryCode.lower()}.dict') self._decoder = Decoder(self._config) def checkLanguage(self) -> bool: if not Path(self.Commons.rootDir(), f'venv/lib/python3.7/site-packages/pocketsphinx/model/{self.LanguageManager.activeLanguageAndCountryCode.lower()}').exists(): self.logInfo('Missing language model') return False return True def timeout(self): super().timeout() try: self._decoder.end_utt() except: # If this fails we don't care, at least we tried to close the utterance pass def downloadLanguage(self) -> bool: self.logInfo(f'Downloading language model for "{self.LanguageManager.activeLanguage}"') venv = Path(self.Commons.rootDir(), 'venv/lib/python3.7/site-packages/pocketsphinx/') for url in self.LANGUAGE_PACKS[self.LanguageManager.activeLanguage]: filename = Path(url).name download = Path(venv, 'model', filename) self.Commons.downloadFile(url=f'{url}?raw=true', dest=str(download)) if download.suffix == '.tar': dest = Path(venv, 'model', self.LanguageManager.activeLanguageAndCountryCode.lower()) if dest.exists(): shutil.rmtree(dest) tar = tarfile.open(str(download)) tar.extractall(str(dest)) download.unlink() self.logInfo('Downloaded and installed') return True def decodeStream(self, session: DialogSession) -> Optional[ASRResult]: super().decodeStream(session) result = None with Stopwatch() as processingTime: with Recorder(self._timeout) as recorder: self.ASRManager.addRecorder(session.siteId, recorder) self._decoder.start_utt() inSpeech = False for chunk in recorder: if self._timeout.isSet(): break self._decoder.process_raw(chunk, False, False) if self._decoder.get_in_speech() != inSpeech: inSpeech = self._decoder.get_in_speech() if not inSpeech: self._decoder.end_utt() result = self._decoder.hyp() if self._decoder.hyp() else None break self.end(recorder, session) return ASRResult( text=result.hypstr.strip(), session=session, likelihood=self._decoder.hyp().prob, processingTime=processingTime.time ) if result else None
import os from os import path from pocketsphinx import pocketsphinx from pocketsphinx import Decoder import speech_recognition as sr from time import sleep MODELDIR = "BIOMEC_DICTIONARY" config = Decoder.default_config() config.set_string('-hmm', path.join(MODELDIR, 'acoustic-model')) config.set_string('-lm', path.join(MODELDIR, '4177.lm')) config.set_string('-dict', path.join(MODELDIR, '4177.dict')) config.set_string("-logfn", os.devnull) decoder = Decoder(config) commands = ['DOWN', 'GO', 'LEFT', 'RIGHT', 'STOP', 'UP'] def getCommand(phrase, commands=commands): for i in range(len(commands)): if phrase.find(commands[i]) != -1: return (commands[i]) r = sr.Recognizer() r.energy_threshold = 1000 # minimum audio energy to consider for recording r.pause_threshold = 0.25 # seconds of non-speaking audio before a phrase is cons$ r.phrase_threshold = 0.15 # minimum seconds of speaking audio before we conside$ r.non_speaking_duration = 0.25 # seconds of non-speaking audio to keep on both $ with sr.Microphone() as source:
def recognition_worker(audio_file, queue, event, max_no_speech=120, debug=False, hmm='/usr/local/share/pocketsphinx/model/en-us/en-us', lm='/usr/local/share/pocketsphinx/model/en-us/en-us.lm.bin', cmudict='/usr/local/share/pocketsphinx/model/en-us/cmudict-en-us.dict'): ''' Read audio from `audio_file and feed it to pocketsphinx. Put recognized text in `queue`. Shut down if `event` is set. If no speech is detected for `max_no_speech` seconds, set `event` and quit. ''' from pocketsphinx import Decoder config = Decoder.default_config() config.set_string('-hmm', hmm) config.set_string('-lm', lm) config.set_string('-dict', cmudict) if not debug: config.set_string('-logfn', '/dev/null') decoder = Decoder(config) in_speech_bf = True no_speech_timer = None now_in_speech = False decoder.start_utt() try: with open(audio_file, 'rb') as f: f.read(40) # read RIFF header # TODO: Probably should sanity check the audio format... while not event.is_set(): buf = f.read(1024) if buf: decoder.process_raw(buf, False, False) now_in_speech = decoder.get_in_speech() if debug and now_in_speech: print('Found speech', file=sys.stderr) if now_in_speech != in_speech_bf: in_speech_bf = now_in_speech if not in_speech_bf: if debug: print('Processing speech', file=sys.stderr) # No speech, but there was speech before, so, process. decoder.end_utt() try: speech = decoder.hyp().hypstr if speech != '': if debug: print('Speech: ' + speech, file=sys.stderr) queue.put_nowait(speech) except AttributeError: pass decoder.start_utt() else: # Got some speech, reset timer. no_speech_timer = None else: if debug: print('No audio', file=sys.stderr) # Wait a bit... event.wait(0.1) if not now_in_speech: if no_speech_timer is None: no_speech_timer = datetime.datetime.now() elif (datetime.datetime.now() - no_speech_timer).total_seconds() > max_no_speech: if debug: print('No speech, timing out', file=sys.stderr) event.set() except KeyboardInterrupt: pass
def __init__(self): config = Decoder.default_config() config.set_string('-hmm', SPHINX_HMM) config.set_string('-lm', SPHINX_LM) config.set_string('-dict', SPHINX_DICT) self.decoder = Decoder(config)
''' Created on Dec 29, 2013 @author: Mindaugas Greibus ''' from os import path from pocketsphinx import Decoder #from sphinxbase import * #MODELDIR = "../models" MODELDIR = "/home/as/src/speech/sphinx/lt-pocketsphinx-tutorial/impl/models" # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', path.join(MODELDIR, 'hmm/lt.cd_cont_200/')) config.set_string('-jsgf', path.join(MODELDIR, 'lm/robotas.gram')) config.set_string('-dict', path.join(MODELDIR, 'dict/robotas.dict')) decoder = Decoder(config) decoder.decode_raw(open(path.join(MODELDIR, '../test/audio/varyk_pirmyn-16k.wav'), 'rb')) # Retrieve hypothesis. hypothesis = decoder.hyp() print ('Best hypothesis: ', hypothesis.best_score, hypothesis.hypstr) #print 'Best hypothesis segments: ', [seg.word for seg in decoder.seg()]
from os import path import pyaudio CHUNK = 4096 FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 16000 MODELDIR = "../models" #MODELDIR = "/home/as/src/speech/sphinx/lt-pocketsphinx-tutorial/impl/models" # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', path.join(MODELDIR, 'hmm/lt.cd_cont_200/')) config.set_string('-jsgf', path.join(MODELDIR, 'lm/robotas.gram')) config.set_string('-dict', path.join(MODELDIR, 'dict/robotas.dict')) decoder = Decoder(config) p = pyaudio.PyAudio() stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) #Indicate listening for next utterance print ("READY....")
def decodepassive(): speechRec = Decoder(hmm = hmdir, lm = lmdir, dict = dictd) with open(passivewav, 'rb') as passivewav: speechRec.decode_raw(passivewav) result = speechRec.get_hyp() return(result[0])
from os import environ, path from sphinxbase import Config from pocketsphinx import Decoder MODELDIR = "pocketsphinx/model" DATADIR = "pocketsphinx/test/data" # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', path.join(MODELDIR, 'en-us/en-us')) config.set_string('-lm', path.join(MODELDIR, 'en-us/en-us.lm.bin')) config.set_string('-dict', path.join(MODELDIR, 'en-us/cmudict-en-us.dict')) decoder = Decoder(config) # Decode streaming data. decoder = Decoder(config) decoder.start_utt() stream = open(path.join(DATADIR, 'goforward.raw'), 'rb') while True: buf = stream.read(1024) if buf: decoder.process_raw(buf, False, False) else: break decoder.end_utt() print ('Best hypothesis segments: ', [seg.word for seg in decoder.seg()])
class stt: def __init__(self, profile, hmm=None, dict=None, lm=None, kws_threshold=None, keyphrase=None): self.profile = profile if keyphrase: if not dict: dict = fullpath('config/keyphrase.dic') if not lm: lm = fullpath('config/keyphrase.lm') else: if not dict: dict = fullpath('config/corpus.dic') if not lm: lm = fullpath('config/corpus.lm') if not hmm: hmm = 'share/pocketsphinx/model/en-us/en-us' config = Decoder.default_config() config.set_string('-hmm', os.path.join(SPHINX_ROOT, hmm)) config.set_string('-dict', dict) config.set_string('-lm', lm) config.set_string('-logfn', fullpath('config/sphinx.log')) if keyphrase: config.set_string('-keyphrase', keyphrase) if kws_threshold: config.set_float('-kws_threshold', kws_threshold) self.decoder = Decoder(config) self.transcribe = self.transcribe_darwin self.hyp = None def transcribe_darwin(self, wav): self.decoder.start_utt() self.decoder.process_raw(wav, False, False) self.decoder.end_utt() self.hyp = self.decoder.hyp() if self.hyp: return self.hyp.hypstr def get_prob(self): if self.hyp: print self.hyp.best_score return self.hyp.prob def transcribe_linux(self, wav): self.decoder.start_utt() self.decoder.process_raw(wav, False, False) self.decoder.end_utt() result = self.decoder.get_hyp() if result: return result[0]
def run( self ): conf = Decoder.default_config() conf.set_string('-hmm', self.config.hmmPS) conf.set_string('-lm', self.config.lmPS) conf.set_string('-dict', self.config.dictPS) if os.path.isfile(self.config.mllrPS): conf.set_string('-mllr', self.config.mllrPS) decoder = Decoder(conf) p = pyaudio.PyAudio() stream = p.open( format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024 ) stream.start_stream() self.samplewith = p.get_sample_size(pyaudio.paInt16) in_speech_bf = True decoder.start_utt('') while not self._terminate: buf = stream.read(1024) if buf: if self.save: self.liSave.append(buf) self.numSave += 1 if self.numSave > self.maxSave: # nos protegemos de dejar el microfono encendido self.activeSave(self.fichWAV) decoder.process_raw(buf, False, False) if decoder.get_in_speech() != in_speech_bf: in_speech_bf = decoder.get_in_speech() if not in_speech_bf: decoder.end_utt() try: if decoder.hyp().hypstr != '': self.decode(decoder.hyp().hypstr) except AttributeError: pass decoder.start_utt('') else: break decoder.end_utt()
def record(THRESHOLD=None): FORMAT = pyaudio.paInt16 CHANNELS = 1 LISTEN_TIME = 4 WAVE_OUTPUT_FILENAME = "livewav.wav" p = pyaudio.PyAudio() if THRESHOLD == None: THRESHOLD = fetchThreshold() print THRESHOLD stream = p.open(format=FORMAT, channels=1, rate=RATE, input=True, frames_per_buffer=CHUNK) print "* recording" frames = [] lastN = [THRESHOLD * 1.2 for i in range(30)] for i in range(0, RATE / CHUNK * LISTEN_TIME): data = stream.read(CHUNK) frames.append(data) score = getScore(data) lastN.pop(0) lastN.append(score) average = sum(lastN) / float(len(lastN)) #print average,THRESHOLD * 0.8 if average < THRESHOLD * 0.8: break print "* done recording" #stream.stop_stream() stream.close() p.terminate() # write data to WAVE file data = ''.join(frames) wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb') wf.setnchannels(1) wf.setsampwidth(p.get_sample_size(FORMAT)) wf.setframerate(RATE) wf.writeframes(data) wf.close() sysdir = os.getcwd() wavfile = sysdir+"/livewav.wav" #decoded=decodepassive() speechRec = Decoder(hmm=hmdir, lm=lmdir, dict=dictd) with open(wavfile, 'rb') as wavFile: speechRec.decode_raw(wavFile) result = speechRec.get_hyp() return(result[0])
class ContinuousPocketsphinx(object): ''' classdocs ''' CHUNK = 4096 FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 16000 #MODELDIR = "../models" MODELDIR = "/home/mgreibus/src/speech/sphinx/lt-pocketsphinx-tutorial/impl/models" decoder = None stream = None config = None ai = None def __init__(self): ''' Constructor ''' print ("[__init__]+++") # Create a decoder with certain model self.ai = Artificialintelligence() self.config = self.createConfig("code"); self.decoder = Decoder(self.config); print ("[__init__] created decoder") #self.updateGrammar(self.decoder, "confirmation"); print ("[__init__]---") p = pyaudio.PyAudio() self.stream = p.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input=True, frames_per_buffer=self.CHUNK) #Indicate listening for next utterance print ("READY....") def updateGrammar(self,pDecoder, pGramma): ''' Update decoder language model from fsg file ''' print ("[updateGrammar]+++" + pGramma) logmath = pDecoder.get_logmath(); fsg = sphinxbase.FsgModel(os.path.join("../resource/", pGramma+'.fsg'), logmath, 7.5) #pDecoder.readfile(os.path.join("../resource/", pGramma+'.fsg'), logmath) pDecoder.set_fsg("default",fsg); pDecoder.set_search("default"); print ("[updateGrammar]---") def createConfig(self,pGramma): print ("[createConfig]+++") config = Decoder.default_config() config.set_string('-hmm', os.path.join(self.MODELDIR, 'hmm/liepa.cd_semi_200/')) config.set_string('-fsg', os.path.join("../resource/", pGramma+'.fsg')) #config.set_string('-jsgf', os.path.join("../resource/", pGramma+'.gram')) config.set_string('-dict', os.path.join("../resource/", 'service.dict')) print ("[createConfig]---") return config; def speak(self,text): print("Speak: ", text) if text is not None: aProcess = subprocess.Popen(['/home/mgreibus/bin/tark-win-lt', text], stderr=subprocess.STDOUT) out = aProcess.communicate()[0]; time.sleep (0.100) print("ended Speak: ", out) def said(self, aiContext, text): print ("[said]+++", text) aiContext = self.ai.said(text, aiContext) print ('AI response: ', aiContext.state, aiContext.response) self.speak(aiContext.response) if aiContext.interactiveStep is False : self.said(aiContext, text); print ("[said]---") return aiContext def recognized(self, pStream, pDecoder, aiContext): print ("[recognized]+++") pStream.stop_stream() pDecoder.end_utt() # Retrieve hypothesis. hypothesis = pDecoder.hyp() if hypothesis is not None: print ('Best hypothesis: ', hypothesis.uttid, hypothesis.best_score, hypothesis.hypstr) self.said(aiContext, hypothesis.hypstr.decode('utf-8')) if aiContext.state in aiContext.GRAM: self.updateGrammar(pDecoder, aiContext.GRAM[aiContext.state]); elif (time.time() - aiContext.stateStarted) > 10: self.speak(aiContext.response) aiContext.stateStarted = time.time() print ("Time: ", (time.time() - aiContext.stateStarted)) print("AI response ", aiContext.response) time.sleep (0.100) #Indicate listening for next utterance pStream.start_stream() pDecoder.start_utt(None) print ("READY....") print ("[recognized]---") return aiContext def run(self): ''' Executor ''' print("* start recording") self.decoder.start_utt(None) cur_vad_state = 0 aiContext = self.ai.createContext(); self.said(aiContext, None); while True: data = self.stream.read(self.CHUNK) time.sleep (0.100) #frames.append(data) self.decoder.process_raw(data, False, False) vad_state = self.decoder.get_vad_state() if vad_state and not cur_vad_state: #silence -> speech transition, #let user know that we heard print("Listening...\n") if not vad_state and cur_vad_state: #speech -> silence transition, #time to start new utterance aiContext = self.recognized(self.stream,self.decoder, aiContext); if aiContext.state == aiContext.STATE_THANKS: break cur_vad_state = vad_state
class SphinxWrapper(object): ''' For audio stream feeding is used `process_raw(...)` method. It also updates vad status: if voice found in signal. Before signal is fed to decoder, it should be isntructed that new utterance is expected. When Vad says that speech segment ended it should be called `stopListening(...)`, only then we could request hypothesis what was said. `calculateHypothesis(...)` ''' #MODELDIR = "../models" #MODELDIR = "/home/as/src/speech/sphinx/lt-pocketsphinx-tutorial/impl/models" MODELDIR = "../../lt-pocketsphinx-tutorial/impl/models" decoder = None config = None previousVadState = 0 currentVadState = 0 def __init__(self): ''' Constructor ''' def prepareDecoder(self, pGramma): ''' Entry point where sphinx decoder is initialized or grammar updated ''' if self.decoder is None: self.config = self.createConfig(pGramma); self.decoder = Decoder(self.config); else: self.updateGrammar(self.decoder, pGramma); def createConfig(self,pGramma): ''' Create configuration with acoustic model path, grammar and dictionary ''' print ("[createConfig]+++") config = Decoder.default_config() config.set_string('-hmm', os.path.join(self.MODELDIR, 'hmm/lt.cd_cont_200/')) config.set_string('-fsg', os.path.join("../resource/", pGramma+'.fsg')) #config.set_string('-jsgf', os.path.join("../resource/", pGramma+'.gram')) config.set_string('-dict', os.path.join("../resource/", 'service.dict')) print ("[createConfig]---") return config; def updateGrammar(self,pGramma): ''' Update decoder language model from fsg file ''' print ("[updateGrammar]+++" + pGramma) logmath = self.decoder.get_logmath(); fsg = sphinxbase.FsgModel(os.path.join("../resource/", pGramma+'.fsg'), logmath, 7.5) self.decoder.set_fsg("default",fsg); self.decoder.set_search("default"); print ("[updateGrammar]---") def startListening(self): """ Instruct decoder that new utterace should be expected """ self.decoder.start_utt(None) def stopListening(self): """ Instruct decoder that new utterace should is not expected any more """ self.decoder.end_utt() def process_raw(self, data): """ Feed decoder with raw audio data. After data is updating refresh VAD state """ #print("process_raw...\n") self.decoder.process_raw(data, False, False) self.previousVadState = self.currentVadState self.currentVadState = self.decoder.get_vad_state(); #print("process_raw", self.currentVadState and True, self.previousVadState and True) def calculateHypothesis(self): return self.decoder.hyp(); def calculateVadState(self): return self.decoder.get_vad_state; def isVoiceStarted(self): ''' silence -> speech transition, ''' return self.currentVadState and not self.previousVadState def isVoiceEnded(self): ''' speech -> silence transition, ''' return not self.currentVadState and self.previousVadState
class VoiceService(object): audio_device = None buffer_size = 2048 sampling_rate = 16000 def __init__(self): config = get_decoder_config() self.decoder = Decoder(config) self.speech = pyttsx3.init() self.audio = sphinxbase.Ad(self.audio_device, self.sampling_rate) self.buffer = bytearray(self.buffer_size) self.default_search = self.decoder.get_search() self.in_speech = False self.max_history = 100 self.phrases = [] self.prompts = {} self.next_prompt_id = 1 self.current_prompt = None self.prompt_queue = queue.Queue() def create_prompt(self, message=None, message_url=None, search="enable", timeout=15): """ Create a new prompt and add it to the queue. Currently, only one type of prompt is supported. We play a message, then wait for someone to say a specific word (the search word) within the alloted amount of time. The status of the prompt can be retrieved by calling get_prompt with the appropriate id. timeout: prompt timeout in seconds, expected to be either None or numeric. """ if timeout is not None: # Be forgiving of caller who may have passed timeout as a string. timeout = float(timeout) prompt = { "created_time": time.time(), "detected": False, "detected_time": None, "id": self.get_next_prompt_id(), "message": message, "message_url": message_url, "search": search, "search_started": False, "search_started_time": None, "played": False, "played_time": None, "timeout": timeout, "timed_out": False } self.prompts[str(prompt['id'])] = prompt self.prompt_queue.put(prompt) return prompt def get_next_prompt_id(self): """ Get a unique ID for a prompt. """ tmp = self.next_prompt_id self.next_prompt_id += 1 return tmp def get_phrases(self): """ Get the history of detected phrases. """ return self.phrases def get_prompt(self, prompt_id): """ Get information about a prompt. """ return self.prompts[str(prompt_id)] def get_status(self): """ Get the system status. """ status = { "current_prompt": self.current_prompt, "in_speech": self.decoder.get_in_speech(), "queue_length": self.prompt_queue.qsize(), "search": self.decoder.get_search() } return status def play_prompt(self, prompt): prompt['played_time'] = time.time() if prompt.get("message_url", None) is not None: cmd = ["mplayer", "-ao", "pulse", prompt['message_url']] subprocess.call(cmd) elif prompt.get("message", None) is not None: self.speech.say(prompt['message']) self.speech.runAndWait() prompt['played'] = True def process_hypothesis(self, hypothesis): print("SPEECH {}".format(hypothesis.hypstr)) phrase = { "search": self.decoder.get_search(), "time": time.time(), "text": hypothesis.hypstr } self.phrases.append(phrase) del self.phrases[:-self.max_history] def run_next_prompt(self): if self.prompt_queue.empty(): self.create_prompt(None, search="paradrop", timeout=None) self.current_prompt = self.prompt_queue.get_nowait() self.decoder.set_search(self.current_prompt['search']) self.audio.stop_recording() self.play_prompt(self.current_prompt) self.audio.start_recording() self.current_prompt['search_started_time'] = time.time() self.current_prompt['search_started'] = True def detect_timeout(self): """ Check if the current prompt has timed out. """ if self.current_prompt is None: # No active prompt to timeout. return False if self.decoder.get_in_speech(): # Defer timeout if decoder reports that speech is in progress. A # person may be speaking the target phrase currently. return False if self.current_prompt['timeout'] is None: # If timeout is None, then only timeout when there is another item # in the queue. return not self.prompt_queue.empty() else: diff = time.time() - self.current_prompt['search_started_time'] return diff >= self.current_prompt['timeout'] def run(self): self.decoder.set_keyphrase("activate", "activate") self.decoder.set_keyphrase("allow", "allow") self.decoder.set_keyphrase("enable", "enable") self.decoder.set_keyphrase("paradrop", "para drop") self.audio.start_recording() while True: if self.current_prompt is None: self.run_next_prompt() self.decoder.start_utt() self.audio.readinto(self.buffer) self.decoder.process_raw(self.buffer, False, False) if self.in_speech and not self.decoder.get_in_speech(): self.decoder.end_utt() hypothesis = self.decoder.hyp() if hypothesis is not None: self.process_hypothesis(hypothesis) self.current_prompt['detected'] = True self.current_prompt['detected_time'] = time.time() self.current_prompt = None else: self.decoder.start_utt() if self.detect_timeout(): self.decoder.end_utt() self.current_prompt['timed_out'] = True self.current_prompt = None self.in_speech = self.decoder.get_in_speech()
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # ==================================================================== from os import environ, path from itertools import izip from sphinxbase import Config from pocketsphinx import Decoder #some dumb test for checking during developent MODELDIR = "../../../model" config = Decoder.default_config() intval = 256 floatval = 8000.0 stringval = "~/pocketsphinx" boolval = True # Check values that was previously set. s = config.get_float("-samprate") print "Float: ",floatval ," ", s config.set_float("-samprate", floatval) s = config.get_float("-samprate") print "Float: ",floatval ," ", s s = config.get_int("-nfft") print "Int:",intval, " ", s
import os from pocketsphinx import DefaultConfig, Decoder, get_model_path, get_data_path model_path = get_model_path() data_path = 'C:/project/accent/accent-poc/src/Audio/' # Create a decoder with a certain model config = DefaultConfig() config.set_string('-hmm', os.path.join(model_path, 'en-us')) config.set_string('-lm', os.path.join(model_path, 'en-us.lm.bin')) config.set_string('-dict', os.path.join(model_path, 'cmudict-en-us.dict')) decoder = Decoder(config) # Decode streaming data buf = bytearray(1024) with open(os.path.join(data_path, 'speaker2.wav'), 'rb') as f: decoder.start_utt() while f.readinto(buf): decoder.process_raw(buf, False, False) decoder.end_utt() print('Best hypothesis segments:', [seg.word for seg in decoder.seg()])
p.terminate() # write data to WAVE file data = ''.join(all) wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb') wf.setnchannels(CHANNELS) wf.setsampwidth(p.get_sample_size(FORMAT)) wf.setframerate(RATE) wf.writeframes(data) wf.close() if __name__ == "__main__": hmdir = "/usr/share/pocketsphinx/model/hmm/en_US/hub4wsj_sc_8k" lmdir = "/usr/share/pocketsphinx/model/lm/en_US/hub4.5000.DMP" dictd = "/usr/share/pocketsphinx/model/lm/en_US/cmu07a.dic" record() wavfile = "/home/shridhar/pocketsphinxtest/livewav.wav" speechRec = Decoder(hmm=hmdir, lm=lmdir, dict=dictd) wavFile = file(wavfile, 'rb') speechRec.decode_raw(wavFile) result = speechRec.get_hyp() print "Recognised text from the converted video file" print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%" print result[0] print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
''' Created on Dec 29, 2013 @author: Mindaugas Greibus ''' import sys, os from pocketsphinx import Decoder MODELDIR = "../models" # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', os.path.join(MODELDIR, 'hmm/lt.cd_cont_200/')) config.set_string('-jsgf', os.path.join(MODELDIR, 'lm/robotas.gram')) config.set_string('-dict', os.path.join(MODELDIR, 'dict/robotas.dict')) decoder = Decoder(config) decoder.decode_raw( open(os.path.join(MODELDIR, '../test/audio/varyk_pirmyn-16k.wav'), 'rb')) # Retrieve hypothesis. hypothesis = decoder.hyp() print('Best hypothesis: ', hypothesis.best_score, hypothesis.hypstr) print('Best hypothesis segments: ', [seg.word for seg in decoder.seg()])
''' Created on Dec 29, 2013 @author: Mindaugas Greibus ''' import sys, os from pocketsphinx import Decoder MODELDIR = "../models" # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', os.path.join(MODELDIR, 'hmm/lt.cd_cont_200/')) config.set_string('-jsgf', os.path.join(MODELDIR, 'lm/robotas.gram')) config.set_string('-dict', os.path.join(MODELDIR, 'dict/robotas.dict')) decoder = Decoder(config) decoder.decode_raw(open(os.path.join(MODELDIR, '../test/audio/varyk_pirmyn-16k.wav'), 'rb')) # Retrieve hypothesis. hypothesis = decoder.hyp() print ('Best hypothesis: ', hypothesis.best_score, hypothesis.hypstr) print ('Best hypothesis segments: ', [seg.word for seg in decoder.seg()])