def process_file(self, audiofile): """ processes audio file and returns the text """ with open(audiofile, 'rb') as audiofile: decoder = Decoder(self.config) decoder.start_utt() while True: buf = audiofile.read(1024) if buf: decoder.process_raw(buf, False, False) else: break decoder.end_utt() hyp = decoder.hyp() print "Hyp:", hyp if hyp != None: print "Hyp Score", (hyp.prob, hyp.best_score) average_score = 0 seg_count = 0 for seg in decoder.seg(): if seg.word != "<sil>": seg_count += 1 average_score += seg.ascore print(seg.word, seg.ascore, seg.lscore) print "hyp:", hyp.hypstr print average_score / seg_count return hyp.hypstr return None
def build_decoder(self): config = Decoder.default_config() config.set_string( "-dict", os.path.join(self.MODEL_DIR, "cmudict-en-us.dict") ) config.set_string( "-fdict", os.path.join(self.MODEL_DIR, "en-us/noisedict") ) config.set_string( "-featparams", os.path.join(self.MODEL_DIR, "en-us/feat.params") ) config.set_string( "-tmat", os.path.join(self.MODEL_DIR, "en-us/transition_matrices") ) config.set_string("-hmm", os.path.join(self.MODEL_DIR, "en-us")) config.set_string("-lm", os.path.join(self.MODEL_DIR, "en-us.lm.bin")) config.set_string("-mdef", os.path.join(self.MODEL_DIR, "en-us/mdef")) config.set_string("-mean", os.path.join(self.MODEL_DIR, "en-us/means")) config.set_string( "-sendump", os.path.join(self.MODEL_DIR, "en-us/sendump") ) config.set_string( "-var", os.path.join(self.MODEL_DIR, "en-us/variances") ) null_path = "/dev/null" if sys.platform == "win32": null_path = "NUL" config.set_string("-logfn", null_path) return Decoder(config)
def create_decoder(): base = os.path.join(root(), 'pocketsphinx', 'zero_ru_cont_8k_v3') hmm = os.path.join(base, 'zero_ru.cd_semi_4000') # - mobile? # hmm = os.path.join(base, 'zero_ru.cd_cont_4000') # hmm = os.path.join(base, 'zero_ru.cd_ptm_4000') - mobile? dict = os.path.join(base, 'ru.dic.orig') # dict = os.path.join(base, 'ru.dic') lm = os.path.join(base, 'ru.lm.orig') # kws = os.path.join(base, 'ru.dic.orig.keywords') kws = os.path.join(base, 'keywords.mini') decoder_config = Decoder.default_config() decoder_config.set_string('-hmm', hmm) decoder_config.set_string("-lm", lm) # decoder_config.set_string('-keyphrase', 'алекса') # decoder_config.set_float('-kws_threshold', 1e-20) # decoder_config.set_string('-kws', kws) decoder_config.set_string('-dict', dict) decoder_config.set_boolean('-remove_noise', False) decoder_config.set_float('-samprate', 8000) decoder_config.set_string('-logfn', os.devnull) decoder = Decoder(decoder_config) return decoder
def audio2phoneme(audio_file): wave_read = wave.open(audio_file, 'rb') length = wave_read.getnframes() / wave_read.getframerate() wave_read.close() # Decode streaming data. decoder = Decoder(config) buf = bytearray(1024) with open(audio_file, 'rb') as f: decoder.start_utt() while f.readinto(buf): decoder.process_raw(buf, False, False) decoder.end_utt() nframes = decoder.n_frames() phonemes = [] offset = None for seg in decoder.seg(): if offset is None: offset = seg.start_frame start_frame = seg.start_frame - offset end_frame = seg.end_frame - offset phonemes.append((seg.word, start_frame / nframes * length, end_frame / nframes * length)) return phonemes
def setup(self): # PocketSphinx configuration ps_config = Decoder.default_config() # Set recognition model to US ps_config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) ps_config.set_string( '-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) # Specify recognition key phrase ps_config.set_string('-keyphrase', self._tconfig['phrase']) ps_config.set_float('-kws_threshold', float(self._tconfig['threshold'])) # Hide the VERY verbose logging information when not in debug if logging.getLogger('alexapi').getEffectiveLevel() != logging.DEBUG: null_path = '/dev/null' if platform.system() == 'Windows': null_path = 'nul' ps_config.set_string('-logfn', null_path) # Process audio chunk by chunk. On keyword detected perform action and restart search self._detector = Decoder(ps_config)
def speech_recog(self, model): # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', '/usr/local/share/pocketsphinx/model/en-us/en-us') config.set_int('-ds', 2) config.set_int('-topn', 3) config.set_int('-maxwpf', 5) #config.set_string('-kws', MODELDIR + model + '.txt') config.set_string('-lm', MODELDIR + model + '.lm') config.set_string('-dict', MODELDIR + model + '.dict') decoder = Decoder(config) decoder.start_utt() recog_text = '' with self.stream_in as stream: audio_generator = stream.generator() for content in audio_generator: decoder.process_raw(content, False, False) if decoder.hyp() and decoder.hyp().hypstr != '': recog_text += decoder.hyp().hypstr if len(recog_text) > 1: decoder.end_utt() logging.info("recog text: %s", recog_text) return recog_text return recog_text
def setup(self): # PocketSphinx configuration ps_config = Decoder.default_config() # Set recognition model to US ps_config.set_string( '-hmm', os.path.join(get_model_path(), self._tconfig['language'])) ps_config.set_string( '-dict', os.path.join(get_model_path(), self._tconfig['dictionary'])) # Specify recognition key phrase #ps_config.set_string('-keyphrase', self._tconfig['phrase']) #ps_config.set_float('-kws_threshold', float(self._tconfig['threshold'])) ### Multiple Hotwords #ps_config.set_string('-inmic', 'yes') ps_config.set_string('-kws', '/opt/AlexaPi/src/keyphrase.list') # Hide the VERY verbose logging information when not in debug if logging.getLogger('alexapi').getEffectiveLevel() != logging.DEBUG: ps_config.set_string('-logfn', '/dev/null') # Process audio chunk by chunk. On keyword detected perform action and restart search self._decoder = Decoder(ps_config)
def create_decoder(): path = os.path.dirname(os.path.realpath(__file__)) pocketsphinx_data = os.getenv('POCKETSPHINX_DATA', os.path.join(path, 'pocketsphinx')) hmm = os.getenv('POCKETSPHINX_HMM', os.path.join(pocketsphinx_data, 'tdt_sc_8k')) dict = os.getenv('POCKETSPHINX_DIC', os.path.join(pocketsphinx_data, 'keywords.dic')) kws = os.getenv('POCKETSPHINX_KWS', os.path.join(pocketsphinx_data, 'keywords.kws')) lm = os.getenv('POCKETSPHINX_LM', os.path.join(pocketsphinx_data, 'keywords.lm')) log = os.getenv('POCKETSPHINX_LOG', os.path.join(pocketsphinx_data, 'log')) config = Decoder.default_config() config.set_string('-hmm', hmm) config.set_string('-lm', lm) config.set_string('-dict', dict) # config.set_string('-kws', kws) # config.set_int('-samprate', SAMPLE_RATE) # uncomment if rate is not 16000. use config.set_float() on ubuntu config.set_int('-nfft', 512) #config.set_float('-vad_threshold', 2.7) config.set_string('-logfn', log) return Decoder(config)
def __init__(self, phrase, threshold, device_index=0): self._decoder = None self._pa = None self._device_no = device_index self._phrase = phrase self._threshold = float(threshold) # PocketSphinx configuration logging.info('Phrase: ' + phrase + ' Threshold: ' + str(threshold)) ps_config = Decoder.default_config() # Set recognition model to US ps_config.set_string('-hmm', os.path.join(get_model_path_keyword(), 'en-us')) ps_config.set_string( '-dict', os.path.join(get_model_path_keyword(), 'cmudict-en-us.dict')) # Specify recognition key phrase ps_config.set_string('-keyphrase', self._phrase) ps_config.set_float('-kws_threshold', self._threshold) ps_config.set_string('-logfn', '/dev/null') # Process audio chunk by chunk. On keyword detected perform action and restart search self._decoder = Decoder(ps_config) self._pa = pyaudio.PyAudio()
def __init__(self, keyword, sensitivity): config = Decoder.default_config() config.set_string('-logfn', '/dev/null') config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) config.set_string('-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) config.set_string('-keyphrase', keyword if keyword != 'snowboy' else 'snow boy') config.set_float('-kws_threshold', 10 ** -sensitivity) self._decoder = Decoder(config) self._decoder.start_utt()
def start_listening(self): ''' Starts streaming. Pauses until self.resume has been called ''' config = Decoder.default_config() config.set_string('-hmm', path.join(self.model_dir, self.hmm)) config.set_string('-lm', path.join(self.model_dir, self.lm)) config.set_string('-dict', path.join(self.model_dir, self.dictionary)) config.set_string('-logfn', self.logfn) # This takes a while decoder = Decoder(config) p = pyaudio.PyAudio() print(self.input_source_index) stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, input_device_index=self.input_source_index, frames_per_buffer=1024) stream.start_stream() in_speech_bf = False decoder.start_utt() self.wait_to_resume_lock.acquire() while self.is_running: while self.paused: pass buf = stream.read(1024, exception_on_overflow=False) if buf: decoder.process_raw(buf, False, False) if decoder.get_in_speech() != in_speech_bf: in_speech_bf = decoder.get_in_speech() if not in_speech_bf: decoder.end_utt() # if self.wait_to_resume: # stream.stop_stream() phrase = decoder.hyp().hypstr if phrase != "": self.all_speech_data.append(phrase) # if self.wait_to_resume: # # print("waiting") # self.wait_to_resume_lock.acquire() # # print("resuming") # if self.wait_to_resume: # stream.start_stream() decoder.start_utt() else: break decoder.end_utt()
def __init__(self): # https://github.com/cmusphinx/pocketsphinx-python/blob/master/example.py config = Decoder.default_config() config.set_string('-logfn', '/dev/null') config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) config.set_string('-lm', os.path.join(get_model_path(), 'en-us.lm.bin')) config.set_string('-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) self._decoder = Decoder(config)
def configure(self): config = Decoder.default_config() config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang, 'hmm')) config.set_string('-dict', os.path.join(BASEDIR, 'model', self.lang, 'mycroft-en-us.dict')) config.set_string('-keyphrase', self.key_phrase) config.set_float('-kws_threshold', float('1e-45')) config.set_float('-samprate', self.sample_rate) config.set_int('-nfft', 2048) config.set_string('-logfn', '/dev/null') self.decoder = Decoder(config)
def main(): abspath = os.path.dirname(os.path.abspath(__file__)) abspath = os.path.join(abspath, '..') model_dir = os.path.join(abspath, 'model') hmm = os.path.join(model_dir, HMM) lm = os.path.join(model_dir, LM) dic = os.path.join(model_dir, DIC) config = Decoder.default_config() config.set_string('-hmm', hmm) config.set_string('-lm', lm) config.set_string('-dict', dic) config.set_string('-logfn', '/dev/null') decoder = Decoder(config) p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=BUFFER) stream.start_stream() in_speech_bf = True decoder.start_utt() while True: buf = stream.read(BUFFER) if buf: decoder.process_raw(buf, False, False) if decoder.get_in_speech(): sys.stdout.write('.') sys.stdout.flush() if decoder.get_in_speech() == in_speech_bf: continue in_speech_bf = decoder.get_in_speech() if in_speech_bf: continue decoder.end_utt() try: if decoder.hyp().hypstr != '': print('You said:', decoder.hyp().hypstr) except AttributeError: pass decoder.start_utt() else: break decoder.end_utt() print('An Error occured:', decoder.hyp().hypstr)
def process_stream(self, stream, callback): """ Processes continuosly an audio stream and trigger the callback when text is detected """ decoder = Decoder(self.config) decoder.start_utt() while True: buf = stream.read(1024) decoder.process_raw(buf, False, False) if decoder.hyp() is not None and decoder.hyp().hypstr is not None: decoder.end_utt() callback(decoder.hyp().hypstr) decoder.start_utt()
def setup_pocketsphinx(self) -> None: self.logger.info("Setting up PocketSphinx.") self.MODELDIR = "resources/model" config = Decoder.default_config() config.set_string('-hmm', os.path.join(self.MODELDIR, 'es-es')) config.set_string('-lm', os.path.join(self.MODELDIR, 'es-es.lm')) config.set_string('-dict', os.path.join(self.MODELDIR, 'es.dict')) config.set_string('-logfn', '/dev/null') self.decoder = Decoder(config) self.prev_buf_is_speech = False self.decoder.start_utt() self.logger.info("Done setting up PocketSphinx.")
def init(): # Create a decoder with certain model config = Decoder.default_config() config.set_string('-logfn', settings.POCKETSPHINX_LOG) config.set_string('-hmm', settings.ACOUSTIC_MODEL) config.set_string('-lm', settings.LANGUAGE_MODEL) config.set_string('-dict', settings.POCKET_DICT) # Decode streaming data global decoder, p decoder = Decoder(config) decoder.set_keyphrase('wakeup', settings.WAKE_UP_WORD) decoder.set_search('wakeup') p = pyaudio.PyAudio() global r r = speech_recognition.Recognizer()
def start_recognizer(self): """Function to handle lm or grammar processing of audio.""" config = Decoder.default_config() rospy.loginfo("Done initializing pocketsphinx") # Setting configuration of decoder using provided params config.set_string('-dict', self.dict) config.set_string('-lm', self.class_lm) config.set_string('-hmm', self.hmm) self.decoder = Decoder(config) # Start processing input audio self.decoder.start_utt() rospy.loginfo("Decoder started successfully") # Subscribe to audio topic rospy.Subscriber("recognizer/audio_ready", Bool, self.process_audio) rospy.spin()
def create_decoder(): from pocketsphinx.pocketsphinx import Decoder path = os.path.dirname(os.path.realpath(__file__)) pocketsphinx_data = os.getenv('POCKETSPHINX_DATA', os.path.join(path, 'pocketsphinx-data')) hmm = os.getenv('POCKETSPHINX_HMM', os.path.join(pocketsphinx_data, 'hmm')) dict = os.getenv('POCKETSPHINX_DIC', os.path.join(pocketsphinx_data, 'dictionary.txt')) kws = os.getenv('POCKETSPHINX_KWS', os.path.join(pocketsphinx_data, 'keywords.txt')) config = Decoder.default_config() config.set_string('-hmm', hmm) config.set_string('-dict', dict) config.set_string('-kws', kws) # config.set_int('-samprate', SAMPLE_RATE) # uncomment if rate is not 16000. use config.set_float() on ubuntu config.set_int('-nfft', 512) config.set_float('-vad_threshold', 2.7) config.set_string('-logfn', os.devnull) return Decoder(config)
def __init__(self, keyword, sensitivity): """ Constructor. :param keyword: keyword to be detected. :param sensitivity: detection sensitivity. """ # Set the configuration. config = Decoder.default_config() config.set_string('-logfn', '/dev/null') # Set recognition model to US config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) config.set_string('-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) config.set_string('-keyphrase', keyword) config.set_float('-kws_threshold', sensitivity) self._decoder = Decoder(config) self._decoder.start_utt()
def load_models(pipe, config, models): """Internal worker method to load the language model Note: Some lanaguages take a long time to load. English is by far the fastest language to be loaded as a model. Arguments: pipe (:obj: socket): The response pipe to send to the parent process models (dict): The language and nltk models developed by the parent process Returns: (Decoder) The STT decoder object and the nltk model """ language_model = models["language_model"] nltk_model = models["nltk_model"] if False in [ language_model.is_valid_model(), nltk_model.is_valid_model() ]: l_log.error("The language model %s is invalid!" % str(language_model.name)) send_error(pipe, "Failed loading language model!") return # Load the model configurations into pocketsphinx config.set_string('-hmm', str(language_model.hmm)) config.set_string('-lm', str(language_model.lm)) config.set_string('-dict', str(language_model.dict)) decoder = Decoder(config) send_json( pipe, {"success": True}) # Send a success message to the client l_log.debug("Set the language model to %s" % str(language_model.name)) return decoder, nltk_model # Return the new decoder and nltk model
def __init__(self, device_index=0, model_path=None): self._decoder = None self._pa = None self._device_no = device_index self._model_path = model_path # PocketSphinx configuration logging.info('Grammar file:' + os.path.join(model_path, self.GRAMMAR)) ps_config = Decoder.default_config() # Set recognition model to ... ps_config.set_string('-hmm', os.path.join(model_path, self.HMM)) ps_config.set_string('-dict', os.path.join(model_path, self.DIC)) ps_config.set_string('-jsgf', os.path.join(model_path, self.GRAMMAR)) ps_config.set_string('-logfn', '/dev/null') # Process audio chunk by chunk. On keyword detected perform action and restart search self._decoder = Decoder(ps_config) self._pa = pyaudio.PyAudio()
def __init__(self, engine_type, keyword, sensitivity): """Initializer. :param engine_type: type of the engine. :param keyword: keyword being used for detection. :param sensitivity: sensitivity passed to the engine. """ super().__init__(engine_type, keyword, sensitivity) # Set the configuration. config = Decoder.default_config() config.set_string('-logfn', '/dev/null') # Set recognition model to US config.set_string('-hmm', os.path.join(get_model_path(), 'en-us')) config.set_string('-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict')) config.set_string('-keyphrase', keyword) config.set_float('-kws_threshold', sensitivity) self._decoder = Decoder(config) self._decoder.start_utt()
def __init__(self): self.MODELDIR = 'speech/' self.wav_name = 'media/temp.wav' self.raw_name = 'media/temp.raw' config = Decoder.default_config() config.set_string('-hmm', self.MODELDIR + 'ru_ru/') config.set_string('-dict', self.MODELDIR + 'ru.dic') self.decoder = Decoder(config) jsgf = Jsgf(self.MODELDIR + 'gr.gram') rule = jsgf.get_rule('gr.rule') fsg = jsgf.build_fsg(rule, self.decoder.get_logmath(), 7.5) fsg.writefile('gr.fsg') self.decoder.set_fsg('gr', fsg) self.decoder.set_search('gr') self.rec = Recognizer() self.mic = Microphone()
def recognize_phonemes(segments_path, phonemes_result_path): # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', join(model_dir, decoder_hmm)) config.set_string('-allphone', join(model_dir, decoder_allphone)) config.set_string('-dict', join(model_dir, decoder_dict)) config.set_float('-lw', decoder_lw) config.set_float('-pip', decoder_pip) config.set_float('-beam', decoder_beam) config.set_float('-pbeam', decoder_pbeam) config.set_boolean('-mmap', decoder_mmap) hyps = [] segs = [] self.decoder = Decoder(config) with open(segments_path, 'rb') as stream: in_speech_buffer = False self.decoder.start_utt() while True: buf = stream.read(decoder_stream_buf_size) if buf: self.decoder.process_raw(buf, False, False) if self.decoder.get_in_speech() != in_speech_buffer: in_speech_buffer = self.decoder.get_in_speech() if not in_speech_buffer: hyp_result, segment = _get_decoder_results() segs += segment hyps.append(hyp_result) self.decoder.start_utt() else: if in_speech_buffer: hyp_result, segment = _get_decoder_results() segs += segment hyps.append(hyp_result) break phonemes_dict = dict(hypotheses=hyps, segment_info=segs) phonemes_result = DecoderOutputSchema().dumps(phonemes_dict) with open(phonemes_result_path, 'w') as f: f.write(phonemes_result)
def __init__(self, gui): QThread.__init__(self, gui) if settings.sphinx_acoustic_model_dir == '': # use default acoustic model acoustic_model_directory = path.join(get_model_path(), 'en-us') else: # use custom acoustic model acoustic_model_directory = settings.sphinx_acoustic_model_dir config = Decoder.default_config() config.set_string('-hmm', acoustic_model_directory) # acoustic model config.set_string( '-dict', settings.prepared_lexicon_file) # lexicon pronunciation config.set_string( '-jsgf', settings.prepared_grammar_file) # language model from grammar config.set_string( '-logfn', settings.outputFileName(sphinx_decoder_log_file_base_name, ext='log')) self.listen = False self.decoder = Decoder(config) self.audio = None self.device = None
def speech_recog(self, model): # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', '/usr/local/share/pocketsphinx/model/en-us/en-us') config.set_int('-ds', 2) config.set_int('-topn', 3) config.set_int('-maxwpf', 5) #config.set_string('-kws', MODELDIR + model + '.txt') config.set_string('-lm', MODELDIR + model + '.lm') config.set_string('-dict', MODELDIR + model + '.dict') decoder = Decoder(config) decoder.start_utt() tstamp = time.time() recog_text = '' while len(recog_text) < 1: try: buf = self.stream_in.read(CHUNK_SIZE) logging.info("actual voice") decoder.process_raw(buf, False, False) if decoder.hyp().hypstr != '': recog_text += decoder.hyp().hypstr print "text: " + decoder.hyp().hypstr tstamp = time.time() except IOError as ex: if ex[1] != pyaudio.paInputOverflowed: raise buf = '\x00' * CHUNK_SIZE #white noise logging.info("white noise") except AttributeError: pass decoder.end_utt() logging.info("recog text: " + recog_text) return recog_text
def init(): # Create a decoder with certain model config = Decoder.default_config() config.set_string('-logfn', os.path.join(settings.LOGS_DIR, 'passive-listen.log')) config.set_string('-hmm', os.path.join(settings.MODEL_DIR, 'en-US/acoustic-model')) config.set_string( '-lm', os.path.join(settings.MODEL_DIR, 'en-US/language-model.lm.bin')) config.set_string( '-dict', os.path.join(settings.MODEL_DIR, 'en-US/pronounciation-dictionary.dict')) # Decode streaming data global decoder, p decoder = Decoder(config) decoder.set_keyphrase('wakeup', settings.WAKE_UP_WORD) decoder.set_search('wakeup') p = pyaudio.PyAudio() global r r = speech_recognition.Recognizer()
def _create_decoder(config) -> Decoder: decoder_config = Decoder.default_config() decoder_config.set_string('-hmm', config.hmm) decoder_config.set_string('-dict', config.dict) decoder_config.set_boolean('-remove_noise', config.remove_noise) decoder_config.set_float('-samprate', config.sample_rate) decoder_config.set_string('-logfn', devnull) if config.lm is not None: decoder_config.set_string("-lm", config.lm) elif len(config.hotwords) == 1: decoder_config.set_string('-keyphrase', config.hotwords[0]) decoder_config.set_float('-kws_threshold', config.threshold) else: import os from tempfile import gettempdir path = os.path.join(gettempdir(), 'keywords.mini') f = open(path, 'w') f.writelines(['{} /{}/\n'.format(w, config.threshold) for w in config.hotwords]) f.flush() decoder_config.set_string('-kws', path) return Decoder(decoder_config)
def get_decoder(): from pocketsphinx.pocketsphinx import Decoder script_dir = os.path.dirname(os.path.realpath(__file__)) config = Decoder.default_config() config.set_string('-hmm', os.path.join(script_dir, 'model/hmm/en')) config.set_string('-dict', os.path.join(script_dir, 'model/respeaker.dic')) config.set_string('-kws', os.path.join(script_dir, 'model/keywords.txt')) # config.set_string('-keyphrase', 'respeaker') # config.set_float('-kws_threshold', 1e-43) config.set_int('-samprate', SAMPLE_RATE) config.set_int('-nfft', 2048) config.set_string('-logfn', os.devnull) try: decoder = Decoder(config) except Exception as e: print( "Maybe replace config.set_int('-samprate', SAMPLE_RATE) with config.set_float('-samprate', SAMPLE_RATE)" ) raise e return decoder