def parse(self, raw_audio, grammar=None): root = os.path.dirname(os.path.normpath(__file__)) model_dir = os.path.join(root, "pocketsphinx") hmm = os.path.join(model_dir, "en-us") lm = os.path.join(model_dir, "en-us.lm") dict = os.path.join(model_dir, "cmudict.dict") config = pocketsphinx.Decoder.default_config() config.set_string("-hmm", hmm) config.set_string("-dict", dict) config.set_string("-logfn", os.devnull) if grammar is not None: grammar_file = os.path.join(root, grammar) if not os.path.isfile(grammar_file): raise IOError("missing grammar file") config.set_string("-jsgf", grammar) else: config.set_string("-lm", lm) decoder = pocketsphinx.Decoder(config) decoder.start_utt() decoder.process_raw(raw_audio, False, True) decoder.end_utt() text = decoder.hyp() if text is None: return None return text.hypstr
def recognition(keyphrase_function, key_phrase): # Start a pyaudio instance p = pyaudio.PyAudio() # Create an input stream with pyaudio - if on raspi use index 1 for google voice hat mic if os.uname()[1] == 'raspberrypi': stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, input_device_index=1, frames_per_buffer=1024) print('stream started on rpi') else: stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024) print('stream started') # Start the stream stream.start_stream() modeldir = "data/files/sphinx/models" # Create a decoder with certain model config = pocketsphinx.Decoder.default_config() # Use the mobile voice model (en-us-ptm) for performance constrained systems if os.uname()[1] == 'raspberrypi': config.set_string('-hmm', os.path.join(modeldir, 'en-us/en-us-ptm')) else: config.set_string('-hmm', os.path.join(modeldir, 'en-us/en-us')) config.set_string('-dict', os.path.join(modeldir, 'en-us/cmudict-en-us.dict')) config.set_string('-keyphrase', key_phrase) config.set_string('-logfn', 'data/files/sphinx.log') config.set_float('-kws_threshold', 1) # Process audio chunk by chunk. On keyword detected perform action and restart search decoder = pocketsphinx.Decoder(config) decoder.start_utt() # Loop forever while True: # Read 1024 samples from the buffer buf = stream.read(1024, exception_on_overflow=False) # If data in the buffer, process using the sphinx decoder if buf: decoder.process_raw(buf, False, False) else: break # If the hypothesis is not none, the key phrase was recognized if decoder.hyp() is not None: decoder.end_utt() stream.stop_stream() # Stop Audio Recording stream.close() # Close Audio Recording keyphrase_function() # Call back return True
def stream_decode(self, raw): if raw.endswith('.wav') and not os.path.isfile( raw.replace('.wav', '.raw')): msg = 'converting %s to raw' % raw logging.debug(msg) self.convert2raw(raw) raw = raw.replace('.wav', '.raw') self.segs = [] decoder = ps.Decoder(self.config) stream = open(raw, 'rb') in_speech_bf = False decoder.start_utt() while True: buf = stream.read(1024) if buf: decoder.process_raw(buf, False, False) if decoder.get_in_speech() != in_speech_bf: in_speech_bf = decoder.get_in_speech() if not in_speech_bf: decoder.end_utt() for seg in decoder.seg(): self.segs.append([ seg.word, seg.start_frame / 100, seg.end_frame / 100 ]) decoder.start_utt() else: # the last buffered stream for seg in decoder.seg(): self.segs.append( [seg.word, seg.start_frame / 100, seg.end_frame / 100]) break decoder.end_utt()
def __init__(self, language="en-US"): """[summary] [description] Keyword Arguments: language {str} -- [description] (default: {"en-US"}) Raises: RequestError -- There are issues with the Sphinx installation. """ assert isinstance(language, str), "``language`` must be a string" language_directory = os.path.join( os.path.dirname(os.path.realpath(__file__)), "pocketsphinx-data", language) if not os.path.isdir(language_directory): raise RequestError("missing PocketSphinx language data directory:\ \"{0}\"".format(language_directory)) acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model") if not os.path.isdir(acoustic_parameters_directory): raise RequestError("missing PocketSphinx language model parameters\ directory: \"{0}\"".format(acoustic_parameters_directory)) language_model_file = os.path.join(language_directory, "language-model.lm.bin") if not os.path.isfile(language_model_file): raise RequestError("missing PocketSphinx language model file:\ \"{0}\"".format(language_model_file)) phoneme_dictionary_file = os.path.join( language_directory, "pronounciation-dictionary.dict") if not os.path.isfile(phoneme_dictionary_file): raise RequestError("missing PocketSphinx phoneme dictionary file:\ \"{0}\"".format(phoneme_dictionary_file)) # create decoder object config = pocketsphinx.Decoder.default_config() # set the path of the hidden Markov model (HMM) parameter files config.set_string("-hmm", acoustic_parameters_directory) config.set_string("-lm", language_model_file) config.set_string("-dict", phoneme_dictionary_file) # disable logging (logging causes unwanted output in terminal) config.set_string("-logfn", os.devnull) self.decoder = pocketsphinx.Decoder(config)
def recognize_sphinx(self, audio_data, language = "en-US", show_all = False): """ Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx. The recognition language is determined by ``language``, an IETF language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See `Notes on using `PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for information about installing other languages. This document is also included under ``reference/pocketsphinx.rst``. Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition. Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation. """ assert isinstance(audio_data, AudioData), "`audio_data` must be audio data" assert isinstance(language, str), "`language` must be a string" # import the PocketSphinx speech recognition module try: from pocketsphinx import pocketsphinx from sphinxbase import sphinxbase except ImportError: raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.") language_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "pocketsphinx-data", language) if not os.path.isdir(language_directory): raise RequestError("missing PocketSphinx language data directory: \"{0}\"".format(language_directory)) acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model") if not os.path.isdir(acoustic_parameters_directory): raise RequestError("missing PocketSphinx language model parameters directory: \"{0}\"".format(acoustic_parameters_directory)) language_model_file = os.path.join(language_directory, "language-model.lm.bin") if not os.path.isfile(language_model_file): raise RequestError("missing PocketSphinx language model file: \"{0}\"".format(language_model_file)) phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict") if not os.path.isfile(phoneme_dictionary_file): raise RequestError("missing PocketSphinx phoneme dictionary file: \"{0}\"".format(phoneme_dictionary_file)) # create decoder object config = pocketsphinx.Decoder.default_config() config.set_string("-hmm", acoustic_parameters_directory) # set the path of the hidden Markov model (HMM) parameter files config.set_string("-lm", language_model_file) config.set_string("-dict", phoneme_dictionary_file) config.set_string("-logfn", os.devnull) # disable logging (logging causes unwanted output in terminal) decoder = pocketsphinx.Decoder(config) # obtain audio data raw_data = audio_data.get_raw_data(convert_rate = 16000, convert_width = 2) # the included language models require audio to be 16-bit mono 16 kHz in little-endian format # obtain recognition results decoder.start_utt() # begin utterance processing decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True) decoder.end_utt() # stop utterance processing if show_all: return decoder # return results hypothesis = decoder.hyp() if hypothesis is not None: return hypothesis.hypstr raise UnknownValueError() # no transcriptions available
def get_decoder(libdir=None, modeldir=None, lang='en-us'): """ Create a decoder with the requested language model """ modeldir = modeldir or (os.path.join(libdir, 'model') if libdir else MODELDIR) libdir = os.path.dirname(modeldir) config = ps.Decoder.default_config() config.set_string('-hmm', os.path.join(modeldir, lang)) config.set_string('-lm', os.path.join(modeldir, lang + '.lm.bin')) config.set_string('-dict', os.path.join(modeldir, 'cmudict-' + lang + '.dict')) print(config) return ps.Decoder(config)
def start_keyphrase_recognition(keyphrase_function, key_phrase): """ Starts a thread that is always listening for a specific key phrase. Once the key phrase is recognized, the thread will call the keyphrase_function. This function is called within the thread (a new thread is not started), so the key phrase detection is paused until the function returns. :param keyphrase_function: function that is called when the phrase is recognized :param key_phrase: a string for the key phrase """ modeldir = "files/sphinx/models" # Create a decoder with certain model config = pocketsphinx.Decoder.default_config() # Use the mobile voice model (en-us-ptm) for performance constrained systems config.set_string('-hmm', os.path.join(modeldir, 'en-us/en-us-ptm')) # config.set_string('-hmm', os.path.join(modeldir, 'en-us/en-us')) config.set_string('-dict', os.path.join(modeldir, 'en-us/cmudict-en-us.dict')) config.set_string('-keyphrase', key_phrase) config.set_string('-logfn', 'files/sphinx.log') config.set_float('-kws_threshold', 1) # Start a pyaudio instance p = pyaudio.PyAudio() # Create an input stream with pyaudio stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024) # Start the stream stream.start_stream() # Process audio chunk by chunk. On keyword detected perform action and restart search decoder = pocketsphinx.Decoder(config) decoder.start_utt() # Loop forever while True: # Read 1024 samples from the buffer buf = stream.read(1024) # If data in the buffer, process using the sphinx decoder if buf: decoder.process_raw(buf, False, False) else: break # If the hypothesis is not none, the key phrase was recognized if decoder.hyp() is not None: keyphrase_function() # Stop and reinitialize the decoder decoder.end_utt() decoder.start_utt()
def build_decoder(self): model_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'files/sphinx') file_name = 'alexa' dict_file = '{}.dict'.format(file_name) lm_file = '{}.lm'.format(file_name) ps_config = pocketsphinx.Decoder.default_config() ps_config.set_string('-hmm', os.path.join(model_path, 'acoustic-model')) ps_config.set_string('-dict', os.path.join(model_path, dict_file)) ps_config.set_string('-keyphrase', "JARVIS") ps_config.set_float('-kws_threshold', 1e-10) ps_config.set_string('-logfn', '/dev/null') self.decoder = pocketsphinx.Decoder(ps_config) self.decoder.start_utt()
def start_recognizer(self): config = pocketsphinx.Decoder.default_config() # Setup decoder config if self._hmm is None: rospy.logwarn("Using default hmm") else: rospy.loginfo("hmm file: %s", self._hmm) config.set_string('-hmm', self._hmm) if self._dict is None: rospy.logwarn("Using default dict") else: rospy.loginfo("Dict file: %s", self._dict) config.set_string('-dict', self._dict) config.set_string('-dither', "no") config.set_string('-featparams', os.path.join(self._hmm, "feat.params")) #config.set_boolean('-bestpath', True) if self._kws is not None: config.set_string('-kws', self._kws) if self._keyphrase is not None: config.set_string('-keyphrase', self.keyphrase) if self._threshold is not None: config.set_float('-kws_threshold', self.kws_threshold) # Set required configuration for decoder self.decoder = pocketsphinx.Decoder(config) if self._gram and self._grammar and self._rule: jsgf = Jsgf(self._gram) self.get_list_of_public_jsgf_rules(self._gram) if isinstance(self._rule,str): rule = jsgf.get_rule(self._grammar + '.' + self._rule) # rospy.logwarn(rule) if rule is not None: rospy.logwarn("LOAD: Rule <"+ self._rule + "> from grammar <" + self._grammar + ">") fsg = jsgf.build_fsg(rule, self.decoder.get_logmath(), 7.5) fsg.writefile(self._gram + '.fsg') self.decoder.set_fsg(self._gram, fsg) self.decoder.set_search(self._gram) # Start processing input audio self.decoder.start_utt() rospy.loginfo("Decoder is successfully started") else: rospy.logwarn("LOAD FAILED: No rule <"+ self._rule + "> in grammar <" + self._grammar + ">") else: rospy.logerr("LOAD FAILED: rule name must be string") self._rule = None
def pocketsphinx_init(self): """Initialize pocketsphinx stt engine""" language_directory = self.args["pocketsphinx_dir"] acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model") language_model_file = os.path.join(language_directory, "language-model.lm.bin") phoneme_dictionary_file = os.path.join( language_directory, "pronounciation-dictionary.dict") config = pocketsphinx.Decoder.default_config() config.set_string("-hmm", acoustic_parameters_directory) config.set_string("-lm", language_model_file) config.set_string("-dict", phoneme_dictionary_file) config.set_string("-logfn", os.devnull) self.sphinx_decoder = pocketsphinx.Decoder(config) self.log("Pocketsphinx init done")
def decodeSpeech(decoder_config, audio_file): # pocketsphinx wav recognition process. Do not modify ! speechRec = ps.Decoder(decoder_config) subprocess.call(record, shell=True) stream = open(audio_file, 'rb') in_speech_bf = True speechRec.start_utt() while True: buf = stream.read(1024) #audio_file2 = file(audio_file,'rb') #audio_file2.seek(44) #if audio_file2: # speechRec.process_raw(audio_file2,False,False) # #speechRec.decode_raw(audio_file2) if buf: speechRec.process_raw(buf,False,False) try: if speechRec.hyp().hypstr != '': print "**************PARTIAL decoding reult:", speechRec.hyp().hypstr except AttributeError: pass if speechRec.get_in_speech(): sys.stdout.write('.') sys.stdout.flush() if speechRec.get_in_speech() != in_speech_bf: in_speech_bf = speechRec.get_in_speech() if not in_speech_bf: speechRec.end_utt() try: if speechRec.hyp().hypstr != '': print 'Stream decoding result:', speechRec.hyp().hypstr except AttributeError: pass speechRec.start_utt() else: break #result = speechRec.hyp().hypstr result = speechRec.hyp() speechRec.end_utt() return result
def __init__(self, language="en-US", language_directory=None, acoustic_parameters_directory=None, language_model_file=None, phoneme_dictionary_file=None): super(PS_Recognizer, self).__init__() language = language.lower() language_directory = language_directory or join( dirname(dirname(__file__)), "recognizer/model", language) if not isdir(language_directory): raise RequestError( "missing PocketSphinx language data directory: \"{}\"".format( language_directory)) acoustic_parameters_directory = \ acoustic_parameters_directory or \ join(language_directory, "hmm") if not isdir(acoustic_parameters_directory): raise RequestError( "missing PocketSphinx language model parameters directory: " "\"{}\"".format(acoustic_parameters_directory)) language_model_file = language_model_file or join( language_directory, language + ".lm") if not isfile(language_model_file): language_model_file += ".bin" if not isfile(language_model_file): raise RequestError( "missing PocketSphinx language model file: \"{}\"".format( language_model_file)) phoneme_dictionary_file = phoneme_dictionary_file or join( language_directory, language + ".dict") if not isfile(phoneme_dictionary_file): raise RequestError( "missing PocketSphinx phoneme dictionary file: \"{}\"".format( phoneme_dictionary_file)) # create decoder object config = pocketsphinx.Decoder.default_config() config.set_string("-hmm", acoustic_parameters_directory) config.set_string("-lm", language_model_file) config.set_string("-dict", phoneme_dictionary_file) config.set_string("-logfn", os.devnull) self.decoder = pocketsphinx.Decoder(config) self.lang = language
def __init__(self): super().__init__(self) self.interpreter = Interpreter.load(settings.RASA_MODEL_DIR) self.stream = PyAudio().open(format=paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024, output_device_index=0) self.config = pocketsphinx.Decoder.default_config() self.config.set_string( '-hmm', path.join(settings.SPHINX_MODEL_DIR, 'en-us/en-us')) self.config.set_string( '-dict', path.join(settings.SPHINX_MODEL_DIR, 'en-us/cmudict-en-us.dict')) self.config.set_string('-keyphrase', settings.WAKE_PHRASE) self.config.set_float('-kws_threshold', 1e+20) self.config.set_string('-logfn', 'text.log') self.decoder = pocketsphinx.Decoder(self.config) self.listen_for_wake()
def recognition(keyphrase_function, key_phrase, loop): modeldir = "data/files/sphinx/models" # Create a decoder with certain model config = pocketsphinx.Decoder.default_config() # Use the mobile voice model (en-us-ptm) for performance constrained systems if os.uname()[1] == 'raspberrypi': config.set_string('-hmm', os.path.join(modeldir, 'en-us/en-us-ptm')) else: config.set_string('-hmm', os.path.join(modeldir, 'en-us/en-us')) config.set_string('-dict', os.path.join(modeldir, 'en-us/cmudict-en-us.dict')) config.set_string('-keyphrase', key_phrase) config.set_string('-logfn', 'data/files/sphinx.log') config.set_float('-kws_threshold', 1) # Process audio chunk by chunk. On keyword detected perform action and restart search decoder = pocketsphinx.Decoder(config) decoder.start_utt() # Loop forever while True: # Read 1024 samples from the buffer buf = stream.read(1024, exception_on_overflow=False) # If data in the buffer, process using the sphinx decoder if buf: decoder.process_raw(buf, False, False) else: break # If the hypothesis is not none, the key phrase was recognized if decoder.hyp() is not None: decoder.end_utt() keyphrase_function() if loop: # Stop and reinitialize the decoder if loop is on decoder.start_utt() else: # else end and send true return True
def start_keyphrase_detection(self, keyphrase_function, key_phrase): modeldir = "models" config = pocketsphinx.Decoder.default_config() config.set_string('-hmm', os.path.join(modeldir, 'en-us/en-us-ptm')) config.set_string('-dict', os.path.join(modeldir, 'en-us/cmudict-en-us.dict')) config.set_string('-kws', 'keylist') # config.set_string('-keyphrase', key_phrase) config.set_string('-logfn', './log') config.set_float('-kws_threshold', 1e10) # Start a pyaudio instance p = pyaudio.PyAudio() # Create an input stream with pyaudio stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024) # Start the stream stream.start_stream() # Process audio chunk by chunk. On keyword detected perform action and restart search decoder = pocketsphinx.Decoder(config) decoder.start_utt() print('start listening...') while True: buf = stream.read(1024) if buf: decoder.process_raw(buf, True, True) else: break hyp = decoder.hyp() if hyp is not None: keyphrase_function(hyp.hypstr) # Stop and reinitialize the decoder decoder.end_utt() decoder.start_utt()
def start(self): # Create a decoder with certain model config = pocketsphinx.Decoder.default_config() config.set_string('-hmm', self.hmm_directory) config.set_string('-dict', self.dictionary_file) config.set_string("-logfn", os.devnull) decoder = pocketsphinx.Decoder(config) decoder.set_lm_file("lm", self.language_model_file) decoder.set_keyphrase("kws", "hey emma") decoder.set_search("kws") p = pyaudio.PyAudio() stream = p.open(format=FORMAT, channels=1, rate=RATE, input=True, output=True, frames_per_buffer=BUFFER_SIZE) stream.start_stream() # Process audio chunk by chunk. On keyword detected perform action and restart search decoder.start_utt() while True: buf = stream.read(BUFFER_SIZE) if buf: decoder.process_raw(buf, False, False) else: break if decoder.hyp() != None: print decoder.hyp().hypstr stream.stop_stream() decoder.end_utt() # self.houndClient.query() stream.start_stream() decoder.start_utt()
def recognize_sphinx(self, audio_data, language="en-US", keyword_entries=None, grammar=None, show_all=False): """ Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx. The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See `Notes on using `PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for information about installing other languages. This document is also included under ``reference/pocketsphinx.rst``. The ``language`` parameter can also be a tuple of filesystem paths, of the form ``(acoustic_parameters_directory, language_model_file, phoneme_dictionary_file)`` - this allows you to load arbitrary Sphinx models. If specified, the keywords to search for are determined by ``keyword_entries``, an iterable of tuples of the form ``(keyword, sensitivity)``, where ``keyword`` is a phrase, and ``sensitivity`` is how sensitive to this phrase the recognizer should be, on a scale of 0 (very insensitive, more false negatives) to 1 (very sensitive, more false positives) inclusive. If not specified or ``None``, no keywords are used and Sphinx will simply transcribe whatever words it recognizes. Specifying ``keyword_entries`` is more accurate than just looking for those same keywords in non-keyword-based transcriptions, because Sphinx knows specifically what sounds to look for. Sphinx can also handle FSG or JSGF grammars. The parameter ``grammar`` expects a path to the grammar file. Note that if a JSGF grammar is passed, an FSG grammar will be created at the same location to speed up execution in the next run. If ``keyword_entries`` are passed, content of ``grammar`` will be ignored. Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition. Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation. """ assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data" assert isinstance(language, str) or (isinstance(language, tuple) and len(language) == 3), "``language`` must be a string or 3-tuple of Sphinx data file paths of the form ``(acoustic_parameters, language_model, phoneme_dictionary)``" assert keyword_entries is None or all(isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1" # import the PocketSphinx speech recognition module try: from pocketsphinx import pocketsphinx, Jsgf, FsgModel except ImportError: raise RequestError("missing PocketSphinx module: ensure that PocketSphinx is set up correctly.") except ValueError: raise RequestError("bad PocketSphinx installation; try reinstalling PocketSphinx version 0.0.9 or better.") if not hasattr(pocketsphinx, "Decoder") or not hasattr(pocketsphinx.Decoder, "default_config"): raise RequestError("outdated PocketSphinx installation; ensure you have PocketSphinx version 0.0.9 or better.") if isinstance(language, str): # directory containing language data language_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)), "pocketsphinx-data", language) if not os.path.isdir(language_directory): raise RequestError("missing PocketSphinx language data directory: \"{}\"".format(language_directory)) acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model") language_model_file = os.path.join(language_directory, "language-model.lm.bin") phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict") else: # 3-tuple of Sphinx data file paths acoustic_parameters_directory, language_model_file, phoneme_dictionary_file = language if not os.path.isdir(acoustic_parameters_directory): raise RequestError("missing PocketSphinx language model parameters directory: \"{}\"".format(acoustic_parameters_directory)) if not os.path.isfile(language_model_file): raise RequestError("missing PocketSphinx language model file: \"{}\"".format(language_model_file)) if not os.path.isfile(phoneme_dictionary_file): raise RequestError("missing PocketSphinx phoneme dictionary file: \"{}\"".format(phoneme_dictionary_file)) # create decoder object config = pocketsphinx.Decoder.default_config() config.set_string("-hmm", acoustic_parameters_directory) # set the path of the hidden Markov model (HMM) parameter files config.set_string("-lm", language_model_file) config.set_string("-dict", phoneme_dictionary_file) config.set_string("-logfn", os.devnull) # disable logging (logging causes unwanted output in terminal) decoder = pocketsphinx.Decoder(config) # obtain audio data raw_data = audio_data.get_raw_data(convert_rate=16000, convert_width=2) # the included language models require audio to be 16-bit mono 16 kHz in little-endian format # obtain recognition results if keyword_entries is not None: # explicitly specified set of keywords with PortableNamedTemporaryFile("w") as f: # generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5 f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity - 110) for keyword, sensitivity in keyword_entries) f.flush() # perform the speech recognition with the keywords file (this is inside the context manager so the file isn;t deleted until we're done) decoder.set_kws("keywords", f.name) decoder.set_search("keywords") decoder.start_utt() # begin utterance processing decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True) decoder.end_utt() # stop utterance processing elif grammar is not None: # a path to a FSG or JSGF grammar if not os.path.exists(grammar): raise ValueError("Grammar '{0}' does not exist.".format(grammar)) grammar_path = os.path.abspath(os.path.dirname(grammar)) grammar_name = os.path.splitext(os.path.basename(grammar))[0] fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name) if not os.path.exists(fsg_path): # create FSG grammar if not available jsgf = Jsgf(grammar) rule = jsgf.get_rule("{0}.{0}".format(grammar_name)) fsg = jsgf.build_fsg(rule, decoder.get_logmath(), 7.5) fsg.writefile(fsg_path) else: fsg = FsgModel(fsg_path, decoder.get_logmath(), 7.5) decoder.set_fsg(grammar_name, fsg) decoder.set_search(grammar_name) decoder.start_utt() decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True) decoder.end_utt() # stop utterance processing else: # no keywords, perform freeform recognition decoder.start_utt() # begin utterance processing decoder.process_raw(raw_data, False, True) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True) decoder.end_utt() # stop utterance processing if show_all: return decoder # return results hypothesis = decoder.hyp() if hypothesis is not None: return hypothesis.hypstr raise UnknownValueError() # no transcriptions available
def __init__(self, *args, **kwargs): plugin.STTPlugin.__init__(self, *args, **kwargs) self._vocabulary_name = "keywords" keywords = profile.get(['keyword'], ['Naomi']) if isinstance(keywords, str): keywords = [keywords] keywords = [keyword.upper() for keyword in keywords] self._vocabulary_phrases = keywords self._logger.info( "Adding vocabulary {} containing phrases {}".format( self._vocabulary_name, self._vocabulary_phrases ) ) vocabulary_path = self.compile_vocabulary( sphinxvocab.compile_vocabulary ) dict_path = sphinxvocab.get_dictionary_path(vocabulary_path) lm_path = sphinxvocab.get_languagemodel_path(vocabulary_path) thresholds_path = sphinxvocab.get_thresholds_path(vocabulary_path) msg = " ".join([ "Creating thresholds file '{}'", "See README.md for more information." ]).format(thresholds_path) print(msg) with open(thresholds_path, 'w') as f: for keyword in keywords: threshold = profile.get(['Pocketsphinx_KWS', 'thresholds', keyword], 80) if(threshold < 0): f.write("{}\t/1e{}/\n".format(keyword, threshold)) else: f.write("{}\t/1e+{}/\n".format(keyword, threshold)) hmm_dir = profile.get(['pocketsphinx', 'hmm_dir']) # Perform some checks on the hmm_dir so that we can display more # meaningful error messages if neccessary if not os.path.exists(hmm_dir): msg = " ".join([ "hmm_dir '{}' does not exist! Please make sure that you", "have set the correct hmm_dir in your profile." ]).format(hmm_dir) self._logger.error(msg) raise RuntimeError(msg) # Lets check if all required files are there. Refer to: # http://cmusphinx.sourceforge.net/wiki/acousticmodelformat # for details missing_hmm_files = [] for fname in ('mdef', 'feat.params', 'means', 'noisedict', 'transition_matrices', 'variances'): if not os.path.exists(os.path.join(hmm_dir, fname)): missing_hmm_files.append(fname) mixweights = os.path.exists(os.path.join(hmm_dir, 'mixture_weights')) sendump = os.path.exists(os.path.join(hmm_dir, 'sendump')) if not mixweights and not sendump: # We only need mixture_weights OR sendump missing_hmm_files.append('mixture_weights or sendump') if missing_hmm_files: self._logger.warning( " ".join([ "hmm_dir '%s' is missing files: %s.", "Please make sure that you have set the correct", "hmm_dir in your profile." ]).format(hmm_dir, ', '.join(missing_hmm_files)) ) with tempfile.NamedTemporaryFile( prefix='psdecoder_', suffix='.log', delete=False ) as f: self._logfile = f.name self._logger.info('Pocketsphinx log file: {}'.format(self._logfile)) # Pocketsphinx v5 config = pocketsphinx.Decoder.default_config() config.set_string('-hmm', hmm_dir) config.set_string('-kws', thresholds_path) config.set_string('-lm', lm_path) config.set_string('-dict', dict_path) config.set_string('-logfn', self._logfile) self._ps = pocketsphinx.Decoder(config)
grammar_path = model_dir + '/grammars' config = pocketsphinx.Decoder.default_config() config.set_string('-hmm', model_dir + '/accoustic-model') config.set_string('-lm', model_dir + '/language-model.bin') config.set_string('-dict', model_dir + '/pronounciation-dictionary.dict') config.set_string("-logfn", os.devnull) jsgf = Jsgf(grammar_path) grammar_decoders = [] pattern = re.compile('public <(.*?)> =') with open(grammar_path, 'rt') as in_file: for linenum, line in enumerate(in_file): grammar_key = pattern.findall(line) if grammar_key != []: decoder = pocketsphinx.Decoder(config) ruleGrammar = jsgf.get_rule( ('structure.' + grammar_key[0]).format(grammar_path)) fsgNext = jsgf.build_fsg(ruleGrammar, decoder.get_logmath(), 7.5) decoder.set_fsg(grammar_key[0], fsgNext) decoder.set_search(grammar_key[0]) grammar_decoders.append(decoder) class Text2Speech: CHANNEL = 'text2speech' CHANNEL_TYPE = 'brain' @staticmethod def id():
def setup_decoder(audio_file, keyword_entries): language = "en-US" audio_file_type = audio_file.split(".")[1] if audio_file_type == 'wav': curr_dir = os.getcwd() data_dir = os.path.join(curr_dir, '../data/') speech_recognition_directory = '/Library/Python/2.7/site-packages/speech_recognition/' audio_data_path = os.path.join(data_dir, audio_file) else: raise speech_recognition.RequestError("file type must be .wav") assert isinstance(language, str), "``language`` must be a string" assert keyword_entries is None or all( isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers " \ "between 0 and 1" # import the PocketSphinx speech recognition module try: from pocketsphinx import pocketsphinx except ImportError: raise speech_recognition.RequestError( "missing PocketSphinx module: ensure that PocketSphinx is set up correctly." ) except ValueError: raise speech_recognition.RequestError( "bad PocketSphinx installation detected; make sure you have PocketSphinx version 0.0.9 or better." ) language_directory = os.path.join( os.path.dirname(speech_recognition_directory), "pocketsphinx-data", language) if not os.path.isdir(language_directory): raise speech_recognition.RequestError( "missing PocketSphinx language data directory: \"{}\"".format( language_directory)) acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model") if not os.path.isdir(acoustic_parameters_directory): raise speech_recognition.RequestError( "missing PocketSphinx language model parameters directory: \"{}\"" .format(acoustic_parameters_directory)) language_model_file = os.path.join(language_directory, "language-model.lm.bin") if not os.path.isfile(language_model_file): raise speech_recognition.RequestError( "missing PocketSphinx language model file: \"{}\"".format( language_model_file)) phoneme_dictionary_file = os.path.join( language_directory, "pronounciation-dictionary.dict") if not os.path.isfile(phoneme_dictionary_file): raise speech_recognition.RequestError( "missing PocketSphinx phoneme dictionary file: \"{}\"".format( phoneme_dictionary_file)) # create decoder object config = pocketsphinx.Decoder.default_config() # set the path of the hidden Markov model (HMM) parameter files config.set_string("-hmm", acoustic_parameters_directory) config.set_string("-lm", language_model_file) config.set_string("-dict", phoneme_dictionary_file) # disable logging (logging causes unwanted output in terminal) config.set_string("-logfn", os.devnull) decoder = pocketsphinx.Decoder(config) return audio_data_path, decoder
def __init__(self, *args, **kwargs): """ Initiates the pocketsphinx instance. Arguments: vocabulary -- a PocketsphinxVocabulary instance hmm_dir -- the path of the Hidden Markov Model (HMM) """ plugin.STTPlugin.__init__(self, *args, **kwargs) if not pocketsphinx_available: raise ImportError("Pocketsphinx not installed!") vocabulary_path = self.compile_vocabulary( sphinxvocab.compile_vocabulary ) lm_path = sphinxvocab.get_languagemodel_path(vocabulary_path) dict_path = sphinxvocab.get_dictionary_path(vocabulary_path) hmm_dir = profile.get(['pocketsphinx', 'hmm_dir']) self._logger.debug( "Initializing PocketSphinx Decoder with hmm_dir '{}'".format( hmm_dir ) ) # Perform some checks on the hmm_dir so that we can display more # meaningful error messages if neccessary if not os.path.exists(hmm_dir): msg = " ".join([ "hmm_dir '{}' does not exist! Please make sure that you", "have set the correct hmm_dir in your profile." ]).format(hmm_dir) self._logger.error(msg) raise RuntimeError(msg) # Lets check if all required files are there. Refer to: # http://cmusphinx.sourceforge.net/wiki/acousticmodelformat # for details missing_hmm_files = [] for fname in ('mdef', 'feat.params', 'means', 'noisedict', 'transition_matrices', 'variances'): if not os.path.exists(os.path.join(hmm_dir, fname)): missing_hmm_files.append(fname) mixweights = os.path.exists(os.path.join(hmm_dir, 'mixture_weights')) sendump = os.path.exists(os.path.join(hmm_dir, 'sendump')) if not mixweights and not sendump: # We only need mixture_weights OR sendump missing_hmm_files.append('mixture_weights or sendump') if missing_hmm_files: self._logger.warning( " ".join([ "hmm_dir '%s' is missing files: %s.", "Please make sure that you have set the correct", "hmm_dir in your profile." ]).format(hmm_dir, ', '.join(missing_hmm_files)) ) self._pocketsphinx_v5 = hasattr(pocketsphinx.Decoder, 'default_config') with tempfile.NamedTemporaryFile(prefix='psdecoder_', suffix='.log', delete=False) as f: self._logfile = f.name if self._pocketsphinx_v5: # Pocketsphinx v5 config = pocketsphinx.Decoder.default_config() config.set_string('-hmm', hmm_dir) config.set_string('-lm', lm_path) config.set_string('-dict', dict_path) config.set_string('-logfn', self._logfile) self._decoder = pocketsphinx.Decoder(config) else: # Pocketsphinx v4 or sooner self._decoder = pocketsphinx.Decoder( hmm=hmm_dir, logfn=self._logfile, lm=lm_path, dict=dict_path )
import cv2 import csv FILENAME = 'users.csv' MODELDIR = "../../../model" config = ps.Decoder.default_config() config.set_string( '-hmm', '/home/anna/diplom/test_speech/zero_ru_cont_8k_v3/zero_ru.cd_cont_4000/') config.set_string('-dict', '/home/anna/diplom/comb_1/speech/vocabular.dict') config.set_string('-jsgf', '/home/anna/diplom/comb_1/speech/sp.jsgf') config.set_string('-logfn', '/dev/null') #config.set_string('-lm', '/home/anna/diplom/test_speech/zero_ru_cont_8k_v3/ru.lm') config.set_int('-nfft', 512) config.set_float('-samprate', 8000.0) decoder = ps.Decoder(config) #create models for looking face sp = dlib.shape_predictor('shape_predictor_68_face_landmarks.dat') facerec = dlib.face_recognition_model_v1( 'dlib_face_recognition_resnet_model_v1.dat') detector = dlib.get_frontal_face_detector() users = {} with open(FILENAME, "r", newline="") as file: reader = csv.reader(file) for row in reader: name = row[0] with open(name + '.pickle', 'rb') as f: face_descriptor = pickle.load(f) users[face_descriptor] = name
def prepare_sphinx2(self, language="en-US", keyword_entries=None): assert isinstance(language, str) or ( isinstance(language, tuple) and len(language) == 3 ), "``language`` must be a string or 3-tuple of Sphinx data file paths of the form ``(acoustic_parameters, language_model, phoneme_dictionary)``" assert keyword_entries is None or all( isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries ), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1" if isinstance(language, str): # directory containing language data language_directory = os.path.join( os.path.dirname(os.path.realpath(__file__)), "pocketsphinx-data", language) if not os.path.isdir(language_directory): raise sr.RequestError( "missing PocketSphinx language data directory: \"{}\"". format(language_directory)) acoustic_parameters_directory = os.path.join( language_directory, "acoustic-model") language_model_file = os.path.join(language_directory, "language-model.lm.bin") phoneme_dictionary_file = os.path.join( language_directory, "pronounciation-dictionary.dict") else: # 3-tuple of Sphinx data file paths acoustic_parameters_directory, language_model_file, phoneme_dictionary_file = language if not os.path.isdir(acoustic_parameters_directory): raise sr.RequestError( "missing PocketSphinx language model parameters directory: \"{}\"" .format(acoustic_parameters_directory)) if not os.path.isfile(language_model_file): raise sr.RequestError( "missing PocketSphinx language model file: \"{}\"".format( language_model_file)) if not os.path.isfile(phoneme_dictionary_file): raise sr.RequestError( "missing PocketSphinx phoneme dictionary file: \"{}\"".format( phoneme_dictionary_file)) # create decoder object config = pocketsphinx.Decoder.default_config() config.set_string( "-hmm", acoustic_parameters_directory ) # set the path of the hidden Markov model (HMM) parameter files config.set_string("-lm", language_model_file) config.set_string("-dict", phoneme_dictionary_file) config.set_string( "-logfn", os.devnull ) # disable logging (logging causes unwanted output in terminal) self.decoder = pocketsphinx.Decoder(config) with open("sphinx.txt", "w") as f: # generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5 f.writelines("{} /{}/\n".format(keyword, sensitivity) for keyword, sensitivity in keyword_entries) # perform the speech recognition with the keywords file (this is inside the context manager so the file isn;t deleted until we're done) self.decoder.set_kws("keywords", "sphinx.txt") self.decoder.set_search("keywords") return