def transform_audio_to_text(filename): user = expanduser("~") path = user + "/DTAI_Internship/src/speech_recognizer_node/data/" lm_file = path + "generated_language_model.lm" dict_file = path + "generated_dictionary.dic" hmm_file = user + "/.local/lib/python2.7/site-packages/pocketsphinx/model/en-us" model_path = get_model_path() data_path = get_data_path() config = { 'hmm': os.path.join(model_path, 'en-us'), 'lm': os.path.join(model_path, lm_file), 'dict': os.path.join(model_path, dict_file) } ps = Pocketsphinx(**config) ps.decode(audio_file=os.path.join(data_path, filename), buffer_size=2048, no_search=False, full_utt=False) text = ps.hypothesis() print(text) return text
def pocket(): ps = Pocketsphinx() language_directory = os.path.dirname(os.path.realpath(__file__)) print language_directory acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model") language_model_file = os.path.join(language_directory, "language-model.lm.bin") phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict") config = Decoder.default_config() config.set_string("-hmm", acoustic_parameters_directory) # set the path of the hidden Markov model (HMM) parameter files config.set_string("-lm", language_model_file) config.set_string("-dict", phoneme_dictionary_file) decoder = Decoder(config) with sr.AudioFile(s_dir + "/a bad situation could become dramatically worse. /a bad situation could become dramatically worse. .wav") as source: audio_data = r.record(source) decoder.start_utt() decoder.process_raw(audio_data, False, True) decoder.end_utt() print decoder.hyp() ps.decode( audio_file=os.path.join(s_dir, 'a bad situation could become dramatically worse. /a bad situation could become dramatically worse. .wav'), buffer_size=2048, no_search=False, full_utt=False) print(ps.hypothesis()) # => ['<s>', '<sil>', 'go', 'forward', 'ten', 'meters', '</s>'] #pocket()
def test_lm(self): ps = Pocketsphinx(dic='deps/pocketsphinx/test/data/defective.dic', mmap=False) # Decoding with 'defective' dictionary ps.decode() self.assertEqual(ps.hypothesis(), '') # Switch to 'turtle' language model turtle_lm = 'deps/pocketsphinx/test/data/turtle.lm.bin' lm = NGramModel(ps.get_config(), ps.get_logmath(), turtle_lm) ps.set_lm('turtle', lm) ps.set_search('turtle') # Decoding with 'turtle' language model ps.decode() self.assertEqual(ps.hypothesis(), '') # The word 'meters' isn't in the loaded dictionary # Let's add it manually ps.add_word('foobie', 'F UW B IY', False) ps.add_word('meters', 'M IY T ER Z', True) # Decoding with 'turtle' language model ps.decode() self.assertEqual(ps.hypothesis(), 'foobie meters meters')
def main(args): # if not args: # print("args are required") # exit(0) config = getConfig() ps = Pocketsphinx(**config) # if (args[0] == '--test'): # withGraphics = False # testsRootDir = "./../tests" # resultsDir = "./../testResults" # for dirName in os.listdir(testsRootDir): # for filename in os.listdir(testsRootDir + "/" + dirName): # path = testsRootDir + "/" + dirName + "/" + filename # if(dirName == "indfrdic"): # for deepFile in os.listdir(path): # print(f"I'm in if {path}/{deepFile} and file name: {filename}") # f = open(resultsDir + f"/{dirName}_{filename}_w_filter_results.txt",'a') # process(path + "/" + deepFile,ps,f, withGraphics) # else: # print(f"I'm in else {path}") # f = open(resultsDir + f"/{dirName}_w_filter_results.txt",'a') # process(path,ps,f, withGraphics) # elif (args[0] == '-P'): withGraphics = True f = open('result.txt', 'a') process(args[0], ps, f, withGraphics)
def __init__(self, hmm='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/modelo', lm='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/leng.lm.bin', dict='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/dicc.dic', grammar='data/gramatica-tp2.gram', dataPath='tmp/'): self.data_path = dataPath config = { 'hmm': hmm, 'lm': lm, 'dict': dict } #model_path = get_model_path() self.ps = Pocketsphinx(**config) # Switch to JSGF grammar jsgf = Jsgf(grammar) rule = jsgf.get_rule('tp2.grammar') fsg = jsgf.build_fsg(rule, self.ps.get_logmath(), 7.5) self.ps.set_fsg('tp2', fsg) self.ps.set_search('tp2') # Síntesis self.tts_authenticator = IAMAuthenticator('cq9_4YcCXxClw2AfgUhbokFktZ-xSRT4kcHS2akcZ05J') self.tts = TextToSpeechV1(authenticator=self.tts_authenticator) self.tts.set_service_url('https://stream.watsonplatform.net/text-to-speech/api')
def __init__(self, mode): # state self.micbuf = np.zeros((0, 4), 'uint16') self.outbuf = None self.buffer_stuff = 0 self.mode = mode self.playchan = 0 self.playsamp = 0 # check mode if not (mode == "echo" or mode == "record" or mode == "record4"): error("argument not recognised") # robot name topic_base_name = "/" + os.getenv("MIRO_ROBOT_NAME") # publish topic = topic_base_name + "/control/stream" print ("publish", topic) self.pub_stream = rospy.Publisher(topic, Int16MultiArray, queue_size=0) # subscribe topic = topic_base_name + "/sensors/stream" print ("subscribe", topic) self.sub_stream = rospy.Subscriber(topic, UInt16MultiArray, self.callback_stream, queue_size=1, tcp_nodelay=True) # subscribe topic = topic_base_name + "/sensors/mics" print ("subscribe", topic) self.sub_mics = rospy.Subscriber(topic, Int16MultiArray, self.callback_mics, queue_size=5, tcp_nodelay=True) # report print "recording from 4 microphones for", RECORD_TIME, "seconds..." ####### Speech Recongnition using Pocket-Sphinx ######### model_path = get_model_path() data_path = get_data_path() config = { 'hmm' : os.path.join(model_path, 'en-us'), # Hidden Markov Model, Speech Recongnition model - trained probability scoring system 'lm': os.path.join(model_path, 'en-us.lm.bin'), #language model 'dict' : os.path.join(model_path, 'cmudict-en-us.dict') # language dictionary } ps = Pocketsphinx(**config) ps.decode( audio_file=("/tmp/input.wav"), #add temp input.wav file buffer_size=2048, no_search= False, full_utt=False) print("Recognized: ") print((ps.hypothesis())) ## output print("END")
def test_lattice(self): ps = Pocketsphinx() ps.decode() lattice = ps.get_lattice() self.assertEqual(lattice.write('tests/goforward.lat'), None) lattice = ps.get_lattice() self.assertEqual(lattice.write_htk('tests/goforward.htk'), None)
def __init__(self, keyword: str, kws_threshold: float): self._decoder = Pocketsphinx(keyphrase=keyword, lm=False, kws_threshold=kws_threshold) self._sound = pyaudio.PyAudio() self._audio_stream = self._sound.open(rate=_SAMPLE_RATE, channels=1, format=pyaudio.paInt16, input=True, frames_per_buffer=_FRAME_LENGTH)
def test_cep_decoder_hypothesis(self): ps = Pocketsphinx() with open('deps/pocketsphinx/test/data/goforward.mfc', 'rb') as f: with ps.start_utterance(): f.read(4) buf = f.read(13780) ps.process_cep(buf, False, True) self.assertEqual(ps.hypothesis(), 'go forward ten meters') self.assertEqual(ps.score(), -7095) self.assertEqual(ps.probability(), -32715)
def run(self): print_important("Info! Thread sphinx started.") self.config = { 'verbose': True, 'hmm': os.path.join('s2m', 'core', 'sphinx', 'fr'), 'lm': os.path.join('s2m', 'core', 'sphinx', 'fr.lm.dmp'), 'dict': os.path.join('s2m', 'core', 'sphinx', 's2m.dict'), 'jsgf': os.path.join('s2m', 'core', 'sphinx', 's2m.jsgf'), } self.pocketsphinx = Pocketsphinx(**self.config) self.ready = True
def getPockerSphinxDecoder(): model_path = get_model_path() data_path = get_data_path() config = { 'verbose': False, 'hmm': os.path.join(model_path, 'en-us'), 'lm': os.path.join(model_path, 'en-us.lm.bin'), 'dict': os.path.join(model_path, 'cmudict-en-us.dict') } return Pocketsphinx(**config)
def __init__(self, *args, **kwargs): self.ps = Pocketsphinx( lm=False, dic=False, allphone='deps/pocketsphinx/model/en-us/en-us-phone.lm.bin', lw=2.0, pip=0.3, beam=1e-200, pbeam=1e-20, mmap=False) self.ps.decode() super(TestPhoneme, self).__init__(*args, **kwargs)
def getPockerSphinxDecoder(): model_path = get_model_path() data_path = get_data_path() config = { 'verbose': False, 'hmm': os.path.join(model_path, 'en-us'), 'lm': os.path.join(model_path, 'en-us.lm.bin'), 'dict': os.path.join(model_path, 'cmudict-en-us.dict'), # 'topn': 2, # 'ds':2, # 'maxwpf': 5, # 'maxhmmpf': 3000 } return Pocketsphinx(**config)
def __init__(self, **kwargs): # signal.signal(signal.SIGINT, self.stop) self._no_search = False self._full_utt = False hotword = kwargs.pop('hotword', ['阿Q', 'R-cute']) self._hotwords = hotword if isinstance(hotword, list) else [hotword] model_path = get_model_path() opt = { 'verbose': False, 'hmm': os.path.join(model_path, 'en-us'), 'lm': util.resource('sphinx/rcute.lm'), 'dic': util.resource('sphinx/rcute.dic'), } opt.update(kwargs) self._rec = Pocketsphinx(**opt)
def test_jsgf(self): ps = Pocketsphinx(lm='deps/pocketsphinx/test/data/turtle.lm.bin', dic='deps/pocketsphinx/test/data/turtle.dic') # Decoding with 'turtle' language model ps.decode() self.assertEqual(ps.hypothesis(), 'go forward ten meters') # Switch to JSGF grammar jsgf = Jsgf('deps/pocketsphinx/test/data/goforward.gram') rule = jsgf.get_rule('goforward.move2') fsg = jsgf.build_fsg(rule, ps.get_logmath(), 7.5) ps.set_fsg('goforward', fsg) ps.set_search('goforward') # Decoding with 'goforward' grammar ps.decode() self.assertEqual(ps.hypothesis(), 'go forward ten meters')
def __init__(self): model_path = get_model_path() print(model_path) data_path = get_data_path() config = { 'hmm' : os.path.join(model_path, 'en-us'), # Hidden Markov Model, Speech Recongnition model - trained probability scoring system 'lm': os.path.join(model_path, 'en-us.lm.bin'), #language model 'dict' : os.path.join(model_path, 'testdict.dict')#, # language dictionary } #Start PocketSphinx Deocde self.ps = Pocketsphinx(**config) # Variables for Audio self.micbuf = np.zeros((0, 4), 'uint16') self.outbuf = None self.buffer_stuff = 0 self.audio_level = 0 self.timeofclap = 0 self.playchan = 0 self.playsamp = 0 self.startTime = 0 self.TimeSinceLast = 0 self.DemoPause = False self.PID = '' self.velocity = TwistStamped() # Variables for Illumination self.illum = UInt32MultiArray() self.illum.data = [0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF] self.illumInt = 0 self.illumState = 0 # robot name topic_base_name = "/" + os.getenv("MIRO_ROBOT_NAME") #Publisher for Illum to control LED's while we are processing requests topic = topic_base_name + "/control/illum" self.pub_illum = rospy.Publisher(topic, UInt32MultiArray, queue_size=0) self.velocity_pub = rospy.Publisher(topic_base_name + "/control/cmd_vel", TwistStamped, queue_size=0) # subscribe topic = topic_base_name + "/sensors/mics" self.sub_mics = rospy.Subscriber(topic, Int16MultiArray, self.callback_mics, queue_size=1, tcp_nodelay=True)
def detect(): from pocketsphinx import Pocketsphinx, Ad ad = Ad(None, 16000) # default input decoder = Pocketsphinx(lm=False, hmm=hmm, dic=dic, keyphrase=keyphrase, kws_threshold=kws_threshold) buf = bytearray(2048) with ad: with decoder.start_utterance(): while ad.readinto(buf) >= 0: decoder.process_raw(buf, False, False) if decoder.hyp(): with decoder.end_utterance(): logging.info('Wake word detected for %s' % system) wake_statuses[system] = 'detected' break
def decode(): nonlocal decoder, decoded_phrase # Dynamically load decoder if decoder is None: _LOGGER.debug('Loading decoder') hass.states.async_set(OBJECT_POCKETSPHINX, STATE_LOADING, state_attrs) decoder = Pocketsphinx( hmm=acoustic_model, lm=language_model, dic=dictionary) hass.states.async_set(OBJECT_POCKETSPHINX, STATE_DECODING, state_attrs) # Do actual decoding with decoder.start_utterance(): decoder.process_raw(recorded_data, False, True) # full utterance hyp = decoder.hyp() if hyp: with decoder.end_utterance(): decoded_phrase = hyp.hypstr decoded_event.set()
def __init__(self, mode='from_microphone', name_dataset='plays_ru'): self.current_dirname = os.path.dirname(os.path.realpath(__file__)) self.work_mode = mode model_path = get_model_path() if not (name_dataset == 'plays_ru' or name_dataset == 'subtitles_ru' or name_dataset == 'conversations_ru'): print( '\n[E] Неверное значение name_dataset. Возможные варианты: plays_ru, subtitles_ru или conversations_ru\n' ) return if self.work_mode == 'from_file': config = { 'hmm': os.path.join(model_path, 'zero_ru.cd_cont_4000'), 'lm': os.path.join(model_path, 'ru_bot_' + name_dataset + '.lm'), 'dict': os.path.join(model_path, 'ru_bot_' + name_dataset + '.dic') } self.speech_from_file = Pocketsphinx(**config) elif self.work_mode == 'from_microphone': self.speech_from_microphone = LiveSpeech( verbose=False, sampling_rate=16000, buffer_size=2048, no_search=False, full_utt=False, hmm=os.path.join(model_path, 'zero_ru.cd_cont_4000'), lm=os.path.join(model_path, 'ru_bot_' + name_dataset + '.lm'), dic=os.path.join(model_path, 'ru_bot_' + name_dataset + '.dic')) else: print( '[E] Неподдерживаемый режим работы, проверьте значение аргумента mode.' )
def async_setup(hass, config): name = config[DOMAIN].get(CONF_NAME, DEFAULT_NAME) hotword = config[DOMAIN].get(CONF_HOTWORD) acoustic_model = os.path.expanduser(config[DOMAIN].get( CONF_ACOUSTIC_MODEL, DEFAULT_ACOUSTIC_MODEL)) dictionary = os.path.expanduser(config[DOMAIN].get(CONF_DICTIONARY, DEFAULT_DICTIONARY)) threshold = config[DOMAIN].get(CONF_THRESHOLD, DEFAULT_THRESHOLD) audio_device_str = config[DOMAIN].get(CONF_AUDIO_DEVICE, DEFAULT_AUDIO_DEVICE) sample_rate = config[DOMAIN].get(CONF_SAMPLE_RATE, DEFAULT_SAMPLE_RATE) buffer_size = config[DOMAIN].get(CONF_BUFFER_SIZE, DEFAULT_BUFFER_SIZE) detected_event = threading.Event() detected_phrase = None terminated = False from pocketsphinx import Pocketsphinx, Ad decoder = Pocketsphinx(hmm=acoustic_model, lm=False, dic=dictionary, keyphrase=hotword, kws_threshold=threshold) audio_device = Ad(audio_device_str, sample_rate) state_attrs = {'friendly_name': 'Hotword', 'icon': 'mdi:microphone'} @asyncio.coroutine def async_listen(call): nonlocal terminated, detected_phrase terminated = False detected_phrase = None hass.states.async_set(OBJECT_DECODER, STATE_LISTENING, state_attrs) def listen(): buf = bytearray(buffer_size) with audio_device: with decoder.start_utterance(): while not terminated and audio_device.readinto(buf) >= 0: decoder.process_raw(buf, False, False) hyp = decoder.hyp() if hyp: with decoder.end_utterance(): # Make sure the hotword is matched detected_phrase = hyp.hypstr if detected_phrase == hotword: break detected_event.set() # Listen asynchronously detected_event.clear() thread = threading.Thread(target=listen, daemon=True) thread.start() yield from asyncio.get_event_loop().run_in_executor( None, detected_event.wait) if not terminated: thread.join() hass.states.async_set(OBJECT_DECODER, STATE_IDLE, state_attrs) # Fire detected event hass.bus.async_fire( EVENT_HOTWORD_DETECTED, { 'name': name # name of the component }) hass.services.async_register(DOMAIN, SERVICE_LISTEN, async_listen) hass.states.async_set(OBJECT_DECODER, STATE_IDLE, state_attrs) # Make sure snowboy terminates property when home assistant stops @asyncio.coroutine def async_terminate(event): nonlocal terminated terminated = True detected_event.set() hass.bus.async_listen(EVENT_HOMEASSISTANT_STOP, async_terminate) _LOGGER.info('Started') return True
def decode(): nonlocal decoder, decoded_phrase, data, filename # Check if WAV is in the correct format. # Convert with sox if not. with io.BytesIO(data) as wav_data: with wave.open(wav_data, mode='rb') as wav_file: rate, width, channels = wav_file.getframerate(), wav_file.getsampwidth(), wav_file.getnchannels() _LOGGER.debug('rate=%s, width=%s, channels=%s.' % (rate, width, channels)) if (rate != 16000) or (width != 2) or (channels != 1): # Convert to 16-bit 16Khz mono (required by pocketsphinx acoustic models) _LOGGER.debug('Need to convert to 16-bit 16Khz mono.') if shutil.which('sox') is None: _LOGGER.error("'sox' command not found. Cannot convert WAV file to appropriate format. Expect poor performance.") else: temp_input_file = None if filename is None: # Need to write original WAV data out to a file for sox temp_input_file = tempfile.NamedTemporaryFile(suffix='.wav', mode='wb+') temp_input_file.write(data) temp_input_file.seek(0) filename = temp_input_file.name # sox <IN> -r 16000 -e signed-integer -b 16 -c 1 <OUT> with tempfile.NamedTemporaryFile(suffix='.wav', mode='wb+') as out_wav_file: subprocess.check_call(['sox', filename, '-r', '16000', '-e', 'signed-integer', '-b', '16', '-c', '1', out_wav_file.name]) out_wav_file.seek(0) # Use converted data with wave.open(out_wav_file, 'rb') as wav_file: data = wav_file.readframes(wav_file.getnframes()) if temp_input_file is not None: # Clean up temporary file del temp_input_file # Dynamically load decoder if decoder is None: _LOGGER.debug('Loading decoder') hass.states.async_set(OBJECT_POCKETSPHINX, STATE_LOADING, state_attrs) decoder = Pocketsphinx( hmm=acoustic_model, lm=language_model, dic=dictionary) hass.states.async_set(OBJECT_POCKETSPHINX, STATE_DECODING, state_attrs) # Process WAV data as a complete utterance (best performance) with decoder.start_utterance(): decoder.process_raw(data, False, True) # full utterance if decoder.hyp(): with decoder.end_utterance(): decoded_phrase = decoder.hyp().hypstr decoded_event.set()
from pocketsphinx import Pocketsphinx print(Pocketsphinx().decode()) # => "go forward ten meters"
def __init__(self): # state self.micbuf = np.zeros((0, 4), 'uint16') self.spkrbuf = None self.buffer_stuff = 0 # robot name topic_base = "/" + os.getenv("MIRO_ROBOT_NAME") + "/" # publish topic = topic_base + "control/stream" print ("publish", topic) self.pub_stream = rospy.Publisher(topic, Int16MultiArray, queue_size=0) # subscribe topic = topic_base + "sensors/stream" print ("subscribe", topic) self.sub_stream = rospy.Subscriber(topic, UInt16MultiArray, self.callback_stream) # subscribe topic = topic_base + "sensors/mics" print ("subscribe", topic) self.sub_mics = rospy.Subscriber(topic, Int16MultiArray, self.callback_mics) # report print "recording on 4 microphone channels..." ####### Speech Recongnition using Pocket-Sphinx ######### #obtain audio from microphone r = sr.Recognizer() with sr.callback_mics() as source: print("Say Hello") audio = r.listen(source) #write audio as a wav file with open("./tmp/input.wav", "wb") as f: f.write(audio.get_wav_data()) model_path = get_model_path() data_path = get_data_path() config = { 'hmm' : os.path.join(model_path, 'en-us'), # Hidden Markov Model, Speech Recongnition model - trained probability scoring system 'lm': os.path.join(model_path, 'en-us.lm.bin'), #language model 'dict' : os.path.join(model_path, 'cmudict-en-us.dict') # language dictionary } ps = Pocketsphinx(**config) ps.decode( audio_file=os.path.join(data_path, "./tmp/input.wav"),#add temp input.wav file buffer_size=2048 no_search= False, full_utt=False ) print(ps.hypothesis()) ## output
def transcribe(audiofile): return Pocketsphinx()\ .decode( audio_file = audiofile)\ .hypothesis()
def __init__(self, vocabulary, hmm_dir="/usr/local/share/" + "pocketsphinx/model/hmm/en_US/hub4wsj_sc_8k"): """ Initiates the pocketsphinx instance. Arguments: vocabulary -- a PocketsphinxVocabulary instance hmm_dir -- the path of the Hidden Markov Model (HMM) """ self._logger = logging.getLogger(__name__) # quirky bug where first import doesn't work # try: # import pocketsphinx as ps # except Exception: # import pocketsphinx as ps from pocketsphinx import Pocketsphinx with tempfile.NamedTemporaryFile(prefix='psdecoder_', suffix='.log', delete=False) as f: self._logfile = f.name self._logger.debug("Initializing PocketSphinx Decoder with hmm_dir " + "'%s'", hmm_dir) # Perform some checks on the hmm_dir so that we can display more # meaningful error messages if neccessary if not os.path.exists(hmm_dir): msg = ("hmm_dir '%s' does not exist! Please make sure that you " + "have set the correct hmm_dir in your profile.") % hmm_dir self._logger.error(msg) raise RuntimeError(msg) # Lets check if all required files are there. Refer to: # http://cmusphinx.sourceforge.net/wiki/acousticmodelformat # for details missing_hmm_files = [] for fname in ('mdef', 'feat.params', 'means', 'noisedict', 'transition_matrices', 'variances'): if not os.path.exists(os.path.join(hmm_dir, fname)): missing_hmm_files.append(fname) mixweights = os.path.exists(os.path.join(hmm_dir, 'mixture_weights')) sendump = os.path.exists(os.path.join(hmm_dir, 'sendump')) if not mixweights and not sendump: # We only need mixture_weights OR sendump missing_hmm_files.append('mixture_weights or sendump') if missing_hmm_files: self._logger.warning("hmm_dir '%s' is missing files: %s. Please " + "make sure that you have set the correct " + "hmm_dir in your profile.", hmm_dir, ', '.join(missing_hmm_files)) # self._decoder = ps.Decoder(hmm=hmm_dir, logfn=self._logfile, # **vocabulary.decoder_kwargs) config = { 'hmm': hmm_dir, 'logfn': self._logfile } config.update(**vocabulary.decoder_kwargs) ps = Pocketsphinx(**config) self._decoder = ps.decode()
def async_setup(hass, config): name = config[DOMAIN].get(CONF_NAME, DEFAULT_NAME) acoustic_model = os.path.expanduser(config[DOMAIN].get( CONF_ACOUSTIC_MODEL, DEFAULT_ACOUSTIC_MODEL)) language_model = os.path.expanduser(config[DOMAIN].get( CONF_LANGUAGE_MODEL, DEFAULT_LANGUAGE_MODEL)) dictionary = os.path.expanduser(config[DOMAIN].get(CONF_DICTIONARY, DEFAULT_DICTIONARY)) audio_device_index = config[DOMAIN].get(CONF_AUDIO_DEVICE, DEFAULT_AUDIO_DEVICE) if (audio_device_index is not None) and (audio_device_index < 0): audio_device_index = None # default device sample_width = 2 # 16-bit channels = 1 # mono sample_rate = config[DOMAIN].get(CONF_SAMPLE_RATE, DEFAULT_SAMPLE_RATE) buffer_size = config[DOMAIN].get(CONF_BUFFER_SIZE, DEFAULT_BUFFER_SIZE) # Set up voice activity detection (VAD) import webrtcvad vad_mode = config[DOMAIN].get(CONF_VAD_MODE, DEFAULT_VAD_MODE) assert 0 <= vad_mode <= 3, 'VAD mode must be in [0-3]' vad = webrtcvad.Vad() vad.set_mode(vad_mode) # agressiveness (0-3) # Controls how phrase is recorded min_sec = config[DOMAIN].get(CONF_MIN_SEC, DEFAULT_MIN_SEC) silence_sec = config[DOMAIN].get(CONF_SILENCE_SEC, DEFAULT_SILENCE_SEC) timeout_sec = config[DOMAIN].get(CONF_TIMEOUT_SEC, DEFAULT_TIMEOUT_SEC) seconds_per_buffer = buffer_size / sample_rate # Create speech-to-text decoder from pocketsphinx import Pocketsphinx, Ad decoder = Pocketsphinx(hmm=acoustic_model, lm=language_model, dic=dictionary) import pyaudio data_format = pyaudio.get_format_from_width(sample_width) # Events for asynchronous recording/decoding recorded_event = threading.Event() decoded_event = threading.Event() decoded_phrase = None terminated = False # ------------------------------------------------------------------------- state_attrs = { 'friendly_name': 'Speech to Text', 'icon': 'mdi:comment-text', 'text': '' } @asyncio.coroutine def async_listen(call): nonlocal decoded_phrase, terminated decoded_phrase = None terminated = False hass.states.async_set(OBJECT_POCKETSPHINX, STATE_LISTENING, state_attrs) # Recording state max_buffers = int(math.ceil(timeout_sec / seconds_per_buffer)) silence_buffers = int(math.ceil(silence_sec / seconds_per_buffer)) min_phrase_buffers = int(math.ceil(min_sec / seconds_per_buffer)) in_phrase = False after_phrase = False finished = False recorded_data = bytearray() # PyAudio callback for each buffer from audio device def stream_callback(buf, frame_count, time_info, status): nonlocal max_buffers, silence_buffers, min_phrase_buffers nonlocal in_phrase, after_phrase nonlocal recorded_data, finished # Check maximum number of seconds to record max_buffers -= 1 if max_buffers <= 0: # Timeout finished = True # Reset in_phrase = False after_phrase = False # Detect speech in buffer is_speech = vad.is_speech(buf, sample_rate) if is_speech and not in_phrase: # Start of phrase in_phrase = True after_phrase = False recorded_data += buf min_phrase_buffers = int( math.ceil(min_sec / seconds_per_buffer)) elif in_phrase and (min_phrase_buffers > 0): # In phrase, before minimum seconds recorded_data += buf min_phrase_buffers -= 1 elif in_phrase and is_speech: # In phrase, after minimum seconds recorded_data += buf elif not is_speech: # Outside of speech if after_phrase and (silence_buffers > 0): # After phrase, before stop recorded_data += buf silence_buffers -= 1 elif after_phrase and (silence_buffers <= 0): # Phrase complete recorded_data += buf finished = True # Reset in_phrase = False after_phrase = False elif in_phrase and (min_phrase_buffers <= 0): # Transition to after phrase after_phrase = True silence_buffers = int( math.ceil(silence_sec / seconds_per_buffer)) if finished: recorded_event.set() return (buf, pyaudio.paContinue) # Open microphone device audio = pyaudio.PyAudio() mic = audio.open(format=data_format, channels=channels, rate=sample_rate, input_device_index=audio_device_index, input=True, stream_callback=stream_callback, frames_per_buffer=buffer_size) loop = asyncio.get_event_loop() # Wait for recorded to complete recorded_event.clear() mic.start_stream() yield from loop.run_in_executor(None, recorded_event.wait) # Stop audio mic.stop_stream() mic.close() audio.terminate() if not terminated: # Fire recorded event hass.bus.async_fire( EVENT_SPEECH_RECORDED, { 'name': name, # name of the component 'size': len(recorded_data) # bytes of recorded audio data }) hass.states.async_set(OBJECT_POCKETSPHINX, STATE_DECODING, state_attrs) def decode(): nonlocal decoded_phrase with decoder.start_utterance(): decoder.process_raw(recorded_data, False, True) # full utterance hyp = decoder.hyp() if hyp: with decoder.end_utterance(): decoded_phrase = hyp.hypstr decoded_event.set() # Decode in separate thread decoded_event.clear() thread = threading.Thread(target=decode, daemon=True) thread.start() yield from loop.run_in_executor(None, decoded_event.wait) if not terminated: thread.join() state_attrs['text'] = decoded_phrase hass.states.async_set(OBJECT_POCKETSPHINX, STATE_IDLE, state_attrs) # Fire decoded event hass.bus.async_fire( EVENT_SPEECH_TO_TEXT, { 'name': name, # name of the component 'text': decoded_phrase }) # ------------------------------------------------------------------------- @asyncio.coroutine def async_decode(call): nonlocal decoded_phrase, terminated decoded_phrase = None terminated = False if ATTR_FILENAME in call.data: # Use WAV file filename = call.data[ATTR_FILENAME] with wave.open(filename, mode='rb') as wav_file: data = wav_file.readframes(wav_file.getnframes()) else: # Use data directly from JSON filename = None data = bytearray(call.data[ATTR_DATA]) hass.states.async_set(OBJECT_POCKETSPHINX, STATE_DECODING, state_attrs) def decode(): nonlocal decoded_phrase, data, filename # Check if WAV is in the correct format. # Convert with sox if not. with io.BytesIO(data) as wav_data: with wave.open(wav_data, mode='rb') as wav_file: rate, width, channels = wav_file.getframerate( ), wav_file.getsampwidth(), wav_file.getnchannels() _LOGGER.debug('rate=%s, width=%s, channels=%s.' % (rate, width, channels)) if (rate != 16000) or (width != 2) or (channels != 1): # Convert to 16-bit 16Khz mono (required by pocketsphinx acoustic models) _LOGGER.debug('Need to convert to 16-bit 16Khz mono.') if shutil.which('sox') is None: _LOGGER.error( "'sox' command not found. Cannot convert WAV file to appropriate format. Expect poor performance." ) else: temp_input_file = None if filename is None: # Need to write original WAV data out to a file for sox temp_input_file = tempfile.NamedTemporaryFile( suffix='.wav', mode='wb+') temp_input_file.write(data) temp_input_file.seek(0) filename = temp_input_file.name # sox <IN> -r 16000 -e signed-integer -b 16 -c 1 <OUT> with tempfile.NamedTemporaryFile( suffix='.wav', mode='wb+') as out_wav_file: subprocess.check_call([ 'sox', filename, '-r', '16000', '-e', 'signed-integer', '-b', '16', '-c', '1', out_wav_file.name ]) out_wav_file.seek(0) # Use converted data with wave.open(out_wav_file, 'rb') as wav_file: data = wav_file.readframes( wav_file.getnframes()) if temp_input_file is not None: # Clean up temporary file del temp_input_file # Process WAV data as a complete utterance (best performance) with decoder.start_utterance(): decoder.process_raw(data, False, True) # full utterance if decoder.hyp(): with decoder.end_utterance(): decoded_phrase = decoder.hyp().hypstr decoded_event.set() loop = asyncio.get_event_loop() # Decode in separate thread decoded_event.clear() thread = threading.Thread(target=decode, daemon=True) thread.start() yield from loop.run_in_executor(None, decoded_event.wait) if not terminated: thread.join() state_attrs['text'] = decoded_phrase hass.states.async_set(OBJECT_POCKETSPHINX, STATE_IDLE, state_attrs) # Fire decoded event hass.bus.async_fire( EVENT_SPEECH_TO_TEXT, { 'name': name, # name of the component 'text': decoded_phrase }) # ------------------------------------------------------------------------- hass.http.register_view(ExternalSpeechView) # Service to record commands hass.services.async_register(DOMAIN, SERVICE_LISTEN, async_listen) # Service to do speech to text hass.services.async_register(DOMAIN, SERVICE_DECODE, async_decode, schema=SCHEMA_SERVICE_DECODE) hass.states.async_set(OBJECT_POCKETSPHINX, STATE_IDLE, state_attrs) # Make sure everything terminates property when home assistant stops @asyncio.coroutine def async_terminate(event): nonlocal terminated terminated = True recorded_event.set() decoded_event.set() hass.bus.async_listen(EVENT_HOMEASSISTANT_STOP, async_terminate) _LOGGER.info('Started') return True
def __init__(self, *args, **kwargs): self.ps = Pocketsphinx() self.ps.decode() super(TestRawDecoder, self).__init__(*args, **kwargs)
from pocketsphinx import Pocketsphinx ps = Pocketsphinx(verbose=True) ps.decode() print(ps.hypothesis())
# Code retested by KhalsaLabs # You can use your own audio file in code # Raw or wav files would work perfectly # For mp3 files, you need to modify code (add codex) from __future__ import print_function import os from pocketsphinx import Pocketsphinx, get_model_path, get_data_path model_path = get_model_path() data_path = get_data_path() config = { 'hmm': os.path.join(model_path, 'en-us'), 'lm': os.path.join(model_path, 'en-us.lm.bin'), 'dict': os.path.join(model_path, 'cmudict-en-us.dict') } ps = Pocketsphinx(**config) ps.decode( audio_file=os.path.join(data_path, 'test1.wav'), # add your audio file here buffer_size=2048, no_search=False, full_utt=False) print(ps.hypothesis())
def loop(self): # loop while not rospy.core.is_shutdown(): # if recording finished if not self.outbuf is None: # write output file print("writing output file") outfilename = '/tmp/input.wav' file = wave.open(outfilename, 'wb') file.setparams((1, 4, 20000, 0, 'NONE', 'not compressed')) print("Starting Reshape") x = np.reshape(self.outbuf[:, [0, 0]], (-1)) print("writing frames") print(len(x)) values = [] for s in x: packed_value = struct.pack('<h', s) values.append(packed_value) #file.writeframes(struct.pack('<h', s)) #close file value_str = b''.join(values) file.writeframes(value_str) print("Closing file") file.close() model_path = get_model_path() data_path = get_data_path() config = { 'hmm': os.path.join( model_path, 'en-us' ), # Hidden Markov Model, Speech Recongnition model - trained probability scoring system 'lm': os.path.join(model_path, 'en-us.lm.bin'), #language model 'dict': os.path.join( model_path, 'cmudict-en-us.dict') #, # language dictionary #'samprate' : 16000 } #cmd= "ffmpeg -y -i /tmp/output.wav -ar 8000 -af asetrate=16000*" + pitch + ",aresample=16000,atempo=" + tempo + " -ac 1 /tmp/outputConv.wav" #cmd = "ffmpeg -y -i /tmp/input.wav -f s32le -acodec pcm_s32le -ar 16000 -ac 1 /tmp/inputConv.wav" #cmd = "sox /tmp/input.wav -r 16000 inputConv.wav" #cmd = "ffmpeg -i /tmp/input.wav -ar 16000 /tmp/inputConv.wav" print("Converting via FFMPEG") cmd = "ffmpeg -y -i /tmp/input.wav -f s16le -acodec pcm_s16le -ar 16000 -af 'aresample=20000' -ac 1 /tmp/inputConv.wav -loglevel quiet" os.system(cmd) print("Decoding Via Pocketsphinx") ps = Pocketsphinx(**config) ps.decode( audio_file=( "/tmp/inputConv.wav"), #add temp input.wav file buffer_size=8192, no_search=False, full_utt=False) print("Recognized: ") print(ps.hypothesis()) ## output ## Speech Analysis, (what to start?) if ps.hypothesis() == "hello": mml.say("Hello there human") # Change this to whatever elif ps.hypothesis().find("how are you") >= 0: mml.say("I'm always good") print("END") self.micbuf = np.zeros((0, 4), 'uint16') self.outbuf = None self.buffer_stuff = 0 self.playchan = 0 self.playsamp = 0 # state time.sleep(0.02)