Ejemplo n.º 1
0
def transform_audio_to_text(filename):

    user = expanduser("~")
    path = user + "/DTAI_Internship/src/speech_recognizer_node/data/"

    lm_file = path + "generated_language_model.lm"
    dict_file = path + "generated_dictionary.dic"

    hmm_file = user + "/.local/lib/python2.7/site-packages/pocketsphinx/model/en-us"

    model_path = get_model_path()
    data_path = get_data_path()

    config = {
        'hmm': os.path.join(model_path, 'en-us'),
        'lm': os.path.join(model_path, lm_file),
        'dict': os.path.join(model_path, dict_file)
    }

    ps = Pocketsphinx(**config)
    ps.decode(audio_file=os.path.join(data_path, filename),
              buffer_size=2048,
              no_search=False,
              full_utt=False)

    text = ps.hypothesis()

    print(text)

    return text
Ejemplo n.º 2
0
def pocket():

	ps = Pocketsphinx()


	language_directory = os.path.dirname(os.path.realpath(__file__))
	
	print language_directory

	acoustic_parameters_directory = os.path.join(language_directory, "acoustic-model")
	language_model_file = os.path.join(language_directory, "language-model.lm.bin")
	phoneme_dictionary_file = os.path.join(language_directory, "pronounciation-dictionary.dict")
    
	config = Decoder.default_config()
	config.set_string("-hmm", acoustic_parameters_directory)  # set the path of the hidden Markov model (HMM) parameter files
	config.set_string("-lm", language_model_file)
	config.set_string("-dict", phoneme_dictionary_file)

	decoder = Decoder(config)

	with sr.AudioFile(s_dir + "/a bad situation could become dramatically worse. /a bad situation could become dramatically worse. .wav") as source:
		audio_data = r.record(source)
	decoder.start_utt()
	decoder.process_raw(audio_data, False, True)
	decoder.end_utt()

	print decoder.hyp()

	ps.decode(
	    audio_file=os.path.join(s_dir, 'a bad situation could become dramatically worse. /a bad situation could become dramatically worse. .wav'),
	    buffer_size=2048,
	    no_search=False,
	    full_utt=False)
	print(ps.hypothesis()) # => ['<s>', '<sil>', 'go', 'forward', 'ten', 'meters', '</s>']
#pocket()
Ejemplo n.º 3
0
    def test_lm(self):
        ps = Pocketsphinx(dic='deps/pocketsphinx/test/data/defective.dic',
                          mmap=False)

        # Decoding with 'defective' dictionary
        ps.decode()
        self.assertEqual(ps.hypothesis(), '')

        # Switch to 'turtle' language model
        turtle_lm = 'deps/pocketsphinx/test/data/turtle.lm.bin'
        lm = NGramModel(ps.get_config(), ps.get_logmath(), turtle_lm)
        ps.set_lm('turtle', lm)
        ps.set_search('turtle')

        # Decoding with 'turtle' language model
        ps.decode()
        self.assertEqual(ps.hypothesis(), '')

        # The word 'meters' isn't in the loaded dictionary
        # Let's add it manually
        ps.add_word('foobie', 'F UW B IY', False)
        ps.add_word('meters', 'M IY T ER Z', True)

        # Decoding with 'turtle' language model
        ps.decode()
        self.assertEqual(ps.hypothesis(), 'foobie meters meters')
Ejemplo n.º 4
0
def main(args):
    # if not args:
    #     print("args are required")
    #     exit(0)

    config = getConfig()
    ps = Pocketsphinx(**config)

    # if (args[0] == '--test'):
    #     withGraphics = False
    #     testsRootDir = "./../tests"
    #     resultsDir = "./../testResults"

    #     for dirName in os.listdir(testsRootDir):

    #         for filename in os.listdir(testsRootDir + "/" + dirName):
    #             path = testsRootDir + "/" + dirName + "/" + filename

    #             if(dirName == "indfrdic"):
    #                 for deepFile in os.listdir(path):
    #                     print(f"I'm in if {path}/{deepFile} and file name: {filename}")
    #                     f = open(resultsDir + f"/{dirName}_{filename}_w_filter_results.txt",'a')
    #                     process(path + "/" + deepFile,ps,f, withGraphics)
    #             else:
    #                 print(f"I'm in else {path}")
    #                 f = open(resultsDir + f"/{dirName}_w_filter_results.txt",'a')
    #                 process(path,ps,f, withGraphics)
    # elif (args[0] == '-P'):
    withGraphics = True
    f = open('result.txt', 'a')
    process(args[0], ps, f, withGraphics)
Ejemplo n.º 5
0
    def __init__(self, hmm='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/modelo',
                       lm='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/leng.lm.bin',
                       dict='data/spanish/CIEMPIESS_Spanish_Models_581h/Models/dicc.dic',
                       grammar='data/gramatica-tp2.gram', dataPath='tmp/'):
        self.data_path = dataPath
        config = {
            'hmm': hmm,
            'lm': lm,
            'dict': dict
        }
        #model_path = get_model_path()

        self.ps = Pocketsphinx(**config)
        
        # Switch to JSGF grammar
        jsgf = Jsgf(grammar)
        rule = jsgf.get_rule('tp2.grammar')
        fsg = jsgf.build_fsg(rule, self.ps.get_logmath(), 7.5)
        self.ps.set_fsg('tp2', fsg)
        self.ps.set_search('tp2')

        # Síntesis
        self.tts_authenticator = IAMAuthenticator('cq9_4YcCXxClw2AfgUhbokFktZ-xSRT4kcHS2akcZ05J')
        self.tts = TextToSpeechV1(authenticator=self.tts_authenticator)
        self.tts.set_service_url('https://stream.watsonplatform.net/text-to-speech/api')
Ejemplo n.º 6
0
	def __init__(self, mode):

		# state
		self.micbuf = np.zeros((0, 4), 'uint16')
		self.outbuf = None
		self.buffer_stuff = 0
		self.mode = mode
		self.playchan = 0
		self.playsamp = 0
		
		# check mode
		if not (mode == "echo" or mode == "record" or mode == "record4"):
			error("argument not recognised")

		# robot name
		topic_base_name = "/" + os.getenv("MIRO_ROBOT_NAME")

		# publish
		topic = topic_base_name + "/control/stream"
		print ("publish", topic)
		self.pub_stream = rospy.Publisher(topic, Int16MultiArray, queue_size=0)

		# subscribe
		topic = topic_base_name + "/sensors/stream"
		print ("subscribe", topic)
		self.sub_stream = rospy.Subscriber(topic, UInt16MultiArray, self.callback_stream, queue_size=1, tcp_nodelay=True)

		# subscribe
		topic = topic_base_name + "/sensors/mics"
		print ("subscribe", topic)
		self.sub_mics = rospy.Subscriber(topic, Int16MultiArray, self.callback_mics, queue_size=5, tcp_nodelay=True)
		
		# report
		print "recording from 4 microphones for", RECORD_TIME, "seconds..."


		####### Speech Recongnition using Pocket-Sphinx #########  
		

		model_path = get_model_path()
		data_path = get_data_path()

		config = {

		'hmm' : os.path.join(model_path, 'en-us'), # Hidden Markov Model, Speech Recongnition model - trained probability scoring system
		'lm': os.path.join(model_path, 'en-us.lm.bin'), #language model
		'dict' : os.path.join(model_path, 'cmudict-en-us.dict') # language dictionary
		}

		ps = Pocketsphinx(**config)
		ps.decode(
		audio_file=("/tmp/input.wav"), #add temp input.wav file
		buffer_size=2048,
		no_search= False,
		full_utt=False)

		print("Recognized: ")
		print((ps.hypothesis())) ## output
		print("END")
Ejemplo n.º 7
0
    def test_lattice(self):
        ps = Pocketsphinx()
        ps.decode()

        lattice = ps.get_lattice()
        self.assertEqual(lattice.write('tests/goforward.lat'), None)

        lattice = ps.get_lattice()
        self.assertEqual(lattice.write_htk('tests/goforward.htk'), None)
Ejemplo n.º 8
0
 def __init__(self, keyword: str, kws_threshold: float):
     self._decoder = Pocketsphinx(keyphrase=keyword,
                                  lm=False,
                                  kws_threshold=kws_threshold)
     self._sound = pyaudio.PyAudio()
     self._audio_stream = self._sound.open(rate=_SAMPLE_RATE,
                                           channels=1,
                                           format=pyaudio.paInt16,
                                           input=True,
                                           frames_per_buffer=_FRAME_LENGTH)
Ejemplo n.º 9
0
 def test_cep_decoder_hypothesis(self):
     ps = Pocketsphinx()
     with open('deps/pocketsphinx/test/data/goforward.mfc', 'rb') as f:
         with ps.start_utterance():
             f.read(4)
             buf = f.read(13780)
             ps.process_cep(buf, False, True)
     self.assertEqual(ps.hypothesis(), 'go forward ten meters')
     self.assertEqual(ps.score(), -7095)
     self.assertEqual(ps.probability(), -32715)
Ejemplo n.º 10
0
 def run(self):
     print_important("Info! Thread sphinx started.") 
     self.config = {
         'verbose': True,
         'hmm': os.path.join('s2m', 'core', 'sphinx', 'fr'),
         'lm': os.path.join('s2m', 'core', 'sphinx', 'fr.lm.dmp'),
         'dict': os.path.join('s2m', 'core', 'sphinx', 's2m.dict'),
         'jsgf': os.path.join('s2m', 'core', 'sphinx', 's2m.jsgf'),
     }
     self.pocketsphinx = Pocketsphinx(**self.config)
     self.ready = True
Ejemplo n.º 11
0
def getPockerSphinxDecoder():
	model_path = get_model_path()
	data_path = get_data_path()
	config = {
		'verbose': False,
		'hmm': os.path.join(model_path, 'en-us'),
		'lm': os.path.join(model_path, 'en-us.lm.bin'),
		'dict': os.path.join(model_path, 'cmudict-en-us.dict')
	}

	return Pocketsphinx(**config)
Ejemplo n.º 12
0
 def __init__(self, *args, **kwargs):
     self.ps = Pocketsphinx(
         lm=False,
         dic=False,
         allphone='deps/pocketsphinx/model/en-us/en-us-phone.lm.bin',
         lw=2.0,
         pip=0.3,
         beam=1e-200,
         pbeam=1e-20,
         mmap=False)
     self.ps.decode()
     super(TestPhoneme, self).__init__(*args, **kwargs)
Ejemplo n.º 13
0
def getPockerSphinxDecoder():
    model_path = get_model_path()
    data_path = get_data_path()
    config = {
        'verbose': False,
        'hmm': os.path.join(model_path, 'en-us'),
        'lm': os.path.join(model_path, 'en-us.lm.bin'),
        'dict': os.path.join(model_path, 'cmudict-en-us.dict'),
        # 'topn': 2,
        # 'ds':2,
        # 'maxwpf': 5,
        # 'maxhmmpf': 3000
    }
    return Pocketsphinx(**config)
Ejemplo n.º 14
0
    def __init__(self, **kwargs):
        # signal.signal(signal.SIGINT, self.stop)
        self._no_search = False
        self._full_utt = False
        hotword = kwargs.pop('hotword', ['阿Q', 'R-cute'])
        self._hotwords = hotword if isinstance(hotword, list) else [hotword]

        model_path = get_model_path()
        opt = {
            'verbose': False,
            'hmm': os.path.join(model_path, 'en-us'),
            'lm': util.resource('sphinx/rcute.lm'),
            'dic': util.resource('sphinx/rcute.dic'),
        }
        opt.update(kwargs)
        self._rec = Pocketsphinx(**opt)
Ejemplo n.º 15
0
    def test_jsgf(self):
        ps = Pocketsphinx(lm='deps/pocketsphinx/test/data/turtle.lm.bin',
                          dic='deps/pocketsphinx/test/data/turtle.dic')

        # Decoding with 'turtle' language model
        ps.decode()
        self.assertEqual(ps.hypothesis(), 'go forward ten meters')

        # Switch to JSGF grammar
        jsgf = Jsgf('deps/pocketsphinx/test/data/goforward.gram')
        rule = jsgf.get_rule('goforward.move2')
        fsg = jsgf.build_fsg(rule, ps.get_logmath(), 7.5)
        ps.set_fsg('goforward', fsg)
        ps.set_search('goforward')

        # Decoding with 'goforward' grammar
        ps.decode()
        self.assertEqual(ps.hypothesis(), 'go forward ten meters')
Ejemplo n.º 16
0
	def __init__(self):

		model_path = get_model_path()
		print(model_path)
		data_path = get_data_path()
		config = {
			'hmm' : os.path.join(model_path, 'en-us'), # Hidden Markov Model, Speech Recongnition model - trained probability scoring system
			'lm': os.path.join(model_path, 'en-us.lm.bin'), #language model
			'dict' : os.path.join(model_path, 'testdict.dict')#, # language dictionary
			}
				
			#Start PocketSphinx Deocde
		self.ps = Pocketsphinx(**config)
		# Variables for Audio
		self.micbuf = np.zeros((0, 4), 'uint16')
		self.outbuf = None
		self.buffer_stuff = 0
		self.audio_level = 0
		self.timeofclap = 0
		self.playchan = 0
		self.playsamp = 0
		self.startTime = 0
		self.TimeSinceLast = 0
		self.DemoPause = False
		self.PID = ''
		self.velocity = TwistStamped()
		
		# Variables for Illumination
		self.illum = UInt32MultiArray()
		self.illum.data = [0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF]
		self.illumInt = 0
		self.illumState = 0
		

		# robot name
		topic_base_name = "/" + os.getenv("MIRO_ROBOT_NAME")

		#Publisher for Illum to control LED's while we are processing requests
		topic = topic_base_name + "/control/illum"
		self.pub_illum = rospy.Publisher(topic, UInt32MultiArray, queue_size=0)
		self.velocity_pub = rospy.Publisher(topic_base_name + "/control/cmd_vel", TwistStamped, queue_size=0)
		# subscribe
		topic = topic_base_name + "/sensors/mics"
		self.sub_mics = rospy.Subscriber(topic, Int16MultiArray, self.callback_mics, queue_size=1, tcp_nodelay=True)
Ejemplo n.º 17
0
                def detect():
                    from pocketsphinx import Pocketsphinx, Ad
                    ad = Ad(None, 16000)  # default input
                    decoder = Pocketsphinx(lm=False,
                                           hmm=hmm,
                                           dic=dic,
                                           keyphrase=keyphrase,
                                           kws_threshold=kws_threshold)

                    buf = bytearray(2048)
                    with ad:
                        with decoder.start_utterance():
                            while ad.readinto(buf) >= 0:
                                decoder.process_raw(buf, False, False)
                                if decoder.hyp():
                                    with decoder.end_utterance():
                                        logging.info('Wake word detected for %s' % system)
                                        wake_statuses[system] = 'detected'
                                        break
Ejemplo n.º 18
0
            def decode():
                nonlocal decoder, decoded_phrase

                # Dynamically load decoder
                if decoder is None:
                    _LOGGER.debug('Loading decoder')
                    hass.states.async_set(OBJECT_POCKETSPHINX, STATE_LOADING, state_attrs)
                    decoder = Pocketsphinx(
                        hmm=acoustic_model,
                        lm=language_model,
                        dic=dictionary)
                    hass.states.async_set(OBJECT_POCKETSPHINX, STATE_DECODING, state_attrs)

                # Do actual decoding
                with decoder.start_utterance():
                    decoder.process_raw(recorded_data, False, True)  # full utterance
                    hyp = decoder.hyp()
                    if hyp:
                        with decoder.end_utterance():
                            decoded_phrase = hyp.hypstr

                decoded_event.set()
Ejemplo n.º 19
0
    def __init__(self, mode='from_microphone', name_dataset='plays_ru'):
        self.current_dirname = os.path.dirname(os.path.realpath(__file__))
        self.work_mode = mode
        model_path = get_model_path()

        if not (name_dataset == 'plays_ru' or name_dataset == 'subtitles_ru'
                or name_dataset == 'conversations_ru'):
            print(
                '\n[E] Неверное значение name_dataset. Возможные варианты: plays_ru, subtitles_ru или conversations_ru\n'
            )
            return

        if self.work_mode == 'from_file':
            config = {
                'hmm': os.path.join(model_path, 'zero_ru.cd_cont_4000'),
                'lm': os.path.join(model_path,
                                   'ru_bot_' + name_dataset + '.lm'),
                'dict': os.path.join(model_path,
                                     'ru_bot_' + name_dataset + '.dic')
            }
            self.speech_from_file = Pocketsphinx(**config)
        elif self.work_mode == 'from_microphone':
            self.speech_from_microphone = LiveSpeech(
                verbose=False,
                sampling_rate=16000,
                buffer_size=2048,
                no_search=False,
                full_utt=False,
                hmm=os.path.join(model_path, 'zero_ru.cd_cont_4000'),
                lm=os.path.join(model_path, 'ru_bot_' + name_dataset + '.lm'),
                dic=os.path.join(model_path,
                                 'ru_bot_' + name_dataset + '.dic'))
        else:
            print(
                '[E] Неподдерживаемый режим работы, проверьте значение аргумента mode.'
            )
Ejemplo n.º 20
0
def async_setup(hass, config):
    name = config[DOMAIN].get(CONF_NAME, DEFAULT_NAME)
    hotword = config[DOMAIN].get(CONF_HOTWORD)
    acoustic_model = os.path.expanduser(config[DOMAIN].get(
        CONF_ACOUSTIC_MODEL, DEFAULT_ACOUSTIC_MODEL))
    dictionary = os.path.expanduser(config[DOMAIN].get(CONF_DICTIONARY,
                                                       DEFAULT_DICTIONARY))
    threshold = config[DOMAIN].get(CONF_THRESHOLD, DEFAULT_THRESHOLD)

    audio_device_str = config[DOMAIN].get(CONF_AUDIO_DEVICE,
                                          DEFAULT_AUDIO_DEVICE)
    sample_rate = config[DOMAIN].get(CONF_SAMPLE_RATE, DEFAULT_SAMPLE_RATE)
    buffer_size = config[DOMAIN].get(CONF_BUFFER_SIZE, DEFAULT_BUFFER_SIZE)

    detected_event = threading.Event()
    detected_phrase = None
    terminated = False

    from pocketsphinx import Pocketsphinx, Ad
    decoder = Pocketsphinx(hmm=acoustic_model,
                           lm=False,
                           dic=dictionary,
                           keyphrase=hotword,
                           kws_threshold=threshold)

    audio_device = Ad(audio_device_str, sample_rate)

    state_attrs = {'friendly_name': 'Hotword', 'icon': 'mdi:microphone'}

    @asyncio.coroutine
    def async_listen(call):
        nonlocal terminated, detected_phrase
        terminated = False
        detected_phrase = None

        hass.states.async_set(OBJECT_DECODER, STATE_LISTENING, state_attrs)

        def listen():
            buf = bytearray(buffer_size)

            with audio_device:
                with decoder.start_utterance():
                    while not terminated and audio_device.readinto(buf) >= 0:
                        decoder.process_raw(buf, False, False)
                        hyp = decoder.hyp()
                        if hyp:
                            with decoder.end_utterance():
                                # Make sure the hotword is matched
                                detected_phrase = hyp.hypstr
                                if detected_phrase == hotword:
                                    break

            detected_event.set()

        # Listen asynchronously
        detected_event.clear()
        thread = threading.Thread(target=listen, daemon=True)
        thread.start()
        yield from asyncio.get_event_loop().run_in_executor(
            None, detected_event.wait)

        if not terminated:
            thread.join()
            hass.states.async_set(OBJECT_DECODER, STATE_IDLE, state_attrs)

            # Fire detected event
            hass.bus.async_fire(
                EVENT_HOTWORD_DETECTED,
                {
                    'name': name  # name of the component
                })

    hass.services.async_register(DOMAIN, SERVICE_LISTEN, async_listen)
    hass.states.async_set(OBJECT_DECODER, STATE_IDLE, state_attrs)

    # Make sure snowboy terminates property when home assistant stops
    @asyncio.coroutine
    def async_terminate(event):
        nonlocal terminated
        terminated = True
        detected_event.set()

    hass.bus.async_listen(EVENT_HOMEASSISTANT_STOP, async_terminate)

    _LOGGER.info('Started')

    return True
Ejemplo n.º 21
0
        def decode():
            nonlocal decoder, decoded_phrase, data, filename

            # Check if WAV is in the correct format.
            # Convert with sox if not.
            with io.BytesIO(data) as wav_data:
                with wave.open(wav_data, mode='rb') as wav_file:
                    rate, width, channels = wav_file.getframerate(), wav_file.getsampwidth(), wav_file.getnchannels()
                    _LOGGER.debug('rate=%s, width=%s, channels=%s.' % (rate, width, channels))

                    if (rate != 16000) or (width != 2) or (channels != 1):
                        # Convert to 16-bit 16Khz mono (required by pocketsphinx acoustic models)
                        _LOGGER.debug('Need to convert to 16-bit 16Khz mono.')
                        if shutil.which('sox') is None:
                            _LOGGER.error("'sox' command not found. Cannot convert WAV file to appropriate format. Expect poor performance.")
                        else:
                            temp_input_file = None
                            if filename is None:
                                # Need to write original WAV data out to a file for sox
                                temp_input_file = tempfile.NamedTemporaryFile(suffix='.wav', mode='wb+')
                                temp_input_file.write(data)
                                temp_input_file.seek(0)
                                filename = temp_input_file.name

                            # sox <IN> -r 16000 -e signed-integer -b 16 -c 1 <OUT>
                            with tempfile.NamedTemporaryFile(suffix='.wav', mode='wb+') as out_wav_file:
                                subprocess.check_call(['sox',
                                                       filename,
                                                       '-r', '16000',
                                                       '-e', 'signed-integer',
                                                       '-b', '16',
                                                       '-c', '1',
                                                       out_wav_file.name])

                                out_wav_file.seek(0)

                                # Use converted data
                                with wave.open(out_wav_file, 'rb') as wav_file:
                                    data = wav_file.readframes(wav_file.getnframes())

                            if temp_input_file is not None:
                                # Clean up temporary file
                                del temp_input_file

            # Dynamically load decoder
            if decoder is None:
                _LOGGER.debug('Loading decoder')
                hass.states.async_set(OBJECT_POCKETSPHINX, STATE_LOADING, state_attrs)
                decoder = Pocketsphinx(
                    hmm=acoustic_model,
                    lm=language_model,
                    dic=dictionary)
                hass.states.async_set(OBJECT_POCKETSPHINX, STATE_DECODING, state_attrs)

            # Process WAV data as a complete utterance (best performance)
            with decoder.start_utterance():
                decoder.process_raw(data, False, True)  # full utterance
                if decoder.hyp():
                    with decoder.end_utterance():
                        decoded_phrase = decoder.hyp().hypstr

            decoded_event.set()
Ejemplo n.º 22
0
Archivo: sp.py Proyecto: moysn7/SMI
from pocketsphinx import Pocketsphinx

print(Pocketsphinx().decode())  # => "go forward ten meters"
Ejemplo n.º 23
0
	def __init__(self):

		# state
		self.micbuf = np.zeros((0, 4), 'uint16')
		self.spkrbuf = None
		self.buffer_stuff = 0

		# robot name
		topic_base = "/" + os.getenv("MIRO_ROBOT_NAME") + "/"

		# publish
		topic = topic_base + "control/stream"
		print ("publish", topic)
		self.pub_stream = rospy.Publisher(topic, Int16MultiArray, queue_size=0)

		# subscribe
		topic = topic_base + "sensors/stream"
		print ("subscribe", topic)
		self.sub_stream = rospy.Subscriber(topic, UInt16MultiArray, self.callback_stream)

		# subscribe
		topic = topic_base + "sensors/mics"
		print ("subscribe", topic)
		self.sub_mics = rospy.Subscriber(topic, Int16MultiArray, self.callback_mics)
		
		# report
		print "recording on 4 microphone channels..."


		####### Speech Recongnition using Pocket-Sphinx #########  
		

		#obtain audio from microphone

		r = sr.Recognizer()
		with sr.callback_mics() as source:
			print("Say Hello")
			audio = r.listen(source)

		#write audio as a wav file
		with open("./tmp/input.wav", "wb") as f:

			f.write(audio.get_wav_data())

		model_path = get_model_path()
		data_path = get_data_path()

		config = {

		'hmm' : os.path.join(model_path, 'en-us'), # Hidden Markov Model, Speech Recongnition model - trained probability scoring system
		'lm': os.path.join(model_path, 'en-us.lm.bin'), #language model
		'dict' : os.path.join(model_path, 'cmudict-en-us.dict') # language dictionary
		}

		ps = Pocketsphinx(**config)
		ps.decode(
		audio_file=os.path.join(data_path, "./tmp/input.wav"),#add temp input.wav file
		buffer_size=2048
		no_search= False,
		full_utt=False
		)
		
		print(ps.hypothesis()) ## output
Ejemplo n.º 24
0
def transcribe(audiofile):
    return Pocketsphinx()\
        .decode( audio_file = audiofile)\
        .hypothesis()
Ejemplo n.º 25
0
    def __init__(self, vocabulary, hmm_dir="/usr/local/share/" +
                 "pocketsphinx/model/hmm/en_US/hub4wsj_sc_8k"):
        """
        Initiates the pocketsphinx instance.

        Arguments:
            vocabulary -- a PocketsphinxVocabulary instance
            hmm_dir -- the path of the Hidden Markov Model (HMM)
        """

        self._logger = logging.getLogger(__name__)

        # quirky bug where first import doesn't work
        # try:
        #     import pocketsphinx as ps
        # except Exception:
        #     import pocketsphinx as ps
        from pocketsphinx import Pocketsphinx

        with tempfile.NamedTemporaryFile(prefix='psdecoder_',
                                         suffix='.log', delete=False) as f:
            self._logfile = f.name

        self._logger.debug("Initializing PocketSphinx Decoder with hmm_dir " +
                           "'%s'", hmm_dir)

        # Perform some checks on the hmm_dir so that we can display more
        # meaningful error messages if neccessary
        if not os.path.exists(hmm_dir):
            msg = ("hmm_dir '%s' does not exist! Please make sure that you " +
                   "have set the correct hmm_dir in your profile.") % hmm_dir
            self._logger.error(msg)
            raise RuntimeError(msg)
        # Lets check if all required files are there. Refer to:
        # http://cmusphinx.sourceforge.net/wiki/acousticmodelformat
        # for details
        missing_hmm_files = []
        for fname in ('mdef', 'feat.params', 'means', 'noisedict',
                      'transition_matrices', 'variances'):
            if not os.path.exists(os.path.join(hmm_dir, fname)):
                missing_hmm_files.append(fname)
        mixweights = os.path.exists(os.path.join(hmm_dir, 'mixture_weights'))
        sendump = os.path.exists(os.path.join(hmm_dir, 'sendump'))
        if not mixweights and not sendump:
            # We only need mixture_weights OR sendump
            missing_hmm_files.append('mixture_weights or sendump')
        if missing_hmm_files:
            self._logger.warning("hmm_dir '%s' is missing files: %s. Please " +
                                 "make sure that you have set the correct " +
                                 "hmm_dir in your profile.",
                                 hmm_dir, ', '.join(missing_hmm_files))

        # self._decoder = ps.Decoder(hmm=hmm_dir, logfn=self._logfile,
        #                            **vocabulary.decoder_kwargs)
        config = {
            'hmm': hmm_dir,
            'logfn': self._logfile
        }
        config.update(**vocabulary.decoder_kwargs)

        ps = Pocketsphinx(**config)
        self._decoder = ps.decode()
Ejemplo n.º 26
0
def async_setup(hass, config):
    name = config[DOMAIN].get(CONF_NAME, DEFAULT_NAME)
    acoustic_model = os.path.expanduser(config[DOMAIN].get(
        CONF_ACOUSTIC_MODEL, DEFAULT_ACOUSTIC_MODEL))
    language_model = os.path.expanduser(config[DOMAIN].get(
        CONF_LANGUAGE_MODEL, DEFAULT_LANGUAGE_MODEL))
    dictionary = os.path.expanduser(config[DOMAIN].get(CONF_DICTIONARY,
                                                       DEFAULT_DICTIONARY))

    audio_device_index = config[DOMAIN].get(CONF_AUDIO_DEVICE,
                                            DEFAULT_AUDIO_DEVICE)
    if (audio_device_index is not None) and (audio_device_index < 0):
        audio_device_index = None  # default device

    sample_width = 2  # 16-bit
    channels = 1  # mono
    sample_rate = config[DOMAIN].get(CONF_SAMPLE_RATE, DEFAULT_SAMPLE_RATE)
    buffer_size = config[DOMAIN].get(CONF_BUFFER_SIZE, DEFAULT_BUFFER_SIZE)

    # Set up voice activity detection (VAD)
    import webrtcvad
    vad_mode = config[DOMAIN].get(CONF_VAD_MODE, DEFAULT_VAD_MODE)
    assert 0 <= vad_mode <= 3, 'VAD mode must be in [0-3]'
    vad = webrtcvad.Vad()
    vad.set_mode(vad_mode)  # agressiveness (0-3)

    # Controls how phrase is recorded
    min_sec = config[DOMAIN].get(CONF_MIN_SEC, DEFAULT_MIN_SEC)
    silence_sec = config[DOMAIN].get(CONF_SILENCE_SEC, DEFAULT_SILENCE_SEC)
    timeout_sec = config[DOMAIN].get(CONF_TIMEOUT_SEC, DEFAULT_TIMEOUT_SEC)
    seconds_per_buffer = buffer_size / sample_rate

    # Create speech-to-text decoder
    from pocketsphinx import Pocketsphinx, Ad
    decoder = Pocketsphinx(hmm=acoustic_model,
                           lm=language_model,
                           dic=dictionary)

    import pyaudio
    data_format = pyaudio.get_format_from_width(sample_width)

    # Events for asynchronous recording/decoding
    recorded_event = threading.Event()
    decoded_event = threading.Event()
    decoded_phrase = None
    terminated = False

    # -------------------------------------------------------------------------

    state_attrs = {
        'friendly_name': 'Speech to Text',
        'icon': 'mdi:comment-text',
        'text': ''
    }

    @asyncio.coroutine
    def async_listen(call):
        nonlocal decoded_phrase, terminated
        decoded_phrase = None
        terminated = False

        hass.states.async_set(OBJECT_POCKETSPHINX, STATE_LISTENING,
                              state_attrs)

        # Recording state
        max_buffers = int(math.ceil(timeout_sec / seconds_per_buffer))
        silence_buffers = int(math.ceil(silence_sec / seconds_per_buffer))
        min_phrase_buffers = int(math.ceil(min_sec / seconds_per_buffer))
        in_phrase = False
        after_phrase = False
        finished = False

        recorded_data = bytearray()

        # PyAudio callback for each buffer from audio device
        def stream_callback(buf, frame_count, time_info, status):
            nonlocal max_buffers, silence_buffers, min_phrase_buffers
            nonlocal in_phrase, after_phrase
            nonlocal recorded_data, finished

            # Check maximum number of seconds to record
            max_buffers -= 1
            if max_buffers <= 0:
                # Timeout
                finished = True

                # Reset
                in_phrase = False
                after_phrase = False

            # Detect speech in buffer
            is_speech = vad.is_speech(buf, sample_rate)
            if is_speech and not in_phrase:
                # Start of phrase
                in_phrase = True
                after_phrase = False
                recorded_data += buf
                min_phrase_buffers = int(
                    math.ceil(min_sec / seconds_per_buffer))
            elif in_phrase and (min_phrase_buffers > 0):
                # In phrase, before minimum seconds
                recorded_data += buf
                min_phrase_buffers -= 1
            elif in_phrase and is_speech:
                # In phrase, after minimum seconds
                recorded_data += buf
            elif not is_speech:
                # Outside of speech
                if after_phrase and (silence_buffers > 0):
                    # After phrase, before stop
                    recorded_data += buf
                    silence_buffers -= 1
                elif after_phrase and (silence_buffers <= 0):
                    # Phrase complete
                    recorded_data += buf
                    finished = True

                    # Reset
                    in_phrase = False
                    after_phrase = False
                elif in_phrase and (min_phrase_buffers <= 0):
                    # Transition to after phrase
                    after_phrase = True
                    silence_buffers = int(
                        math.ceil(silence_sec / seconds_per_buffer))

            if finished:
                recorded_event.set()

            return (buf, pyaudio.paContinue)

        # Open microphone device
        audio = pyaudio.PyAudio()
        mic = audio.open(format=data_format,
                         channels=channels,
                         rate=sample_rate,
                         input_device_index=audio_device_index,
                         input=True,
                         stream_callback=stream_callback,
                         frames_per_buffer=buffer_size)

        loop = asyncio.get_event_loop()

        # Wait for recorded to complete
        recorded_event.clear()
        mic.start_stream()
        yield from loop.run_in_executor(None, recorded_event.wait)

        # Stop audio
        mic.stop_stream()
        mic.close()
        audio.terminate()

        if not terminated:
            # Fire recorded event
            hass.bus.async_fire(
                EVENT_SPEECH_RECORDED,
                {
                    'name': name,  # name of the component
                    'size': len(recorded_data)  # bytes of recorded audio data
                })

            hass.states.async_set(OBJECT_POCKETSPHINX, STATE_DECODING,
                                  state_attrs)

            def decode():
                nonlocal decoded_phrase
                with decoder.start_utterance():
                    decoder.process_raw(recorded_data, False,
                                        True)  # full utterance
                    hyp = decoder.hyp()
                    if hyp:
                        with decoder.end_utterance():
                            decoded_phrase = hyp.hypstr

                decoded_event.set()

            # Decode in separate thread
            decoded_event.clear()
            thread = threading.Thread(target=decode, daemon=True)
            thread.start()
            yield from loop.run_in_executor(None, decoded_event.wait)

            if not terminated:
                thread.join()
                state_attrs['text'] = decoded_phrase
                hass.states.async_set(OBJECT_POCKETSPHINX, STATE_IDLE,
                                      state_attrs)

                # Fire decoded event
                hass.bus.async_fire(
                    EVENT_SPEECH_TO_TEXT,
                    {
                        'name': name,  # name of the component
                        'text': decoded_phrase
                    })

    # -------------------------------------------------------------------------

    @asyncio.coroutine
    def async_decode(call):
        nonlocal decoded_phrase, terminated
        decoded_phrase = None
        terminated = False

        if ATTR_FILENAME in call.data:
            # Use WAV file
            filename = call.data[ATTR_FILENAME]
            with wave.open(filename, mode='rb') as wav_file:
                data = wav_file.readframes(wav_file.getnframes())
        else:
            # Use data directly from JSON
            filename = None
            data = bytearray(call.data[ATTR_DATA])

        hass.states.async_set(OBJECT_POCKETSPHINX, STATE_DECODING, state_attrs)

        def decode():
            nonlocal decoded_phrase, data, filename

            # Check if WAV is in the correct format.
            # Convert with sox if not.
            with io.BytesIO(data) as wav_data:
                with wave.open(wav_data, mode='rb') as wav_file:
                    rate, width, channels = wav_file.getframerate(
                    ), wav_file.getsampwidth(), wav_file.getnchannels()
                    _LOGGER.debug('rate=%s, width=%s, channels=%s.' %
                                  (rate, width, channels))

                    if (rate != 16000) or (width != 2) or (channels != 1):
                        # Convert to 16-bit 16Khz mono (required by pocketsphinx acoustic models)
                        _LOGGER.debug('Need to convert to 16-bit 16Khz mono.')
                        if shutil.which('sox') is None:
                            _LOGGER.error(
                                "'sox' command not found. Cannot convert WAV file to appropriate format. Expect poor performance."
                            )
                        else:
                            temp_input_file = None
                            if filename is None:
                                # Need to write original WAV data out to a file for sox
                                temp_input_file = tempfile.NamedTemporaryFile(
                                    suffix='.wav', mode='wb+')
                                temp_input_file.write(data)
                                temp_input_file.seek(0)
                                filename = temp_input_file.name

                            # sox <IN> -r 16000 -e signed-integer -b 16 -c 1 <OUT>
                            with tempfile.NamedTemporaryFile(
                                    suffix='.wav', mode='wb+') as out_wav_file:
                                subprocess.check_call([
                                    'sox', filename, '-r', '16000', '-e',
                                    'signed-integer', '-b', '16', '-c', '1',
                                    out_wav_file.name
                                ])

                                out_wav_file.seek(0)

                                # Use converted data
                                with wave.open(out_wav_file, 'rb') as wav_file:
                                    data = wav_file.readframes(
                                        wav_file.getnframes())

                            if temp_input_file is not None:
                                # Clean up temporary file
                                del temp_input_file

            # Process WAV data as a complete utterance (best performance)
            with decoder.start_utterance():
                decoder.process_raw(data, False, True)  # full utterance
                if decoder.hyp():
                    with decoder.end_utterance():
                        decoded_phrase = decoder.hyp().hypstr

            decoded_event.set()

        loop = asyncio.get_event_loop()

        # Decode in separate thread
        decoded_event.clear()
        thread = threading.Thread(target=decode, daemon=True)
        thread.start()
        yield from loop.run_in_executor(None, decoded_event.wait)

        if not terminated:
            thread.join()
            state_attrs['text'] = decoded_phrase
            hass.states.async_set(OBJECT_POCKETSPHINX, STATE_IDLE, state_attrs)

            # Fire decoded event
            hass.bus.async_fire(
                EVENT_SPEECH_TO_TEXT,
                {
                    'name': name,  # name of the component
                    'text': decoded_phrase
                })

    # -------------------------------------------------------------------------

    hass.http.register_view(ExternalSpeechView)

    # Service to record commands
    hass.services.async_register(DOMAIN, SERVICE_LISTEN, async_listen)

    # Service to do speech to text
    hass.services.async_register(DOMAIN,
                                 SERVICE_DECODE,
                                 async_decode,
                                 schema=SCHEMA_SERVICE_DECODE)

    hass.states.async_set(OBJECT_POCKETSPHINX, STATE_IDLE, state_attrs)

    # Make sure everything terminates property when home assistant stops
    @asyncio.coroutine
    def async_terminate(event):
        nonlocal terminated
        terminated = True
        recorded_event.set()
        decoded_event.set()

    hass.bus.async_listen(EVENT_HOMEASSISTANT_STOP, async_terminate)

    _LOGGER.info('Started')

    return True
Ejemplo n.º 27
0
 def __init__(self, *args, **kwargs):
     self.ps = Pocketsphinx()
     self.ps.decode()
     super(TestRawDecoder, self).__init__(*args, **kwargs)
Ejemplo n.º 28
0
from pocketsphinx import Pocketsphinx

ps = Pocketsphinx(verbose=True)
ps.decode()

print(ps.hypothesis())
Ejemplo n.º 29
0
# Code retested by KhalsaLabs
# You can use your own audio file in code
# Raw or wav files would work perfectly
# For mp3 files, you need to modify code (add codex)

from __future__ import print_function
import os
from pocketsphinx import Pocketsphinx, get_model_path, get_data_path

model_path = get_model_path()
data_path = get_data_path()

config = {
    'hmm': os.path.join(model_path, 'en-us'),
    'lm': os.path.join(model_path, 'en-us.lm.bin'),
    'dict': os.path.join(model_path, 'cmudict-en-us.dict')
}

ps = Pocketsphinx(**config)
ps.decode(
    audio_file=os.path.join(data_path,
                            'test1.wav'),  # add your audio file here
    buffer_size=2048,
    no_search=False,
    full_utt=False)

print(ps.hypothesis())
Ejemplo n.º 30
0
    def loop(self):

        # loop
        while not rospy.core.is_shutdown():
            # if recording finished
            if not self.outbuf is None:
                # write output file
                print("writing output file")
                outfilename = '/tmp/input.wav'
                file = wave.open(outfilename, 'wb')
                file.setparams((1, 4, 20000, 0, 'NONE', 'not compressed'))
                print("Starting Reshape")
                x = np.reshape(self.outbuf[:, [0, 0]], (-1))
                print("writing frames")
                print(len(x))
                values = []
                for s in x:
                    packed_value = struct.pack('<h', s)
                    values.append(packed_value)
                    #file.writeframes(struct.pack('<h', s))
                #close file
                value_str = b''.join(values)
                file.writeframes(value_str)

                print("Closing file")
                file.close()

                model_path = get_model_path()
                data_path = get_data_path()

                config = {
                    'hmm': os.path.join(
                        model_path, 'en-us'
                    ),  # Hidden Markov Model, Speech Recongnition model - trained probability scoring system
                    'lm': os.path.join(model_path,
                                       'en-us.lm.bin'),  #language model
                    'dict': os.path.join(
                        model_path,
                        'cmudict-en-us.dict')  #, # language dictionary
                    #'samprate' : 16000
                }
                #cmd= "ffmpeg -y -i /tmp/output.wav -ar 8000 -af asetrate=16000*" + pitch + ",aresample=16000,atempo=" + tempo + " -ac 1 /tmp/outputConv.wav"
                #cmd = "ffmpeg -y -i /tmp/input.wav -f s32le -acodec pcm_s32le -ar 16000 -ac 1 /tmp/inputConv.wav"
                #cmd = "sox /tmp/input.wav -r 16000 inputConv.wav"
                #cmd = "ffmpeg -i /tmp/input.wav -ar 16000 /tmp/inputConv.wav"
                print("Converting via FFMPEG")
                cmd = "ffmpeg -y -i /tmp/input.wav -f s16le -acodec pcm_s16le -ar 16000 -af 'aresample=20000' -ac 1 /tmp/inputConv.wav -loglevel quiet"
                os.system(cmd)
                print("Decoding Via Pocketsphinx")
                ps = Pocketsphinx(**config)
                ps.decode(
                    audio_file=(
                        "/tmp/inputConv.wav"),  #add temp input.wav file
                    buffer_size=8192,
                    no_search=False,
                    full_utt=False)

                print("Recognized: ")
                print(ps.hypothesis())  ## output

                ## Speech Analysis, (what to start?)
                if ps.hypothesis() == "hello":
                    mml.say("Hello there human")  # Change this to whatever
                elif ps.hypothesis().find("how are you") >= 0:
                    mml.say("I'm always good")
                print("END")
                self.micbuf = np.zeros((0, 4), 'uint16')
                self.outbuf = None
                self.buffer_stuff = 0

                self.playchan = 0
                self.playsamp = 0

            # state
            time.sleep(0.02)