class TestVoice(Voice): # playback FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 44100 CHUNK = 1024 FILE_NAME = 'aux.wav' # recognition MODELDIR = "es-ES" GRAMMARDIR = "gram" # text to speech RATE = 150 VOLUME = 0.9 def __init__(self, file_name='aux.wav', raspi=False, local=True): ## load environment self.FILE_NAME = file_name self.audio = pyaudio.PyAudio() self.raspi = raspi self.local = local self.config = Decoder.default_config() self.config.set_string('-hmm', os.path.join(self.MODELDIR, 'acoustic-model')) self.config.set_string( '-dict', os.path.join(self.MODELDIR, 'pronounciation-dictionary.dict')) self.config.set_string('-logfn', os.devnull) self.decoder = Decoder(self.config) self.r = sr.Recognizer() print("adjunting...") with sr.Microphone() as source: self.r.adjust_for_ambient_noise(source) # tts if self.local: self.tts = pyttsx3.init() self.tts.setProperty('rate', self.RATE) self.tts.setProperty('volume', self.VOLUME) self.tts.setProperty('voice', 'spanish-latin-am') else: # Instantiates a client self.tts_client = texttospeech.TextToSpeechClient() # Build the voice request, select the language code ("en-US") and the ssml # voice gender ("neutral") self.tts_voice = texttospeech.types.VoiceSelectionParams( language_code='es-ES', ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE) # Select the type of audio file you want returned self.tts_audio_config = texttospeech.types.AudioConfig( audio_encoding=texttospeech.enums.AudioEncoding.MP3) def speak(self, phrase): print('decir: ' + phrase) if self.local: self.tts.say(phrase) self.tts.runAndWait() else: # Set the text input to be synthesized synthesis_input = texttospeech.types.SynthesisInput(text=phrase) # Perform the text-to-speech request on the text input with the selected # voice parameters and audio file type response = self.tts_client.synthesize_speech( synthesis_input, self.tts_voice, self.tts_audio_config) audio_file = 'tts.mp3' # The response's audio_content is binary. with open(audio_file, 'wb') as out: out.write(response.audio_content) print('reproducir voz sintetica') command = '/usr/bin/mpg321 ' + audio_file print(command) os.system(command) def play(self, filename): print('reproduciendo archivo: ' + filename) extension = filename.split('.')[-1] if extension == 'wav': wf = wave.open(filename, 'rb') stream = self.audio.open(format=self.audio.get_format_from_width( wf.getsampwidth()), channels=wf.getnchannels(), rate=wf.getframerate(), output=True) data = wf.readframes(self.CHUNK) # play while len(data) > 0: stream.write(data) data = wf.readframes(self.CHUNK) stream.stop_stream() stream.close() elif extension == 'mp3': command = '/usr/bin/mpg321 ' + filename print(command) os.system(command) def listen(self, duration=3): # start recording if self.raspi: stream = self.audio.open(format=self.FORMAT, channels=1, rate=self.RATE, input_device_index=2, input=True, frames_per_buffer=self.CHUNK) else: stream = self.audio.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input_device_index=7, input=True, frames_per_buffer=self.CHUNK) frames = [] for i in range(0, int(self.RATE / self.CHUNK * duration)): data = stream.read(self.CHUNK, exception_on_overflow=False) frames.append(data) stream.stop_stream() stream.close() wave_file = wave.open(self.FILE_NAME, 'wb') if self.raspi: wave_file.setnchannels(1) else: wave_file.setnchannels(self.CHANNELS) wave_file.setsampwidth(self.audio.get_sample_size(self.FORMAT)) wave_file.setframerate(self.RATE) wave_file.writeframes(b''.join(frames)) wave_file.close() with sr.AudioFile(self.FILE_NAME) as source: audio = self.r.record(source) raw_data = audio.get_raw_data(convert_rate=16000, convert_width=2) return raw_data def echo(self): self.play(self.FILE_NAME) def recognize(self): with sr.Microphone() as source: audio = self.r.listen(source) # raw_out = self.listen() try: self.decoder.start_utt() self.decoder.process_raw(audio.frame_data, False, True) self.decoder.end_utt() hyp = self.decoder.hyp() return hyp.hypstr except Exception: return None def loadGrammar(self, grammar): # delete(self.decoder) grammar_file = grammar + '.gram' c_string = os.path.join(self.GRAMMARDIR, grammar_file) #.encode('ascii') print(c_string) self.config.set_string('-jsgf', c_string) self.decoder.reinit(self.config) def close(self): self.audio.terminate()
class TestVoice(Voice): # playback FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 44100 CHUNK = 1024 FILE_NAME = 'aux.wav' # recognition MODELDIR = "es-ES" GRAMMARDIR = "gram" # text to speech RATE = 150 VOLUME = 0.9 def __init__(self, file_name='aux.wav', raspi=False): self.FILE_NAME = file_name self.audio = pyaudio.PyAudio() self.raspi = raspi self.config = Decoder.default_config() self.config.set_string('-hmm', os.path.join(self.MODELDIR, 'acoustic-model')) self.config.set_string( '-dict', os.path.join(self.MODELDIR, 'pronounciation-dictionary.dict')) self.config.set_string('-logfn', os.devnull) self.decoder = Decoder(self.config) self.r = sr.Recognizer() print("adjunting...") with sr.Microphone() as source: self.r.adjust_for_ambient_noise(source) # tts self.tts = pyttsx3.init() self.tts.setProperty('rate', self.RATE) self.tts.setProperty('volume', self.VOLUME) self.tts.setProperty('voice', 'spanish-latin-am') def speak(self, phrase): self.tts.say(phrase) self.tts.runAndWait() def play(self, filename): extension = filename.split('.')[1] if extension == 'wav': wf = wave.open(filename, 'rb') stream = self.audio.open(format=self.audio.get_format_from_width( wf.getsampwidth()), channels=wf.getnchannels(), rate=wf.getframerate(), output=True) data = wf.readframes(self.CHUNK) # play while len(data) > 0: stream.write(data) data = wf.readframes(self.CHUNK) stream.stop_stream() stream.close() elif extension == 'mp3': playsound(filename) def listen(self, duration=3): # start recording if self.raspi: stream = self.audio.open(format=self.FORMAT, channels=1, rate=self.RATE, input_device_index=2, input=True, frames_per_buffer=self.CHUNK) else: stream = self.audio.open(format=self.FORMAT, channels=self.CHANNELS, rate=self.RATE, input_device_index=7, input=True, frames_per_buffer=self.CHUNK) frames = [] for i in range(0, int(self.RATE / self.CHUNK * duration)): data = stream.read(self.CHUNK, exception_on_overflow=False) frames.append(data) stream.stop_stream() stream.close() wave_file = wave.open(self.FILE_NAME, 'wb') if self.raspi: wave_file.setnchannels(1) else: wave_file.setnchannels(self.CHANNELS) wave_file.setsampwidth(self.audio.get_sample_size(self.FORMAT)) wave_file.setframerate(self.RATE) wave_file.writeframes(b''.join(frames)) wave_file.close() with sr.AudioFile(self.FILE_NAME) as source: audio = self.r.record(source) raw_data = audio.get_raw_data(convert_rate=16000, convert_width=2) return raw_data def echo(self): self.play(self.FILE_NAME) def recognize(self): with sr.Microphone() as source: audio = self.r.listen(source) # raw_out = self.listen() try: self.decoder.start_utt() self.decoder.process_raw(audio.frame_data, False, True) self.decoder.end_utt() hyp = self.decoder.hyp() return hyp.hypstr except Exception: return None def loadGrammar(self, grammar): # delete(self.decoder) grammar_file = grammar + '.gram' c_string = os.path.join(self.GRAMMARDIR, grammar_file) #.encode('ascii') print(c_string) self.config.set_string('-jsgf', c_string) self.decoder.reinit(self.config) def close(self): self.audio.terminate()