def speak_for_me(msg): picotts = PicoTTS() wavs = picotts.synth_wav(msg) wav = wave.open(StringIO.StringIO(wavs)) #print wav.getnchannels(), wav.getframerate(), wav.getnframes() f = wav p = pyaudio.PyAudio() stream = p.open(format=p.get_format_from_width(wav.getsampwidth()), channels=wav.getnchannels(), rate=f.getframerate(), output=True) chunk = 1024 data = f.readframes(chunk) while data: stream.write(data) data = f.readframes(chunk) stream.stop_stream() stream.close() p.terminate()
def create_voice_file(self, voice_text, voice_type, file_path): picotts = PicoTTS() picotts.voice = voice_type wavs = picotts.synth_wav(voice_text) with open(file_path, mode="wb") as f: f.write(wavs) f.close() y, s = librosa.load(file_path, sr=8000) sf.write(file_path, y, s)
def test_synth_wav_de(self): picotts = PicoTTS(voice='de-DE') wavs = picotts.synth_wav('Hallo Welt!') wav = wave.open(StringIO.StringIO(wavs)) self.assertEqual(wav.getnchannels(), 1) self.assertEqual(wav.getframerate(), 16000) self.assertGreater(wav.getnframes(), 20000)
def __init__(self, library_path, model_path, keyword_paths, sensitivities, input_device_index=None, output_path=None): """ Constructor. :param library_path: Absolute path to Porcupine's dynamic library. :param model_file_path: Absolute path to the model parameter file. :param keyword_file_paths: List of absolute paths to keyword files. :param sensitivities: Sensitivity parameter for each wake word. For more information refer to 'include/pv_porcupine.h'. It uses the same sensitivity value for all keywords. :param input_device_index: Optional argument. If provided, audio is recorded from this input device. Otherwise, the default audio input device is used. :param output_path: If provided recorded audio will be stored in this location at the end of the run. """ super(BertaDeepSpeech, self).__init__() self._library_path = library_path self._model_path = model_path self._keyword_paths = keyword_paths self._sensitivities = sensitivities self._input_device_index = input_device_index self.db_model = None self.pt = PicoTTS() self.pa = pyaudio.PyAudio() self._output_path = output_path if self._output_path is not None: self._recorded_frames = [] #Load DeepSpeech model print('Initializing model...') dirname = os.path.dirname(os.path.abspath(__file__)) model_name = glob.glob(os.path.join(dirname, 'libs/*.tflite'))[0] logging.info("Model: %s", model_name) self.model = deepspeech.Model(model_name) try: scorer_name = glob.glob(os.path.join(dirname, '*.scorer'))[0] logging.info("Language model: %s", scorer_name) self.model.enableExternalScorer(scorer_name) except Exception as e: pass
def speak(self, text): if os.name == 'nt': self.voice_output_engine.say(text) self.voice_output_engine.runAndWait() else: if self.engine == 'espeak': espeak_command = [self.espeak_exec_path, '-v' + self.lang, '-s' + str(self.speed), '-a' + str(self.amplitude), '-p' + str(self.pitch), '-w' + self.path, text] # generate the file with eSpeak subprocess.call(espeak_command, stderr=sys.stderr) f = wave.open(self.path, "rb") if self.engine == 'picotts': self.voice_output_engine = PicoTTS() self.voice_output_engine.voice = self.lang synth = self.voice_output_engine.synth_wav(text) w = StringIO.StringIO(synth) f = wave.open(w) self.play(f)
def __init__(self, args): reload(sys) sys.setdefaultencoding('utf8') self.lang = args.get('lang', 'fr-FR') self.path = args.get('path', r"/tmp/output.wav") self.pitch = args.get('pitch', 50) self.amplitude = args.get('amplitude', 90) self.speed = args.get('speed', 100) self.espeak_exec_path = args.get('espeak_exec_path', r"/usr/bin/espeak") self.engine = args.get('engine', 'espeak') if os.name == 'nt': self.voice_output_engine = pyttsx3.init() if self.lang == 'fr-FR' or self.lang == 'fr_FR': voice = self.voice_output_engine.getProperty('voices')[0] # the french voice self.voice_output_engine.setProperty('voice', voice.id) else: if self.engine == 'picotts': self.voice_output_engine = PicoTTS() self.voice_output_engine.voice = self.lang
def setup_TTS(): # TTS objects picotts = PicoTTS() p = pyaudio.PyAudio() outport = mido.open_output() input1 = mido.get_input_names()[0] for inp in mido.get_input_names(): if inp.find('Keystation') > -1: input1 = inp return picotts, p, input1, outport
def __init__( self, host_tts='local', port_tts=8300, locale='en_US', engine='mary', voice='cmu-rms-hsmm', pitch=50, # 0-99 speed=175): # approx. words per minute self._host_tts = host_tts self._port_tts = port_tts self._locale = locale self._engine = engine self._voice = voice self._pitch = pitch self._speed = speed if host_tts == 'local': self.player = PulsePlayer('Local TTS Client') self.espeak = ESpeakNG() self.marytts = MaryTTS() self.picotts = PicoTTS()
class TTS(object): def __init__( self, host_tts='local', port_tts=8300, locale=DEFAULT_MARY_LOCALE, engine='mary', voice=DEFAULT_MARY_VOICE, pitch=50, # 0-99 speed=175): # approx. words per minute self._host_tts = host_tts self._port_tts = port_tts self._locale = locale self._engine = engine self._voice = voice self._pitch = pitch self._speed = speed if host_tts == 'local': self.marytts = MaryTTS() # lazy-loading to reduce package dependencies self.picotts = None @property def locale(self): return self._locale @locale.setter def locale(self, v): self._locale = v @property def engine(self): return self._engine @engine.setter def engine(self, v): self._engine = v @property def voice(self): return self._voice @voice.setter def voice(self, v): self._voice = v @property def pitch(self): return self._pitch @pitch.setter def pitch(self, v): self._pitch = v @property def speed(self): return self._speed @speed.setter def speed(self, v): self._speed = v def synthesize(self, txt, mode='txt'): if self._host_tts == 'local': # import pdb; pdb.set_trace() wav = None if self.engine == 'mary': self.marytts.voice = self._voice self.marytts.locale = self._locale if mode == 'txt': wav = self.marytts.synth_wav(txt) elif mode == 'ipa': xs = ipa2mary('ipa', txt) wav = self.marytts.synth_wav(xs, fmt='xs') elif mode == 'mary': wav = self.marytts.synth_wav(txt, fmt='xs') else: raise Exception("unknown mary mode '%s'" % mode) elif self.engine == 'pico': if mode == 'txt': if not self.picotts: from picotts import PicoTTS self.picotts = PicoTTS() self.picotts.voice = self._voice wav = self.picotts.synth_wav(txt) # logging.debug ('synthesize: %s %s -> %s' % (txt, mode, repr(wav))) else: raise Exception("unknown pico mode '%s'" % mode) else: raise Exception("unknown engine '%s'" % self.engine) else: args = { 'l': self._locale, 'v': self._voice, 'e': self._engine, 'm': mode, 't': txt.encode('utf8') } url = 'http://%s:%s/tts/synth?%s' % ( self._host_tts, self._port_tts, urllib.urlencode(args)) response = requests.get(url) if response.status_code != 200: return None wav = response.content if wav: logging.debug('synthesize: %s %s -> WAV' % (txt, mode)) else: logging.error('synthesize: %s %s -> NO WAV' % (txt, mode)) return wav def play_wav(self, wav, async_play=False): if self._host_tts == 'local': if wav: with io.BytesIO(wav) as tmp_buffer: wave_read = sa.wave.open(tmp_buffer, 'rb') wave_obj = sa.WaveObject.from_wave_read(wave_read) play_obj = wave_obj.play() if not async_play: play_obj.wait_done() else: raise Exception('no wav data given') else: url = 'http://%s:%s/tts/play' % (self._host_tts, self._port_tts) if async_play: url += '?async=t' response = requests.post(url, data=wav) def say(self, utterance, async_play=False): wav = self.synthesize(utterance) self.play_wav(wav, async_play=async_play) def say_phn(self, phn, phn_format='mary', async_play=False): wav = self.synthesize(phn, mode=phn_format) self.play_wav(wav, async_play=async_play) def gen_phn(self, word, model='dicts/de_g2p_model-6', phn_format='mary'): if self._host_tts == 'local': if self.engine == 'mary': self.marytts.voice = self._voice self.marytts.locale = self._locale mp = self.marytts.g2p(word) if phn_format == 'mary': return mp elif phn_format == 'ipa': return mary2ipa(word, mp) else: raise Exception("Format not supported: '%s'" % phn_format) elif self.engine == 'sequitur': return sequiturclient.sequitur_gen_ipa(model, word) else: raise Exception("unknown engine '%s'" % self.engine) else: args = { 'l': self._locale, 'v': self._voice, 'e': self._engine, 't': word.encode('utf8') } url = 'http://%s:%s/tts/g2p?%s' % (self._host_tts, self._port_tts, urllib.urlencode(args)) response = requests.get(url) if response.status_code != 200: return None return response.json()['ipa']
def synthesize(self, txt, mode='txt'): if self._host_tts == 'local': # import pdb; pdb.set_trace() wav = None if self.engine == 'mary': self.marytts.voice = self._voice self.marytts.locale = self._locale if mode == 'txt': wav = self.marytts.synth_wav(txt) elif mode == 'ipa': xs = ipa2mary('ipa', txt) wav = self.marytts.synth_wav(xs, fmt='xs') elif mode == 'mary': wav = self.marytts.synth_wav(txt, fmt='xs') else: raise Exception("unknown mary mode '%s'" % mode) elif self.engine == 'pico': if mode == 'txt': if not self.picotts: from picotts import PicoTTS self.picotts = PicoTTS() self.picotts.voice = self._voice wav = self.picotts.synth_wav(txt) # logging.debug ('synthesize: %s %s -> %s' % (txt, mode, repr(wav))) else: raise Exception("unknown pico mode '%s'" % mode) else: raise Exception("unknown engine '%s'" % self.engine) else: args = { 'l': self._locale, 'v': self._voice, 'e': self._engine, 'm': mode, 't': txt.encode('utf8') } url = 'http://%s:%s/tts/synth?%s' % ( self._host_tts, self._port_tts, urllib.urlencode(args)) response = requests.get(url) if response.status_code != 200: return None wav = response.content if wav: logging.debug('synthesize: %s %s -> WAV' % (txt, mode)) else: logging.error('synthesize: %s %s -> NO WAV' % (txt, mode)) return wav
class TTS(object): def __init__( self, host_tts='local', port_tts=8300, locale='en_US', engine='mary', voice='cmu-rms-hsmm', pitch=50, # 0-99 speed=175): # approx. words per minute self._host_tts = host_tts self._port_tts = port_tts self._locale = locale self._engine = engine self._voice = voice self._pitch = pitch self._speed = speed if host_tts == 'local': self.player = PulsePlayer('Local TTS Client') self.espeak = ESpeakNG() self.marytts = MaryTTS() self.picotts = PicoTTS() @property def locale(self): return self._locale @locale.setter def locale(self, v): self._locale = v @property def engine(self): return self._engine @engine.setter def engine(self, v): self._engine = v @property def voice(self): return self._voice @voice.setter def voice(self, v): self._voice = v @property def pitch(self): return self._pitch @pitch.setter def pitch(self, v): self._pitch = v @property def speed(self): return self._speed @speed.setter def speed(self, v): self._speed = v def synthesize(self, txt, mode='txt'): if self._host_tts == 'local': # import pdb; pdb.set_trace() wav = None if self.engine == 'mary': self.marytts.voice = self._voice self.marytts.locale = self._locale if mode == 'txt': wav = self.marytts.synth_wav(txt) elif mode == 'ipa': xs = ipa2mary('ipa', txt) wav = self.marytts.synth_wav(xs, fmt='xs') else: raise Exception("unknown mary mode '%s'" % mode) elif self.engine == 'espeak': if mode == 'txt': self.espeak.voice = self._voice self.espeak.speed = self._speed self.espeak.pitch = self._pitch wav = self.espeak.synth_wav(txt) # logging.debug ('synthesize: %s %s -> %s' % (txt, mode, repr(wav))) elif mode == 'ipa': xs = ipa2xsampa('ipa', txt) logging.debug('synthesize: %s %s -> %s' % (txt, mode, repr(xs))) wav = self.espeak.synth_wav(xs, fmt='xs') elif self.engine == 'pico': if mode == 'txt': self.picotts.voice = self._voice wav = self.picotts.synth_wav(txt) # logging.debug ('synthesize: %s %s -> %s' % (txt, mode, repr(wav))) else: raise Exception("unknown espeak mode '%s'" % mode) else: raise Exception("unknown engine '%s'" % self.engine) else: args = { 'l': self._locale, 'v': self._voice, 'e': self._engine, 'm': mode, 't': txt.encode('utf8') } url = 'http://%s:%s/tts/synth?%s' % ( self._host_tts, self._port_tts, urllib.urlencode(args)) response = requests.get(url) if response.status_code != 200: return None wav = response.content if wav: logging.debug('synthesize: %s %s -> WAV' % (txt, mode)) else: logging.error('synthesize: %s %s -> NO WAV' % (txt, mode)) return wav def play_wav(self, wav, async=False): if self._host_tts == 'local': if wav: self.player.play(wav, async) else: raise Exception('no wav given') else: url = 'http://%s:%s/tts/play' % (self._host_tts, self._port_tts) if async: url += '?async=t' response = requests.post(url, data=wav)
class VoiceOutput(object): def __init__(self, args): reload(sys) sys.setdefaultencoding('utf8') self.lang = args.get('lang', 'fr-FR') self.path = args.get('path', r"/tmp/output.wav") self.pitch = args.get('pitch', 50) self.amplitude = args.get('amplitude', 90) self.speed = args.get('speed', 100) self.espeak_exec_path = args.get('espeak_exec_path', r"/usr/bin/espeak") self.engine = args.get('engine', 'espeak') if os.name == 'nt': self.voice_output_engine = pyttsx3.init() if self.lang == 'fr-FR' or self.lang == 'fr_FR': voice = self.voice_output_engine.getProperty('voices')[0] # the french voice self.voice_output_engine.setProperty('voice', voice.id) else: if self.engine == 'picotts': self.voice_output_engine = PicoTTS() self.voice_output_engine.voice = self.lang def speak(self, text): if os.name == 'nt': self.voice_output_engine.say(text) self.voice_output_engine.runAndWait() else: if self.engine == 'espeak': espeak_command = [self.espeak_exec_path, '-v' + self.lang, '-s' + str(self.speed), '-a' + str(self.amplitude), '-p' + str(self.pitch), '-w' + self.path, text] # generate the file with eSpeak subprocess.call(espeak_command, stderr=sys.stderr) f = wave.open(self.path, "rb") if self.engine == 'picotts': self.voice_output_engine = PicoTTS() self.voice_output_engine.voice = self.lang synth = self.voice_output_engine.synth_wav(text) w = StringIO.StringIO(synth) f = wave.open(w) self.play(f) def play(self, f): import pyaudio # instantiate PyAudio p = pyaudio.PyAudio() # open stream stream = p.open(format=p.get_format_from_width(f.getsampwidth()), channels=f.getnchannels(), rate=f.getframerate(), output=True) # define stream chunk chunk = 1024 # read data data = f.readframes(chunk) # play stream while data: stream.write(data) data = f.readframes(chunk) # stop stream stream.stop_stream() stream.close() # close PyAudio p.terminate()
class BertaDeepSpeech(Thread): """ Class for wake word detection (aka Porcupine) library. It creates an input audio stream from a microphone, monitors it, and upon detecting the specified wake word(s) prints the detection time and index of wake word on console. """ def __init__(self, library_path, model_path, keyword_paths, sensitivities, input_device_index=None, output_path=None): """ Constructor. :param library_path: Absolute path to Porcupine's dynamic library. :param model_file_path: Absolute path to the model parameter file. :param keyword_file_paths: List of absolute paths to keyword files. :param sensitivities: Sensitivity parameter for each wake word. For more information refer to 'include/pv_porcupine.h'. It uses the same sensitivity value for all keywords. :param input_device_index: Optional argument. If provided, audio is recorded from this input device. Otherwise, the default audio input device is used. :param output_path: If provided recorded audio will be stored in this location at the end of the run. """ super(BertaDeepSpeech, self).__init__() self._library_path = library_path self._model_path = model_path self._keyword_paths = keyword_paths self._sensitivities = sensitivities self._input_device_index = input_device_index self.db_model = None self.pt = PicoTTS() self.pa = pyaudio.PyAudio() self._output_path = output_path if self._output_path is not None: self._recorded_frames = [] #Load DeepSpeech model print('Initializing model...') dirname = os.path.dirname(os.path.abspath(__file__)) model_name = glob.glob(os.path.join(dirname, 'libs/*.tflite'))[0] logging.info("Model: %s", model_name) self.model = deepspeech.Model(model_name) try: scorer_name = glob.glob(os.path.join(dirname, '*.scorer'))[0] logging.info("Language model: %s", scorer_name) self.model.enableExternalScorer(scorer_name) except Exception as e: pass def set_model(self, db_model): self.db_model = db_model def transcribe(self): # Start audio with VAD vad_audio = VADAudio(aggressiveness=1, device=None, input_rate=16000, file=None) print("Listening (ctrl-C to exit)...") frames = vad_audio.vad_collector() # Stream from microphone to DeepSpeech using VAD #spinner = Halo(spinner='line') stream_context = self.model.createStream() #wav_data = bytearray() listening = False for frame in frames: if frame is not None: if not listening: pixels.listen() listening = True # if spinner: spinner.start() logging.debug("streaming frame") stream_context.feedAudioContent(np.frombuffer(frame, np.int16)) #if ARGS.savewav: wav_data.extend(frame) else: if listening: listening = False pixels.think() # if spinner: spinner.stop() logging.debug("end utterence") #if ARGS.savewav: # vad_audio.write_wav(os.path.join(ARGS.savewav, datetime.now().strftime("savewav_%Y-%m-%d_%H-%M-%S_%f.wav")), wav_data) # wav_data = bytearray() text = stream_context.finishStream() print("Recognized: %s" % text) log = (text, self.analyze(text)) return (1, log) if 'stop recording' in text: vad_audio.destroy() #break return 1 stream_context = self.model.createStream() def find_action(self, phrase): words = phrase.lower().split() default = [ x for x in ActionProvider.plugins if 'default' in x.categories ][0] for word in words: action = [ x for x in ActionProvider.plugins if word in x.categories ] if action: return action[0]() return default() def analyze(self, phrase): """Method that analyzes the phrase given, speaks and returns the answer""" # find the correct action to take action = self.find_action(phrase) # perform the action and get the answer answer = action.perform() # speek the answer self.speek(answer) # return answer for saving into database return answer def test_phrase(self, phrase): """Method used in web application to test apis maually""" # find the correct action to take action = self.find_action(phrase) # perform the action and get the answer answer = action.perform() # return answer for saving into database return answer def speek(self, answer): """Method that generates the audio data and plays it on the microphone""" self.pa = pyaudio.PyAudio() # 1kb of data at a time chunk = 1024 # create the picotts wav wavs = self.pt.synth_wav(str(answer)) # open wav for processing wav = wave.open(BytesIO(wavs)) # create audio stream for output stream = self.pa.open(format=self.pa.get_format_from_width( wav.getsampwidth()), channels=wav.getnchannels(), rate=wav.getframerate(), output=True) data = wav.readframes(chunk) pixels.speak() print("speaking here") while data: #print(data) stream.write(data) data = wav.readframes(chunk) print("done speaking") pixels.off() stream.stop_stream() stream.close() self.pa.terminate() def run(self): """ Creates an input audio stream, initializes wake word detection (Porcupine) object, and monitors the audio stream for occurrences of the wake word(s). It prints the time of detection for each occurrence and index of wake word. """ num_keywords = len(self._keyword_paths) keywords = list() for x in self._keyword_paths: keywords.append( os.path.basename(x).replace('.ppn', '').replace('_compressed', '').split('_')[0]) print('listening for:') for keyword, sensitivity in zip(keywords, self._sensitivities): print('- %s (sensitivity: %f)' % (keyword, sensitivity)) porcupine = None pa = None audio_stream = None try: porcupine = Porcupine(library_path=self._library_path, model_path=self._model_path, keyword_paths=self._keyword_paths, sensitivities=self._sensitivities) pa = pyaudio.PyAudio() audio_stream = pa.open(rate=porcupine.sample_rate, channels=1, format=pyaudio.paInt16, input=True, frames_per_buffer=porcupine.frame_length, input_device_index=self._input_device_index) while True: pcm = audio_stream.read(porcupine.frame_length) pcm = struct.unpack_from("h" * porcupine.frame_length, pcm) if self._output_path is not None: self._recorded_frames.append(pcm) result = porcupine.process(pcm) if result >= 0: print('[%s] Detected %s' % (str(datetime.now()), keywords[result])) pixels.wakeup() audio_stream.close() ds_result = self.transcribe() #if self.transcribe(): if ds_result[0]: audio_stream = pa.open( rate=porcupine.sample_rate, channels=1, format=pyaudio.paInt16, input=True, frames_per_buffer=porcupine.frame_length, input_device_index=self._input_device_index) return ds_result[1] except KeyboardInterrupt: print('stopping ...') raise KeyboardInterrupt finally: if porcupine is not None: porcupine.delete() if audio_stream is not None: audio_stream.close() if pa is not None: pa.terminate() if self._output_path is not None and len( self._recorded_frames) > 0: recorded_audio = np.concatenate(self._recorded_frames, axis=0).astype(np.int16) soundfile.write(self._output_path, recorded_audio, samplerate=porcupine.sample_rate, subtype='PCM_16') pixels.off() _AUDIO_DEVICE_INFO_KEYS = [ 'index', 'name', 'defaultSampleRate', 'maxInputChannels' ] @classmethod def show_audio_devices_info(cls): """ Provides information regarding different audio devices available. """ pa = pyaudio.PyAudio() for i in range(pa.get_device_count()): info = pa.get_device_info_by_index(i) print(', '.join("'%s': '%s'" % (k, str(info[k])) for k in cls._AUDIO_DEVICE_INFO_KEYS)) pa.terminate()
def test_voices(self): picotts = PicoTTS() voices = picotts.voices self.assertGreater(len(voices), 5)
import picamera import pyaudio import time import wave import StringIO from picotts import PicoTTS from google.cloud import vision from google.cloud.vision import types from PIL import Image, ImageDraw from firebase import firebase from socketIO_client import SocketIO, LoggingNamespace firebase = firebase.FirebaseApplication('https://metronome-nyc.firebaseio.com', None) camera = picamera.PiCamera() picotts = PicoTTS() SERVER = 'api.memeboard.net' PORT = 80 TRAIN_ID = 10011 CAR_ID = 0 STATION_LIST = [0, 1, 2, 3] station_index = 1 def playSound(): global picotts wavs = picotts.synth_wav('Stand clear of the closing doors please.') wav = wave.open(StringIO.StringIO(wavs))
engine.say("the temperature is -21 celsius.") engine.runAndWait() ''' ''' import pyttsx3 engine = pyttsx3.init() voices = engine.getProperty('voices') for voice in voices: print("Voice:") print(" - ID: %s" % voice.id) print(" - Name: %s" % voice.name) print(" - Languages: %s" % voice.languages) print(" - Gender: %s" % voice.gender) print(" - Age: %s" % voice.age) ''' import wave try: from StringIO import StringIO except ImportError: import io from picotts import PicoTTS picotts = PicoTTS() wavs = picotts.synth_wav("Hello World!") wav = wave.open(io.BytesIO(wavs)) wav.getnchannels(), wav.getframerate(), wav.getnframes() wav.open() #pico2wave( -l=en-US, -w=file.wav, "This is a test of pico")
def synth_wav(sentence): picotts = PicoTTS(voice='fr-FR') filename = picotts.synth_wav(sentence) wave.open(filename, 'rb') return filename