def text_to_speech(self, text: str, slow: bool = False, use_cache: bool = True) -> str: # TODO: Allow various settings to be changed via config option from espeakng import ESpeakNG file_path = self._get_cache_file_path(text=text, use_cache=use_cache) if self._is_valid_cached_file(file_path=file_path, use_cache=use_cache): LOG.debug("Using existing cached file: %s" % (file_path)) return file_path LOG.trace('Performing TTS on text "%s" and saving result to %s' % (text, file_path)) esng = ESpeakNG() esng.voice = "en-us" esng.pitch = 32 esng.pitch = 32 if slow: esng.speed = 80 else: esng.speed = 150 wave_data = esng.synth_wav(text) with open(file_path, "wb") as fp: fp.write(wave_data) return file_path
def text_to_speech(self, text, voice='en-us', bit_rate=8000): speak = ESpeakNG(volume=200) speak.voice = voice wav_data = speak.synth_wav(text) self._tmp_in_file.seek(0) self._tmp_in_file.write(wav_data) self._convert(bit_rate)
def test_synth_wav_xsampa(self): esng = ESpeakNG(voice='english-us') esng.pitch = 32 esng.speed = 150 wavs = esng.synth_wav("h@l'oU", fmt='xs') wav = wave.open(BytesIO(wavs)) self.assertEqual(wav.getnchannels(), 1) self.assertEqual(wav.getframerate(), 22050) self.assertGreater(wav.getnframes(), 20000)
def test_synth_wav(self): esng = ESpeakNG(voice='english-us') esng.pitch = 32 esng.speed = 150 wavs = esng.synth_wav('Hello World!') wav = wave.open(BytesIO(wavs)) self.assertEqual(wav.getnchannels(), 1) self.assertEqual(wav.getframerate(), 22050) self.assertGreater(wav.getnframes(), 24000)
class MService(): def __repr__(self): return "Mservice" def __init__(self, *args, **kwargs): self.__espeak = ESpeakNG() self.__espeak.speed = 150 #self.generate_song("a b c d e f g h i j k l m n o p q r s t u v w x y z 1 2 3 4 5 6 7 8 9 0",export_path="cache.mp3") default_path = "distopianM" __hard_code_path = "sounds_db" __prefix = "-transform.wav" __suffix = "vocal-" __distopian_music = "distopianM" try: __distopian_segment = AudioSegment.from_file(default_path + "/distopian.mp3") except: raise FileNotFoundError("Default music was not found!") def generate_song(self,lyrics,base_music=default_path,export_path="/",name_file="out",export_format="mp3"): base_music_audio = None if not os.path.exists(base_music): raise FileNotFoundError("The base music was not found") if base_music == self.default_path: base_music_audio = self.__distopian_segment[0:240000] else: base_music_audio = AudioSegment.from_file(base_music) lyrics = lyrics.lower() sounds_lyrics = self.convert_lyrics_to_voice(lyrics) number_of_segments = len(sounds_lyrics) segments_music = self.split_segment(base_music_audio,number_of_segments) segments_lyrics = sounds_lyrics #[self.segment_from_raw_data(sound) for sound in sounds_lyrics] segments_lyrics = self.inject_silence_to_segments(segments_lyrics,1500) concatenated_segments = self.concatenate_segments(segments_lyrics,segments_music) concatenated_segments.export(f"{export_path}", format=export_format) def concatenate_segments(self,segment_voice,segment_music): empty_segment = AudioSegment.empty() if len(segment_voice) >= len(segment_music): low = len(segment_music) high = len(segment_voice) high_list = segment_voice else: low = len(segment_voice) high = len(segment_music) high_list = segment_music for x in range(0,low): empty_segment = empty_segment + segment_voice[x] + segment_music[x] for x in range(low,high): empty_segment = empty_segment + high_list[x] return empty_segment def inject_silence_to_segments(self,segments,duration_silence_ms=1500): list_of_segments_with_silence = [] silence = AudioSegment.silent(duration=duration_silence_ms) for x in range(len(segments)): list_of_segments_with_silence.append(silence + segments[x] + silence) return list_of_segments_with_silence @cached(cache=LRUCache(maxsize=150)) def __get_text_as_raw_voice(self,text): #mp3_bytes = BytesIO() #voice = gTTS(text) #voice.write_to_fp(mp3_bytes) wav_generated = self.__espeak.synth_wav(text) song_as_bytes = BytesIO(wav_generated) preprocess = self.segment_from_raw_data(song_as_bytes) return preprocess def convert_lyrics_to_voice(self,lyrics): words = lyrics.split(" ") list_of_words_as_sound = [self.__get_text_as_raw_voice(word) for word in words] return list_of_words_as_sound def segment_from_raw_data(self,raw_data): raw_segment = AudioSegment.from_file(raw_data) raw_data.close() return raw_segment def split_segment(self,segment,number_of_segments): list_of_partitions = [] miliseconds_segment = int(segment.duration_seconds * 1000) if(number_of_segments > miliseconds_segment): number_of_segments = miliseconds_segment equality = miliseconds_segment // number_of_segments sorted_list = [x for x in range(equality,miliseconds_segment + 1,equality)] sorted_list.append(miliseconds_segment) sorted_list.sort() low_begin = 0 for part in sorted_list: list_of_partitions.append(segment[low_begin:part]) low_begin = part return list_of_partitions
class TTS(object): def __init__( self, host_tts='local', port_tts=8300, locale='en_US', engine='mary', voice='cmu-rms-hsmm', pitch=50, # 0-99 speed=175): # approx. words per minute self._host_tts = host_tts self._port_tts = port_tts self._locale = locale self._engine = engine self._voice = voice self._pitch = pitch self._speed = speed if host_tts == 'local': self.player = PulsePlayer('Local TTS Client') self.espeak = ESpeakNG() self.marytts = MaryTTS() self.picotts = PicoTTS() @property def locale(self): return self._locale @locale.setter def locale(self, v): self._locale = v @property def engine(self): return self._engine @engine.setter def engine(self, v): self._engine = v @property def voice(self): return self._voice @voice.setter def voice(self, v): self._voice = v @property def pitch(self): return self._pitch @pitch.setter def pitch(self, v): self._pitch = v @property def speed(self): return self._speed @speed.setter def speed(self, v): self._speed = v def synthesize(self, txt, mode='txt'): if self._host_tts == 'local': # import pdb; pdb.set_trace() wav = None if self.engine == 'mary': self.marytts.voice = self._voice self.marytts.locale = self._locale if mode == 'txt': wav = self.marytts.synth_wav(txt) elif mode == 'ipa': xs = ipa2mary('ipa', txt) wav = self.marytts.synth_wav(xs, fmt='xs') else: raise Exception("unknown mary mode '%s'" % mode) elif self.engine == 'espeak': if mode == 'txt': self.espeak.voice = self._voice self.espeak.speed = self._speed self.espeak.pitch = self._pitch wav = self.espeak.synth_wav(txt) # logging.debug ('synthesize: %s %s -> %s' % (txt, mode, repr(wav))) elif mode == 'ipa': xs = ipa2xsampa('ipa', txt) logging.debug('synthesize: %s %s -> %s' % (txt, mode, repr(xs))) wav = self.espeak.synth_wav(xs, fmt='xs') elif self.engine == 'pico': if mode == 'txt': self.picotts.voice = self._voice wav = self.picotts.synth_wav(txt) # logging.debug ('synthesize: %s %s -> %s' % (txt, mode, repr(wav))) else: raise Exception("unknown espeak mode '%s'" % mode) else: raise Exception("unknown engine '%s'" % self.engine) else: args = { 'l': self._locale, 'v': self._voice, 'e': self._engine, 'm': mode, 't': txt.encode('utf8') } url = 'http://%s:%s/tts/synth?%s' % ( self._host_tts, self._port_tts, urllib.urlencode(args)) response = requests.get(url) if response.status_code != 200: return None wav = response.content if wav: logging.debug('synthesize: %s %s -> WAV' % (txt, mode)) else: logging.error('synthesize: %s %s -> NO WAV' % (txt, mode)) return wav def play_wav(self, wav, async=False): if self._host_tts == 'local': if wav: self.player.play(wav, async) else: raise Exception('no wav given') else: url = 'http://%s:%s/tts/play' % (self._host_tts, self._port_tts) if async: url += '?async=t' response = requests.post(url, data=wav)