class Synthesizer(object): def __init__(self, config): self.wavernn = None self.config = config self.use_cuda = config.use_cuda if self.use_cuda: assert torch.cuda.is_available(), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_path, self.config.tts_file, self.config.tts_config, config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(config.wavernn_lib_path, config.wavernn_path, config.wavernn_file, config.wavernn_config, config.use_cuda) def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.tts_config.text_cleaner], self.tts_config.phoneme_language, self.tts_config.enable_eos_bos_chars) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.tts_config.text_cleaner]) self.tts_model = setup_model(self.input_size, self.tts_config) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): sys.path.append(lib_path) # set this if TTS is not installed globally from WaveRNN.models.wavernn import Model wavernn_config = os.path.join(model_path, model_config) model_file = os.path.join(model_path, model_file) print(" > Loading WaveRNN model ...") print(" | > model config: ", wavernn_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(wavernn_config) self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, pad=2, upsample_factors=self.wavernn_config.upsample_factors, # set this depending on dataset feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file) self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def split_into_sentences(self, text): text = " " + text + " " text = text.replace("\n"," ") text = re.sub(prefixes,"\\1<prd>",text) text = re.sub(websites,"<prd>\\1",text) if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>") text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text) text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text) text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text) text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text) text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text) text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text) text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text) if "”" in text: text = text.replace(".”","”.") if "\"" in text: text = text.replace(".\"","\".") if "!" in text: text = text.replace("!\"","\"!") if "?" in text: text = text.replace("?\"","\"?") text = text.replace(".",".<stop>") text = text.replace("?","?<stop>") text = text.replace("!","!<stop>") text = text.replace("<prd>",".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences def tts(self, text): wavs = [] sens = self.split_into_sentences(text) if len(sens) == 0: sens = [text+'.'] for sen in sens: if len(sen) < 3: continue sen = sen.strip() print(sen)
class Synthesizer(object): def __init__(self, config): self.wavernn = None self.config = config self.use_cuda = config.use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_path, self.config.tts_file, self.config.tts_config, config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(config.wavernn_lib_path, config.wavernn_path, config.wavernn_file, config.wavernn_config, config.use_cuda) def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence( sen, [self.tts_config.text_cleaner], self.tts_config. phoneme_language, self.tts_config.enable_eos_bos_chars) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence( sen, [self.tts_config.text_cleaner]) # load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping( os.path.join(model_path, self.config.tts_speakers)) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append(lib_path) # set this if TTS is not installed globally from WaveRNN.models.wavernn import Model wavernn_config = os.path.join(model_path, model_config) model_file = os.path.join(model_path, model_file) print(" > Loading WaveRNN model ...") print(" | > model config: ", wavernn_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(wavernn_config) self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file) self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def split_into_sentences(self, text): text = " " + text + " " text = text.replace("\n", " ") text = re.sub(prefixes, "\\1<prd>", text) text = re.sub(websites, "<prd>\\1", text) if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>") text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text) text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text) text = re.sub( alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text) text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text) text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text) text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text) text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text) if "”" in text: text = text.replace(".”", "”.") if "\"" in text: text = text.replace(".\"", "\".") if "!" in text: text = text.replace("!\"", "\"!") if "?" in text: text = text.replace("?\"", "\"?") text = text.replace(".", ".<stop>") text = text.replace("?", "?<stop>") text = text.replace("!", "!<stop>") text = text.replace("<prd>", ".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences def tts(self, text): wavs = [] sens = self.split_into_sentences(text) if not sens: sens = [text + '.'] for sen in sens: if len(sen) < 3: continue sen = sen.strip() print(sen) seq = np.array(self.input_adapter(sen)) text_hat = sequence_to_phoneme(seq) print(text_hat) chars_var = torch.from_numpy(seq).unsqueeze(0).long() if self.use_cuda: chars_var = chars_var.cuda() decoder_out, postnet_out, alignments, stop_tokens = self.tts_model.inference( chars_var) postnet_out = postnet_out[0].data.cpu().numpy() if self.tts_config.model == "Tacotron": wav = self.ap.inv_spectrogram(postnet_out.T) elif self.tts_config.model == "Tacotron2": if self.wavernn: wav = self.wavernn.generate( torch.FloatTensor(postnet_out.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550) else: wav = self.ap.inv_mel_spectrogram(postnet_out.T) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) return out
class Synthesizer(object): def __init__(self, config): self.wavernn = None self.pwgan = None self.config = config self.use_cuda = self.config.use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_checkpoint, self.config.tts_config, self.config.use_cuda) if self.config.vocoder_checkpoint: self.load_vocoder(self.config.vocoder_checkpoint, self.config.vocoder_config, self.config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file, self.config.wavernn_config, self.config.use_cuda) if self.config.pwgan_file: self.load_pwgan(self.config.pwgan_lib_path, self.config.pwgan_file, self.config.pwgan_config, self.config.use_cuda) def load_tts(self, tts_checkpoint, tts_config, use_cuda): # pylint: disable=global-statement global symbols, phonemes print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if 'characters' in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r']) print(f" > model reduction factor: {cp['r']}") def load_vocoder(self, model_file, model_config, use_cuda): self.vocoder_config = load_config(model_config) self.vocoder_model = setup_generator(self.vocoder_config) self.vocoder_model.load_state_dict( torch.load(model_file, map_location="cpu")["model"]) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 self.vocoder_config = load_config(model_config) if use_cuda: self.vocoder_model.cuda() self.vocoder_model.eval() def load_wavernn(self, lib_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append( lib_path) # set this if WaveRNN is not installed globally #pylint: disable=import-outside-toplevel from WaveRNN.models.wavernn import Model print(" > Loading WaveRNN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(model_config) # This is the default architecture we use for our models. # You might need to update it self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file, map_location="cpu") self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() def load_pwgan(self, lib_path, model_file, model_config, use_cuda): if lib_path: # set this if ParallelWaveGAN is not installed globally sys.path.append(lib_path) try: #pylint: disable=import-outside-toplevel from parallel_wavegan.models import ParallelWaveGANGenerator except ImportError as e: raise RuntimeError( f"cannot import parallel-wavegan, either install it or set its directory using the --pwgan_lib_path command line argument: {e}" ) print(" > Loading PWGAN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) with open(model_config) as f: self.pwgan_config = yaml.load(f, Loader=yaml.Loader) self.pwgan = ParallelWaveGANGenerator( **self.pwgan_config["generator_params"]) self.pwgan.load_state_dict( torch.load(model_file, map_location="cpu")["model"]["generator"]) self.pwgan.remove_weight_norm() if use_cuda: self.pwgan.cuda() self.pwgan.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) @staticmethod def split_into_sentences(text): text = " " + text + " <stop>" text = text.replace("\n", " ") text = re.sub(prefixes, "\\1<prd>", text) text = re.sub(websites, "<prd>\\1", text) if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>") text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text) text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text) text = re.sub( alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text) text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text) text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text) text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text) text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text) if "”" in text: text = text.replace(".”", "”.") if "\"" in text: text = text.replace(".\"", "\".") if "!" in text: text = text.replace("!\"", "\"!") if "?" in text: text = text.replace("?\"", "\"?") text = text.replace(".", ".<stop>") text = text.replace("?", "?<stop>") text = text.replace("!", "!<stop>") text = text.replace("<prd>", ".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = list(filter( None, [s.strip() for s in sentences])) # remove empty sentences return sentences def tts(self, text, speaker_id=None): start_time = time.time() wavs = [] sens = self.split_into_sentences(text) print(sens) speaker_id = id_to_torch(speaker_id) if speaker_id is not None and self.use_cuda: speaker_id = speaker_id.cuda() for sen in sens: # preprocess the given text inputs = text_to_seqvec(sen, self.tts_config) inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda) inputs = inputs.unsqueeze(0) # synthesize voice decoder_output, postnet_output, alignments, stop_tokens = run_model_torch( self.tts_model, inputs, self.tts_config, False, speaker_id, None) # convert outputs to numpy if self.vocoder_model: vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0) wav = self.vocoder_model.inference(vocoder_input) if self.use_cuda: wav = wav.cpu().numpy() else: wav = wav.numpy() wav = wav.flatten() elif self.wavernn: vocoder_input = None if self.tts_config.model == "Tacotron": vocoder_input = torch.FloatTensor( self.ap.out_linear_to_mel( linear_spec=postnet_output.T).T).T.unsqueeze(0) else: vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0) if self.use_cuda: vocoder_input.cuda() wav = self.wavernn.generate( vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550) # trim silence wav = trim_silence(wav, self.ap) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) # compute stats process_time = time.time() - start_time audio_time = len(wavs) / self.tts_config.audio['sample_rate'] print(f" > Processing time: {process_time}") print(f" > Real-time factor: {process_time / audio_time}") return out
mode=VC.mode, mulaw=VC.mulaw, pad=VC.pad, upsample_factors=VC.upsample_factors, feat_dims=VC.audio["num_mels"], compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=ap.hop_length, sample_rate=ap.sample_rate, use_aux_net=True, use_upsample_net=True) check = torch.load(args.vocoder_path) vocoder_model.load_state_dict(check['model']) vocoder_model.eval() if args.use_cuda: vocoder_model.cuda() else: vocoder_model = None VC = None ap_vocoder = None # synthesize voice print(" > Text: {}".format(args.text)) _, _, _, wav = tts(model, vocoder_model, C, VC, args.text, ap,
wavernn = Model( rnn_dims=512, fc_dims=512, mode=VOCODER_CONFIG.mode, mulaw=VOCODER_CONFIG.mulaw, pad=VOCODER_CONFIG.pad, use_aux_net=VOCODER_CONFIG.use_aux_net, use_upsample_net=VOCODER_CONFIG.use_upsample_net, upsample_factors=VOCODER_CONFIG.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=ap2.hop_length, sample_rate=ap2.sample_rate, ) check = torch.load(VOCODER_MODEL_PATH, map_location=torch.device('cuda' if use_cuda else 'cpu')) wavernn.load_state_dict(check['model']) if use_cuda: wavernn.cuda() wavernn.eval() model.decoder.max_decoder_steps = 500 br = 50 stay = True while (stay): synthesise_text(input('Type a sentence to be synthesised > ')) stay = not (input('Type exit to stop > ') == 'exit')
class Synthesizer(object): def __init__(self, config): self.wavernn = None self.vocoder_model = None self.config = config print(config) self.seg = self.get_segmenter("en") self.use_cuda = self.config.use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_checkpoint, self.config.tts_config, self.config.use_cuda) if self.config.vocoder_checkpoint: self.load_vocoder(self.config.vocoder_checkpoint, self.config.vocoder_config, self.config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_checkpoint, self.config.wavernn_config, self.config.use_cuda) @staticmethod def get_segmenter(lang): return pysbd.Segmenter(language=lang, clean=True) def load_tts(self, tts_checkpoint, tts_config, use_cuda): # pylint: disable=global-statement global symbols, phonemes print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if 'characters' in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r']) print(f" > model reduction factor: {cp['r']}") def load_vocoder(self, model_file, model_config, use_cuda): self.vocoder_config = load_config(model_config) self.vocoder_model = setup_generator(self.vocoder_config) self.vocoder_model.load_state_dict( torch.load(model_file, map_location="cpu")["model"]) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 self.vocoder_config = load_config(model_config) if use_cuda: self.vocoder_model.cuda() self.vocoder_model.eval() def load_wavernn(self, lib_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append( lib_path) # set this if WaveRNN is not installed globally #pylint: disable=import-outside-toplevel from WaveRNN.models.wavernn import Model print(" > Loading WaveRNN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(model_config) # This is the default architecture we use for our models. # You might need to update it self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file, map_location="cpu") self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def split_into_sentences(self, text): return self.seg.segment(text) def tts(self, text, speaker_id=None): start_time = time.time() wavs = [] sens = self.split_into_sentences(text) print(sens) speaker_id = id_to_torch(speaker_id) if speaker_id is not None and self.use_cuda: speaker_id = speaker_id.cuda() for sen in sens: # preprocess the given text inputs = text_to_seqvec(sen, self.tts_config) inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda) inputs = inputs.unsqueeze(0) # synthesize voice _, postnet_output, _, _ = run_model_torch(self.tts_model, inputs, self.tts_config, False, speaker_id, None) if self.vocoder_model: # use native vocoder model vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0) wav = self.vocoder_model.inference(vocoder_input) if self.use_cuda: wav = wav.cpu().numpy() else: wav = wav.numpy() wav = wav.flatten() elif self.wavernn: # use 3rd paty wavernn vocoder_input = None if self.tts_config.model == "Tacotron": vocoder_input = torch.FloatTensor( self.ap.out_linear_to_mel( linear_spec=postnet_output.T).T).T.unsqueeze(0) else: vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0) if self.use_cuda: vocoder_input.cuda() wav = self.wavernn.generate( vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550) else: # use GL if self.use_cuda: postnet_output = postnet_output[0].cpu() else: postnet_output = postnet_output[0] postnet_output = postnet_output.numpy() wav = inv_spectrogram(postnet_output, self.ap, self.tts_config) # trim silence wav = trim_silence(wav, self.ap) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) # compute stats process_time = time.time() - start_time audio_time = len(wavs) / self.tts_config.audio['sample_rate'] print(f" > Processing time: {process_time}") print(f" > Real-time factor: {process_time / audio_time}") return out
class Synthesizer(object): def __init__(self, config): self.wavernn = None self.config = config self.use_cuda = self.config.use_cuda if self.use_cuda: assert torch.cuda.is_available(), "CUDA is not availabe on this machine." tts_config_path = resource_path(self.config.tts_config) tts_checkpoint_path = resource_path(self.config.tts_checkpoint) self.load_tts(tts_checkpoint_path, tts_config_path, self.config.use_cuda) if self.config.wavernn_lib_path: wavernn_lib_path = resource_path(self.config.wavernn_lib_path) wavernn_path = resource_path(self.config.wavernn_path) self.load_wavernn(wavernn_lib_path, wavernn_path, wavernn_file, wavernn_config, self.config.use_cuda) def load_tts(self, tts_checkpoint, tts_config, use_cuda): print("\t[-] Loading TTS model ...") print("\t | > model config: ", tts_config) print("\t | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(os.path.join(model_path, self.config.tts_speakers)) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r']) def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): sys.path.append(lib_path) from WaveRNN.models.wavernn import Model wavernn_config = os.path.join(model_path, model_config) model_file = os.path.join(model_path, model_file) print("\t[-] Loading WaveRNN model ...") print("\t | > model config: ", wavernn_config) print("\t | > model file: ", model_file) self.wavernn_config = load_config(wavernn_config) self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net = self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file) self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() def save_wav(self, wav, path): wav = np.array(wav) self.ap.save_wav(wav, path) def split_into_sentences(self, text): text = " " + text + " " text = text.replace("\n", " ") text = re.sub(prefixes, "\\1<prd>", text) text = re.sub(websites, "<prd>\\1", text) if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>") text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text) text = re.sub(acronyms+" "+starters, "\\1<stop> \\2", text) text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text) text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text) text = re.sub(" "+suffixes+"[.] "+starters, " \\1<stop> \\2", text) text = re.sub(" "+suffixes+"[.]", " \\1<prd>", text) text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text) if "”" in text: text = text.replace(".”", "”.") if "\"" in text: text = text.replace(".\"", "\".") if "!" in text: text = text.replace("!\"", "\"!") if "?" in text: text = text.replace("?\"", "\"?") text = text.replace(".", ".<stop>") text = text.replace("?", "?<stop>") text = text.replace("!", "!<stop>") text = text.replace("<prd>", ".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences def tts(self, text): text = f"{text}." # Adding a fullstop to the end wavs = [] sens = self.split_into_sentences(text) if not sens: sens = [text+'.'] for sen in sens: inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda) decoder_output, postnet_output, alignments, _ = run_model( self.tts_model, inputs, self.tts_config, False, None, None) postnet_output, decoder_output, _ = parse_outputs(postnet_output, decoder_output, alignments) if self.wavernn: postnet_output = postnet_output[0].data.cpu().numpy() wav = self.wavernn.generate( torch.FloatTensor(postnet_output.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550) else: wav = inv_spectrogram( postnet_output, self.ap, self.tts_config) wav = trim_silence(wav, self.ap) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) return out
class Synthesizer(object): def __init__(self, config): self.wavernn = None self.config = config self.use_cuda = self.config.use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_path, self.config.tts_file, self.config.tts_config, config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(config.wavernn_lib_path, config.wavernn_path, config.wavernn_file, config.wavernn_config, config.use_cuda) def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping( os.path.join(model_path, self.config.tts_speakers)) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(self.model_file, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp and self.tts_config.model in ["Tacotron", "TacotronGST"]: self.tts_model.decoder.set_r(cp['r']) def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append(lib_path) # set this if TTS is not installed globally from WaveRNN.models.wavernn import Model wavernn_config = os.path.join(model_path, model_config) model_file = os.path.join(model_path, model_file) self.wavernn_config = load_config(wavernn_config) self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file, map_location=torch.device('cpu')) self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def split_into_sentences(self, text): text = " " + text + " " text = text.replace("\n", " ") text = re.sub(prefixes, "\\1<prd>", text) text = re.sub(websites, "<prd>\\1", text) if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>") text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text) text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text) text = re.sub( alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text) text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text) text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text) text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text) text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text) if "”" in text: text = text.replace(".”", "”.") if "\"" in text: text = text.replace(".\"", "\".") if "!" in text: text = text.replace("!\"", "\"!") if "?" in text: text = text.replace("?\"", "\"?") text = text.replace(".", ".<stop>") text = text.replace("?", "?<stop>") text = text.replace("!", "!<stop>") text = text.replace("<prd>", ".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences def tts(self, sens): wavs = [] #sens = self.split_into_sentences(text) for sen in sens: # preprocess the given text inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda) # synthesize voice decoder_output, postnet_output, alignments, _ = run_model( self.tts_model, inputs, self.tts_config, False, None, None) # convert outputs to numpy postnet_output, decoder_output, _ = parse_outputs( postnet_output, decoder_output, alignments) if self.wavernn: postnet_output = postnet_output[0].data.cpu().numpy() wav = self.wavernn.generate( torch.FloatTensor(postnet_output.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550) else: wav = inv_spectrogram(postnet_output, self.ap, self.tts_config) # trim silence wav = trim_silence(wav, self.ap) wavs += list(wav) wavs += [0] * 1000 out = io.BytesIO() self.save_wav(wavs, out) return out
class Synthesizer(object): def __init__(self, config): self.wavernn = None self.pwgan = None self.config = config self.use_cuda = self.config.use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_checkpoint, self.config.tts_config, self.config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file, self.config.wavernn_config, self.config.use_cuda) if self.config.pwgan_file: self.load_pwgan(self.config.pwgan_file, self.config.pwgan_config, self.config.use_cuda) def load_tts(self, tts_checkpoint, tts_config, use_cuda): print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r']) def load_wavernn(self, lib_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append( lib_path) # set this if WaveRNN is not installed globally #pylint: disable=import-outside-toplevel from WaveRNN.models.wavernn import Model print(" > Loading WaveRNN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(model_config) # This is the default architecture we use for our models. # You might need to update it self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file) self.wavernn.load_state_dict(check['model'], map_location="cpu") if use_cuda: self.wavernn.cuda() self.wavernn.eval() def load_pwgan(self, model_file, model_config, use_cuda): #pylint: disable=import-outside-toplevel from parallel_wavegan.models import ParallelWaveGANGenerator from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder print(" > Loading PWGAN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) with open(model_config) as f: self.pwgan_config = yaml.load(f, Loader=yaml.Loader) self.pwgan = ParallelWaveGANGenerator( **self.pwgan_config["generator_params"]) self.pwgan.load_state_dict( torch.load(model_file, map_location="cpu")["model"]["generator"]) self.pwgan.remove_weight_norm() self.pwgan_ap = AudioProcessorVocoder(**self.pwgan_config["audio"]) if use_cuda: self.pwgan.cuda() self.pwgan.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def split_into_sentences(self, text): text = " " + text + " <stop>" text = text.replace("\n", " ") text = re.sub(prefixes, "\\1<prd>", text) text = re.sub(websites, "<prd>\\1", text) if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>") text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text) text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text) text = re.sub( alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text) text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text) text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text) text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text) text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text) if "”" in text: text = text.replace(".”", "”.") if "\"" in text: text = text.replace(".\"", "\".") if "!" in text: text = text.replace("!\"", "\"!") if "?" in text: text = text.replace("?\"", "\"?") text = text.replace(".", ".<stop>") text = text.replace("?", "?<stop>") text = text.replace("!", "!<stop>") text = text.replace("<prd>", ".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = list(filter(None, [s.strip() for s in sentences])) return sentences def tts(self, text): wavs = [] sens = self.split_into_sentences(text) print(sens) if not sens: sens = [text + '.'] for sen in sens: # preprocess the given text inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda) # synthesize voice decoder_output, postnet_output, alignments, _ = run_model( self.tts_model, inputs, self.tts_config, False, None, None) # convert outputs to numpy postnet_output, decoder_output, _ = parse_outputs( postnet_output, decoder_output, alignments) if self.pwgan: input_tensor = torch.FloatTensor(postnet_output.T).unsqueeze(0) if self.use_cuda: input_tensor.cuda() wav = self.pwgan.inference( input_tensor, hop_size=self.pwgan_ap.hop_length).data.cpu().numpy() else: wav = inv_spectrogram(postnet_output, self.ap, self.tts_config) # trim silence wav = trim_silence(wav, self.ap) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) return out
class MozillaTTS: """ Wrapper for Mozilla TTS Related repositories: - Mozilla TTS: - https://github.com/mozilla/TTS - commit 824c091 - data: https://drive.google.com/drive/folders/1FJRjGDAqWIyZRX4CsppaIPEW8UWXCWzF?usp=drive_open - WaveRNN(optional): - https://github.com/erogol/WaveRNN - commit 8a1c152 - data: https://drive.google.com/drive/folders/1wpPn3a0KQc6EYtKL0qOi4NqEmhML71Ve """ def __init__(self, tts_model, tts_config, wavernn_model=None, wavernn_config=None, device="cpu"): from TTS.utils.generic_utils import load_config self.tts_config = load_config(tts_config) self.tts_config.windowing = True if not torch.cuda.is_available(): device = "cpu" self.use_cuda = device != "cpu" self.device = torch.device(device) self.tts_model_path = tts_model self._load_tts() if wavernn_model and wavernn_config: self.use_gl = False self.batched_wavernn = True self.wavernn_model_path = wavernn_model self.wavernn_config = load_config(wavernn_config) self._load_wavernn() else: self.use_gl = True def _load_tts(self): # LOAD TTS MODEL from TTS.utils.text.symbols import symbols, phonemes from TTS.utils.audio import AudioProcessor from TTS.utils.generic_utils import setup_model # load the model num_chars = len(phonemes) if self.tts_config.use_phonemes else len(symbols) self.tts_model = setup_model(num_chars, self.tts_config) # load the audio processor self._ap = AudioProcessor(**self.tts_config.audio) # load model state cp = torch.load(self.tts_model_path, map_location=lambda storage, loc: storage) # load the model self.tts_model.load_state_dict(cp['model']) self.tts_model.to(self.device) self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 2000 def _load_wavernn(self): from WaveRNN.models.wavernn import Model self.wavernn = Model( rnn_dims=512, fc_dims=512, mode="mold", pad=2, upsample_factors=self.wavernn_config.upsample_factors, # set this depending on dataset feat_dims=self.wavernn_config.audio["num_mels"], compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self._ap.hop_length, sample_rate=self._ap.sample_rate, ).to(self.device) check = torch.load(self.wavernn_model_path) self.wavernn.load_state_dict(check['model']) self.wavernn.eval() def __call__(self, text, out_path): waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(self.tts_model, text, self.tts_config, self.use_cuda, self._ap, False, self.tts_config.enable_eos_bos_chars) if not self.use_gl: waveform = self.wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).to(self.device), batched=self.batched_wavernn, target=11000, overlap=550) self._ap.save_wav(waveform, out_path)
upsample_factors=VOCODER_CONFIG.upsample_factors, feat_dims=VOCODER_CONFIG.audio["num_mels"], compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=ap_vocoder.hop_length, sample_rate=ap_vocoder.sample_rate, use_upsample_net = True, use_aux_net = True ).cuda() check = torch.load(VOCODER_MODEL_PATH) wavernn.load_state_dict(check['model'], strict=False) if use_cuda: wavernn.cuda() wavernn.eval(); print(check['step']) # ### Comparision with https://mycroft.ai/blog/available-voices/ model.eval() model.decoder.max_decoder_steps = 2000 # speaker_id = 0 for speaker_id in range(5): sentence = "怎么网络不好啊,为什么上不去" align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=False, use_pinyin=use_pinyin) sentence = "晚饭吃什么" align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=False, use_pinyin=use_pinyin)