class Synthesizer(object): def __init__(self, config): self.wavernn = None self.pwgan = None self.config = config self.use_cuda = self.config.use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_checkpoint, self.config.tts_config, self.config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file, self.config.wavernn_config, self.config.use_cuda) if self.config.pwgan_lib_path: self.load_pwgan(self.config.pwgan_lib_path, self.config.pwgan_file, self.config.pwgan_config, self.config.use_cuda) def load_tts(self, tts_checkpoint, tts_config, use_cuda): # pylint: disable=global-statement global symbols, phonemes print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if 'characters' in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r']) def load_wavernn(self, lib_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append( lib_path) # set this if WaveRNN is not installed globally #pylint: disable=import-outside-toplevel from WaveRNN.models.wavernn import Model print(" > Loading WaveRNN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(model_config) # This is the default architecture we use for our models. # You might need to update it self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file, map_location="cpu") self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() def load_pwgan(self, lib_path, model_file, model_config, use_cuda): sys.path.append( lib_path) # set this if ParallelWaveGAN is not installed globally #pylint: disable=import-outside-toplevel from parallel_wavegan.models import ParallelWaveGANGenerator print(" > Loading PWGAN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) with open(model_config) as f: self.pwgan_config = yaml.load(f, Loader=yaml.Loader) self.pwgan = ParallelWaveGANGenerator( **self.pwgan_config["generator_params"]) self.pwgan.load_state_dict( torch.load(model_file, map_location="cpu")["model"]["generator"]) self.pwgan.remove_weight_norm() if use_cuda: self.pwgan.cuda() self.pwgan.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) @staticmethod def split_into_sentences(text): text = " " + text + " <stop>" text = text.replace("\n", " ") text = re.sub(prefixes, "\\1<prd>", text) text = re.sub(websites, "<prd>\\1", text) if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>") text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text) text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text) text = re.sub( alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text) text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text) text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text) text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text) text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text) if "”" in text: text = text.replace(".”", "”.") if "\"" in text: text = text.replace(".\"", "\".") if "!" in text: text = text.replace("!\"", "\"!") if "?" in text: text = text.replace("?\"", "\"?") text = text.replace(".", ".<stop>") text = text.replace("?", "?<stop>") text = text.replace("!", "!<stop>") text = text.replace("<prd>", ".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = list(filter( None, [s.strip() for s in sentences])) # remove empty sentences return sentences def tts(self, text, speaker_id=None): wavs = [] sens = self.split_into_sentences(text) print(sens) speaker_id = id_to_torch(speaker_id) if speaker_id is not None and self.use_cuda: speaker_id = speaker_id.cuda() for sen in sens: # preprocess the given text inputs = text_to_seqvec(sen, self.tts_config) inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda) inputs = inputs.unsqueeze(0) # synthesize voice decoder_output, postnet_output, alignments, stop_tokens = run_model_torch( self.tts_model, inputs, self.tts_config, False, speaker_id, None) # convert outputs to numpy postnet_output, decoder_output, _, _ = parse_outputs_torch( postnet_output, decoder_output, alignments, stop_tokens) if self.pwgan: vocoder_input = torch.FloatTensor( postnet_output.T).unsqueeze(0) if self.use_cuda: vocoder_input.cuda() wav = self.pwgan.inference(vocoder_input, hop_size=self.ap.hop_length) elif self.wavernn: vocoder_input = None if self.tts_config.model == "Tacotron": vocoder_input = torch.FloatTensor( self.ap.out_linear_to_mel( linear_spec=postnet_output.T).T).T.unsqueeze(0) else: vocoder_input = torch.FloatTensor( postnet_output.T).unsqueeze(0) if self.use_cuda: vocoder_input.cuda() wav = self.wavernn.generate( vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550) else: wav = inv_spectrogram(postnet_output, self.ap, self.tts_config) # trim silence wav = trim_silence(wav, self.ap) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) return out
class Synthesizer(object): def __init__(self, config): self.wavernn = None self.config = config self.use_cuda = self.config.use_cuda print(self.config) if self.use_cuda: assert torch.cuda.is_available(), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_checkpoint, self.config.tts_config, self.config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_path, self.config.wavernn_file, self.config.wavernn_config, self.config.use_cuda) def load_tts(self, tts_checkpoint, tts_config, use_cuda): print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(os.path.join(model_path, self.config.tts_speakers)) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r']) def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append(lib_path) # set this if TTS is not installed globally from WaveRNN.models.wavernn import Model wavernn_config = os.path.join(model_path, model_config) model_file = os.path.join(model_path, model_file) print(" > Loading WaveRNN model ...") print(" | > model config: ", wavernn_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(wavernn_config) self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net = self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file) self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def split_into_sentences(self, text): text = " " + text + " " text = text.replace("\n", " ") text = re.sub(prefixes, "\\1<prd>", text) text = re.sub(websites, "<prd>\\1", text) if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>") text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text) text = re.sub(acronyms+" "+starters, "\\1<stop> \\2", text) text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text) text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text) text = re.sub(" "+suffixes+"[.] "+starters, " \\1<stop> \\2", text) text = re.sub(" "+suffixes+"[.]", " \\1<prd>", text) text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text) if "”" in text: text = text.replace(".”", "”.") if "\"" in text: text = text.replace(".\"", "\".") if "!" in text: text = text.replace("!\"", "\"!") if "?" in text: text = text.replace("?\"", "\"?") text = text.replace(".", ".<stop>") text = text.replace("?", "?<stop>") text = text.replace("!", "!<stop>") text = text.replace("<prd>", ".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences def tts(self, text): wavs = [] sens = self.split_into_sentences(text) print(sens) if not sens: sens = [text+'.'] for sen in sens: # preprocess the given text inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda) # synthesize voice decoder_output, postnet_output, alignments, _ = run_model( self.tts_model, inputs, self.tts_config, False, None, None) # convert outputs to numpy postnet_output, decoder_output, _ = parse_outputs( postnet_output, decoder_output, alignments) if self.wavernn: postnet_output = postnet_output[0].data.cpu().numpy() wav = self.wavernn.generate(torch.FloatTensor(postnet_output.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550) else: wav = inv_spectrogram(postnet_output, self.ap, self.tts_config) # trim silence wav = trim_silence(wav, self.ap) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) return out
class Synthesizer(object): def __init__(self, config): self.wavernn = None self.config = config self.use_cuda = config.use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_path, self.config.tts_file, self.config.tts_config, config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(config.wavernn_lib_path, config.wavernn_path, config.wavernn_file, config.wavernn_config, config.use_cuda) def load_tts(self, model_path, model_file, model_config, use_cuda): tts_config = os.path.join(model_path, model_config) self.model_file = os.path.join(model_path, model_file) print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > model file: ", model_file) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) self.input_adapter = lambda sen: phoneme_to_sequence( sen, [self.tts_config.text_cleaner], self.tts_config. phoneme_language, self.tts_config.enable_eos_bos_chars) else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence( sen, [self.tts_config.text_cleaner]) self.tts_model = setup_model(self.input_size, self.tts_config) # load model state if use_cuda: cp = torch.load(self.model_file) else: cp = torch.load(self.model_file, map_location=lambda storage, loc: storage) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 def load_wavernn(self, lib_path, model_path, model_file, model_config, use_cuda): sys.path.append(lib_path) # set this if TTS is not installed globally from WaveRNN.models.wavernn import Model wavernn_config = os.path.join(model_path, model_config) model_file = os.path.join(model_path, model_file) print(" > Loading WaveRNN model ...") print(" | > model config: ", wavernn_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(wavernn_config) self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, pad=2, upsample_factors=self.wavernn_config. upsample_factors, # set this depending on dataset feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file) self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def split_into_sentences(self, text): text = " " + text + " " text = text.replace("\n", " ") text = re.sub(prefixes, "\\1<prd>", text) text = re.sub(websites, "<prd>\\1", text) if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>") text = re.sub("\s" + alphabets + "[.] ", " \\1<prd> ", text) text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text) text = re.sub( alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text) text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text) text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text) text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text) text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text) if "”" in text: text = text.replace(".”", "”.") if "\"" in text: text = text.replace(".\"", "\".") if "!" in text: text = text.replace("!\"", "\"!") if "?" in text: text = text.replace("?\"", "\"?") text = text.replace(".", ".<stop>") text = text.replace("?", "?<stop>") text = text.replace("!", "!<stop>") text = text.replace("<prd>", ".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences def tts(self, text): wavs = [] sens = self.split_into_sentences(text) if len(sens) == 0: sens = [text + '.'] for sen in sens: if len(sen) < 3: continue sen = sen.strip() print(sen) seq = np.array(self.input_adapter(sen)) text_hat = sequence_to_phoneme(seq) print(text_hat) chars_var = torch.from_numpy(seq).unsqueeze(0).long() if self.use_cuda: chars_var = chars_var.cuda() decoder_out, postnet_out, alignments, stop_tokens = self.tts_model.inference( chars_var) postnet_out = postnet_out[0].data.cpu().numpy() if self.tts_config.model == "Tacotron": wav = self.ap.inv_spectrogram(postnet_out.T) elif self.tts_config.model == "Tacotron2": if self.wavernn: wav = self.wavernn.generate( torch.FloatTensor(postnet_out.T).unsqueeze(0).cuda(), batched=self.config.is_wavernn_batched, target=11000, overlap=550) else: wav = self.ap.inv_mel_spectrogram(postnet_out.T) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) return out
mulaw=VC.mulaw, pad=VC.pad, upsample_factors=VC.upsample_factors, feat_dims=VC.audio["num_mels"], compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=ap.hop_length, sample_rate=ap.sample_rate, ) check = torch.load(args.vocoder_path) vocoder_model.load_state_dict(check['model']) vocoder_model.eval() if args.use_cuda: vocoder_model.cuda() else: vocoder_model = None VC = None # synthesize voice print(" > Text: {}".format(args.text)) _, _, _, wav = tts(model, vocoder_model, C, VC, args.text, ap, args.use_cuda, args.batched_vocoder, figures=False)
pad=VOCODER_CONFIG.pad, upsample_factors=VOCODER_CONFIG.upsample_factors, feat_dims=VOCODER_CONFIG.audio["num_mels"], compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=ap_vocoder.hop_length, sample_rate=ap_vocoder.sample_rate, use_upsample_net = True, use_aux_net = True ).cuda() check = torch.load(VOCODER_MODEL_PATH) wavernn.load_state_dict(check['model'], strict=False) if use_cuda: wavernn.cuda() wavernn.eval(); print(check['step']) # ### Comparision with https://mycroft.ai/blog/available-voices/ model.eval() model.decoder.max_decoder_steps = 2000 # speaker_id = 0 for speaker_id in range(5): sentence = "怎么网络不好啊,为什么上不去" align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=False, use_pinyin=use_pinyin) sentence = "晚饭吃什么" align, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, use_gl=use_gl, figures=False, use_pinyin=use_pinyin)