def generate_plots(self, model: Tacotron, session: TTSSession) -> None: model.eval() device = next(model.parameters()).device x, m, ids, x_lens, m_lens = session.val_sample x, m = x.to(device), m.to(device) m1_hat, m2_hat, att = model(x, m) att = np_now(att)[0] m1_hat = np_now(m1_hat)[0, :600, :] m2_hat = np_now(m2_hat)[0, :600, :] m = np_now(m)[0, :600, :] att_fig = plot_attention(att) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_fig = plot_mel(m) self.writer.add_figure('Ground_Truth_Aligned/attention', att_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/target', m_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) target_wav = reconstruct_waveform(m) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate) m1_hat, m2_hat, att = model.generate(x[0].tolist(), steps=m_lens[0] + 20) att_fig = plot_attention(att) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) self.writer.add_figure('Generated/attention', att_fig, model.step) self.writer.add_figure('Generated/target', m_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate)
def generate_plots(self, model: Tacotron, session: TTSSession) -> None: model.eval() device = next(model.parameters()).device batch = session.val_sample batch = to_device(batch, device=device) m1_hat, m2_hat, att = model(batch['x'], batch['mel']) att = np_now(att)[0] m1_hat = np_now(m1_hat)[0, :600, :] m2_hat = np_now(m2_hat)[0, :600, :] m_target = np_now(batch['mel'])[0, :600, :] att_fig = plot_attention(att) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_target_fig = plot_mel(m_target) self.writer.add_figure('Ground_Truth_Aligned/attention', att_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/target', m_target_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m2_hat_wav = self.dsp.griffinlim(m2_hat) target_wav = self.dsp.griffinlim(m_target) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) m1_hat, m2_hat, att = model.generate(batch['x'][0:1], steps=batch['mel_len'][0] + 20) att_fig = plot_attention(att) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) self.writer.add_figure('Generated/attention', att_fig, model.step) self.writer.add_figure('Generated/target', m_target_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = self.dsp.griffinlim(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=self.dsp.sample_rate)
voc_k = voc_model.get_step() // 1000 tts_k = tts_model.get_step() // 1000 r = tts_model.r simple_table([('WaveRNN', str(voc_k) + 'k'), (f'Tacotron(r={r})', str(tts_k) + 'k'), ('Generation Mode', 'Batched' if batched else 'Unbatched'), ('Target Samples', 11_000 if batched else 'N/A'), ('Overlap Samples', 550 if batched else 'N/A')]) for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') _, m, attention = tts_model.generate(x) if input_text: save_path = f'quick_start/__input_{input_text[:10]}_{tts_k}k.wav' else: save_path = f'quick_start/{i}_batched{str(batched)}_{tts_k}k.wav' # save_attention(attention, save_path) m = torch.tensor(m).unsqueeze(0) m = (m + 4) / 8 print(m.size()) print(m) voc_model.generate(m, save_path, batched, 11_000, 550, hp.mu_law) print('\n\nDone.\n')
class TaiwaneseTacotron(): def __init__(self): # Parse Arguments parser = argparse.ArgumentParser(description='TTS') self.args = parser.parse_args() self.args.vocoder = 'wavernn' self.args.hp_file = 'hparams.py' self.args.voc_weights = False self.args.tts_weights = False self.args.save_attn = False self.args.batched = True self.args.target = None self.args.overlap = None self.args.force_cpu = False #================ vocoder ================# if self.args.vocoder in ['griffinlim', 'gl']: self.args.vocoder = 'griffinlim' elif self.args.vocoder in ['wavernn', 'wr']: self.args.vocoder = 'wavernn' else: raise argparse.ArgumentError('Must provide a valid vocoder type!') hp.configure(self.args.hp_file) # Load hparams from file # set defaults for any arguments that depend on hparams if self.args.vocoder == 'wavernn': if self.args.target is None: self.args.target = hp.voc_target if self.args.overlap is None: self.args.overlap = hp.voc_overlap if self.args.batched is None: self.args.batched = hp.voc_gen_batched #================ others ================# paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) print("hello") print(paths.base) if not self.args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Using device:', device) # === Wavernn === # if self.args.vocoder == 'wavernn': print('\nInitialising WaveRNN Model...\n') self.voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode=hp.voc_mode).to(device) voc_load_path = self.args.voc_weights if self.args.voc_weights else paths.voc_latest_weights #print(paths.voc_latest_weights) self.voc_model.load(voc_load_path) # === Tacotron === # if hp.tts_model == 'tacotron': print('\nInitialising Tacotron Model...\n') self.tts_model = Tacotron( embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout, stop_threshold=hp.tts_stop_threshold).to(device) tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights self.tts_model.load(tts_load_path) # === Tacotron2 === # elif hp.tts_model == 'tacotron2': print('\nInitializing Tacotron2 Model...\n') self.tts_model = Tacotron2().to(device) tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights self.tts_model.load(tts_load_path) # === Infomation === # if hp.tts_model == 'tacotron': if self.args.vocoder == 'wavernn': voc_k = self.voc_model.get_step() // 1000 tts_k = self.tts_model.get_step() // 1000 simple_table([ ('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r), ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'), ('Generation Mode', 'Batched' if self.args.batched else 'Unbatched'), ('Target Samples', self.args.target if self.args.batched else 'N/A'), ('Overlap Samples', self.args.overlap if self.args.batched else 'N/A') ]) elif self.args.vocoder == 'griffinlim': tts_k = self.tts_model.get_step() // 1000 simple_table([('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', self.args.iters)]) elif hp.tts_model == 'tacotron2': if self.args.vocoder == 'wavernn': voc_k = self.voc_model.get_step() // 1000 tts_k = self.tts_model.get_step() // 1000 simple_table([ ('Tacotron2', str(tts_k) + 'k'), ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'), ('Generation Mode', 'Batched' if self.args.batched else 'Unbatched'), ('Target Samples', self.args.target if self.args.batched else 'N/A'), ('Overlap Samples', self.args.overlap if self.args.batched else 'N/A') ]) elif self.args.vocoder == 'griffinlim': tts_k = self.tts_model.get_step() // 1000 simple_table([('Tacotron2', str(tts_k) + 'k'), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', self.args.iters)]) def generate(self, 華, input_text): inputs = [text_to_sequence(input_text.strip(), ['basic_cleaners'])] if hp.tts_model == 'tacotron2': self.gen_tacotron2(華, inputs) elif hp.tts_model == 'tacotron': self.gen_tacotron(華, inputs) else: print(f"Wrong tts model type {{{tts_model_type}}}") print('\n\nDone.\n') # custom function def gen_tacotron2(self, 華, inputs): for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') print(x) x = np.array(x)[None, :] x = torch.autograd.Variable(torch.from_numpy(x)).cuda().long() self.tts_model.eval() mel_outputs, mel_outputs_postnet, _, alignments = self.tts_model.inference( x) if self.args.vocoder == 'griffinlim': v_type = self.args.vocoder elif self.args.vocoder == 'wavernn' and self.args.batched: v_type = 'wavernn_batched' else: v_type = 'wavernn_unbatched' # == define output name == # if len(華) == 0: output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0] elif 1 <= len(華) <= 9: output_name = 華[:-1] elif 9 < len(華): output_name = 華[:8] print(output_name) save_path = "output/{}.wav".format(output_name) ## if self.args.vocoder == 'wavernn': m = mel_outputs_postnet self.voc_model.generate(m, save_path, self.args.batched, hp.voc_target, hp.voc_overlap, hp.mu_law) elif self.args.vocoder == 'griffinlim': m = torch.squeeze(mel_outputs_postnet).detach().cpu().numpy() wav = reconstruct_waveform(m, n_iter=self.args.iters) save_wav(wav, save_path) # custom function def gen_tacotron(self, 華, inputs): for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') _, m, attention = self.tts_model.generate(x) # Fix mel spectrogram scaling to be from 0 to 1 m = (m + 4) / 8 np.clip(m, 0, 1, out=m) if self.args.vocoder == 'griffinlim': v_type = self.args.vocoder elif self.args.vocoder == 'wavernn' and self.args.batched: v_type = 'wavernn_batched' else: v_type = 'wavernn_unbatched' # == define output name == # if len(華) == 0: output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0] elif 1 <= len(華) <= 9: output_name = 華[:-1] elif 9 < len(華): output_name = 華[:8] print(output_name) save_path = "output/{}.wav".format(output_name) ## if self.args.vocoder == 'wavernn': m = torch.tensor(m).unsqueeze(0) self.voc_model.generate(m, save_path, self.args.batched, hp.voc_target, hp.voc_overlap, hp.mu_law) elif self.args.vocoder == 'griffinlim': wav = reconstruct_waveform(m, n_iter=self.args.iters) save_wav(wav, save_path)
def TTS_Wave(self): os.makedirs('quick_start/tts_weights/', exist_ok=True) os.makedirs('quick_start/voc_weights/', exist_ok=True) zip_ref = zipfile.ZipFile('pretrained/ljspeech.wavernn.mol.800k.zip', 'r') zip_ref.extractall('quick_start/voc_weights/') zip_ref.close() zip_ref = zipfile.ZipFile('pretrained/ljspeech.tacotron.r2.180k.zip', 'r') zip_ref.extractall('quick_start/tts_weights/') zip_ref.close() # Parse Arguments parser = argparse.ArgumentParser(description='TTS Generator') parser.add_argument('-name', metavar='name', type=str,help='name of pdf') parser.add_argument('--input_text', '-i', type=str, help='[string] Type in something here and TTS will generate it!') parser.add_argument('--batched', '-b', dest='batched', action='store_true', help='Fast Batched Generation (lower quality)') parser.add_argument('--unbatched', '-u', dest='batched', action='store_false', help='Slower Unbatched Generation (better quality)') parser.add_argument('--target', '-t', type=int, help='[int] number of samples in each batch index') parser.add_argument('--overlap', '-o', type=int, help='[int] number of crossover samples') parser.add_argument('--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment') parser.set_defaults(batched=hp.voc_gen_batched) parser.set_defaults(target=hp.voc_target) parser.set_defaults(overlap=hp.voc_overlap) parser.set_defaults(input_text=None) parser.set_defaults(weights_path=None) args = parser.parse_args() batched = args.batched target = args.target overlap = args.overlap input_text = args.input_text weights_path = args.weights_path if not args.force_cpu and torch.cuda.is_available(): device = torch.device('cuda') torch.cuda.set_device(0) else: device = torch.device('cpu') print('Using device:', device) print('\nInitialising WaveRNN Model...\n') # Instantiate WaveRNN Model voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims, fc_dims=hp.voc_fc_dims, bits=hp.bits, pad=hp.voc_pad, upsample_factors=hp.voc_upsample_factors, feat_dims=hp.num_mels, compute_dims=hp.voc_compute_dims, res_out_dims=hp.voc_res_out_dims, res_blocks=hp.voc_res_blocks, hop_length=hp.hop_length, sample_rate=hp.sample_rate, mode='MOL').to(device) voc_model.restore('quick_start/voc_weights/latest_weights.pyt') print('\nInitialising Tacotron Model...\n') # Instantiate Tacotron Model tts_model = Tacotron(embed_dims=hp.tts_embed_dims, num_chars=len(symbols), encoder_dims=hp.tts_encoder_dims, decoder_dims=hp.tts_decoder_dims, n_mels=hp.num_mels, fft_bins=hp.num_mels, postnet_dims=hp.tts_postnet_dims, encoder_K=hp.tts_encoder_K, lstm_dims=hp.tts_lstm_dims, postnet_K=hp.tts_postnet_K, num_highways=hp.tts_num_highways, dropout=hp.tts_dropout).to(device) tts_model.restore('quick_start/tts_weights/latest_weights.pyt') if input_text: inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)] else: with open('final.txt') as f: inputs = [text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f] voc_k = voc_model.get_step() // 1000 tts_k = tts_model.get_step() // 1000 r = tts_model.get_r() simple_table([('WaveRNN', str(voc_k) + 'k'), (f'Tacotron(r={r})', str(tts_k) + 'k'), ('Generation Mode', 'Batched' if batched else 'Unbatched'), ('Target Samples', target if batched else 'N/A'), ('Overlap Samples', overlap if batched else 'N/A')]) for i, x in enumerate(inputs, 1): print("f'\n| Generating {i}/{len(inputs)}'") _, m, attention = tts_model.generate(x) if input_text: save_path = './output_audio/'+str(i)+'.wav' else: save_path = './output_audio/'+str(i)+'.wav' # save_attention(attention, save_path) m = torch.tensor(m).unsqueeze(0) m = (m + 4) / 8 voc_model.generate(m, save_path, batched, hp.voc_target, hp.voc_overlap, hp.mu_law) if i == 2: temp1 = AudioSegment.from_wav("./output_audio/"+str(i-1)+".wav") temp2 = AudioSegment.from_wav("./output_audio/"+str(i)+".wav") combined_sounds = temp1 + temp2 os.remove("./output_audio/"+str(i-1)+".wav") os.remove("./output_audio/"+str(i)+".wav") combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav") elif i > 2: preTemp = AudioSegment.from_wav("./output_audio/"+self.path[:-4]+".wav") newTemp = AudioSegment.from_wav("./output_audio/"+str(i)+".wav") combined_sounds = preTemp + newTemp os.remove("./output_audio/"+self.path[:-4]+".wav") os.remove("./output_audio/"+str(i)+".wav") combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav") print("Done")
voc_k = voc_model.get_step() // 1000 tts_k = tts_model.get_step() // 1000 simple_table([('WaveRNN', str(voc_k) + 'k'), ('Tacotron', str(tts_k) + 'k'), ('r', tts_model.r.item()), ('Generation Mode', 'Batched' if batched else 'Unbatched'), ('Target Samples', target if batched else 'N/A'), ('Overlap Samples', overlap if batched else 'N/A')]) for i, x in enumerate(inputs, 1) : spk_embds, file_name = get_spk_embed(files, enc_path) print(f'\n| Generating {i}/{len(inputs)}') _, m, attention = tts_model.generate(x, spk_embds) if input_text : # save_path = f'{paths.tts_output}__input_{input_text[:10]}_{tts_k}k.wav' save_path = f'{out}{i}_{file_name}_batched{str(batched)}_{tts_k}k.wav' else : save_path = f'{out}{i}_{file_name}_batched{str(batched)}_{tts_k}k.wav' if save_attn : save_attention(attention, save_path) m = torch.tensor(m).unsqueeze(0) m = (m + 4) / 8 voc_model.generate(m, spk_embds, save_path, batched, hp.voc_target, hp.voc_overlap, hp.mu_law) print('\n\nDone.\n')