def generate_plots(self, model: ForwardTacotron, session: ForwardSession) -> None: model.eval() device = next(model.parameters()).device x, m, ids, x_lens, mel_lens, dur = session.val_sample x, m, dur, mel_lens = x.to(device), m.to(device), dur.to( device), mel_lens.to(device) m1_hat, m2_hat, dur_hat = model(x, m, dur, mel_lens) m1_hat = np_now(m1_hat)[0, :600, :] m2_hat = np_now(m2_hat)[0, :600, :] m = np_now(m)[0, :600, :] m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_fig = plot_mel(m) # pitch_fig = plot_pitch(np_now(pitch[0])) # pitch_gta_fig = plot_pitch(np_now(pitch_hat.squeeze()[0])) # self.writer.add_figure('Pitch/target', pitch_fig, model.step) # self.writer.add_figure('Pitch/ground_truth_aligned', pitch_gta_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/target', m_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) target_wav = reconstruct_waveform(m) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate) m1_hat, m2_hat, dur_hat = model.generate(x[0, :x_lens[0]].tolist()) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) # pitch_gen_fig = plot_pitch(np_now(pitch_hat.squeeze())) # self.writer.add_figure('Pitch/generated', pitch_gen_fig, model.step) self.writer.add_figure('Generated/target', m_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate)
def evaluate(self, model: ForwardTacotron, val_set: Dataset) -> Tuple[float, float,float]: model.eval() m_val_loss = 0 dur_val_loss = 0 pitch_val_loss = 0 device = next(model.parameters()).device for i, (x, m, ids, x_lens, mel_lens, dur, pitch, puncts) in enumerate( val_set, 1 ): x, m, dur, x_lens, mel_lens, pitch, puncts = ( x.to(device), m.to(device), dur.to(device), x_lens.to(device), mel_lens.to(device), pitch.to(device), puncts.to(device), ) with torch.no_grad(): m1_hat, m2_hat, dur_hat, pitch_hat = model( x, m, dur, mel_lens, pitch, puncts ) m1_loss = self.l1_loss(m1_hat, m, mel_lens) m2_loss = self.l1_loss(m2_hat, m, mel_lens) dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1), x_lens) pitch_val_loss += self.l1_loss(pitch_hat, pitch.unsqueeze(1), x_lens) m_val_loss += m1_loss.item() + m2_loss.item() dur_val_loss += dur_loss.item() m_val_loss /= len(val_set) dur_val_loss /= len(val_set) pitch_val_loss /= len(val_set) return m_val_loss, dur_val_loss, pitch_val_loss
def evaluate(self, model: ForwardTacotron, val_set: DataLoader) -> Dict[str, float]: model.eval() m_val_loss = 0 dur_val_loss = 0 pitch_val_loss = 0 energy_val_loss = 0 device = next(model.parameters()).device for i, batch in enumerate(val_set, 1): batch = to_device(batch, device=device) with torch.no_grad(): pred = model(batch) m1_loss = self.l1_loss(pred['mel'], batch['mel'], batch['mel_len']) m2_loss = self.l1_loss(pred['mel_post'], batch['mel'], batch['mel_len']) dur_loss = self.l1_loss(pred['dur'].unsqueeze(1), batch['dur'].unsqueeze(1), batch['x_len']) pitch_loss = self.l1_loss(pred['pitch'], batch['pitch'].unsqueeze(1), batch['x_len']) energy_loss = self.l1_loss(pred['energy'], batch['energy'].unsqueeze(1), batch['x_len']) pitch_val_loss += pitch_loss energy_val_loss += energy_loss m_val_loss += m1_loss.item() + m2_loss.item() dur_val_loss += dur_loss.item() return { 'mel_loss': m_val_loss / len(val_set), 'dur_loss': dur_val_loss / len(val_set), 'pitch_loss': pitch_val_loss / len(val_set), 'energy_loss': energy_val_loss / len(val_set) }
def generate_plots(self, model: ForwardTacotron, session: TTSSession) -> None: model.eval() device = next(model.parameters()).device x, m, ids, lens, dur = session.val_sample x, m, dur = x.to(device), m.to(device), dur.to(device) m1_hat, m2_hat, dur_hat = model(x, m, dur) m1_hat = np_now(m1_hat)[0, :600, :] m2_hat = np_now(m2_hat)[0, :600, :] m = np_now(m)[0, :600, :] m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_fig = plot_mel(m) self.writer.add_figure('Ground_Truth_Aligned/target', m_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m1_hat, m2_hat, m = rescale_mel(m1_hat), rescale_mel( m2_hat), rescale_mel(m) m2_hat_wav = reconstruct_waveform(m2_hat) target_wav = reconstruct_waveform(m) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate) m1_hat, m2_hat, dur_hat = model.generate(x[0].tolist()) m1_hat, m2_hat = rescale_mel(m1_hat), rescale_mel(m2_hat) m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) self.writer.add_figure('Generated/target', m_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = reconstruct_waveform(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=hp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=hp.sample_rate)
def evaluate(self, model: ForwardTacotron, val_set: Dataset) -> Tuple[float, float]: model.eval() m_val_loss = 0 dur_val_loss = 0 device = next(model.parameters()).device for i, (x, m, ids, lens, dur) in enumerate(val_set, 1): x, m, dur, lens = x.to(device), m.to(device), dur.to(device), lens.to(device) with torch.no_grad(): m1_hat, m2_hat, dur_hat = model(x, m, dur) m1_loss = self.l1_loss(m1_hat, m, lens) m2_loss = self.l1_loss(m2_hat, m, lens) dur_loss = F.l1_loss(dur_hat, dur) m_val_loss += m1_loss.item() + m2_loss.item() dur_val_loss += dur_loss.item() return m_val_loss / len(val_set), dur_val_loss / len(val_set)
def generate_plots(self, model: ForwardTacotron, session: TTSSession) -> None: model.eval() device = next(model.parameters()).device batch = session.val_sample batch = to_device(batch, device=device) pred = model(batch) m1_hat = np_now(pred['mel'])[0, :600, :] m2_hat = np_now(pred['mel_post'])[0, :600, :] m_target = np_now(batch['mel'])[0, :600, :] m1_hat_fig = plot_mel(m1_hat) m2_hat_fig = plot_mel(m2_hat) m_target_fig = plot_mel(m_target) pitch_fig = plot_pitch(np_now(batch['pitch'][0])) pitch_gta_fig = plot_pitch(np_now(pred['pitch'].squeeze()[0])) energy_fig = plot_pitch(np_now(batch['energy'][0])) energy_gta_fig = plot_pitch(np_now(pred['energy'].squeeze()[0])) self.writer.add_figure('Pitch/target', pitch_fig, model.step) self.writer.add_figure('Pitch/ground_truth_aligned', pitch_gta_fig, model.step) self.writer.add_figure('Energy/target', energy_fig, model.step) self.writer.add_figure('Energy/ground_truth_aligned', energy_gta_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/target', m_target_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/linear', m1_hat_fig, model.step) self.writer.add_figure('Ground_Truth_Aligned/postnet', m2_hat_fig, model.step) m2_hat_wav = self.dsp.griffinlim(m2_hat) target_wav = self.dsp.griffinlim(m_target) self.writer.add_audio(tag='Ground_Truth_Aligned/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) self.writer.add_audio(tag='Ground_Truth_Aligned/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) gen = model.generate(batch['x'][0:1, :batch['x_len'][0]]) m1_hat_fig = plot_mel(np_now(gen['mel'])) m2_hat_fig = plot_mel(np_now(gen['mel_post'])) pitch_gen_fig = plot_pitch(np_now(gen['pitch'].squeeze())) energy_gen_fig = plot_pitch(np_now(gen['energy'].squeeze())) self.writer.add_figure('Pitch/generated', pitch_gen_fig, model.step) self.writer.add_figure('Energy/generated', energy_gen_fig, model.step) self.writer.add_figure('Generated/target', m_target_fig, model.step) self.writer.add_figure('Generated/linear', m1_hat_fig, model.step) self.writer.add_figure('Generated/postnet', m2_hat_fig, model.step) m2_hat_wav = self.dsp.griffinlim(m2_hat) self.writer.add_audio(tag='Generated/target_wav', snd_tensor=target_wav, global_step=model.step, sample_rate=self.dsp.sample_rate) self.writer.add_audio(tag='Generated/postnet_wav', snd_tensor=m2_hat_wav, global_step=model.step, sample_rate=self.dsp.sample_rate)
def main(): # Parse Arguments parser = argparse.ArgumentParser(description='TTS Generator') parser.add_argument( '--tts_weights', type=str, help='[string/path] Load in different FastSpeech weights') parser.add_argument('--hp_file', metavar='FILE', default='hparams.py', help='The file to use for the hyperparameters') parser.add_argument( '--alpha', type=float, default=1., help='Parameter for controlling length regulator for speedup ' 'or slow-down of generated speech, e.g. alpha=2.0 is double-time') if not os.path.exists('onnx'): os.mkdir('onnx') args = parser.parse_args() hp.configure(args.hp_file) input_text = "the forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves." tts_weights = args.tts_weights paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id) device = torch.device('cpu') print('Using device:', device) print('\nInitialising Forward TTS Model...\n') tts_model = ForwardTacotron(embed_dims=hp.forward_embed_dims, num_chars=len(symbols), durpred_rnn_dims=hp.forward_durpred_rnn_dims, durpred_conv_dims=hp.forward_durpred_conv_dims, rnn_dim=hp.forward_rnn_dims, postnet_k=hp.forward_postnet_K, postnet_dims=hp.forward_postnet_dims, prenet_k=hp.forward_prenet_K, prenet_dims=hp.forward_prenet_dims, highways=hp.forward_num_highways, dropout=hp.forward_dropout, n_mels=hp.num_mels).to(device) tts_load_path = tts_weights or paths.forward_latest_weights tts_model.load(tts_load_path) encoder = DurationPredictor(tts_model) decoder = Tacotron(tts_model) tts_model.eval() encoder.eval() decoder.eval() opset_version = 10 with torch.no_grad(): input_seq = text_to_sequence(input_text.strip(), hp.tts_cleaner_names) input_seq = torch.as_tensor(input_seq, dtype=torch.long, device=device).unsqueeze(0) ''' FIRST STEP: predict symbols duration ''' torch.onnx.export(encoder, input_seq, "./onnx/forward_tacotron_duration_prediction.onnx", opset_version=opset_version, do_constant_folding=True, input_names=["input_seq"], output_names=["embeddings", "duration"]) x, durations = encoder(input_seq) ''' SECOND STEP: expand symbols by durations ''' x = encoder.lr(x, durations) ''' THIRD STEP: generate mel ''' torch.onnx.export(decoder, x, "./onnx/forward_tacotron_regression.onnx", opset_version=opset_version, do_constant_folding=True, input_names=["data"], output_names=["mel"]) print('Done!')