def gen_tacotron(self, 華, inputs): for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') _, m, attention = self.tts_model.generate(x) # Fix mel spectrogram scaling to be from 0 to 1 m = (m + 4) / 8 np.clip(m, 0, 1, out=m) if self.args.vocoder == 'griffinlim': v_type = self.args.vocoder elif self.args.vocoder == 'wavernn' and self.args.batched: v_type = 'wavernn_batched' else: v_type = 'wavernn_unbatched' # == define output name == # if len(華) == 0: output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0] elif 1 <= len(華) <= 9: output_name = 華[:-1] elif 9 < len(華): output_name = 華[:8] print(output_name) save_path = "output/{}.wav".format(output_name) ## if self.args.vocoder == 'wavernn': m = torch.tensor(m).unsqueeze(0) self.voc_model.generate(m, save_path, self.args.batched, hp.voc_target, hp.voc_overlap, hp.mu_law) elif self.args.vocoder == 'griffinlim': wav = reconstruct_waveform(m, n_iter=self.args.iters) save_wav(wav, save_path)
def tsau(input_text, save_path): if input_text: inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)] else: with open('sentences.txt') as f: inputs = [ text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f ] if args.vocoder == 'wavernn': voc_k = voc_model.get_step() // 1000 tts_k = tts_model.get_step() // 1000 simple_table([ ('Tacotron', str(tts_k) + 'k'), ('r', tts_model.r), ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'), ('Generation Mode', 'Batched' if batched else 'Unbatched'), ('Target Samples', target if batched else 'N/A'), ('Overlap Samples', overlap if batched else 'N/A') ]) elif args.vocoder == 'griffinlim': tts_k = tts_model.get_step() // 1000 simple_table([('Tacotron', str(tts_k) + 'k'), ('r', tts_model.r), ('Vocoder Type', 'Griffin-Lim'), ('GL Iters', args.iters)]) for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') _, m, attention = tts_model.generate(x) # Fix mel spectrogram scaling to be from 0 to 1 m = (m + 4) / 8 np.clip(m, 0, 1, out=m) if args.vocoder == 'griffinlim': v_type = args.vocoder elif args.vocoder == 'wavernn' and args.batched: v_type = 'wavernn_batched' else: v_type = 'wavernn_unbatched' if save_attn: save_attention(attention, save_path) if args.vocoder == 'wavernn': m = torch.tensor(m).unsqueeze(0) voc_model.generate(m, save_path, batched, target, overlap, hp.mu_law) elif args.vocoder == 'griffinlim': wav = reconstruct_waveform(m, n_iter=args.iters) save_wav(wav, save_path) print('\n\nDone.\n')
def gen_tacotron2(self, 華, inputs): for i, x in enumerate(inputs, 1): print(f'\n| Generating {i}/{len(inputs)}') print(x) x = np.array(x)[None, :] x = torch.autograd.Variable(torch.from_numpy(x)).cuda().long() self.tts_model.eval() mel_outputs, mel_outputs_postnet, _, alignments = self.tts_model.inference( x) if self.args.vocoder == 'griffinlim': v_type = self.args.vocoder elif self.args.vocoder == 'wavernn' and self.args.batched: v_type = 'wavernn_batched' else: v_type = 'wavernn_unbatched' # == define output name == # if len(華) == 0: output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0] elif 1 <= len(華) <= 9: output_name = 華[:-1] elif 9 < len(華): output_name = 華[:8] print(output_name) save_path = "output/{}.wav".format(output_name) ## if self.args.vocoder == 'wavernn': m = mel_outputs_postnet self.voc_model.generate(m, save_path, self.args.batched, hp.voc_target, hp.voc_overlap, hp.mu_law) elif self.args.vocoder == 'griffinlim': m = torch.squeeze(mel_outputs_postnet).detach().cpu().numpy() wav = reconstruct_waveform(m, n_iter=self.args.iters) save_wav(wav, save_path)
_, m, dur, pitch = tts_model.generate(x, alpha=args.alpha, pitch_function=pitch_function) if args.vocoder == 'griffinlim': v_type = args.vocoder elif args.vocoder == 'wavernn' and args.batched: v_type = 'wavernn_batched' else: v_type = 'wavernn_unbatched' if input_text: save_path = paths.forward_output / f'{input_text[:10]}_{args.alpha}_{v_type}_{tts_k}k_amp{args.amp}.wav' else: save_path = paths.forward_output / f'{i}_{v_type}_{tts_k}k_alpha{args.alpha}_amp{args.amp}.wav' if args.vocoder == 'wavernn': m = torch.tensor(m).unsqueeze(0) voc_model.generate(m, save_path, batched, hp.voc_target, hp.voc_overlap, hp.mu_law) if args.vocoder == 'melgan': m = torch.tensor(m).unsqueeze(0) torch.save( m, paths.forward_output / f'{i}_{tts_k}_alpha{args.alpha}_amp{args.amp}.mel') elif args.vocoder == 'griffinlim': wav = reconstruct_waveform(m, n_iter=args.iters) save_wav(wav, save_path) print('\n\nDone.\n')
def dual_transform(self, model_tts, model_asr, optimizer_tts, optimizer_asr, asr_test_set, m_loss_avg, dur_loss_avg, device, asr_current_step, e, epochs, duration_avg, total_iters, tts_s_loss, asr_s_loss, tts_lr, tts_dt_path): print('\n\nStarting DualTransformation loop...\n') # exit() tmp_dir = './checkpoints/sme_speech_tts.asr_forward/dual_transform_tmp' os.makedirs(tmp_dir, exist_ok=True) # generate tmp ASR training data asr_train_data = [] input_set = get_unpaired_txt(35) # print(input_set) text = [clean_text(v) for v in input_set] inputs = [text_to_sequence(t) for t in text] # generate unpaired data for ASR from TTS for i, x in enumerate(inputs, 1): _, m, dur = model_tts.generate(x, alpha=1.) wav = reconstruct_waveform(m, n_iter=32) wav_path = os.path.join(tmp_dir, f'{i}.wav') save_wav(wav, wav_path) asr_train_data.append((wav_path, text[i - 1])) # print(asr_train_data) dt_asr_data = load_dt_data(asr_train_data) # reinit trainer with only tmp train data asr_trainer_dt = init_trainer(dt_asr_data, None) dt_train = asr_trainer_dt.get_train_dataloader() # unsuper train loop for ASR for step, inputs in enumerate(dt_train, 1): # model_asr.cpu() model_asr.train() model_asr.to(device) for k, v in inputs.items(): if isinstance(v, torch.Tensor): inputs[k] = v.to(device) # model_asr.train() outputs = model_asr(**inputs) asr_u_loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] # asr_u_loss.detach() # asr_u_loss = asr_s_loss.mean() # model_name = step + asr_current_step msg_asr = f'| ASR MODEL (unsupervised training) : '\ f'| Epoch: {e}/{epochs} ({step}/{len(dt_train)}) | Loss ASR: {asr_u_loss:#.4} '\ f' ||||||||||||||||||||||||||||||||||||||||||||||||' stream(msg_asr) # for f in os.listdir(tmp_dir): # file_path = os.path.join(tmp_dir, f) # if f.endswith('.wav'): # os.unlink(file_path) # generate tmp TTS data from ASR # model_asr.to(device) asr_predict_for_dt(model_asr) subprocess.check_output( 'python preprocess.py -p "./data/speech-sme-tts" -d=True', shell=True, stderr=subprocess.STDOUT) print('Finished preprocessing for tmp data!') tmp_tts_train = get_tts_datasets(tts_dt_path, batch_size=2, r=1, model_type='forward_dt') print("Loaded tmp dataset!") # unsuper TTS training for i, (x, m, ids, x_lens, mel_lens, dur) in enumerate(tmp_tts_train, 1): start = time.time() model_tts.to(device) model_tts.train() # optimizer_tts.zero_grad() x, m, dur, x_lens, mel_lens = x.to(device), m.to(device), dur.to(device),\ x_lens.to(device), mel_lens.to(device) m1_hat, m2_hat, dur_hat = model_tts(x, m, dur, mel_lens) m1_loss = self.l1_loss(m1_hat, m, mel_lens) m2_loss = self.l1_loss(m2_hat, m, mel_lens) dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1), x_lens) tts_u_loss = m1_loss + m2_loss + 0.1 * dur_loss # optimizer_tts.zero_grad() # tts_u_loss.backward() torch.nn.utils.clip_grad_norm_(model_tts.parameters(), hp.tts_clip_grad_norm) # optimizer_tts.step() m_loss_avg.add(m1_loss.item() + m2_loss.item()) dur_loss_avg.add(dur_loss.item()) step = model_tts.get_step() k = step // 1000 duration_avg.add(time.time() - start) # pitch_loss_avg.add(pitch_loss.item()) speed = 1. / duration_avg.get() msg_tts = f'| TTS MODEL (unsupervised training ): '\ f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \ f'| Dur Loss: {dur_loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' stream(msg_tts) # m_val_loss, dur_val_loss = self.evaluate(model_tts, tts_session.val_set) #TODO: combine L and update # asr_s_loss = torch.tensor(asr_s_loss).to(device) combined_loss = 0.5 * (tts_s_loss + asr_s_loss) + (tts_u_loss + asr_u_loss) # backwards combined_loss.to(device) # print(combined_loss) combined_loss.backward() optimizer_tts.step() for state in optimizer_asr.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.to(device) optimizer_asr.step() m_loss_avg.reset() duration_avg.reset() # pitch_loss_avg.reset() dt_msg = f'\n\nFinished DT loop in epoch {e}!\n' stream(dt_msg) print(' ') return tts_u_loss, asr_u_loss