def train_session(self, model: Tacotron, optimizer: Optimizer, session: TTSSession) -> None: current_step = model.get_step() training_steps = session.max_step - current_step total_iters = len(session.train_set) epochs = training_steps // total_iters + 1 model.r = session.r simple_table([(f'Steps with r={session.r}', str(training_steps // 1000) + 'k Steps'), ('Batch Size', session.bs), ('Learning Rate', session.lr), ('Outputs/Step (r)', model.r)]) for g in optimizer.param_groups: g['lr'] = session.lr loss_avg = Averager() duration_avg = Averager() device = next( model.parameters()).device # use same device as model parameters for e in range(1, epochs + 1): for i, (x, m, ids, x_lens, mel_lens) in enumerate(session.train_set, 1): start = time.time() model.train() x, m = x.to(device), m.to(device) m1_hat, m2_hat, attention = model(x, m) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) loss = m1_loss + m2_loss optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), hp.tts_clip_grad_norm) optimizer.step() loss_avg.add(loss.item()) step = model.get_step() k = step // 1000 duration_avg.add(time.time() - start) speed = 1. / duration_avg.get() msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', self.paths, model, optimizer, name=ckpt_name, is_silent=True) if step % hp.tts_plot_every == 0: self.generate_plots(model, session) _, att_score = attention_score(attention, mel_lens) att_score = torch.mean(att_score) self.writer.add_scalar('Attention_Score/train', att_score, model.get_step()) self.writer.add_scalar('Loss/train', loss, model.get_step()) self.writer.add_scalar('Params/reduction_factor', session.r, model.get_step()) self.writer.add_scalar('Params/batch_size', session.bs, model.get_step()) self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step()) stream(msg) val_loss, val_att_score = self.evaluate(model, session.val_set) self.writer.add_scalar('Loss/val', val_loss, model.get_step()) self.writer.add_scalar('Attention_Score/val', val_att_score, model.get_step()) save_checkpoint('tts', self.paths, model, optimizer, is_silent=True) loss_avg.reset() duration_avg.reset() print(' ')
def train_session(self, model: ForwardTacotron, optimizer: Optimizer, session: TTSSession) -> None: current_step = model.get_step() training_steps = session.max_step - current_step total_iters = len(session.train_set) epochs = training_steps // total_iters + 1 simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'), ('Batch Size', session.bs), ('Learning Rate', session.lr)]) for g in optimizer.param_groups: g['lr'] = session.lr m_loss_avg = Averager() dur_loss_avg = Averager() duration_avg = Averager() pitch_loss_avg = Averager() device = next(model.parameters()).device # use same device as model parameters for e in range(1, epochs + 1): for i, (x, m, ids, x_lens, mel_lens, dur, pitch, puncts) in enumerate( session.train_set, 1 ): start = time.time() model.train() x, m, dur, x_lens, mel_lens, pitch, puncts = ( x.to(device), m.to(device), dur.to(device), x_lens.to(device), mel_lens.to(device), pitch.to(device), puncts.to(device), ) # print("*" * 20) # print(x) # print("*" * 20) m1_hat, m2_hat, dur_hat, pitch_hat = model( x, m, dur, mel_lens, pitch, puncts ) m1_loss = self.l1_loss(m1_hat, m, mel_lens) m2_loss = self.l1_loss(m2_hat, m, mel_lens) dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1), x_lens) pitch_loss = self.l1_loss(pitch_hat, pitch.unsqueeze(1), x_lens) loss = m1_loss + m2_loss + 0.3 * dur_loss + 0.1 * pitch_loss optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), hp.tts_clip_grad_norm) optimizer.step() m_loss_avg.add(m1_loss.item() + m2_loss.item()) dur_loss_avg.add(dur_loss.item()) step = model.get_step() k = step // 1000 duration_avg.add(time.time() - start) pitch_loss_avg.add(pitch_loss.item()) speed = 1. / duration_avg.get() msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \ f'| Dur Loss: {dur_loss_avg.get():#.4} | Pitch Loss: {pitch_loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % hp.forward_checkpoint_every == 0: ckpt_name = f'forward_step{k}K' save_checkpoint('forward', self.paths, model, optimizer, name=ckpt_name, is_silent=True) if step % hp.forward_plot_every == 0: self.generate_plots(model, session) self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss, model.get_step()) self.writer.add_scalar('Pitch_Loss/train', pitch_loss, model.get_step()) self.writer.add_scalar('Duration_Loss/train', dur_loss, model.get_step()) self.writer.add_scalar('Params/batch_size', session.bs, model.get_step()) self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step()) stream(msg) m_val_loss, dur_val_loss, pitch_val_loss = self.evaluate(model, session.val_set) self.writer.add_scalar('Mel_Loss/val', m_val_loss, model.get_step()) self.writer.add_scalar('Duration_Loss/val', dur_val_loss, model.get_step()) self.writer.add_scalar('Pitch_Loss/val', pitch_val_loss, model.get_step()) save_checkpoint('forward', self.paths, model, optimizer, is_silent=True) m_loss_avg.reset() duration_avg.reset() pitch_loss_avg.reset() print(' ')
def train_session(self, model_tts: ForwardTacotron, model_asr: Wav2Vec2ForCTC, optimizer_tts: Optimizer, tts_session: ForwardSession, asr_session: ASRSession, asr_trainer, optimizer_asr) -> None: # print(tts_session.path) # exit() asr_trainer_state = {'logs': []} current_step = model_tts.get_step() tts_training_steps = tts_session.max_step - current_step try: _, asr_current_step = get_last_checkpoint( './checkpoints/sme_speech_tts.asr_forward/', 'model_at') asr_training_steps = tts_session.max_step - asr_current_step except: asr_current_step = 0 asr_training_steps = tts_training_steps total_iters = len(tts_session.train_set) epochs = tts_training_steps // total_iters + 1 simple_table([ ('TTS Steps', str(tts_training_steps // 1000) + 'k Steps'), ('ASR Steps', str(asr_training_steps // 1000) + 'k Steps'), ('Batch Size TTS', tts_session.bs), ('Learning Rate', tts_session.lr) ]) for g in optimizer_tts.param_groups: g['lr'] = tts_session.lr m_loss_avg = Averager() dur_loss_avg = Averager() duration_avg = Averager() device = next(model_tts.parameters() ).device # use same device as model parameters warnings.filterwarnings('ignore', category=UserWarning) for e in range(1, epochs + 1): #tts train loop for epoch for i, (x, m, ids, x_lens, mel_lens, dur) in enumerate(tts_session.train_set, 1): start = time.time() model_tts.train() x, m, dur, x_lens, mel_lens = x.to(device), m.to(device), dur.to(device),\ x_lens.to(device), mel_lens.to(device) m1_hat, m2_hat, dur_hat = model_tts(x, m, dur, mel_lens) m1_loss = self.l1_loss(m1_hat, m, mel_lens) m2_loss = self.l1_loss(m2_hat, m, mel_lens) dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1), x_lens) tts_s_loss = m1_loss + m2_loss + 0.1 * dur_loss optimizer_tts.zero_grad() # tts_s_loss.backward() torch.nn.utils.clip_grad_norm_(model_tts.parameters(), hp.tts_clip_grad_norm) # optimizer_tts.step() m_loss_avg.add(m1_loss.item() + m2_loss.item()) dur_loss_avg.add(dur_loss.item()) step = model_tts.get_step() k = step // 1000 duration_avg.add(time.time() - start) # pitch_loss_avg.add(pitch_loss.item()) speed = 1. / duration_avg.get() msg_tts = f'| TTS MODEL (supervised training ): '\ f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \ f'| Dur Loss: {dur_loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % hp.forward_checkpoint_every == 0: ckpt_name = f'forward_step{k}K' save_checkpoint('forward', self.paths, model_tts, optimizer_tts, name=ckpt_name, is_silent=True) if step % hp.forward_plot_every == 0: self.generate_plots(model_tts, tts_session) self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss, model_tts.get_step()) self.writer.add_scalar('Duration_Loss/train', dur_loss, model_tts.get_step()) self.writer.add_scalar('Params/batch_size', tts_session.bs, model_tts.get_step()) self.writer.add_scalar('Params/learning_rate', tts_session.lr, model_tts.get_step()) stream(msg_tts) # print(msg_tts) # print(torch.cuda.memory_allocated(device=device)) # model_tts = model_tts.to('cpu') for step, inputs in enumerate(asr_session.train_set): optimizer_asr.zero_grad() model_asr.to(device) for k, v in inputs.items(): if isinstance(v, torch.Tensor): inputs[k] = v.to(device) model_asr.train() outputs = model_asr(**inputs) asr_s_loss = outputs["loss"] if isinstance( outputs, dict) else outputs[0] # asr_s_loss = asr_s_loss.mean() msg_asr = f'| ASR MODEL (supervised training) : '\ f'| Epoch: {e}/{epochs} ({step}/{len(asr_session.train_set)}) | Loss ASR: {asr_s_loss:#.4} '\ f' ||||||||||||||||||||||' stream(msg_asr) # # model_asr.to('cuda') m_val_loss, dur_val_loss = self.evaluate(model_tts, tts_session.val_set) eval_tts_msg = f'| TTS MODEL (supervised eval ): '\ f'| Epoch: {e}/{epochs} | Val Loss: {m_val_loss:#.4} ' \ f'| Dur Val Loss: {dur_val_loss:#.4} ' \ stream(eval_tts_msg) tts_eval_loss = m_val_loss + dur_val_loss # print(eval_tts_msg) # ASR eval supervised print('\nEvaluating ASR model ...') # model_asr.to('cpu') asr_eval_loss = 0 eval_wer = 0 for step, inputs in enumerate(asr_session.test_set): asr_eval_loss_i, logits_a, labels_a = asr_trainer.prediction_step( model_asr, inputs, False) asr_eval_loss += asr_eval_loss_i logits_a.to('cpu') eval_wer_i = asr_trainer.compute_metrics( EvalPrediction(predictions=logits_a, label_ids=labels_a)) eval_wer += eval_wer_i['wer'] # print(eval_wer) eval_wer = eval_wer / step asr_eval_loss = asr_eval_loss / step msg_asr_eval = f'| ASR MODEL (supervised eval) : Epoch {e}/{epochs} | Loss ASR: {asr_eval_loss:#.4} | WER: {eval_wer} |||||||||||||||||||||||||||||||||||||||||||||||||||||' stream(msg_asr_eval) # dual transformation loop # tts_s_loss = 3 # asr_s_loss = 1 tts_u_loss, asr_u_loss = self.dual_transform( model_tts, model_asr, optimizer_tts, optimizer_asr, asr_session.test_set, m_loss_avg, dur_loss_avg, device, asr_current_step, e, epochs, duration_avg, total_iters, tts_s_loss, asr_s_loss, tts_session.lr, tts_session.path) step += 1 asr_path = f'checkpoint-27364' modelasr_folder = './checkpoints/sme_speech_tts.asr_forward/' new_check = modelasr_folder + asr_path os.makedirs(new_check, exist_ok=True) # asr_path, asr_step = get_last_checkpoint(modelasr_folder, modelasr_name) save_checkpoint('forward', self.paths, model_tts, optimizer_tts, is_silent=True) # asr_u_loss = 2 if "logs" not in asr_trainer_state: asr_trainer_state['logs'] = [] asr_trainer_state['logs'].append({ 'step': step, 'epoch': e, 'asr_s_loss': int(asr_s_loss), 'asr_u_loss': int(asr_u_loss), 'tts_s_loss': int(tts_s_loss), 'tts_u_loss': int(tts_u_loss), 'tts_eval_loss': int(tts_eval_loss), 'asr_eval_loss': int(asr_eval_loss), 'eval_wer': eval_wer }) with open(f'{modelasr_folder+ asr_path}/dt_trainer_state.json', 'w') as f: json.dump(asr_trainer_state, f) model_asr.save_pretrained(f'{new_check}') torch.save(optimizer_asr.state_dict(), f'{new_check}/optimizer.pt') print("Exiting due to cuda OOM!") exit(11)
def train_session(self, model: WaveRNN, optimizer: Optimizer, session: VocSession, train_gta: bool) -> None: current_step = model.get_step() training_steps = session.max_step - current_step total_iters = len(session.train_set) epochs = training_steps // total_iters + 1 simple_table([(f'Steps ', str(training_steps // 1000) + 'k'), ('Batch Size', session.bs), ('Learning Rate', session.lr), ('Sequence Length', self.train_cfg['seq_len']), ('GTA Training', train_gta)]) for g in optimizer.param_groups: g['lr'] = session.lr loss_avg = Averager() duration_avg = Averager() device = next( model.parameters()).device # use same device as model parameters for e in range(1, epochs + 1): for i, batch in enumerate(session.train_set, 1): start = time.time() model.train() batch = to_device(batch, device=device) x, y = batch['x'], batch['y'] y_hat = model(x, batch['mel']) if model.mode == 'RAW': y_hat = y_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y = batch['y'].float() y = y.unsqueeze(-1) loss = self.loss_func(y_hat, y) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), self.train_cfg['clip_grad_norm']) optimizer.step() loss_avg.add(loss.item()) step = model.get_step() k = step // 1000 duration_avg.add(time.time() - start) speed = 1. / duration_avg.get() msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % self.train_cfg['gen_samples_every'] == 0: stream(msg + 'generating samples...') gen_result = self.generate_samples(model, session) if gen_result is not None: mel_loss, gen_wav = gen_result self.writer.add_scalar('Loss/generated_mel_l1', mel_loss, model.get_step()) self.track_top_models(mel_loss, gen_wav, model) if step % self.train_cfg['checkpoint_every'] == 0: save_checkpoint(model=model, optim=optimizer, config=self.config, path=self.paths.voc_checkpoints / f'wavernn_step{k}k.pt') self.writer.add_scalar('Loss/train', loss, model.get_step()) self.writer.add_scalar('Params/batch_size', session.bs, model.get_step()) self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step()) stream(msg) val_loss = self.evaluate(model, session.val_set) self.writer.add_scalar('Loss/val', val_loss, model.get_step()) save_checkpoint(model=model, optim=optimizer, config=self.config, path=self.paths.voc_checkpoints / 'latest_model.pt') loss_avg.reset() duration_avg.reset() print(' ')
def train_session(self, model: ForwardTacotron, optimizer: Optimizer, session: TTSSession) -> None: current_step = model.get_step() training_steps = session.max_step - current_step total_iters = len(session.train_set) epochs = training_steps // total_iters + 1 simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'), ('Batch Size', session.bs), ('Learning Rate', session.lr)]) for g in optimizer.param_groups: g['lr'] = session.lr m_loss_avg = Averager() dur_loss_avg = Averager() duration_avg = Averager() pitch_loss_avg = Averager() device = next( model.parameters()).device # use same device as model parameters for e in range(1, epochs + 1): for i, batch in enumerate(session.train_set, 1): batch = to_device(batch, device=device) start = time.time() model.train() pitch_zoneout_mask = torch.rand( batch['x'].size()) > self.train_cfg['pitch_zoneout'] energy_zoneout_mask = torch.rand( batch['x'].size()) > self.train_cfg['energy_zoneout'] pitch_target = batch['pitch'].detach().clone() energy_target = batch['energy'].detach().clone() batch['pitch'] = batch['pitch'] * pitch_zoneout_mask.to( device).float() batch['energy'] = batch['energy'] * energy_zoneout_mask.to( device).float() pred = model(batch) m1_loss = self.l1_loss(pred['mel'], batch['mel'], batch['mel_len']) m2_loss = self.l1_loss(pred['mel_post'], batch['mel'], batch['mel_len']) dur_loss = self.l1_loss(pred['dur'].unsqueeze(1), batch['dur'].unsqueeze(1), batch['x_len']) pitch_loss = self.l1_loss(pred['pitch'], pitch_target.unsqueeze(1), batch['x_len']) energy_loss = self.l1_loss(pred['energy'], energy_target.unsqueeze(1), batch['x_len']) loss = m1_loss + m2_loss \ + self.train_cfg['dur_loss_factor'] * dur_loss \ + self.train_cfg['pitch_loss_factor'] * pitch_loss \ + self.train_cfg['energy_loss_factor'] * energy_loss optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), self.train_cfg['clip_grad_norm']) optimizer.step() m_loss_avg.add(m1_loss.item() + m2_loss.item()) dur_loss_avg.add(dur_loss.item()) step = model.get_step() k = step // 1000 duration_avg.add(time.time() - start) pitch_loss_avg.add(pitch_loss.item()) speed = 1. / duration_avg.get() msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \ f'| Dur Loss: {dur_loss_avg.get():#.4} | Pitch Loss: {pitch_loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % self.train_cfg['checkpoint_every'] == 0: save_checkpoint(model=model, optim=optimizer, config=self.config, path=self.paths.forward_checkpoints / f'forward_step{k}k.pt') if step % self.train_cfg['plot_every'] == 0: self.generate_plots(model, session) self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss, model.get_step()) self.writer.add_scalar('Pitch_Loss/train', pitch_loss, model.get_step()) self.writer.add_scalar('Energy_Loss/train', energy_loss, model.get_step()) self.writer.add_scalar('Duration_Loss/train', dur_loss, model.get_step()) self.writer.add_scalar('Params/batch_size', session.bs, model.get_step()) self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step()) stream(msg) val_out = self.evaluate(model, session.val_set) self.writer.add_scalar('Mel_Loss/val', val_out['mel_loss'], model.get_step()) self.writer.add_scalar('Duration_Loss/val', val_out['dur_loss'], model.get_step()) self.writer.add_scalar('Pitch_Loss/val', val_out['pitch_loss'], model.get_step()) self.writer.add_scalar('Energy_Loss/val', val_out['energy_loss'], model.get_step()) save_checkpoint(model=model, optim=optimizer, config=self.config, path=self.paths.forward_checkpoints / 'latest_model.pt') m_loss_avg.reset() duration_avg.reset() pitch_loss_avg.reset() print(' ')