def validate(self): # Eval mode self.model.eval() if self.emb_decoder is not None: self.emb_decoder.eval() dev_wer = {'att': [], 'ctc': []} for i, data in enumerate(self.dv_set): self.progress('Valid step - {}/{}'.format(i + 1, len(self.dv_set))) # Fetch data feat, feat_len, txt, txt_len = self.fetch_data(data) # Forward model with torch.no_grad(): ctc_output, encode_len, att_output, att_align, dec_state = \ self.model(feat, feat_len, int(max(txt_len)*self.DEV_STEP_RATIO), emb_decoder=self.emb_decoder) dev_wer['att'].append(cal_er(self.tokenizer, att_output, txt)) dev_wer['ctc'].append( cal_er(self.tokenizer, ctc_output, txt, ctc=True)) # Show some example on tensorboard if i == len(self.dv_set) // 2: for i in range(min(len(txt), self.DEV_N_EXAMPLE)): if self.step == 1: self.write_log('true_text{}'.format(i), self.tokenizer.decode(txt[i].tolist())) if att_output is not None: self.write_log( 'att_align{}'.format(i), feat_to_fig(att_align[i, 0, :, :].cpu().detach())) self.write_log( 'att_text{}'.format(i), self.tokenizer.decode( att_output[i].argmax(dim=-1).tolist())) if ctc_output is not None: self.write_log( 'ctc_text{}'.format(i), self.tokenizer.decode( ctc_output[i].argmax(dim=-1).tolist(), ignore_repeat=True)) # Ckpt if performance improves for task in ['att', 'ctc']: dev_wer[task] = sum(dev_wer[task]) / len(dev_wer[task]) if dev_wer[task] < self.best_wer[task]: self.best_wer[task] = dev_wer[task] self.save_checkpoint('best_{}.pth'.format(task), 'wer', dev_wer[task]) self.write_log('wer', {'dv_' + task: dev_wer[task]}) self.save_checkpoint('latest.pth', 'wer', dev_wer['att'], show_msg=False) # Resume training self.model.train() if self.emb_decoder is not None: self.emb_decoder.train()
def exec(self): self.verbose( ['Total training steps {}.'.format(human_format(self.max_step))]) self.timer.set() unpair_speech_loss, unpair_text_loss, unsup_pred, unsup_trans, unsup_align = None, None, None, None, None ctc_nan_flag, ignore_speech_flag = 0, 0 tok_usage, gt_usage = [], [] cnter = {'ctc_nan': 0, 'unp_sph': 0, 'unp_txt': 0} while self.step < self.max_step: # --------------------- Load data ----------------------- # # Unpair setting unpair_mel, unpair_aug_mel, unpair_linear, unpair_text, unpair_sid = None, None, None, None, None post_pred, asr_post_loss = None, None # For ASR postnet only use_unpair_text = self.unpair_text_weight > 0 and self.step > self.unpair_text_start_step use_unpair_speech = self.unpair_speech_weight > 0 and self.step > self.unpair_speech_start_step tf_rate = self.optimizer.pre_step( self.step) # Catch the returned tf_rate if needed # ToDo : change # of sup. step = 2 x # of unsup. step ? mel, aug_mel, linear, text, sid = self.fetch_data( iter_name='pair_iter') # Load unpaired data only when use_unpair_xxx == True if self.step % 2 == 0: #2 # if True: # ASR first speech_first = True if use_unpair_speech: unpair_mel, unpair_aug_mel, unpair_linear, unpair_text, unpair_sid = \ self.fetch_data(iter_name='unpair_iter') else: # TTS first speech_first = False if use_unpair_text: cnter['unp_txt'] += 1 unpair_mel, unpair_aug_mel, unpair_linear, unpair_text, unpair_sid = \ self.fetch_data(iter_name='unpair_iter') total_loss = 0 bs = len(mel) self.timer.cnt('rd') try: # ----------------------- Forward ------------------------ # if speech_first: # Cycle : speech -> text -> speech pair_prob, _, unpair_prob, unpair_latent, unpair_latent_len, pair_post_prob, _ = \ self.model.speech_to_text(paired_mel=aug_mel, unpaired_mel= unpair_aug_mel) # Check to involve unsupervised Speech2Speech if unpair_latent is not None: # ASR output is the representataion for speech2speech cnter['unp_sph'] += 1 ignore_speech_cycle = False unpaired_teacher = unpair_mel else: # ASR output is all blank (cannot be passed to TTS) only paired text is used ignore_speech_cycle = True unpaired_teacher = None # text -> speech pair_mel_pred, pair_linear_pred, pair_align, _, \ unpair_mel_pred, unpair_linear_pred, unpair_align, _ =\ self.model.text_to_speech(paired_text = text, paired_sid=sid, unpaired_sid=unpair_sid, unpaired_latent = unpair_latent, unpaired_text= None, unpaired_latent_len = unpair_latent_len, paired_teacher = mel, unpaired_teacher = unpaired_teacher, tf_rate = tf_rate ) else: # Cycle : text -> speech -> text pair_mel_pred, pair_linear_pred, pair_align, _, \ unpair_mel_pred, unpair_linear_pred, unpair_align, _ =\ self.model.text_to_speech(paired_text=text, paired_sid=sid, unpaired_sid=unpair_sid, unpaired_latent=None, unpaired_text=unpair_text, unpaired_latent_len=None, paired_teacher=mel, unpaired_teacher=None, tf_rate=tf_rate ) if use_unpair_text: unpair_mel_pred = unpair_mel_pred.detach( ) # Stop-grad for tts in text2text pair_prob, _, unpair_prob, unpair_latent, unpair_latent_len, pair_post_prob, _ = \ self.model.speech_to_text(paired_mel=aug_mel, unpaired_mel=unpair_mel_pred, #None, #unpair_mel_pred, #None, #unpaired_mel= unpair_mel_pred, using_fake_mel=use_unpair_text) # Paired ASR loss asr_loss = self.compute_ctcloss(aug_mel, pair_prob, text) if self.model.use_asr_postnet: total_loss = total_loss + self.asr_weight * ( 1 - self.model.asr_postnet_weight) * asr_loss asr_post_loss = self.compute_ctcloss(aug_mel, pair_post_prob, text, apply_log=False) total_loss = total_loss + self.asr_weight * self.model.asr_postnet_weight * asr_post_loss else: total_loss = total_loss + self.asr_weight * asr_loss if math.isnan(asr_loss) or math.isinf(asr_loss): cnter['ctc_nan'] += 1 asr_loss = 0 # Paired TTS loss mel_loss = self.freq_loss(pair_mel_pred, mel) linear_loss = self.freq_loss(pair_linear_pred, linear) tts_loss = mel_loss + linear_loss total_loss = total_loss + self.tts_weight * tts_loss # Unpaired loss if speech_first: # Unpaired speech reconstruction loss if not ignore_speech_cycle: unpair_speech_loss = self.freq_loss(unpair_mel_pred, unpair_mel) +\ self.freq_loss(unpair_linear_pred, unpair_linear) #total_loss += self.unpair_speech_weight*unpair_speech_loss if self.step > self.unpair_speech_start_step: total_loss += self.unpair_speech_weight * unpair_speech_loss elif use_unpair_text: # Unpaired text reconstruction loss ctc_input = (unpair_prob + EPS).transpose(0, 1).log() if self.paras.actual_len: asr_input_len = (unpair_text != 0).sum( dim=-1) * FRAME_PHN_RATIO asr_input_len = asr_input_len + asr_input_len % self.model.n_frames_per_step ctc_len = 1 + (asr_input_len // self.model.time_reduce_factor) else: ctc_len = torch.LongTensor( [unpair_prob.shape[1]] * unpair_prob.shape[0]).to(device=self.device) unpair_text_loss = self.ctc_loss( ctc_input, unpair_text.to_sparse().values(), ctc_len, torch.sum(unpair_text != 0, dim=-1)) if math.isnan(unpair_text_loss) or math.isinf( unpair_text_loss): cnter['ctc_nan'] += 1 unpair_text_loss = 0 total_loss += self.unpair_text_weight * unpair_text_loss # VQ-loss # if vq_loss>0: # total_loss += self.model.vq_weight*vq_loss # if commit_loss>0: # total_loss += self.model.commit_weight*commit_loss # Statics (over unsup. speech only) if speech_first and use_unpair_speech: unsup_pred = unpair_prob.argmax(dim=-1).cpu() unsup_trans = unpair_text.cpu() tok_usage += unsup_pred.flatten().tolist() gt_usage += unsup_trans.flatten().tolist() if unpair_align is not None: unsup_align = unpair_align.detach().cpu() else: unsup_align = [None] * bs self.timer.cnt('fw') # ----------------------- Backward ------------------------ # grad_norm = self.backward(total_loss) # For debugging # if math.isnan(grad_norm): # import IPython # IPython.embed() self.step += 1 # Log if (self.step == 1) or (self.step % self._PROGRESS_STEP == 0): self.progress('Tr stat | Loss - {:.2f} (CTC-nan/unp-sph/unp-txt={}/{}/{}) | Grad. Norm - {:.2f} | {} '\ .format(total_loss.cpu().item(), cnter['ctc_nan'], cnter['unp_sph'], cnter['unp_txt'], grad_norm, self.timer.show())) self.write_log( 'txt_loss', { 'pair': asr_loss.item() if asr_loss is not None else None, 'unpair': unpair_text_loss.item() if unpair_text_loss is not None else None, 'post': asr_post_loss.item() if asr_post_loss is not None else None }) self.write_log( 'speech_loss', { 'pair': tts_loss.item() if tts_loss is not None else None, 'unpair': unpair_speech_loss.item() if unpair_speech_loss is not None else None }) #self.write_log('stop_err',{'tr':stop_err}) # if commit_loss>0: # self.write_log('commit',{'tr':commit_loss}) # if vq_loss>0: # self.write_log('commit',{'vq':vq_loss}) # self.write_log('temperature',{'temp':self.model.codebook.temp.data}) # self.write_log('ppx',{'tr':cal_ppx(p_code)}) for k in cnter.keys(): cnter[k] = 0 if (self.step == 1) or (self.step % ATTENTION_PLOT_STEP == 0): align = pair_align.cpu() # align shape BxDsxEs sup_pred = pair_prob.argmax(dim=-1).cpu() sup_trans = text.cpu() if self.model.use_asr_postnet: post_pred = pair_post_prob.argmax(dim=-1).cpu() self.write_log( 'per', { 'pair': cal_per(sup_pred, sup_trans), 'unpair': cal_per(unsup_pred, unsup_trans), 'post': cal_per(post_pred, sup_trans) }) self.write_log( 'unpair_hist', data_to_bar(tok_usage, gt_usage, self.vocab_size, self.tokenizer._vocab_list)) for i in range(LISTEN_N_EXAMPLES): self.write_log( 'pair_align{}'.format(i), feat_to_fig(align[i].cpu().detach())) if unsup_align is not None and unsup_align[ i] is not None: self.write_log( 'unpair_align{}'.format(i), feat_to_fig(unsup_align[i].cpu().detach())) tok_usage, gt_usage = [], [] # Validation if (self.step == 1) or (self.step % self.valid_step == 0): self.validate() # End of step self.timer.set() if self.step > self.max_step: break except RuntimeError as e: if 'out of memory' in str(e): self.verbose('WARNING: ran out of memory, retrying batch') for p in self.model.parameters(): if p.grad is not None: del p.grad # free some memory torch.cuda.empty_cache() else: print(repr(e)) errorout()
def validate(self): # Eval mode self.model.eval() dev_tts_loss, dev_per, dev_post_per, dev_stop_err = [], [], [], [] for i in range(len(self.dev_set)): self.progress('Valid step - {}/{}'.format(i + 1, len(self.dev_set))) # Fetch data mel, aug_mel, linear, text, sid = self.fetch_data( iter_name='dev_iter') # Forward model with torch.no_grad(): # test ASR pair_prob, _, _, _, _, pair_post_prob, _ = self.model.speech_to_text( paired_mel=mel, unpaired_mel=None) dev_per.append(cal_per(pair_prob, text)) if pair_post_prob is not None: dev_post_per.append((cal_per(pair_post_prob, text))) # test TTS (Note: absolute dec step now) pair_mel_pred, pair_linear_pred, pair_align, _, _, _, _, _ = \ self.model.text_to_speech(paired_text = text, paired_sid=sid, unpaired_sid=None, unpaired_latent=None, unpaired_text=None, unpaired_latent_len=None, paired_teacher=mel.shape[1], unpaired_teacher=None, tf_rate=0.0) dev_tts_loss.append( self.freq_loss(pair_mel_pred, mel) + self.freq_loss(pair_linear_pred, linear)) if i == len(self.dev_set) // 2: # pick n longest samples in the median batch sample_txt = text.cpu()[:LISTEN_N_EXAMPLES] hyp = pair_prob.argmax(dim=-1).cpu()[:LISTEN_N_EXAMPLES] mel_p = pair_mel_pred.cpu()[:LISTEN_N_EXAMPLES] linear_p = pair_linear_pred.cpu()[:LISTEN_N_EXAMPLES] #post_mel_p = tts_pred.cpu()[:LISTEN_N_EXAMPLES,1] # PostNet product align_p = pair_align.cpu()[:LISTEN_N_EXAMPLES] sample_mel = mel.cpu()[:LISTEN_N_EXAMPLES] sample_linear = linear.cpu()[:LISTEN_N_EXAMPLES] # Ckpt if performance improves dev_tts_loss = sum(dev_tts_loss) / len(dev_tts_loss) dev_per = sum(dev_per) / len(dev_per) dev_post_per = sum(dev_post_per) / len(dev_post_per) if len( dev_post_per) > 0 else None #dev_stop_err = sum(dev_stop_err)/len(dev_stop_err) if self.paras.store_best_per: if dev_per < self.best_per: self.best_per = dev_per self.save_checkpoint('best_per.pth', dev_per) if (dev_post_per is not None) and (dev_post_per < self.best_per): self.best_per = dev_post_per self.save_checkpoint('best_post_per.pth', dev_post_per) else: if dev_tts_loss < self.best_tts_loss: self.best_tts_loss = dev_tts_loss if self.step > 1: self.save_checkpoint('tts_{}.pth'.format(self.step), dev_tts_loss) if dev_per < self.best_per: self.best_per = dev_per if self.step > 1: self.save_checkpoint('asr_{}.pth'.format(self.step), dev_per) if (dev_post_per is not None) and (dev_post_per < self.best_per): self.best_per = dev_post_per self.save_checkpoint( 'best_post_per.pth', dev_post_per ) # Note: didnot recode best per from postnet or not if ((self.step > 1) and (self.step % CKPT_STEP == 0)) and not self.paras.store_best_per: # Regular ckpt self.save_checkpoint('step_{}.pth'.format(self.step), dev_tts_loss) # Logger # Write model output (no G-F-lim if picking per) for i, (m_p, l_p, a_p, h_p) in enumerate(zip(mel_p, linear_p, align_p, hyp)): self.write_log('hyp_text{}'.format(i), self.tokenizer.decode(h_p.tolist())) self.write_log('mel_spec{}'.format(i), feat_to_fig(m_p)) self.write_log('linear_spec{}'.format(i), feat_to_fig(l_p)) self.write_log('dv_align{}'.format(i), feat_to_fig(a_p)) if not self.paras.store_best_per: self.write_log('mel_wave{}'.format(i), self.audio_converter.feat_to_wave(m_p)) self.write_log('linear_wave{}'.format(i), self.audio_converter.feat_to_wave(l_p)) # Write ground truth if self.step == 1: for i, (mel, linear, gt_txt) in enumerate( zip(sample_mel, sample_linear, sample_txt)): self.write_log('truth_text{}'.format(i), self.tokenizer.decode(gt_txt.tolist())) self.write_log('mel_spec{}_gt'.format(i), feat_to_fig(mel)) self.write_log('mel_wave{}_gt'.format(i), self.audio_converter.feat_to_wave(mel)) self.write_log('linear_spec{}_gt'.format(i), feat_to_fig(linear)) self.write_log('linear_wave{}_gt'.format(i), self.audio_converter.feat_to_wave(linear)) self.write_log('speech_loss', {'dev': dev_tts_loss}) self.write_log('per', {'dev': dev_per, 'dev_post': dev_post_per}) self.write_log('codebook', (self.model.codebook.embedding.weight.data, self.tokenizer._vocab_list)) #self.write_log('stop_err',{'dev':dev_stop_err}) # Resume training self.model.train()
def validate(self, _dv_set, _name): # Eval mode self.model.eval() if self.emb_decoder is not None: self.emb_decoder.eval() dev_wer = {'att': [], 'ctc': []} dev_cer = {'att': [], 'ctc': []} dev_er = {'att': [], 'ctc': []} for i, data in enumerate(_dv_set): self.progress('Valid step - {}/{}'.format(i + 1, len(_dv_set))) # Fetch data feat, feat_len, txt, txt_len = self.fetch_data(data) # Forward model with torch.no_grad(): ctc_output, encode_len, att_output, att_align, dec_state = \ self.model( feat, feat_len, int(max(txt_len)*self.DEV_STEP_RATIO), emb_decoder=self.emb_decoder) if att_output is not None: dev_wer['att'].append( cal_er(self.tokenizer, att_output, txt, mode='wer')) dev_cer['att'].append( cal_er(self.tokenizer, att_output, txt, mode='cer')) dev_er['att'].append( cal_er(self.tokenizer, att_output, txt, mode=self.val_mode)) if ctc_output is not None: dev_wer['ctc'].append( cal_er(self.tokenizer, ctc_output, txt, mode='wer', ctc=True)) dev_cer['ctc'].append( cal_er(self.tokenizer, ctc_output, txt, mode='cer', ctc=True)) dev_er['ctc'].append( cal_er(self.tokenizer, ctc_output, txt, mode=self.val_mode, ctc=True)) # Show some example on tensorboard if i == len(_dv_set) // 2: for i in range(min(len(txt), self.DEV_N_EXAMPLE)): if self.step == 1: self.write_log('true_text_{}_{}'.format(_name, i), self.tokenizer.decode(txt[i].tolist())) if att_output is not None: self.write_log( 'att_align_{}_{}'.format(_name, i), feat_to_fig(att_align[i, 0, :, :].cpu().detach())) self.write_log( 'att_text_{}_{}'.format(_name, i), self.tokenizer.decode( att_output[i].argmax(dim=-1).tolist())) if ctc_output is not None: self.write_log( 'ctc_text_{}_{}'.format(_name, i), self.tokenizer.decode( ctc_output[i].argmax(dim=-1).tolist(), ignore_repeat=True)) # Ckpt if performance improves tasks = [] if len(dev_er['att']) > 0: tasks.append('att') if len(dev_er['ctc']) > 0: tasks.append('ctc') for task in tasks: dev_er[task] = sum(dev_er[task]) / len(dev_er[task]) dev_wer[task] = sum(dev_wer[task]) / len(dev_wer[task]) dev_cer[task] = sum(dev_cer[task]) / len(dev_cer[task]) if dev_er[task] < self.best_wer[task][_name]: self.best_wer[task][_name] = dev_er[task] self.save_checkpoint( 'best_{}_{}.pth'.format( task, _name + (self.save_name if self.transfer_learning else '')), self.val_mode, dev_er[task], _name) if self.step >= self.max_step: self.save_checkpoint( 'last_{}_{}.pth'.format( task, _name + (self.save_name if self.transfer_learning else '')), self.val_mode, dev_er[task], _name) self.write_log(self.WER, {'dv_' + task + '_' + _name.lower(): dev_wer[task]}) self.write_log('cer', {'dv_' + task + '_' + _name.lower(): dev_cer[task]}) # if self.transfer_learning: # print('[{}] WER {:.4f} / CER {:.4f} on {}'.format(human_format(self.step), dev_wer[task], dev_cer[task], _name)) # Resume training self.model.train() if self.transfer_learning: self.model.encoder.fix_layers(self.fix_enc) if self.fix_dec and self.model.enable_att: self.model.decoder.fix_layers() if self.fix_dec and self.model.enable_ctc: self.model.fix_ctc_layer() if self.emb_decoder is not None: self.emb_decoder.train()