def evaluate(model, criterion, criterion_st, ap, global_step, epoch): data_loader = setup_loader(ap, is_val=True) if c.use_speaker_embedding: speaker_mapping = load_speaker_mapping(OUT_PATH) model.eval() epoch_time = 0 avg_postnet_loss = 0 avg_decoder_loss = 0 avg_stop_loss = 0 print("\n > Validation") if c.test_sentences_file is None: test_sentences = [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", "I'm sorry Dave. I'm afraid I can't do that.", "This cake is great. It's so delicious and moist." ] else: with open(c.test_sentences_file, "r") as f: test_sentences = [s.strip() for s in f.readlines()] with torch.no_grad(): if data_loader is not None: for num_iter, data in enumerate(data_loader): start_time = time.time() # setup input data text_input = data[0] text_lengths = data[1] speaker_names = data[2] linear_input = data[3] if c.model in [ "Tacotron", "TacotronGST" ] else None mel_input = data[4] mel_lengths = data[5] stop_targets = data[6] if c.use_speaker_embedding: speaker_ids = [ speaker_mapping[speaker_name] for speaker_name in speaker_names ] speaker_ids = torch.LongTensor(speaker_ids) else: speaker_ids = None # set stop targets view, we predict a single stop token per r frames prediction stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) # dispatch data to GPU if use_cuda: text_input = text_input.cuda() mel_input = mel_input.cuda() mel_lengths = mel_lengths.cuda() linear_input = linear_input.cuda() if c.model in [ "Tacotron", "TacotronGST" ] else None stop_targets = stop_targets.cuda() if speaker_ids is not None: speaker_ids = speaker_ids.cuda() # forward pass decoder_output, postnet_output, alignments, stop_tokens =\ model.forward(text_input, text_lengths, mel_input, speaker_ids=speaker_ids) # loss computation stop_loss = criterion_st( stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input, mel_lengths) else: postnet_loss = criterion(postnet_output, mel_input, mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input) else: postnet_loss = criterion(postnet_output, mel_input) loss = decoder_loss + postnet_loss + stop_loss step_time = time.time() - start_time epoch_time += step_time if num_iter % c.print_step == 0: print( " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} DecoderLoss:{:.5f} " "StopLoss: {:.5f} ".format(loss.item(), postnet_loss.item(), decoder_loss.item(), stop_loss.item()), flush=True) # aggregate losses from processes if num_gpus > 1: postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) if c.stopnet: stop_loss = reduce_tensor(stop_loss.data, num_gpus) avg_postnet_loss += float(postnet_loss.item()) avg_decoder_loss += float(decoder_loss.item()) avg_stop_loss += stop_loss.item() if args.rank == 0: # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = postnet_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [ "Tacotron", "TacotronGST" ] else mel_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() eval_figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } tb_logger.tb_eval_figures(global_step, eval_figures) # Sample audio if c.model in ["Tacotron", "TacotronGST"]: eval_audio = ap.inv_spectrogram(const_spec.T) else: eval_audio = ap.inv_mel_spectrogram(const_spec.T) tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) # compute average losses avg_postnet_loss /= (num_iter + 1) avg_decoder_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1) # Plot Validation Stats epoch_stats = { "loss_postnet": avg_postnet_loss, "loss_decoder": avg_decoder_loss, "stop_loss": avg_stop_loss } tb_logger.tb_eval_stats(global_step, epoch_stats) if args.rank == 0 and epoch > c.test_delay_epochs: # test sentences test_audios = {} test_figures = {} print(" | > Synthesizing test sentences") speaker_id = 0 if c.use_speaker_embedding else None style_wav = c.get("style_wav_for_test") for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap, speaker_id=speaker_id, style_wav=style_wav) file_path = os.path.join(AUDIO_PATH, str(global_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav test_figures['{}-prediction'.format(idx)] = plot_spectrogram( postnet_output, ap) test_figures['{}-alignment'.format(idx)] = plot_alignment( alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(global_step, test_figures) return avg_postnet_loss
def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, epoch, use_half=False): data_loader = setup_loader(is_val=False, verbose=(epoch == 0), use_half=use_half) model.train() epoch_time = 0 avg_postnet_loss = 0 avg_decoder_loss = 0 avg_stop_loss = 0 avg_step_time = 0 print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True) batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) start_time = time.time() for num_iter, data in enumerate(data_loader): # setup input data text_input = data[0] text_lengths = data[1] linear_input = data[2] if c.model == "Tacotron" else None mel_input = data[3] if not use_half else data[3].type(torch.half) mel_lengths = data[4] if not use_half else data[4].type(torch.half) stop_targets = data[5] avg_text_length = torch.mean(text_lengths.float()) avg_spec_length = torch.mean(mel_lengths.float()) # set stop targets view, we predict a single stop token per r frames prediction stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) stop_targets = stop_targets if not use_half else stop_targets.type( torch.half) current_step = num_iter + args.restore_step + \ epoch * len(data_loader) + 1 # setup lr if c.lr_decay: scheduler.step() optimizer.zero_grad() if optimizer_st: optimizer_st.zero_grad() # dispatch data to GPU if use_cuda: text_input = text_input.cuda(non_blocking=True) text_lengths = text_lengths.cuda(non_blocking=True) mel_input = mel_input.cuda(non_blocking=True) mel_lengths = mel_lengths.cuda(non_blocking=True) linear_input = linear_input.cuda( non_blocking=True) if c.model == "Tacotron" else None stop_targets = stop_targets.cuda(non_blocking=True) decoder_output, postnet_output, alignments, stop_tokens = model( text_input, text_lengths, mel_input) # loss computation stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) if c.model == "Tacotron": postnet_loss = criterion(postnet_output, linear_input, mel_lengths) else: postnet_loss = criterion(postnet_output, mel_input, mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model == "Tacotron": postnet_loss = criterion(postnet_output, linear_input) else: postnet_loss = criterion(postnet_output, mel_input) USE_HALF_LOSS_SCALE = 10.0 if use_half: postnet_loss = postnet_loss * USE_HALF_LOSS_SCALE decoder_loss = decoder_loss * USE_HALF_LOSS_SCALE loss = decoder_loss + postnet_loss if not c.separate_stopnet and c.stopnet: loss += stop_loss loss.backward() optimizer, current_lr = weight_decay(optimizer, c.wd) grad_norm, _ = check_update(model, c.grad_clip) optimizer.step() # backpass and check the grad norm for stop loss if c.separate_stopnet: USE_HALF_STOP_LOSS_SCALE = 1 stop_loss = stop_loss * USE_HALF_STOP_LOSS_SCALE stop_loss.backward() optimizer_st, _ = weight_decay(optimizer_st, c.wd) grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) optimizer_st.step() else: grad_norm_st = 0 step_time = time.time() - start_time start_time = time.time() epoch_time += step_time if current_step % c.print_step == 0: print( " | > Step:{}/{} GlobalStep:{} TotalLoss:{:.5f} PostnetLoss:{:.5f} " "DecoderLoss:{:.5f} StopLoss:{:.5f} GradNorm:{:.5f} " "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} LR:{:.6f}" .format(num_iter, batch_n_iter, current_step, loss.item(), postnet_loss.item(), decoder_loss.item(), stop_loss.item(), grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, current_lr), flush=True) # aggregate losses from processes if num_gpus > 1: postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) loss = reduce_tensor(loss.data, num_gpus) stop_loss = reduce_tensor(stop_loss.data, num_gpus) if c.stopnet else stop_loss if args.rank == 0: avg_postnet_loss += float(postnet_loss.item()) avg_decoder_loss += float(decoder_loss.item()) avg_stop_loss += stop_loss if type(stop_loss) is float else float( stop_loss.item()) avg_step_time += step_time # Plot Training Iter Stats iter_stats = { "loss_posnet": postnet_loss.item(), "loss_decoder": decoder_loss.item(), "lr": current_lr, "grad_norm": grad_norm, "grad_norm_st": grad_norm_st, "step_time": step_time } tb_logger.tb_train_iter_stats(current_step, iter_stats) if current_step % c.save_step == 0: if c.checkpoint: # save model save_checkpoint(model, optimizer, optimizer_st, postnet_loss.item(), OUT_PATH, current_step, epoch) # Diagnostic visualizations const_spec = postnet_output[0].data.cpu().type( torch.float).numpy() gt_spec = linear_input[0].data.cpu().type(torch.float).numpy( ) if c.model == "Tacotron" else mel_input[0].data.cpu().type( torch.float).numpy() align_img = alignments[0].data.cpu().type(torch.float).numpy() figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } tb_logger.tb_train_figures(current_step, figures) # Sample audio if c.model == "Tacotron": train_audio = ap.inv_spectrogram(const_spec.T) else: train_audio = ap.inv_mel_spectrogram(const_spec.T) tb_logger.tb_train_audios(current_step, {'TrainAudio': train_audio}, c.audio["sample_rate"]) avg_postnet_loss /= (num_iter + 1) avg_decoder_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1) avg_total_loss = avg_decoder_loss + avg_postnet_loss + avg_stop_loss avg_step_time /= (num_iter + 1) # print epoch stats print(" | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} " "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} " "AvgStopLoss:{:.5f} EpochTime:{:.2f} " "AvgStepTime:{:.2f}".format(current_step, avg_total_loss, avg_postnet_loss, avg_decoder_loss, avg_stop_loss, epoch_time, avg_step_time), flush=True) # Plot Epoch Stats if args.rank == 0: # Plot Training Epoch Stats epoch_stats = { "loss_postnet": avg_postnet_loss, "loss_decoder": avg_decoder_loss, "stop_loss": avg_stop_loss, "epoch_time": epoch_time } tb_logger.tb_train_epoch_stats(current_step, epoch_stats) if c.tb_model_param_stats: tb_logger.tb_model_weights(model, current_step) return avg_postnet_loss, current_step
def evaluate(model, criterion, criterion_st, ap, global_step, epoch): data_loader = setup_loader(ap, model.decoder.r, is_val=True) if c.use_speaker_embedding: speaker_mapping = load_speaker_mapping(OUT_PATH) model.eval() epoch_time = 0 eval_values_dict = { 'avg_postnet_loss': 0, 'avg_decoder_loss': 0, 'avg_stop_loss': 0, 'avg_align_score': 0 } if c.bidirectional_decoder: eval_values_dict['avg_decoder_b_loss'] = 0 # decoder backward loss eval_values_dict['avg_decoder_c_loss'] = 0 # decoder consistency loss keep_avg = KeepAverage() keep_avg.add_values(eval_values_dict) print("\n > Validation") with torch.no_grad(): if data_loader is not None: for num_iter, data in enumerate(data_loader): start_time = time.time() # format data text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data( data) assert mel_input.shape[1] % model.decoder.r == 0 # forward pass model if c.bidirectional_decoder: decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) else: decoder_output, postnet_output, alignments, stop_tokens = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) # loss computation stop_loss = criterion_st( stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input, mel_lengths) else: postnet_loss = criterion(postnet_output, mel_input, mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input) else: postnet_loss = criterion(postnet_output, mel_input) loss = decoder_loss + postnet_loss + stop_loss # backward decoder loss if c.bidirectional_decoder: if c.loss_masking: decoder_backward_loss = criterion( torch.flip(decoder_backward_output, dims=(1, )), mel_input, mel_lengths) else: decoder_backward_loss = criterion( torch.flip(decoder_backward_output, dims=(1, )), mel_input) decoder_c_loss = torch.nn.functional.l1_loss( torch.flip(decoder_backward_output, dims=(1, )), decoder_output) loss += decoder_backward_loss + decoder_c_loss keep_avg.update_values({ 'avg_decoder_b_loss': decoder_backward_loss.item(), 'avg_decoder_c_loss': decoder_c_loss.item() }) step_time = time.time() - start_time epoch_time += step_time # compute alignment score align_score = alignment_diagonal_score(alignments) keep_avg.update_value('avg_align_score', align_score) # aggregate losses from processes if num_gpus > 1: postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) if c.stopnet: stop_loss = reduce_tensor(stop_loss.data, num_gpus) keep_avg.update_values({ 'avg_postnet_loss': float(postnet_loss.item()), 'avg_decoder_loss': float(decoder_loss.item()), 'avg_stop_loss': float(stop_loss.item()), }) if num_iter % c.print_step == 0: print( " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} - {:.5f} DecoderLoss:{:.5f} - {:.5f} " "StopLoss: {:.5f} - {:.5f} AlignScore: {:.4f} : {:.4f}" .format(loss.item(), postnet_loss.item(), keep_avg['avg_postnet_loss'], decoder_loss.item(), keep_avg['avg_decoder_loss'], stop_loss.item(), keep_avg['avg_stop_loss'], align_score, keep_avg['avg_align_score']), flush=True) if args.rank == 0: # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = postnet_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [ "Tacotron", "TacotronGST" ] else mel_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() eval_figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } # Sample audio if c.model in ["Tacotron", "TacotronGST"]: eval_audio = ap.inv_spectrogram(const_spec.T) else: eval_audio = ap.inv_mel_spectrogram(const_spec.T) tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) # Plot Validation Stats epoch_stats = { "loss_postnet": keep_avg['avg_postnet_loss'], "loss_decoder": keep_avg['avg_decoder_loss'], "stop_loss": keep_avg['avg_stop_loss'], "alignment_score": keep_avg['avg_align_score'] } if c.bidirectional_decoder: epoch_stats['loss_decoder_backward'] = keep_avg[ 'avg_decoder_b_loss'] align_b_img = alignments_backward[idx].data.cpu().numpy() eval_figures['alignment_backward'] = plot_alignment( align_b_img) tb_logger.tb_eval_stats(global_step, epoch_stats) tb_logger.tb_eval_figures(global_step, eval_figures) if args.rank == 0 and epoch > c.test_delay_epochs: if c.test_sentences_file is None: test_sentences = [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", "I'm sorry Dave. I'm afraid I can't do that.", "This cake is great. It's so delicious and moist." ] else: with open(c.test_sentences_file, "r") as f: test_sentences = [s.strip() for s in f.readlines()] # test sentences test_audios = {} test_figures = {} print(" | > Synthesizing test sentences") speaker_id = 0 if c.use_speaker_embedding else None style_wav = c.get("style_wav_for_test") for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap, speaker_id=speaker_id, style_wav=style_wav) file_path = os.path.join(AUDIO_PATH, str(global_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav test_figures['{}-prediction'.format(idx)] = plot_spectrogram( postnet_output, ap) test_figures['{}-alignment'.format(idx)] = plot_alignment( alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(global_step, test_figures) return keep_avg['avg_postnet_loss']
def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, global_step, epoch, criterion_gst=None, optimizer_gst=None): data_loader = setup_loader(ap, is_val=False, verbose=(epoch == 0)) if c.use_speaker_embedding: speaker_mapping = load_speaker_mapping(OUT_PATH) model.train() epoch_time = 0 avg_postnet_loss = 0 avg_decoder_loss = 0 avg_stop_loss = 0 avg_gst_loss = 0 avg_step_time = 0 avg_loader_time = 0 print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True) if use_cuda: batch_n_iter = int(len(data_loader.dataset) / (c.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() for num_iter, data in enumerate(data_loader): start_time = time.time() # setup input data text_input = data[0] text_lengths = data[1] speaker_names = data[2] linear_input = data[3] if c.model in ["Tacotron", "TacotronGST"] else None mel_input = data[4] mel_lengths = data[5] stop_targets = data[6] avg_text_length = torch.mean(text_lengths.float()) avg_spec_length = torch.mean(mel_lengths.float()) loader_time = time.time() - end_time if c.use_speaker_embedding: speaker_ids = [speaker_mapping[speaker_name] for speaker_name in speaker_names] speaker_ids = torch.LongTensor(speaker_ids) else: speaker_ids = None # set stop targets view, we predict a single stop token per r frames prediction stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) global_step += 1 # setup lr if c.lr_decay: scheduler.step() optimizer.zero_grad() if optimizer_gst: optimizer_gst.zero_grad() if optimizer_st: optimizer_st.zero_grad() # dispatch data to GPU if use_cuda: text_input = text_input.cuda(non_blocking=True) text_lengths = text_lengths.cuda(non_blocking=True) mel_input = mel_input.cuda(non_blocking=True) mel_lengths = mel_lengths.cuda(non_blocking=True) linear_input = linear_input.cuda(non_blocking=True) if c.model in ["Tacotron", "TacotronGST"] else None stop_targets = stop_targets.cuda(non_blocking=True) if speaker_ids is not None: speaker_ids = speaker_ids.cuda(non_blocking=True) # forward pass model decoder_output, postnet_output, alignments, stop_tokens, text_gst = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) # loss computation stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) gst_loss = torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input, mel_lengths) else: postnet_loss = criterion(postnet_output, mel_input, mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input) else: postnet_loss = criterion(postnet_output, mel_input) loss = decoder_loss + postnet_loss if not c.separate_stopnet and c.stopnet: loss += stop_loss if c.text_gst and criterion_gst and optimizer_gst: mel_gst, _ = model.gst(mel_input) gst_loss = criterion_gst(text_gst, mel_gst.squeeze().detach()) gst_loss.backward() optimizer_gst.step() loss.backward() optimizer, current_lr = weight_decay(optimizer, c.wd) grad_norm, _ = check_update(model, c.grad_clip) optimizer.step() # backpass and check the grad norm for stop loss if c.separate_stopnet: stop_loss.backward() optimizer_st, _ = weight_decay(optimizer_st, c.wd) grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) optimizer_st.step() else: grad_norm_st = 0 step_time = time.time() - start_time epoch_time += step_time if global_step % c.print_step == 0: print( " | > Step:{}/{} GlobalStep:{} TotalLoss:{:.5f} PostnetLoss:{:.5f} " "DecoderLoss:{:.5f} StopLoss:{:.5f} GSTLoss:{:.5f} GradNorm:{:.5f} " "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} " "LoaderTime:{:.2f} LR:{:.6f}".format( num_iter, batch_n_iter, global_step, loss.item(), postnet_loss.item(), decoder_loss.item(), stop_loss.item(), gst_loss.item(), grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, loader_time, current_lr), flush=True) # aggregate losses from processes if num_gpus > 1: postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) gst_loss = reduce_tensor(gst_loss.data, num_gpus) if c.text_gst else gst_loss loss = reduce_tensor(loss.data, num_gpus) stop_loss = reduce_tensor(stop_loss.data, num_gpus) if c.stopnet else stop_loss if args.rank == 0: avg_postnet_loss += float(postnet_loss.item()) avg_decoder_loss += float(decoder_loss.item()) avg_stop_loss += stop_loss if isinstance(stop_loss, float) else float(stop_loss.item()) avg_gst_loss += float(gst_loss.item()) avg_step_time += step_time avg_loader_time += loader_time # Plot Training Iter Stats # reduce TB load if global_step % 10 == 0: iter_stats = {"loss_posnet": postnet_loss.item(), "loss_decoder": decoder_loss.item(), "gst_loss" : gst_loss.item(), "lr": current_lr, "grad_norm": grad_norm, "grad_norm_st": grad_norm_st, "step_time": step_time} tb_logger.tb_train_iter_stats(global_step, iter_stats) if global_step % c.save_step == 0: if c.checkpoint: # save model save_checkpoint(model, optimizer, optimizer_st, optimizer_gst, postnet_loss.item(), OUT_PATH, global_step, epoch) # Diagnostic visualizations const_spec = postnet_output[0].data.cpu().numpy() gt_spec = linear_input[0].data.cpu().numpy() if c.model in ["Tacotron", "TacotronGST"] else mel_input[0].data.cpu().numpy() align_img = alignments[0].data.cpu().numpy() figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } tb_logger.tb_train_figures(global_step, figures) # Sample audio if c.model in ["Tacotron", "TacotronGST"]: train_audio = ap.inv_spectrogram(const_spec.T) else: train_audio = ap.inv_mel_spectrogram(const_spec.T) tb_logger.tb_train_audios(global_step, {'TrainAudio': train_audio}, c.audio["sample_rate"]) end_time = time.time() avg_postnet_loss /= (num_iter + 1) avg_decoder_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1) avg_gst_loss /= (num_iter + 1) avg_total_loss = avg_decoder_loss + avg_postnet_loss + avg_stop_loss avg_step_time /= (num_iter + 1) avg_loader_time /= (num_iter + 1) # print epoch stats print( " | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} " "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} AvgGSTLoss:{:.5f} " "AvgStopLoss:{:.5f} EpochTime:{:.2f} " "AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format(global_step, avg_total_loss, avg_postnet_loss, avg_decoder_loss, avg_gst_loss, avg_stop_loss, epoch_time, avg_step_time, avg_loader_time), flush=True) # Plot Epoch Stats if args.rank == 0: # Plot Training Epoch Stats epoch_stats = {"loss_postnet": avg_postnet_loss, "loss_decoder": avg_decoder_loss, "stop_loss": avg_stop_loss, "gst_loss" : avg_gst_loss, "epoch_time": epoch_time} tb_logger.tb_train_epoch_stats(global_step, epoch_stats) if c.tb_model_param_stats: tb_logger.tb_model_weights(model, global_step) return avg_postnet_loss, global_step
def evaluate(model, criterion, criterion_st, ap, current_step, epoch): data_loader = setup_loader(is_val=True) model.eval() epoch_time = 0 avg_postnet_loss = 0 avg_decoder_loss = 0 avg_stop_loss = 0 print("\n > Validation") if c.test_sentences_file is None: test_sentences = [ "Evinizde çocuklar televizyonun karşısına dizilmiş oturuyorlar.", "Karşınızda reklamlara çıkan çocukların elinde çikulatalar, püskevitler, birbirlerine ikram ediyorlar, birbirleriyle yiyorlar, şakalaşıyorlar.", "O çocuk aklından geçiriyor 'benim de bir çikulatam olsa, benim de bir püskevitim olsa' diyor.", "Anne bana niye almıyorsunuz diyor, bizde niye yok diyor." ] else: with open(c.test_sentences_file, "r") as f: test_sentences = [s.strip() for s in f.readlines()] with torch.no_grad(): if data_loader is not None: for num_iter, data in enumerate(data_loader): start_time = time.time() # setup input data text_input = data[0] text_lengths = data[1] linear_input = data[2] if c.model == "Tacotron" else None mel_input = data[3] mel_lengths = data[4] stop_targets = data[5] # set stop targets view, we predict a single stop token per r frames prediction stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) # dispatch data to GPU if use_cuda: text_input = text_input.cuda() mel_input = mel_input.cuda() mel_lengths = mel_lengths.cuda() linear_input = linear_input.cuda( ) if c.model == "Tacotron" else None stop_targets = stop_targets.cuda() # forward pass decoder_output, postnet_output, alignments, stop_tokens =\ model.forward(text_input, text_lengths, mel_input) # loss computation stop_loss = criterion_st( stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) if c.model == "Tacotron": postnet_loss = criterion(postnet_output, linear_input, mel_lengths) else: postnet_loss = criterion(postnet_output, mel_input, mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model == "Tacotron": postnet_loss = criterion(postnet_output, linear_input) else: postnet_loss = criterion(postnet_output, mel_input) loss = decoder_loss + postnet_loss + stop_loss step_time = time.time() - start_time epoch_time += step_time if num_iter % c.print_step == 0: print( " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} DecoderLoss:{:.5f} " "StopLoss: {:.5f} ".format(loss.item(), postnet_loss.item(), decoder_loss.item(), stop_loss.item()), flush=True) # aggregate losses from processes if num_gpus > 1: postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) if c.stopnet: stop_loss = reduce_tensor(stop_loss.data, num_gpus) avg_postnet_loss += float(postnet_loss.item()) avg_decoder_loss += float(decoder_loss.item()) avg_stop_loss += stop_loss.item() if args.rank == 0: # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = postnet_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy( ) if c.model == "Tacotron" else mel_input[idx].data.cpu( ).numpy() align_img = alignments[idx].data.cpu().numpy() eval_figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } tb_logger.tb_eval_figures(current_step, eval_figures) # Sample audio if c.model == "Tacotron": eval_audio = ap.inv_spectrogram(const_spec.T) else: eval_audio = ap.inv_mel_spectrogram(const_spec.T) tb_logger.tb_eval_audios(current_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) # compute average losses avg_postnet_loss /= (num_iter + 1) avg_decoder_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1) # Plot Validation Stats epoch_stats = { "loss_postnet": avg_postnet_loss, "loss_decoder": avg_decoder_loss, "stop_loss": avg_stop_loss } tb_logger.tb_eval_stats(current_step, epoch_stats) if args.rank == 0 and epoch > c.test_delay_epochs: # test sentences test_audios = {} test_figures = {} print(" | > Synthesizing test sentences") for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap) file_path = os.path.join(AUDIO_PATH, str(current_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav test_figures['{}-prediction'.format(idx)] = plot_spectrogram( postnet_output, ap) test_figures['{}-alignment'.format(idx)] = plot_alignment( alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() tb_logger.tb_test_audios(current_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(current_step, test_figures) return avg_postnet_loss
def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, global_step, epoch): data_loader = setup_loader(ap, model.decoder.r, is_val=False, verbose=(epoch == 0)) model.train() epoch_time = 0 train_values = { 'avg_postnet_loss': 0, 'avg_decoder_loss': 0, 'avg_stop_loss': 0, 'avg_align_score': 0, 'avg_step_time': 0, 'avg_loader_time': 0, 'avg_alignment_score': 0 } if c.bidirectional_decoder: train_values['avg_decoder_b_loss'] = 0 # decoder backward loss train_values['avg_decoder_c_loss'] = 0 # decoder consistency loss keep_avg = KeepAverage() keep_avg.add_values(train_values) print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True) if use_cuda: batch_n_iter = int( len(data_loader.dataset) / (c.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() for num_iter, data in enumerate(data_loader): start_time = time.time() # format data text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length = format_data( data) loader_time = time.time() - end_time global_step += 1 # setup lr if c.noam_schedule: scheduler.step() optimizer.zero_grad() if optimizer_st: optimizer_st.zero_grad() # forward pass model if c.bidirectional_decoder: decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) else: decoder_output, postnet_output, alignments, stop_tokens = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) # loss computation stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input, mel_lengths) else: postnet_loss = criterion(postnet_output, mel_input, mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input) else: postnet_loss = criterion(postnet_output, mel_input) loss = decoder_loss + postnet_loss if not c.separate_stopnet and c.stopnet: loss += stop_loss # backward decoder if c.bidirectional_decoder: if c.loss_masking: decoder_backward_loss = criterion( torch.flip(decoder_backward_output, dims=(1, )), mel_input, mel_lengths) else: decoder_backward_loss = criterion( torch.flip(decoder_backward_output, dims=(1, )), mel_input) decoder_c_loss = torch.nn.functional.l1_loss( torch.flip(decoder_backward_output, dims=(1, )), decoder_output) loss += decoder_backward_loss + decoder_c_loss keep_avg.update_values({ 'avg_decoder_b_loss': decoder_backward_loss.item(), 'avg_decoder_c_loss': decoder_c_loss.item() }) loss.backward() optimizer, current_lr = adam_weight_decay(optimizer) grad_norm, grad_flag = check_update(model, c.grad_clip, ignore_stopnet=True) optimizer.step() # compute alignment score align_score = alignment_diagonal_score(alignments) keep_avg.update_value('avg_align_score', align_score) # backpass and check the grad norm for stop loss if c.separate_stopnet: stop_loss.backward() optimizer_st, _ = adam_weight_decay(optimizer_st) grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) optimizer_st.step() else: grad_norm_st = 0 step_time = time.time() - start_time epoch_time += step_time if global_step % c.print_step == 0: print( " | > Step:{}/{} GlobalStep:{} PostnetLoss:{:.5f} " "DecoderLoss:{:.5f} StopLoss:{:.5f} AlignScore:{:.4f} GradNorm:{:.5f} " "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} " "LoaderTime:{:.2f} LR:{:.6f}".format( num_iter, batch_n_iter, global_step, postnet_loss.item(), decoder_loss.item(), stop_loss.item(), align_score, grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, loader_time, current_lr), flush=True) # aggregate losses from processes if num_gpus > 1: postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) loss = reduce_tensor(loss.data, num_gpus) stop_loss = reduce_tensor(stop_loss.data, num_gpus) if c.stopnet else stop_loss if args.rank == 0: update_train_values = { 'avg_postnet_loss': float(postnet_loss.item()), 'avg_decoder_loss': float(decoder_loss.item()), 'avg_stop_loss': stop_loss if isinstance(stop_loss, float) else float(stop_loss.item()), 'avg_step_time': step_time, 'avg_loader_time': loader_time } keep_avg.update_values(update_train_values) # Plot Training Iter Stats # reduce TB load if global_step % 10 == 0: iter_stats = { "loss_posnet": postnet_loss.item(), "loss_decoder": decoder_loss.item(), "lr": current_lr, "grad_norm": grad_norm, "grad_norm_st": grad_norm_st, "step_time": step_time } tb_logger.tb_train_iter_stats(global_step, iter_stats) if global_step % c.save_step == 0: if c.checkpoint: # save model save_checkpoint(model, optimizer, optimizer_st, postnet_loss.item(), OUT_PATH, global_step, epoch) # Diagnostic visualizations const_spec = postnet_output[0].data.cpu().numpy() gt_spec = linear_input[0].data.cpu().numpy() if c.model in [ "Tacotron", "TacotronGST" ] else mel_input[0].data.cpu().numpy() align_img = alignments[0].data.cpu().numpy() figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img), } if c.bidirectional_decoder: figures["alignment_backward"] = plot_alignment( alignments_backward[0].data.cpu().numpy()) tb_logger.tb_train_figures(global_step, figures) # Sample audio if c.model in ["Tacotron", "TacotronGST"]: train_audio = ap.inv_spectrogram(const_spec.T) else: train_audio = ap.inv_mel_spectrogram(const_spec.T) tb_logger.tb_train_audios(global_step, {'TrainAudio': train_audio}, c.audio["sample_rate"]) end_time = time.time() # print epoch stats print(" | > EPOCH END -- GlobalStep:{} " "AvgPostnetLoss:{:.5f} AvgDecoderLoss:{:.5f} " "AvgStopLoss:{:.5f} AvgAlignScore:{:3f} EpochTime:{:.2f} " "AvgStepTime:{:.2f} AvgLoaderTime:{:.2f}".format( global_step, keep_avg['avg_postnet_loss'], keep_avg['avg_decoder_loss'], keep_avg['avg_stop_loss'], keep_avg['avg_align_score'], epoch_time, keep_avg['avg_step_time'], keep_avg['avg_loader_time']), flush=True) # Plot Epoch Stats if args.rank == 0: # Plot Training Epoch Stats epoch_stats = { "loss_postnet": keep_avg['avg_postnet_loss'], "loss_decoder": keep_avg['avg_decoder_loss'], "stop_loss": keep_avg['avg_stop_loss'], "alignment_score": keep_avg['avg_align_score'], "epoch_time": epoch_time } tb_logger.tb_train_epoch_stats(global_step, epoch_stats) if c.tb_model_param_stats: tb_logger.tb_model_weights(model, global_step) return keep_avg['avg_postnet_loss'], global_step
def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, epoch): data_loader = setup_loader(is_val=False, verbose=(epoch == 0)) model.train() epoch_time = 0 avg_linear_loss = 0 avg_mel_loss = 0 avg_stop_loss = 0 avg_step_time = 0 print("\n > Epoch {}/{}".format(epoch, c.epochs), flush=True) n_priority_freq = int(3000 / (c.audio['sample_rate'] * 0.5) * c.audio['num_freq']) if num_gpus > 0: batch_n_iter = int( len(data_loader.dataset) / (c.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / c.batch_size) for num_iter, data in enumerate(data_loader): start_time = time.time() # setup input data text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] mel_lengths = data[4] stop_targets = data[5] avg_text_length = torch.mean(text_lengths.float()) avg_spec_length = torch.mean(mel_lengths.float()) # set stop targets view, we predict a single stop token per r frames prediction stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float() current_step = num_iter + args.restore_step + \ epoch * len(data_loader) + 1 # setup lr if c.lr_decay: scheduler.step() optimizer.zero_grad() optimizer_st.zero_grad() # dispatch data to GPU if use_cuda: text_input = text_input.cuda(non_blocking=True) text_lengths = text_lengths.cuda(non_blocking=True) mel_input = mel_input.cuda(non_blocking=True) mel_lengths = mel_lengths.cuda(non_blocking=True) linear_input = linear_input.cuda(non_blocking=True) stop_targets = stop_targets.cuda(non_blocking=True) # compute mask for padding mask = sequence_mask(text_lengths) # forward pass mel_output, linear_output, alignments, stop_tokens = model( text_input, mel_input, mask) # loss computation stop_loss = criterion_st(stop_tokens, stop_targets) mel_loss = criterion(mel_output, mel_input, mel_lengths) linear_loss = (1 - c.loss_weight) * criterion(linear_output, linear_input, mel_lengths)\ + c.loss_weight * criterion(linear_output[:, :, :n_priority_freq], linear_input[:, :, :n_priority_freq], mel_lengths) loss = mel_loss + linear_loss # backpass and check the grad norm for spec losses loss.backward(retain_graph=True) optimizer, current_lr = weight_decay(optimizer, c.wd) grad_norm, _ = check_update(model, 1.0) optimizer.step() # backpass and check the grad norm for stop loss stop_loss.backward() optimizer_st, _ = weight_decay(optimizer_st, c.wd) grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) optimizer_st.step() step_time = time.time() - start_time epoch_time += step_time if current_step % c.print_step == 0: print( " | > Step:{}/{} GlobalStep:{} TotalLoss:{:.5f} LinearLoss:{:.5f} " "MelLoss:{:.5f} StopLoss:{:.5f} GradNorm:{:.5f} " "GradNormST:{:.5f} AvgTextLen:{:.1f} AvgSpecLen:{:.1f} StepTime:{:.2f} LR:{:.6f}" .format(num_iter, batch_n_iter, current_step, loss.item(), linear_loss.item(), mel_loss.item(), stop_loss.item(), grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, current_lr), flush=True) # aggregate losses from processes if num_gpus > 1: linear_loss = reduce_tensor(linear_loss.data, num_gpus) mel_loss = reduce_tensor(mel_loss.data, num_gpus) loss = reduce_tensor(loss.data, num_gpus) stop_loss = reduce_tensor(stop_loss.data, num_gpus) if args.rank == 0: avg_linear_loss += float(linear_loss.item()) avg_mel_loss += float(mel_loss.item()) avg_stop_loss += stop_loss.item() avg_step_time += step_time # Plot Training Iter Stats iter_stats = { "loss_posnet": linear_loss.item(), "loss_decoder": mel_loss.item(), "lr": current_lr, "grad_norm": grad_norm, "grad_norm_st": grad_norm_st, "step_time": step_time } tb_logger.tb_train_iter_stats(current_step, iter_stats) if current_step % c.save_step == 0: if c.checkpoint: # save model save_checkpoint(model, optimizer, optimizer_st, linear_loss.item(), OUT_PATH, current_step, epoch) # Diagnostic visualizations const_spec = linear_output[0].data.cpu().numpy() gt_spec = linear_input[0].data.cpu().numpy() align_img = alignments[0].data.cpu().numpy() figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } tb_logger.tb_train_figures(current_step, figures) # Sample audio tb_logger.tb_train_audios( current_step, {'TrainAudio': ap.inv_spectrogram(const_spec.T)}, c.audio["sample_rate"]) avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1) avg_total_loss = avg_mel_loss + avg_linear_loss + avg_stop_loss avg_step_time /= (num_iter + 1) # print epoch stats print(" | > EPOCH END -- GlobalStep:{} AvgTotalLoss:{:.5f} " "AvgLinearLoss:{:.5f} AvgMelLoss:{:.5f} " "AvgStopLoss:{:.5f} EpochTime:{:.2f} " "AvgStepTime:{:.2f}".format(current_step, avg_total_loss, avg_linear_loss, avg_mel_loss, avg_stop_loss, epoch_time, avg_step_time), flush=True) # Plot Epoch Stats if args.rank == 0: # Plot Training Epoch Stats epoch_stats = { "loss_postnet": avg_linear_loss, "loss_decoder": avg_mel_loss, "stop_loss": avg_stop_loss, "epoch_time": epoch_time } tb_logger.tb_train_epoch_stats(current_step, epoch_stats) if c.tb_model_param_stats: tb_logger.tb_model_weights(model, current_step) return avg_linear_loss, current_step
def evaluate(model, criterion, criterion_st, ap, current_step, epoch): data_loader = setup_loader(is_val=True) model.eval() epoch_time = 0 avg_linear_loss = 0 avg_mel_loss = 0 avg_stop_loss = 0 print("\n > Validation") test_sentences = [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", "I'm sorry Dave. I'm afraid I can't do that.", "This cake is great. It's so delicious and moist." ] n_priority_freq = int(3000 / (c.audio['sample_rate'] * 0.5) * c.audio['num_freq']) with torch.no_grad(): if data_loader is not None: for num_iter, data in enumerate(data_loader): start_time = time.time() # setup input data text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] mel_lengths = data[4] stop_targets = data[5] # set stop targets view, we predict a single stop token per r frames prediction stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float() # dispatch data to GPU if use_cuda: text_input = text_input.cuda() mel_input = mel_input.cuda() mel_lengths = mel_lengths.cuda() linear_input = linear_input.cuda() stop_targets = stop_targets.cuda() # forward pass mel_output, linear_output, alignments, stop_tokens =\ model.forward(text_input, mel_input) # loss computation stop_loss = criterion_st(stop_tokens, stop_targets) mel_loss = criterion(mel_output, mel_input, mel_lengths) linear_loss = 0.5 * criterion(linear_output, linear_input, mel_lengths) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_input[:, :, :n_priority_freq], mel_lengths) loss = mel_loss + linear_loss + stop_loss step_time = time.time() - start_time epoch_time += step_time if num_iter % c.print_step == 0: print( " | > TotalLoss: {:.5f} LinearLoss: {:.5f} MelLoss:{:.5f} " "StopLoss: {:.5f} ".format(loss.item(), linear_loss.item(), mel_loss.item(), stop_loss.item()), flush=True) # aggregate losses from processes if num_gpus > 1: linear_loss = reduce_tensor(linear_loss.data, num_gpus) mel_loss = reduce_tensor(mel_loss.data, num_gpus) stop_loss = reduce_tensor(stop_loss.data, num_gpus) avg_linear_loss += float(linear_loss.item()) avg_mel_loss += float(mel_loss.item()) avg_stop_loss += stop_loss.item() if args.rank == 0: # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = linear_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() eval_figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } tb_logger.tb_eval_figures(current_step, eval_figures) # Sample audio tb_logger.tb_eval_audios( current_step, {"ValAudio": ap.inv_spectrogram(const_spec.T)}, c.audio["sample_rate"]) # compute average losses avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1) # Plot Validation Stats epoch_stats = { "loss_postnet": avg_linear_loss, "loss_decoder": avg_mel_loss, "stop_loss": avg_stop_loss } tb_logger.tb_eval_stats(current_step, epoch_stats) if args.rank == 0 and epoch > c.test_delay_epochs: # test sentences test_audios = {} test_figures = {} print(" | > Synthesizing test sentences") for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, linear_spec, _, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap) file_path = os.path.join(AUDIO_PATH, str(current_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav test_figures['{}-prediction'.format(idx)] = plot_spectrogram( linear_spec, ap) test_figures['{}-alignment'.format(idx)] = plot_alignment( alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() tb_logger.tb_test_audios(current_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(current_step, test_figures) return avg_linear_loss
def evaluate(model, criterion, ap, global_step, epoch): data_loader = setup_loader(ap, model.decoder.r, is_val=True) model.eval() epoch_time = 0 eval_values_dict = { 'avg_postnet_loss': 0, 'avg_decoder_loss': 0, 'avg_stopnet_loss': 0, 'avg_align_error': 0 } if c.bidirectional_decoder: eval_values_dict['avg_decoder_b_loss'] = 0 # decoder backward loss eval_values_dict['avg_decoder_c_loss'] = 0 # decoder consistency loss if c.ga_alpha > 0: eval_values_dict['avg_ga_loss'] = 0 # guidede attention loss keep_avg = KeepAverage() keep_avg.add_values(eval_values_dict) c_logger.print_eval_start() if data_loader is not None: for num_iter, data in enumerate(data_loader): start_time = time.time() # format data text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data( data) assert mel_input.shape[1] % model.decoder.r == 0 # forward pass model if c.bidirectional_decoder: decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) else: decoder_output, postnet_output, alignments, stop_tokens = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) decoder_backward_output = None # set the alignment lengths wrt reduction factor for guided attention if mel_lengths.max() % model.decoder.r != 0: alignment_lengths = ( mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r else: alignment_lengths = mel_lengths // model.decoder.r # compute loss loss_dict = criterion(postnet_output, decoder_output, mel_input, linear_input, stop_tokens, stop_targets, mel_lengths, decoder_backward_output, alignments, alignment_lengths, text_lengths) if c.bidirectional_decoder: keep_avg.update_values({ 'avg_decoder_b_loss': loss_dict['decoder_b_loss'].item(), 'avg_decoder_c_loss': loss_dict['decoder_c_loss'].item() }) if c.ga_alpha > 0: keep_avg.update_values( {'avg_ga_loss': loss_dict['ga_loss'].item()}) # step time step_time = time.time() - start_time epoch_time += step_time # compute alignment score align_error = 1 - alignment_diagonal_score(alignments) keep_avg.update_value('avg_align_error', align_error) # aggregate losses from processes if num_gpus > 1: loss_dict['postnet_loss'] = reduce_tensor( loss_dict['postnet_loss'].data, num_gpus) loss_dict['decoder_loss'] = reduce_tensor( loss_dict['decoder_loss'].data, num_gpus) if c.stopnet: loss_dict['stopnet_loss'] = reduce_tensor( loss_dict['stopnet_loss'].data, num_gpus) keep_avg.update_values({ 'avg_postnet_loss': float(loss_dict['postnet_loss'].item()), 'avg_decoder_loss': float(loss_dict['decoder_loss'].item()), 'avg_stopnet_loss': float(loss_dict['stopnet_loss'].item()), }) if c.print_eval: c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) if args.rank == 0: # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = postnet_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [ "Tacotron", "TacotronGST" ] else mel_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() eval_figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } # Sample audio if c.model in ["Tacotron", "TacotronGST"]: eval_audio = ap.inv_spectrogram(const_spec.T) else: eval_audio = ap.inv_melspectrogram(const_spec.T) tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) # Plot Validation Stats epoch_stats = { "loss_postnet": keep_avg['avg_postnet_loss'], "loss_decoder": keep_avg['avg_decoder_loss'], "stopnet_loss": keep_avg['avg_stopnet_loss'], "alignment_score": keep_avg['avg_align_error'], } if c.bidirectional_decoder: epoch_stats['loss_decoder_backward'] = keep_avg[ 'avg_decoder_b_loss'] align_b_img = alignments_backward[idx].data.cpu().numpy() eval_figures['alignment_backward'] = plot_alignment( align_b_img) if c.ga_alpha > 0: epoch_stats['guided_attention_loss'] = keep_avg['avg_ga_loss'] tb_logger.tb_eval_stats(global_step, epoch_stats) tb_logger.tb_eval_figures(global_step, eval_figures) if args.rank == 0 and epoch > c.test_delay_epochs: if c.test_sentences_file is None: test_sentences = [ "Con la mia voce posso dire cose splendide.", "Ciao Marco ed Alice, come state?", "Ora che ho una voce, voglio solo parlare.", "Tra tutte le cose che ho letto, in tanti anni, questo libro è davvero il mio preferito." ] else: with open(c.test_sentences_file, "r") as f: test_sentences = [s.strip() for s in f.readlines()] # test sentences test_audios = {} test_figures = {} print(" | > Synthesizing test sentences") speaker_id = 0 if c.use_speaker_embedding else None style_wav = c.get("style_wav_for_test") for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens, inputs = synthesis( model, test_sentence, c, use_cuda, ap, speaker_id=speaker_id, style_wav=style_wav, truncated=False, enable_eos_bos_chars=c.enable_eos_bos_chars, #pylint: disable=unused-argument use_griffin_lim=True, do_trim_silence=False) file_path = os.path.join(AUDIO_PATH, str(global_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav test_figures['{}-prediction'.format(idx)] = plot_spectrogram( postnet_output, ap) test_figures['{}-alignment'.format(idx)] = plot_alignment( alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(global_step, test_figures) return keep_avg.avg_values
def train(model, criterion, optimizer, optimizer_st, scheduler, ap, global_step, epoch): data_loader = setup_loader(ap, model.decoder.r, is_val=False, verbose=(epoch == 0)) model.train() epoch_time = 0 train_values = { 'avg_postnet_loss': 0, 'avg_decoder_loss': 0, 'avg_stopnet_loss': 0, 'avg_align_error': 0, 'avg_step_time': 0, 'avg_loader_time': 0 } if c.bidirectional_decoder: train_values['avg_decoder_b_loss'] = 0 # decoder backward loss train_values['avg_decoder_c_loss'] = 0 # decoder consistency loss if c.ga_alpha > 0: train_values['avg_ga_loss'] = 0 # guidede attention loss keep_avg = KeepAverage() keep_avg.add_values(train_values) if use_cuda: batch_n_iter = int( len(data_loader.dataset) / (c.batch_size * num_gpus)) else: batch_n_iter = int(len(data_loader.dataset) / c.batch_size) end_time = time.time() c_logger.print_train_start() for num_iter, data in enumerate(data_loader): start_time = time.time() # format data text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, avg_text_length, avg_spec_length = format_data( data) loader_time = time.time() - end_time global_step += 1 # setup lr if c.noam_schedule: scheduler.step() optimizer.zero_grad() if optimizer_st: optimizer_st.zero_grad() # forward pass model if c.bidirectional_decoder: decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) else: decoder_output, postnet_output, alignments, stop_tokens = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) decoder_backward_output = None # set the alignment lengths wrt reduction factor for guided attention if mel_lengths.max() % model.decoder.r != 0: alignment_lengths = ( mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r else: alignment_lengths = mel_lengths // model.decoder.r # compute loss loss_dict = criterion(postnet_output, decoder_output, mel_input, linear_input, stop_tokens, stop_targets, mel_lengths, decoder_backward_output, alignments, alignment_lengths, text_lengths) if c.bidirectional_decoder: keep_avg.update_values({ 'avg_decoder_b_loss': loss_dict['decoder_backward_loss'].item(), 'avg_decoder_c_loss': loss_dict['decoder_c_loss'].item() }) if c.ga_alpha > 0: keep_avg.update_values( {'avg_ga_loss': loss_dict['ga_loss'].item()}) # backward pass loss_dict['loss'].backward() optimizer, current_lr = adam_weight_decay(optimizer) grad_norm, _ = check_update(model, c.grad_clip, ignore_stopnet=True) optimizer.step() # compute alignment error (the lower the better ) align_error = 1 - alignment_diagonal_score(alignments) keep_avg.update_value('avg_align_error', align_error) loss_dict['align_error'] = align_error # backpass and check the grad norm for stop loss if c.separate_stopnet: loss_dict['stopnet_loss'].backward() optimizer_st, _ = adam_weight_decay(optimizer_st) grad_norm_st, _ = check_update(model.decoder.stopnet, 1.0) optimizer_st.step() else: grad_norm_st = 0 step_time = time.time() - start_time epoch_time += step_time # update avg stats update_train_values = { 'avg_postnet_loss': float(loss_dict['postnet_loss'].item()), 'avg_decoder_loss': float(loss_dict['decoder_loss'].item()), 'avg_stopnet_loss': loss_dict['stopnet_loss'].item() \ if isinstance(loss_dict['stopnet_loss'], float) else float(loss_dict['stopnet_loss'].item()), 'avg_step_time': step_time, 'avg_loader_time': loader_time } keep_avg.update_values(update_train_values) if global_step % c.print_step == 0: c_logger.print_train_step(batch_n_iter, num_iter, global_step, avg_spec_length, avg_text_length, step_time, loader_time, current_lr, loss_dict, keep_avg.avg_values) # aggregate losses from processes if num_gpus > 1: loss_dict['postnet_loss'] = reduce_tensor( loss_dict['postnet_loss'].data, num_gpus) loss_dict['decoder_loss'] = reduce_tensor( loss_dict['decoder_loss'].data, num_gpus) loss_dict['loss'] = reduce_tensor(loss_dict['loss'].data, num_gpus) loss_dict['stopnet_loss'] = reduce_tensor( loss_dict['stopnet_loss'].data, num_gpus) if c.stopnet else loss_dict['stopnet_loss'] if args.rank == 0: # Plot Training Iter Stats # reduce TB load if global_step % 10 == 0: iter_stats = { "loss_posnet": loss_dict['postnet_loss'].item(), "loss_decoder": loss_dict['decoder_loss'].item(), "lr": current_lr, "grad_norm": grad_norm, "grad_norm_st": grad_norm_st, "step_time": step_time } tb_logger.tb_train_iter_stats(global_step, iter_stats) if global_step % c.save_step == 0: if c.checkpoint: # save model save_checkpoint( model, optimizer, global_step, epoch, model.decoder.r, OUT_PATH, optimizer_st=optimizer_st, model_loss=loss_dict['postnet_loss'].item()) # Diagnostic visualizations const_spec = postnet_output[0].data.cpu().numpy() gt_spec = linear_input[0].data.cpu().numpy() if c.model in [ "Tacotron", "TacotronGST" ] else mel_input[0].data.cpu().numpy() align_img = alignments[0].data.cpu().numpy() figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img), } if c.bidirectional_decoder: figures["alignment_backward"] = plot_alignment( alignments_backward[0].data.cpu().numpy()) tb_logger.tb_train_figures(global_step, figures) # Sample audio if c.model in ["Tacotron", "TacotronGST"]: train_audio = ap.inv_spectrogram(const_spec.T) else: train_audio = ap.inv_melspectrogram(const_spec.T) tb_logger.tb_train_audios(global_step, {'TrainAudio': train_audio}, c.audio["sample_rate"]) end_time = time.time() # print epoch stats c_logger.print_train_epoch_end(global_step, epoch, epoch_time, keep_avg) # Plot Epoch Stats if args.rank == 0: # Plot Training Epoch Stats epoch_stats = { "loss_postnet": keep_avg['avg_postnet_loss'], "loss_decoder": keep_avg['avg_decoder_loss'], "stopnet_loss": keep_avg['avg_stopnet_loss'], "alignment_score": keep_avg['avg_align_error'], "epoch_time": epoch_time } if c.ga_alpha > 0: epoch_stats['guided_attention_loss'] = keep_avg['avg_ga_loss'] tb_logger.tb_train_epoch_stats(global_step, epoch_stats) if c.tb_model_param_stats: tb_logger.tb_model_weights(model, global_step) return keep_avg.avg_values, global_step
def evaluate(model, criterion, ap, global_step, epoch): data_loader = setup_loader(ap, model.decoder.r, is_val=True) model.eval() epoch_time = 0 keep_avg = KeepAverage() c_logger.print_eval_start() if data_loader is not None: for num_iter, data in enumerate(data_loader): start_time = time.time() # format data text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data( data) assert mel_input.shape[1] % model.decoder.r == 0 # forward pass model if c.bidirectional_decoder or c.double_decoder_consistency: decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) else: decoder_output, postnet_output, alignments, stop_tokens = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) decoder_backward_output = None alignments_backward = None # set the alignment lengths wrt reduction factor for guided attention if mel_lengths.max() % model.decoder.r != 0: alignment_lengths = ( mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r else: alignment_lengths = mel_lengths // model.decoder.r # compute loss loss_dict = criterion(postnet_output, decoder_output, mel_input, linear_input, stop_tokens, stop_targets, mel_lengths, decoder_backward_output, alignments, alignment_lengths, alignments_backward, text_lengths) # step time step_time = time.time() - start_time epoch_time += step_time # compute alignment score align_error = 1 - alignment_diagonal_score(alignments) loss_dict['align_error'] = align_error # aggregate losses from processes if num_gpus > 1: loss_dict['postnet_loss'] = reduce_tensor( loss_dict['postnet_loss'].data, num_gpus) loss_dict['decoder_loss'] = reduce_tensor( loss_dict['decoder_loss'].data, num_gpus) if c.stopnet: loss_dict['stopnet_loss'] = reduce_tensor( loss_dict['stopnet_loss'].data, num_gpus) # detach loss values loss_dict_new = dict() for key, value in loss_dict.items(): if isinstance(value, (int, float)): loss_dict_new[key] = value else: loss_dict_new[key] = value.item() loss_dict = loss_dict_new # update avg stats update_train_values = dict() for key, value in loss_dict.items(): update_train_values['avg_' + key] = value keep_avg.update_values(update_train_values) if c.print_eval: c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) if args.rank == 0: # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = postnet_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [ "Tacotron", "TacotronGST" ] else mel_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() eval_figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } # Sample audio if c.model in ["Tacotron", "TacotronGST"]: eval_audio = ap.inv_spectrogram(const_spec.T) else: eval_audio = ap.inv_melspectrogram(const_spec.T) tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) # Plot Validation Stats if c.bidirectional_decoder or c.double_decoder_consistency: align_b_img = alignments_backward[idx].data.cpu().numpy() eval_figures['alignment2'] = plot_alignment(align_b_img) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) tb_logger.tb_eval_figures(global_step, eval_figures) if args.rank == 0 and epoch > c.test_delay_epochs: if c.test_sentences_file is None: test_sentences = [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", "I'm sorry Dave. I'm afraid I can't do that.", "This cake is great. It's so delicious and moist.", "Prior to November 22, 1963." ] else: with open(c.test_sentences_file, "r") as f: test_sentences = [s.strip() for s in f.readlines()] # test sentences test_audios = {} test_figures = {} print(" | > Synthesizing test sentences") speaker_id = 0 if c.use_speaker_embedding else None style_wav = c.get("style_wav_for_test") for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens, inputs = synthesis( model, test_sentence, c, use_cuda, ap, speaker_id=speaker_id, style_wav=style_wav, truncated=False, enable_eos_bos_chars=c.enable_eos_bos_chars, #pylint: disable=unused-argument use_griffin_lim=True, do_trim_silence=False) file_path = os.path.join(AUDIO_PATH, str(global_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav test_figures['{}-prediction'.format(idx)] = plot_spectrogram( postnet_output, ap) test_figures['{}-alignment'.format(idx)] = plot_alignment( alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(global_step, test_figures) return keep_avg.avg_values