def main(): if os.path.isdir('plots') == False: os.mkdir('plots') speech_parametrization() acoustic_parameters_analysis() synthesis()
def tts(model, vocoder_model, C, VC, text, ap, use_cuda, batched_vocoder, figures=False, text_gst=True): t_1 = time.time() use_vocoder_model = vocoder_model is not None model.decoder.max_decoder_steps = 50000 waveform, alignment, decoder_outputs, postnet_output, stop_tokens = synthesis( model, text=text, CONFIG=C, use_cuda=use_cuda, ap=ap, speaker_id=False, style_wav=None, enable_eos_bos_chars=C.enable_eos_bos_chars, text_gst=text_gst) if use_vocoder_model: vocoder_input = torch.FloatTensor(decoder_outputs.T).unsqueeze(0) waveform = vocoder_model.generate( vocoder_input.cuda() if use_cuda else vocoder_input, batched=batched_vocoder, target=11000, overlap=550) print(" > Run-time: {}".format(time.time() - t_1)) return alignment, postnet_output, stop_tokens, waveform
def tts(model, C, text, ap, use_cuda, text_gst=True, persistent=False): model.decoder.max_decoder_steps = 50000 waveform, alignment, decoder_outputs, postnet_output, stop_tokens = synthesis( model, text=text, CONFIG=C, use_cuda=use_cuda, ap=ap, speaker_id=None, style_wav=None, enable_eos_bos_chars=C.enable_eos_bos_chars, text_gst=text_gst, persistent=persistent) mels = torch.FloatTensor(decoder_outputs.T) return alignment, postnet_output, stop_tokens, waveform, mels
def tts(model, vocoder_model, C, VC, text, ap, use_cuda, batched_vocoder, figures=False): t_1 = time.time() use_vocoder_model = vocoder_model is not None waveform, alignment, decoder_outputs, postnet_output, stop_tokens = synthesis( model, text, C, use_cuda, ap, False, C.enable_eos_bos_chars) if C.model == "Tacotron" and use_vocoder_model: postnet_output = ap.out_linear_to_mel(postnet_output.T).T if use_vocoder_model: vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) waveform = vocoder_model.generate( vocoder_input.cuda() if use_cuda else vocoder_input, batched=batched_vocoder, target=11000, overlap=550) print(" > Run-time: {}".format(time.time() - t_1)) return alignment, postnet_output, stop_tokens, waveform
def evaluate(model, criterion, criterion_st, ap, current_step, epoch, use_half=False): # data_loader = setup_loader(is_val=True) model.eval() epoch_time = 0 avg_postnet_loss = 0 avg_decoder_loss = 0 avg_stop_loss = 0 print("\n > Validation") if c.test_sentences_file is None: test_sentences = [ "wo3 jin1 tian1 zhen1 de5 shuai1 dao4 bao4 biao3.", "zhe4 ge5 mo2 xing2 you3 gou4 nan2 xun4 lian4, wo3 lei4 le5", ] else: with open(c.test_sentences_file, "r") as f: test_sentences = [s.strip() for s in f.readlines()] #- # with torch.no_grad(): # if data_loader is not None: # for num_iter, data in enumerate(data_loader): # start_time = time.time() # # setup input data # text_input = data[0] # text_lengths = data[1] # linear_input = data[2] if c.model == "Tacotron" else None # mel_input = data[3] if not use_half else data[3].type(torch.half) # mel_lengths = data[4] if not use_half else data[4].type(torch.half) # stop_targets = data[5] # # set stop targets view, we predict a single stop token per r frames prediction # stop_targets = stop_targets.view(text_input.shape[0], # stop_targets.size(1) // c.r, # -1) # stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) # stop_targets = stop_targets if not use_half else stop_targets.type(torch.half) # # dispatch data to GPU # if use_cuda: # text_input = text_input.cuda() # mel_input = mel_input.cuda() # mel_lengths = mel_lengths.cuda() # linear_input = linear_input.cuda() if c.model == "Tacotron" else None # stop_targets = stop_targets.cuda() # # forward pass # decoder_output, postnet_output, alignments, stop_tokens =\ # model.forward(text_input, text_lengths, mel_input) # # loss computation # stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) # if c.loss_masking: # decoder_loss = criterion(decoder_output, mel_input, mel_lengths) # if c.model == "Tacotron": # postnet_loss = criterion(postnet_output, linear_input, mel_lengths) # else: # postnet_loss = criterion(postnet_output, mel_input, mel_lengths) # else: # decoder_loss = criterion(decoder_output, mel_input) # if c.model == "Tacotron": # postnet_loss = criterion(postnet_output, linear_input) # else: # postnet_loss = criterion(postnet_output, mel_input) # loss = decoder_loss + postnet_loss + stop_loss # step_time = time.time() - start_time # epoch_time += step_time # if num_iter % c.print_step == 0: # print( # " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} DecoderLoss:{:.5f} " # "StopLoss: {:.5f} ".format(loss.item(), # postnet_loss.item(), # decoder_loss.item(), # stop_loss.item()), # flush=True) # # aggregate losses from processes # if num_gpus > 1: # postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) # decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) # if c.stopnet: # stop_loss = reduce_tensor(stop_loss.data, num_gpus) # avg_postnet_loss += float(postnet_loss.item()) # avg_decoder_loss += float(decoder_loss.item()) # avg_stop_loss += stop_loss.item() # if args.rank == 0: # # Diagnostic visualizations # idx = np.random.randint(mel_input.shape[0]) # const_spec = postnet_output[idx].data.cpu().type(torch.float).numpy() # gt_spec = linear_input[idx].data.cpu().type(torch.float).numpy() if c.model == "Tacotron" else mel_input[idx].data.cpu().type(torch.float).numpy() # align_img = alignments[idx].data.cpu().type(torch.float).numpy() # eval_figures = { # "prediction": plot_spectrogram(const_spec, ap), # "ground_truth": plot_spectrogram(gt_spec, ap), # "alignment": plot_alignment(align_img) # } # tb_logger.tb_eval_figures(current_step, eval_figures) # # Sample audio # if c.model == "Tacotron": # eval_audio = ap.inv_spectrogram(const_spec.T) # else: # eval_audio = ap.inv_mel_spectrogram(const_spec.T) # tb_logger.tb_eval_audios(current_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) # # compute average losses # avg_postnet_loss /= (num_iter + 1) # avg_decoder_loss /= (num_iter + 1) # avg_stop_loss /= (num_iter + 1) # # Plot Validation Stats # epoch_stats = {"loss_postnet": avg_postnet_loss, # "loss_decoder": avg_decoder_loss, # "stop_loss": avg_stop_loss} # tb_logger.tb_eval_stats(current_step, epoch_stats) if args.rank == 0 and epoch >= c.test_delay_epochs: # test sentences test_audios = {} test_figures = {} print(" | > Synthesizing test sentences") for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap) if use_half: wav, alignment, decoder_output, postnet_output, stop_tokens = wav.astype( np.float), alignment.astype( np.float), decoder_output.astype( np.float), postnet_output.astype( np.float), stop_tokens.type(torch.float) file_path = os.path.join(AUDIO_PATH, str(current_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav test_figures['{}-prediction'.format(idx)] = plot_spectrogram( postnet_output, ap) test_figures['{}-alignment'.format(idx)] = plot_alignment( alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() tb_logger.tb_test_audios(current_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(current_step, test_figures) return avg_postnet_loss
def evaluate(model, criterion, criterion_st, criterion_gst, ap, global_step, epoch): data_loader = setup_loader(ap, is_val=True) if c.use_speaker_embedding: speaker_mapping = load_speaker_mapping(OUT_PATH) model.eval() epoch_time = 0 avg_postnet_loss = 0 avg_decoder_loss = 0 avg_stop_loss = 0 avg_gst_loss = 0 print("\n > Validation") if c.test_sentences_file is None: test_sentences = [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", "It was neither an assault by the Picards nor the Burgundians, nor a hunt led along in procession, nor a revolt of scholars in the town of Laas, nor an entry of our much dread lord, monsieur the king, nor even a pretty hanging of male and female thieves by the courts of Paris .", "It was barely two days since the last cavalcade of that nature, that of the Flemish ambassadors charged with concluding the marriage between the dauphin and Marguerite of Flanders ." ] else: with open(c.test_sentences_file, "r") as f: test_sentences = [s.strip() for s in f.readlines()] with torch.no_grad(): if data_loader is not None: for num_iter, data in enumerate(data_loader): start_time = time.time() # setup input data text_input = data[0] text_lengths = data[1] speaker_names = data[2] linear_input = data[3] if c.model in ["Tacotron", "TacotronGST"] else None mel_input = data[4] mel_lengths = data[5] stop_targets = data[6] if c.use_speaker_embedding: speaker_ids = [speaker_mapping[speaker_name] for speaker_name in speaker_names] speaker_ids = torch.LongTensor(speaker_ids) else: speaker_ids = None # set stop targets view, we predict a single stop token per r frames prediction stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) # dispatch data to GPU if use_cuda: text_input = text_input.cuda() mel_input = mel_input.cuda() mel_lengths = mel_lengths.cuda() linear_input = linear_input.cuda() if c.model in ["Tacotron", "TacotronGST"] else None stop_targets = stop_targets.cuda() if speaker_ids is not None: speaker_ids = speaker_ids.cuda() # forward pass decoder_output, postnet_output, alignments, stop_tokens, text_gst =\ model.forward(text_input, text_lengths, mel_input, speaker_ids=speaker_ids) # loss computation stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) gst_loss = torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input, mel_lengths) else: postnet_loss = criterion(postnet_output, mel_input, mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input) else: postnet_loss = criterion(postnet_output, mel_input) if c.text_gst: mel_gst, _ = model.gst(mel_input) gst_loss = criterion_gst(text_gst, mel_gst.squeeze().detach()) loss = decoder_loss + postnet_loss + stop_loss step_time = time.time() - start_time epoch_time += step_time if num_iter % c.print_step == 0: print( " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} DecoderLoss:{:.5f} " "StopLoss: {:.5f} GSTLoss: {:.5f} ".format(loss.item(), postnet_loss.item(), decoder_loss.item(), stop_loss.item(), gst_loss.item()), flush=True) # aggregate losses from processes if num_gpus > 1: postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) gst_loss = reduce_tensor(gst_loss.data, num_gpus) if c.stopnet: stop_loss = reduce_tensor(stop_loss.data, num_gpus) avg_postnet_loss += float(postnet_loss.item()) avg_decoder_loss += float(decoder_loss.item()) avg_gst_loss += float(gst_loss.item()) avg_stop_loss += stop_loss.item() if args.rank == 0: # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = postnet_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() if c.model in ["Tacotron", "TacotronGST"] else mel_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() eval_figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } tb_logger.tb_eval_figures(global_step, eval_figures) # Sample audio if c.model in ["Tacotron", "TacotronGST"]: eval_audio = ap.inv_spectrogram(const_spec.T) else: eval_audio = ap.inv_mel_spectrogram(const_spec.T) tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) # compute average losses avg_postnet_loss /= (num_iter + 1) avg_decoder_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1) avg_gst_loss /= (num_iter + 1) # Plot Validation Stats epoch_stats = {"loss_postnet": avg_postnet_loss, "loss_decoder": avg_decoder_loss, "stop_loss": avg_stop_loss, "gst_loss": avg_gst_loss} tb_logger.tb_eval_stats(global_step, epoch_stats) if args.rank == 0 and epoch > c.test_delay_epochs: # test sentences test_audios = {} test_figures = {} print(" | > Synthesizing test sentences") speaker_id = 0 if c.use_speaker_embedding else None style_wav = c.get("style_wav_for_test") for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap, speaker_id=speaker_id, style_wav=style_wav, text_gst=False) file_path = os.path.join(AUDIO_PATH, str(global_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav test_figures['{}-prediction'.format(idx)] = plot_spectrogram(postnet_output, ap) test_figures['{}-alignment'.format(idx)] = plot_alignment(alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(global_step, test_figures) for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap, speaker_id=speaker_id, style_wav=style_wav, text_gst=True) file_path = os.path.join(AUDIO_PATH, str(global_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_GST_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio-GST'.format(idx)] = wav test_figures['{}-prediction-GST'.format(idx)] = plot_spectrogram(postnet_output, ap) test_figures['{}-alignment-GST'.format(idx)] = plot_alignment(alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(global_step, test_figures) return avg_postnet_loss
def evaluate(model, criterion, criterion_st, ap, current_step, epoch): data_loader = setup_loader(is_val=True) model.eval() epoch_time = 0 avg_postnet_loss = 0 avg_decoder_loss = 0 avg_stop_loss = 0 print("\n > Validation") test_sentences = [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", "I'm sorry Dave. I'm afraid I can't do that.", "This cake is great. It's so delicious and moist." ] with torch.no_grad(): if data_loader is not None: for num_iter, data in enumerate(data_loader): start_time = time.time() # setup input data text_input = data[0] text_lengths = data[1] linear_input = data[2] if c.model == "Tacotron" else None mel_input = data[3] mel_lengths = data[4] stop_targets = data[5] # set stop targets view, we predict a single stop token per r frames prediction stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) # dispatch data to GPU if use_cuda: text_input = text_input.cuda() mel_input = mel_input.cuda() mel_lengths = mel_lengths.cuda() linear_input = linear_input.cuda() if c.model == "Tacotron" else None stop_targets = stop_targets.cuda() # forward pass decoder_output, postnet_output, alignments, stop_tokens =\ model.forward(text_input, text_lengths, mel_input) # loss computation stop_loss = criterion_st(stop_tokens, stop_targets) decoder_loss = criterion(decoder_output, mel_input, mel_lengths) if c.model == "Tacotron": postnet_loss = criterion(postnet_output, linear_input, mel_lengths) else: postnet_loss = criterion(postnet_output, mel_input, mel_lengths) loss = decoder_loss + postnet_loss + stop_loss step_time = time.time() - start_time epoch_time += step_time if num_iter % c.print_step == 0: print( " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} DecoderLoss:{:.5f} " "StopLoss: {:.5f} ".format(loss.item(), postnet_loss.item(), decoder_loss.item(), stop_loss.item()), flush=True) # aggregate losses from processes if num_gpus > 1: postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) stop_loss = reduce_tensor(stop_loss.data, num_gpus) avg_postnet_loss += float(postnet_loss.item()) avg_decoder_loss += float(decoder_loss.item()) avg_stop_loss += stop_loss.item() if args.rank == 0: # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = postnet_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() if c.model == "Tacotron" else mel_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() eval_figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } tb_logger.tb_eval_figures(current_step, eval_figures) # Sample audio if c.model == "Tacotron": eval_audio = ap.inv_spectrogram(const_spec.T) else: eval_audio = ap.inv_mel_spectrogram(const_spec.T) tb_logger.tb_eval_audios(current_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) # compute average losses avg_postnet_loss /= (num_iter + 1) avg_decoder_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1) # Plot Validation Stats epoch_stats = {"loss_postnet": avg_postnet_loss, "loss_decoder": avg_decoder_loss, "stop_loss": avg_stop_loss} tb_logger.tb_eval_stats(current_step, epoch_stats) if args.rank == 0 and epoch > c.test_delay_epochs: # test sentences test_audios = {} test_figures = {} print(" | > Synthesizing test sentences") for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap) file_path = os.path.join(AUDIO_PATH, str(current_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav test_figures['{}-prediction'.format(idx)] = plot_spectrogram(postnet_output, ap) test_figures['{}-alignment'.format(idx)] = plot_alignment(alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() tb_logger.tb_test_audios(current_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(current_step, test_figures) return avg_postnet_loss
def evaluate(model, criterion, criterion_st, ap, current_step): data_loader = setup_loader(is_val=True) model.eval() epoch_time = 0 avg_linear_loss = 0 avg_mel_loss = 0 avg_stop_loss = 0 print(" | > Validation") test_sentences = [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", "I'm sorry Dave. I'm afraid I can't do that.", "This cake is great. It's so delicious and moist." ] n_priority_freq = int( 3000 / (c.audio['sample_rate'] * 0.5) * c.audio['num_freq']) with torch.no_grad(): if data_loader is not None: for num_iter, data in enumerate(data_loader): start_time = time.time() # setup input data text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] mel_lengths = data[4] stop_targets = data[5] # set stop targets view, we predict a single stop token per r frames prediction stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float() # dispatch data to GPU if use_cuda: text_input = text_input.cuda() mel_input = mel_input.cuda() mel_lengths = mel_lengths.cuda() linear_input = linear_input.cuda() stop_targets = stop_targets.cuda() # forward pass mel_output, linear_output, alignments, stop_tokens =\ model.forward(text_input, mel_input) # loss computation stop_loss = criterion_st(stop_tokens, stop_targets) mel_loss = criterion(mel_output, mel_input, mel_lengths) linear_loss = 0.5 * criterion(linear_output, linear_input, mel_lengths) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_input[:, :, :n_priority_freq], mel_lengths) loss = mel_loss + linear_loss + stop_loss step_time = time.time() - start_time epoch_time += step_time if num_iter % c.print_step == 0: print( " | > TotalLoss: {:.5f} LinearLoss: {:.5f} MelLoss:{:.5f} " "StopLoss: {:.5f} ".format(loss.item(), linear_loss.item(), mel_loss.item(), stop_loss.item()), flush=True) avg_linear_loss += linear_loss.item() avg_mel_loss += mel_loss.item() avg_stop_loss += stop_loss.item() # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = linear_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() const_spec = plot_spectrogram(const_spec, ap) gt_spec = plot_spectrogram(gt_spec, ap) align_img = plot_alignment(align_img) tb.add_figure('ValVisual/Reconstruction', const_spec, current_step) tb.add_figure('ValVisual/GroundTruth', gt_spec, current_step) tb.add_figure('ValVisual/ValidationAlignment', align_img, current_step) # Sample audio audio_signal = linear_output[idx].data.cpu().numpy() ap.griffin_lim_iters = 60 audio_signal = ap.inv_spectrogram(audio_signal.T) try: tb.add_audio( 'ValSampleAudio', audio_signal, current_step, sample_rate=c.audio["sample_rate"]) except: # sometimes audio signal is out of boundaries pass # compute average losses avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1) avg_total_loss = avg_mel_loss + avg_linear_loss + avg_stop_loss # Plot Learning Stats tb.add_scalar('ValEpochLoss/TotalLoss', avg_total_loss, current_step) tb.add_scalar('ValEpochLoss/LinearLoss', avg_linear_loss, current_step) tb.add_scalar('ValEpochLoss/MelLoss', avg_mel_loss, current_step) tb.add_scalar('ValEpochLoss/Stop_loss', avg_stop_loss, current_step) # test sentences ap.griffin_lim_iters = 60 for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, linear_spec, _, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap) file_path = os.path.join(AUDIO_PATH, str(current_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) wav_name = 'TestSentences/{}'.format(idx) tb.add_audio( wav_name, wav, current_step, sample_rate=c.audio['sample_rate']) linear_spec = plot_spectrogram(linear_spec, ap) align_img = plot_alignment(alignment) tb.add_figure('TestSentences/{}_Spectrogram'.format(idx), linear_spec, current_step) tb.add_figure('TestSentences/{}_Alignment'.format(idx), align_img, current_step) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() pass return avg_linear_loss
def evaluate(model, criterion, criterion_st, ap, current_step, epoch): data_loader = setup_loader(is_val=True) model.eval() epoch_time = 0 avg_postnet_loss = 0 avg_decoder_loss = 0 avg_stop_loss = 0 print("\n > Validation") if c.test_sentences_file is None: test_sentences = [ "Evinizde çocuklar televizyonun karşısına dizilmiş oturuyorlar.", "Karşınızda reklamlara çıkan çocukların elinde çikulatalar, püskevitler, birbirlerine ikram ediyorlar, birbirleriyle yiyorlar, şakalaşıyorlar.", "O çocuk aklından geçiriyor 'benim de bir çikulatam olsa, benim de bir püskevitim olsa' diyor.", "Anne bana niye almıyorsunuz diyor, bizde niye yok diyor." ] else: with open(c.test_sentences_file, "r") as f: test_sentences = [s.strip() for s in f.readlines()] with torch.no_grad(): if data_loader is not None: for num_iter, data in enumerate(data_loader): start_time = time.time() # setup input data text_input = data[0] text_lengths = data[1] linear_input = data[2] if c.model == "Tacotron" else None mel_input = data[3] mel_lengths = data[4] stop_targets = data[5] # set stop targets view, we predict a single stop token per r frames prediction stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) # dispatch data to GPU if use_cuda: text_input = text_input.cuda() mel_input = mel_input.cuda() mel_lengths = mel_lengths.cuda() linear_input = linear_input.cuda( ) if c.model == "Tacotron" else None stop_targets = stop_targets.cuda() # forward pass decoder_output, postnet_output, alignments, stop_tokens =\ model.forward(text_input, text_lengths, mel_input) # loss computation stop_loss = criterion_st( stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) if c.model == "Tacotron": postnet_loss = criterion(postnet_output, linear_input, mel_lengths) else: postnet_loss = criterion(postnet_output, mel_input, mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model == "Tacotron": postnet_loss = criterion(postnet_output, linear_input) else: postnet_loss = criterion(postnet_output, mel_input) loss = decoder_loss + postnet_loss + stop_loss step_time = time.time() - start_time epoch_time += step_time if num_iter % c.print_step == 0: print( " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} DecoderLoss:{:.5f} " "StopLoss: {:.5f} ".format(loss.item(), postnet_loss.item(), decoder_loss.item(), stop_loss.item()), flush=True) # aggregate losses from processes if num_gpus > 1: postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) if c.stopnet: stop_loss = reduce_tensor(stop_loss.data, num_gpus) avg_postnet_loss += float(postnet_loss.item()) avg_decoder_loss += float(decoder_loss.item()) avg_stop_loss += stop_loss.item() if args.rank == 0: # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = postnet_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy( ) if c.model == "Tacotron" else mel_input[idx].data.cpu( ).numpy() align_img = alignments[idx].data.cpu().numpy() eval_figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } tb_logger.tb_eval_figures(current_step, eval_figures) # Sample audio if c.model == "Tacotron": eval_audio = ap.inv_spectrogram(const_spec.T) else: eval_audio = ap.inv_mel_spectrogram(const_spec.T) tb_logger.tb_eval_audios(current_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) # compute average losses avg_postnet_loss /= (num_iter + 1) avg_decoder_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1) # Plot Validation Stats epoch_stats = { "loss_postnet": avg_postnet_loss, "loss_decoder": avg_decoder_loss, "stop_loss": avg_stop_loss } tb_logger.tb_eval_stats(current_step, epoch_stats) if args.rank == 0 and epoch > c.test_delay_epochs: # test sentences test_audios = {} test_figures = {} print(" | > Synthesizing test sentences") for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap) file_path = os.path.join(AUDIO_PATH, str(current_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav test_figures['{}-prediction'.format(idx)] = plot_spectrogram( postnet_output, ap) test_figures['{}-alignment'.format(idx)] = plot_alignment( alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() tb_logger.tb_test_audios(current_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(current_step, test_figures) return avg_postnet_loss
def evaluate(model, criterion, criterion_st, ap, current_step, epoch): data_loader = setup_loader(ap, is_val=True) if c.use_speaker_embedding: speaker_mapping = load_speaker_mapping(OUT_PATH) model.eval() epoch_time = 0 avg_postnet_loss = 0 avg_decoder_loss = 0 avg_stop_loss = 0 print("\n > Validation") if c.test_sentences_file is None: test_sentences = [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", "I'm sorry Dave. I'm afraid I can't do that.", "This cake is great. It's so delicious and moist." # "jin1 tian1 tian1 qi4 zhen1 bu2 cuo4。", # "zuo2 wan3, ya4 zhou1 wen2 hua4 jia1 nian2 hua2 zai4 guo2 jia1 ti3 yu4 chang3 sheng4 da4 kai1 yan3。", # "zhe4 shi4 zhong1 hua2 min2 zu2 shi3 zhong1 jian1 shou3 de5 dao4 de2 zhun3 ze2。", # "you3 shen2 me5 xu1 yao4 wo3 bang1 mang2 ma5? jin2 guan3 shuo1!", # "you3 shen2 me5 xu1 yao4 wo3 bang1 mang2 ma5。", # "zhong1 gong4 zhong1 yang1 zheng4 zhi4 ju2 zhao4 kai1 hui4 yi4, xi2 jin4 ping2 zhu3 chi2 hui4 yi4。 ", # "wu2 lei3 shi4 jie4 bo1, xi1 ban1 ya2 ren2 you3 yi2 sai4 zhan4 ping2。" ] else: with open(c.test_sentences_file, "r") as f: test_sentences = [s.strip() for s in f.readlines()] # print(" > > DEBUG: Test_sentences:") # print(test_sentences) with torch.no_grad(): # print("CP1") if data_loader is not None: # print("CP2") for num_iter, data in enumerate(data_loader): # print("CP3") start_time = time.time() # setup input data text_input = data[0] text_lengths = data[1] speaker_names = data[2] linear_input = data[3] if c.model in [ "Tacotron", "TacotronGST" ] else None mel_input = data[4] mel_lengths = data[5] stop_targets = data[6] if c.use_speaker_embedding: speaker_ids = [ speaker_mapping[speaker_name] for speaker_name in speaker_names ] speaker_ids = torch.LongTensor(speaker_ids) else: speaker_ids = None # set stop targets view, we predict a single stop token per r frames prediction stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) # dispatch data to GPU if use_cuda: text_input = text_input.cuda() mel_input = mel_input.cuda() mel_lengths = mel_lengths.cuda() linear_input = linear_input.cuda() if c.model in [ "Tacotron", "TacotronGST" ] else None stop_targets = stop_targets.cuda() if speaker_ids is not None: speaker_ids = speaker_ids.cuda() # forward pass decoder_output, postnet_output, alignments, stop_tokens =\ model.forward(text_input, text_lengths, mel_input, speaker_ids=speaker_ids) # loss computation stop_loss = criterion_st( stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input, mel_lengths) else: postnet_loss = criterion(postnet_output, mel_input, mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input) else: postnet_loss = criterion(postnet_output, mel_input) loss = decoder_loss + postnet_loss + stop_loss step_time = time.time() - start_time epoch_time += step_time if num_iter % c.print_step == 0: print( " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} DecoderLoss:{:.5f} " "StopLoss: {:.5f} ".format(loss.item(), postnet_loss.item(), decoder_loss.item(), stop_loss.item()), flush=True) # aggregate losses from processes if num_gpus > 1: postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) if c.stopnet: stop_loss = reduce_tensor(stop_loss.data, num_gpus) avg_postnet_loss += float(postnet_loss.item()) avg_decoder_loss += float(decoder_loss.item()) avg_stop_loss += stop_loss.item() if args.rank == 0: # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = postnet_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [ "Tacotron", "TacotronGST" ] else mel_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() eval_figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } tb_logger.tb_eval_figures(current_step, eval_figures) # Sample audio if c.model in ["Tacotron", "TacotronGST"]: eval_audio = ap.inv_spectrogram(const_spec.T) else: eval_audio = ap.inv_mel_spectrogram(const_spec.T) tb_logger.tb_eval_audios(current_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) # compute average losses avg_postnet_loss /= (num_iter + 1) avg_decoder_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1) # Plot Validation Stats epoch_stats = { "loss_postnet": avg_postnet_loss, "loss_decoder": avg_decoder_loss, "stop_loss": avg_stop_loss } tb_logger.tb_eval_stats(current_step, epoch_stats) if args.rank == 0 and epoch > c.test_delay_epochs: # test sentences test_audios = {} test_figures = {} print(" | > Synthesizing test sentences") speaker_id = 0 if c.use_speaker_embedding else None for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap, speaker_id=speaker_id) file_path = os.path.join(AUDIO_PATH, str(current_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav test_figures['{}-prediction'.format(idx)] = plot_spectrogram( postnet_output, ap) test_figures['{}-alignment'.format(idx)] = plot_alignment( alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() tb_logger.tb_test_audios(current_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(current_step, test_figures) return avg_postnet_loss
def evaluate(model, criterion, criterion_st, ap, global_step, epoch): data_loader = setup_loader(ap, model.decoder.r, is_val=True) if c.use_speaker_embedding: speaker_mapping = load_speaker_mapping(OUT_PATH) model.eval() epoch_time = 0 eval_values_dict = { 'avg_postnet_loss': 0, 'avg_decoder_loss': 0, 'avg_stop_loss': 0, 'avg_align_score': 0 } if c.bidirectional_decoder: eval_values_dict['avg_decoder_b_loss'] = 0 # decoder backward loss eval_values_dict['avg_decoder_c_loss'] = 0 # decoder consistency loss keep_avg = KeepAverage() keep_avg.add_values(eval_values_dict) print("\n > Validation") with torch.no_grad(): if data_loader is not None: for num_iter, data in enumerate(data_loader): start_time = time.time() # format data text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data( data) assert mel_input.shape[1] % model.decoder.r == 0 # forward pass model if c.bidirectional_decoder: decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward, mu, logvar, z = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids, ref_cond=True) _, postnet_output_noRef, _, _, _, _ = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids, ref_cond=False) else: decoder_output, postnet_output, alignments, stop_tokens, mu, logvar, z = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids, ref_cond=True) _, postnet_output_noRef, _, _ = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids, ref_cond=False) # loss computation stop_loss = criterion_st( stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input, mel_lengths) else: postnet_loss = criterion(postnet_output, mel_input, mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input) else: postnet_loss = criterion(postnet_output, mel_input) loss = decoder_loss + postnet_loss + stop_loss # backward decoder loss if c.bidirectional_decoder: if c.loss_masking: decoder_backward_loss = criterion( torch.flip(decoder_backward_output, dims=(1, )), mel_input, mel_lengths) else: decoder_backward_loss = criterion( torch.flip(decoder_backward_output, dims=(1, )), mel_input) decoder_c_loss = torch.nn.functional.l1_loss( torch.flip(decoder_backward_output, dims=(1, )), decoder_output) loss += decoder_backward_loss + decoder_c_loss keep_avg.update_values({ 'avg_decoder_b_loss': decoder_backward_loss.item(), 'avg_decoder_c_loss': decoder_c_loss.item() }) step_time = time.time() - start_time epoch_time += step_time # compute alignment score align_score = alignment_diagonal_score(alignments) keep_avg.update_value('avg_align_score', align_score) # aggregate losses from processes if num_gpus > 1: postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) if c.stopnet: stop_loss = reduce_tensor(stop_loss.data, num_gpus) keep_avg.update_values({ 'avg_postnet_loss': float(postnet_loss.item()), 'avg_decoder_loss': float(decoder_loss.item()), 'avg_stop_loss': float(stop_loss.item()), }) if num_iter % c.print_step == 0: print( " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} - {:.5f} DecoderLoss:{:.5f} - {:.5f} " "StopLoss: {:.5f} - {:.5f} AlignScore: {:.4f} : {:.4f}" .format(loss.item(), postnet_loss.item(), keep_avg['avg_postnet_loss'], decoder_loss.item(), keep_avg['avg_decoder_loss'], stop_loss.item(), keep_avg['avg_stop_loss'], align_score, keep_avg['avg_align_score']), flush=True) if args.rank == 0: # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = postnet_output[idx].data.cpu().numpy() const_spec_noRef = postnet_output_noRef[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [ "Tacotron", "TacotronGST" ] else mel_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() eval_figures = { "prediction": plot_spectrogram(const_spec, ap), "prediction_noRef": plot_spectrogram(const_spec_noRef, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } # Sample audio if c.model in ["Tacotron", "TacotronGST"]: eval_audio = ap.inv_spectrogram(const_spec.T) eval_audio_noRef = ap.inv_spectrogram(const_spec_noRef.T) tgruth_audio = ap.inv_spectrogram(gt_spec.T) else: eval_audio = ap.inv_mel_spectrogram(const_spec.T) eval_audio_noRef = ap.inv_mel_spectrogram( const_spec_noRef.T) tgruth_audio = ap.inv_mel_spectrogram(gt_spec.T) tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) tb_logger.tb_eval_audios(global_step, {"ValAudioNoRef": eval_audio_noRef}, c.audio["sample_rate"]) tb_logger.tb_eval_audios(global_step, {"RefAudio": tgruth_audio}, c.audio["sample_rate"]) # Plot Validation Stats epoch_stats = { "loss_postnet": keep_avg['avg_postnet_loss'], "loss_decoder": keep_avg['avg_decoder_loss'], "stop_loss": keep_avg['avg_stop_loss'], "alignment_score": keep_avg['avg_align_score'] } if c.bidirectional_decoder: epoch_stats['loss_decoder_backward'] = keep_avg[ 'avg_decoder_b_loss'] align_b_img = alignments_backward[idx].data.cpu().numpy() eval_figures['alignment_backward'] = plot_alignment( align_b_img) tb_logger.tb_eval_stats(global_step, epoch_stats) tb_logger.tb_eval_figures(global_step, eval_figures) if args.rank == 0 and epoch == -1: # >= c.test_delay_epochs: if c.test_sentences_file is None: test_sentences = [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", "I'm sorry Dave. I'm afraid I can't do that.", "This cake is great. It's so delicious and moist." ] else: with open(c.test_sentences_file, "r") as f: test_sentences = [s.strip() for s in f.readlines()] # test sentences test_audios = {} test_figures = {} print(" | > Synthesizing test sentences") speaker_id = 0 if c.use_speaker_embedding else None style_wav = c.get("style_wav_for_test") for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap, speaker_id=speaker_id, style_wav=style_wav) file_path = os.path.join(AUDIO_PATH, str(global_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav test_figures['{}-prediction'.format(idx)] = plot_spectrogram( postnet_output, ap) test_figures['{}-alignment'.format(idx)] = plot_alignment( alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(global_step, test_figures) return keep_avg['avg_postnet_loss']