def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_id=None, figures=True, filename="example.wav"): global runcounter t_1 = time.time() # submatch = re.sub(r'\s+',' ',text) # filenamematch = re.search( r'([^\s]+\s?\d+)', submatch) # if filenamematch: # filename = filenamematch.group(0) + '_' + str(runcounter) + '.wav' # else: # filename = 'tempout_' + str(runcounter) + '.wav' runcounter += 1 waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis( model, text, CONFIG, use_cuda, ap, truncated=False) if CONFIG.model == "Tacotron" and not use_gl: mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T if not use_gl: waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze( 0).cuda(), batched=batched_wavernn, target=11000, overlap=550) print(" > Run-time: {}".format(time.time() - t_1)) os.makedirs(OUT_FOLDER, exist_ok=True) out_path = os.path.join(OUT_FOLDER, filename) ap.save_wav(waveform, out_path) return alignment, mel_postnet_spec, stop_tokens, waveform
def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True, counter=0): t_1 = time.time() waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis( model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None, truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars) if CONFIG.model == "Tacotron" and not use_gl: mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T mel_postnet_spec = ap._denormalize(mel_postnet_spec) print(mel_postnet_spec.shape) print("max- ", mel_postnet_spec.max(), " -- min- ", mel_postnet_spec.min()) if not use_gl: waveform = vocoder_model.inference(torch.FloatTensor( ap_vocoder._normalize(mel_postnet_spec).T).unsqueeze(0), hop_size=ap_vocoder.hop_length) if use_cuda: waveform = waveform.cpu() waveform = waveform.numpy() print(waveform.shape) # print(" > Run-time: {}".format(time.time() - t_1)) if figures: visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec)) os.makedirs('configuration/voice/result', exist_ok=True) file_name = "part" + str(counter) + ".wav" out_path = os.path.join('configuration/voice/result/', file_name) ap.save_wav(waveform, out_path) return alignment, mel_postnet_spec, stop_tokens, waveform
def tts(text, model, vocoder_model, speaker_id, CONFIG, use_cuda, ap, use_gl, figures=True): t_1 = time.time() waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis( model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None, truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars) # mel_postnet_spec = ap._denormalize(mel_postnet_spec.T) if not use_gl: waveform = vocoder_model.inference( torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0)) waveform = waveform.flatten() if use_cuda: waveform = waveform.cpu() waveform = waveform.numpy() rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate) tps = (time.time() - t_1) / len(waveform) print(waveform.shape) print(" > Run-time: {}".format(time.time() - t_1)) print(" > Real-time factor: {}".format(rtf)) print(" > Time per step: {}".format(tps)) return alignment, mel_postnet_spec, stop_tokens, waveform
def tts(model, vocoder_model, C, VC, text, ap, ap_vocoder, use_cuda, batched_vocoder, speaker_id=None, figures=False): t_1 = time.time() use_vocoder_model = vocoder_model is not None waveform, alignment, _, postnet_output, stop_tokens = synthesis( model, text, C, use_cuda, ap, speaker_id, False, C.enable_eos_bos_chars) if C.model == "Tacotron" and use_vocoder_model: postnet_output = ap.out_linear_to_mel(postnet_output.T).T # correct if there is a scale difference b/w two models postnet_output = ap._denormalize(postnet_output) postnet_output = ap_vocoder._normalize(postnet_output) if use_vocoder_model: vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) waveform = vocoder_model.generate( vocoder_input.cuda() if use_cuda else vocoder_input, batched=batched_vocoder, target=8000, overlap=400) print(" > Run-time: {}".format(time.time() - t_1)) return alignment, postnet_output, stop_tokens, waveform
def tts(self, model, text, CONFIG, use_cuda, ap, OUT_FILE): import numpy as np waveform, alignment, spectrogram, mel_spectrogram, stop_tokens = synthesis( model, text, CONFIG, use_cuda, ap) ap.save_wav(waveform, OUT_FILE) wav_norm = waveform * (32767 / max(0.01, np.max(np.abs(waveform)))) return alignment, spectrogram, stop_tokens, wav_norm
def tts(self, text, interactive=False, printable=False): figures = True t_1 = time.time() tmodel = copy.deepcopy(self.model) #tvoc = copy.deepcopy(self.vocoder_model) enable_chars = self.tts_config.enable_eos_bos_chars waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis( tmodel, text, self.tts_config, self.use_cuda, self.ap, self.speaker_id, style_wav=None, truncated=False, enable_eos_bos_chars=enable_chars) # mel_postnet_spec = ap._denormalize(mel_postnet_spec.T) del tmodel gc.collect(2) if not self.use_gl: waveform = self.vocoder_model.inference( torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0)) waveform = waveform.flatten() if self.use_cuda: waveform = waveform.cpu() else: waveform = waveform.numpy() #del tvoc if printable: rtf = (time.time() - t_1) / (len(waveform) / self.ap.sample_rate) tps = (time.time() - t_1) / len(waveform) usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print(waveform.shape) print(" > Run-time: {}".format(time.time() - t_1)) print(" > Memory Used: {} MB".format(math.floor(usage / 1024))) print(" > Real-time factor: {}".format(rtf)) print(" > Time per step: {}".format(tps)) if interactive: IPython.display.display( IPython.display.Audio(waveform, rate=self.sample_rate)) gc.collect(2) return alignment, mel_postnet_spec, stop_tokens, waveform
def tts(model, vocoder_model, C, VC, text, ap, ap_vocoder, use_cuda, batched_vocoder, speaker_id=None, style_input=None, figures=False): use_vocoder_model = vocoder_model is not None waveform, alignment, _, postnet_output, stop_tokens, _ = synthesis( model, text, C, use_cuda, ap, speaker_id, style_input=style_input, truncated=False, enable_eos_bos_chars=C.enable_eos_bos_chars, use_griffin_lim=(not use_vocoder_model), do_trim_silence=True) if C.model == "Tacotron" and use_vocoder_model: postnet_output = ap.out_linear_to_mel(postnet_output.T).T # correct if there is a scale difference b/w two models if use_vocoder_model: vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) waveform = vocoder_model.inference(vocoder_input) if use_cuda: waveform = waveform.cpu() #waveform = waveform.detach().numpy() waveform = waveform.numpy() waveform = waveform.flatten() # if use_vocoder_model: # postnet_output = ap._denormalize(postnet_output) # postnet_output = ap_vocoder._normalize(postnet_output) # vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) # waveform = vocoder_model.generate( # vocoder_input.cuda() if use_cuda else vocoder_input, # batched=batched_vocoder, # target=8000, # overlap=400) return alignment, postnet_output, stop_tokens, waveform
def tts(self, text, use_gl=False): t_1 = time.time() waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(self.model, text, self.TTS_CONFIG, self.use_cuda, self.ap, self.speaker_id, style_wav=None, truncated=False, enable_eos_bos_chars=self.TTS_CONFIG.enable_eos_bos_chars) # mel_postnet_spec = self.ap._denormalize(mel_postnet_spec.T) if not use_gl: waveform = self.vocoder_model.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0)) waveform = waveform.flatten() if self.use_cuda: waveform = waveform.cpu() waveform = waveform.numpy() rtf = (time.time() - t_1) / (len(waveform) / self.ap.sample_rate) tps = (time.time() - t_1) / len(waveform) if self.verbose: print(" > Run-time: {}".format(time.time() - t_1)) print(" > Real-time factor: {}".format(rtf)) print(" > Time per step: {}".format(tps)) return alignment, mel_postnet_spec, stop_tokens, waveform
def tts(model, text, CONFIG, use_cuda, ap, use_gl): t_1 = time.time() waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis( model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None, truncated=False, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars) if not use_gl: waveform = vocoder_model.inference( torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0)) waveform = waveform.flatten() if use_cuda: waveform = waveform.cpu() waveform = waveform.numpy() return waveform
def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True): t_1 = time.time() waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis( model, text, CONFIG, use_cuda, ap, speaker_id, style_wav=None, truncated=False, use_griffin_lim=True, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars, do_trim_silence=False) OUT_FOLDER = "/content/output" #Path where the audio files will be saved os.makedirs(OUT_FOLDER, exist_ok=True) file_name = text.replace(" ", "_").replace(".", "") + ".wav" out_path = os.path.join(OUT_FOLDER, file_name) ap.save_wav(waveform, out_path) return alignment, mel_postnet_spec, stop_tokens, waveform
def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_id=None): t_1 = time.time() waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis( model, text, CONFIG, use_cuda, ap, truncated=True, enable_eos_bos_chars=CONFIG.enable_eos_bos_chars, ) if CONFIG.model == "Tacotron" and not use_gl: mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T if not use_gl: waveform = wavernn.generate( torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=11000, overlap=550, ) print(" > Run-time: {}".format(time.time() - t_1)) return alignment, mel_postnet_spec, stop_tokens, waveform
def tts(model, vocoder_model, C, VC, text, ap, ap_vocoder, use_cuda, batched_vocoder, speaker_id=None, style_wav=None, figures=False, target=8000, overlap=400): t_1 = time.time() use_vocoder_model = vocoder_model is not None waveform, alignment, _, postnet_output, stop_tokens = synthesis( model, text, C, use_cuda, ap, speaker_id, style_wav=style_wav, truncated=False, enable_eos_bos_chars=C.enable_eos_bos_chars, use_griffin_lim=(not use_vocoder_model), do_trim_silence=True) if C.model == "Tacotron" and use_vocoder_model: postnet_output = ap.out_linear_to_mel(postnet_output.T).T # correct if there is a scale difference b/w two models if batched_vocoder: print('using batched vocoder and target: ', target, ' and overlap: ', overlap) if use_vocoder_model: #postnet_output = ap._denormalize(postnet_output) #postnet_output = ap_vocoder._normalize(postnet_output) vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) waveform = vocoder_model.generate( vocoder_input.cuda() if use_cuda else vocoder_input, batched=batched_vocoder, target=8000, overlap=400) print(" > Run-time: {}".format(time.time() - t_1)) return alignment, postnet_output, stop_tokens, waveform
def tts(model, raw_text, CONFIG, use_cuda, ap, use_gl, figures=False, use_pinyin=False): if use_pinyin: text = " ".join(lazy_pinyin(raw_text, style=style)) else: text = raw_text t_1 = time.time() waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, speaker_id, None, False) if CONFIG.model == "Tacotron" and not use_gl: # coorect the normalization differences b/w TTS and the Vocoder. mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T mel_postnet_spec = ap._denormalize(mel_postnet_spec) if not use_gl: mel_postnet_spec = ap_vocoder._normalize(mel_postnet_spec) waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=8000, overlap=400) print(" > Run-time: {}".format(time.time() - t_1)) if figures: visualize(alignment, mel_postnet_spec, stop_tokens, raw_text, ap.hop_length, CONFIG, mel_spec) # IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'])) os.makedirs(OUT_FOLDER, exist_ok=True) file_name = raw_text.replace(" ", "_").replace(".","") + f"-{speaker_id}.wav" out_path = os.path.join(OUT_FOLDER, file_name) ap.save_wav(waveform, out_path) return alignment, mel_postnet_spec, stop_tokens, waveform
def tts(model, vocoder_model, C, VC, text, ap, use_cuda, batched_vocoder, figures=False): t_1 = time.time() use_vocoder_model = vocoder_model is not None waveform, alignment, decoder_outputs, postnet_output, stop_tokens = synthesis( model, text, C, use_cuda, ap, False, C.enable_eos_bos_chars) if C.model == "Tacotron" and use_vocoder_model: postnet_output = ap.out_linear_to_mel(postnet_output.T).T if use_vocoder_model: vocoder_input = torch.FloatTensor(postnet_output.T).unsqueeze(0) waveform = vocoder_model.generate( vocoder_input.cuda() if use_cuda else vocoder_input, batched=batched_vocoder, target=11000, overlap=550) print(" > Run-time: {}".format(time.time() - t_1)) return alignment, postnet_output, stop_tokens, waveform
def evaluate(model, criterion, criterion_st, ap, global_step, epoch): data_loader = setup_loader(ap, is_val=True) if c.use_speaker_embedding: speaker_mapping = load_speaker_mapping(OUT_PATH) model.eval() epoch_time = 0 avg_postnet_loss = 0 avg_decoder_loss = 0 avg_stop_loss = 0 print("\n > Validation") if c.test_sentences_file is None: test_sentences = [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", "I'm sorry Dave. I'm afraid I can't do that.", "This cake is great. It's so delicious and moist." ] else: with open(c.test_sentences_file, "r") as f: test_sentences = [s.strip() for s in f.readlines()] with torch.no_grad(): if data_loader is not None: for num_iter, data in enumerate(data_loader): start_time = time.time() # setup input data text_input = data[0] text_lengths = data[1] speaker_names = data[2] linear_input = data[3] if c.model in [ "Tacotron", "TacotronGST" ] else None mel_input = data[4] mel_lengths = data[5] stop_targets = data[6] if c.use_speaker_embedding: speaker_ids = [ speaker_mapping[speaker_name] for speaker_name in speaker_names ] speaker_ids = torch.LongTensor(speaker_ids) else: speaker_ids = None # set stop targets view, we predict a single stop token per r frames prediction stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // c.r, -1) stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) # dispatch data to GPU if use_cuda: text_input = text_input.cuda() mel_input = mel_input.cuda() mel_lengths = mel_lengths.cuda() linear_input = linear_input.cuda() if c.model in [ "Tacotron", "TacotronGST" ] else None stop_targets = stop_targets.cuda() if speaker_ids is not None: speaker_ids = speaker_ids.cuda() # forward pass decoder_output, postnet_output, alignments, stop_tokens =\ model.forward(text_input, text_lengths, mel_input, speaker_ids=speaker_ids) # loss computation stop_loss = criterion_st( stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input, mel_lengths) else: postnet_loss = criterion(postnet_output, mel_input, mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input) else: postnet_loss = criterion(postnet_output, mel_input) loss = decoder_loss + postnet_loss + stop_loss step_time = time.time() - start_time epoch_time += step_time if num_iter % c.print_step == 0: print( " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} DecoderLoss:{:.5f} " "StopLoss: {:.5f} ".format(loss.item(), postnet_loss.item(), decoder_loss.item(), stop_loss.item()), flush=True) # aggregate losses from processes if num_gpus > 1: postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) if c.stopnet: stop_loss = reduce_tensor(stop_loss.data, num_gpus) avg_postnet_loss += float(postnet_loss.item()) avg_decoder_loss += float(decoder_loss.item()) avg_stop_loss += stop_loss.item() if args.rank == 0: # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = postnet_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [ "Tacotron", "TacotronGST" ] else mel_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() eval_figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } tb_logger.tb_eval_figures(global_step, eval_figures) # Sample audio if c.model in ["Tacotron", "TacotronGST"]: eval_audio = ap.inv_spectrogram(const_spec.T) else: eval_audio = ap.inv_mel_spectrogram(const_spec.T) tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) # compute average losses avg_postnet_loss /= (num_iter + 1) avg_decoder_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1) # Plot Validation Stats epoch_stats = { "loss_postnet": avg_postnet_loss, "loss_decoder": avg_decoder_loss, "stop_loss": avg_stop_loss } tb_logger.tb_eval_stats(global_step, epoch_stats) if args.rank == 0 and epoch > c.test_delay_epochs: # test sentences test_audios = {} test_figures = {} print(" | > Synthesizing test sentences") speaker_id = 0 if c.use_speaker_embedding else None style_wav = c.get("style_wav_for_test") for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap, speaker_id=speaker_id, style_wav=style_wav) file_path = os.path.join(AUDIO_PATH, str(global_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav test_figures['{}-prediction'.format(idx)] = plot_spectrogram( postnet_output, ap) test_figures['{}-alignment'.format(idx)] = plot_alignment( alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(global_step, test_figures) return avg_postnet_loss
def evaluate(model, criterion, criterion_st, ap, global_step, epoch): data_loader = setup_loader(ap, model.decoder.r, is_val=True) if c.use_speaker_embedding: speaker_mapping = load_speaker_mapping(OUT_PATH) model.eval() epoch_time = 0 eval_values_dict = { 'avg_postnet_loss': 0, 'avg_decoder_loss': 0, 'avg_stop_loss': 0, 'avg_align_score': 0 } if c.bidirectional_decoder: eval_values_dict['avg_decoder_b_loss'] = 0 # decoder backward loss eval_values_dict['avg_decoder_c_loss'] = 0 # decoder consistency loss keep_avg = KeepAverage() keep_avg.add_values(eval_values_dict) print("\n > Validation") with torch.no_grad(): if data_loader is not None: for num_iter, data in enumerate(data_loader): start_time = time.time() # format data text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data( data) assert mel_input.shape[1] % model.decoder.r == 0 # forward pass model if c.bidirectional_decoder: decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) else: decoder_output, postnet_output, alignments, stop_tokens = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) # loss computation stop_loss = criterion_st( stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) if c.loss_masking: decoder_loss = criterion(decoder_output, mel_input, mel_lengths) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input, mel_lengths) else: postnet_loss = criterion(postnet_output, mel_input, mel_lengths) else: decoder_loss = criterion(decoder_output, mel_input) if c.model in ["Tacotron", "TacotronGST"]: postnet_loss = criterion(postnet_output, linear_input) else: postnet_loss = criterion(postnet_output, mel_input) loss = decoder_loss + postnet_loss + stop_loss # backward decoder loss if c.bidirectional_decoder: if c.loss_masking: decoder_backward_loss = criterion( torch.flip(decoder_backward_output, dims=(1, )), mel_input, mel_lengths) else: decoder_backward_loss = criterion( torch.flip(decoder_backward_output, dims=(1, )), mel_input) decoder_c_loss = torch.nn.functional.l1_loss( torch.flip(decoder_backward_output, dims=(1, )), decoder_output) loss += decoder_backward_loss + decoder_c_loss keep_avg.update_values({ 'avg_decoder_b_loss': decoder_backward_loss.item(), 'avg_decoder_c_loss': decoder_c_loss.item() }) step_time = time.time() - start_time epoch_time += step_time # compute alignment score align_score = alignment_diagonal_score(alignments) keep_avg.update_value('avg_align_score', align_score) # aggregate losses from processes if num_gpus > 1: postnet_loss = reduce_tensor(postnet_loss.data, num_gpus) decoder_loss = reduce_tensor(decoder_loss.data, num_gpus) if c.stopnet: stop_loss = reduce_tensor(stop_loss.data, num_gpus) keep_avg.update_values({ 'avg_postnet_loss': float(postnet_loss.item()), 'avg_decoder_loss': float(decoder_loss.item()), 'avg_stop_loss': float(stop_loss.item()), }) if num_iter % c.print_step == 0: print( " | > TotalLoss: {:.5f} PostnetLoss: {:.5f} - {:.5f} DecoderLoss:{:.5f} - {:.5f} " "StopLoss: {:.5f} - {:.5f} AlignScore: {:.4f} : {:.4f}" .format(loss.item(), postnet_loss.item(), keep_avg['avg_postnet_loss'], decoder_loss.item(), keep_avg['avg_decoder_loss'], stop_loss.item(), keep_avg['avg_stop_loss'], align_score, keep_avg['avg_align_score']), flush=True) if args.rank == 0: # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = postnet_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [ "Tacotron", "TacotronGST" ] else mel_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() eval_figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } # Sample audio if c.model in ["Tacotron", "TacotronGST"]: eval_audio = ap.inv_spectrogram(const_spec.T) else: eval_audio = ap.inv_mel_spectrogram(const_spec.T) tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) # Plot Validation Stats epoch_stats = { "loss_postnet": keep_avg['avg_postnet_loss'], "loss_decoder": keep_avg['avg_decoder_loss'], "stop_loss": keep_avg['avg_stop_loss'], "alignment_score": keep_avg['avg_align_score'] } if c.bidirectional_decoder: epoch_stats['loss_decoder_backward'] = keep_avg[ 'avg_decoder_b_loss'] align_b_img = alignments_backward[idx].data.cpu().numpy() eval_figures['alignment_backward'] = plot_alignment( align_b_img) tb_logger.tb_eval_stats(global_step, epoch_stats) tb_logger.tb_eval_figures(global_step, eval_figures) if args.rank == 0 and epoch > c.test_delay_epochs: if c.test_sentences_file is None: test_sentences = [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", "I'm sorry Dave. I'm afraid I can't do that.", "This cake is great. It's so delicious and moist." ] else: with open(c.test_sentences_file, "r") as f: test_sentences = [s.strip() for s in f.readlines()] # test sentences test_audios = {} test_figures = {} print(" | > Synthesizing test sentences") speaker_id = 0 if c.use_speaker_embedding else None style_wav = c.get("style_wav_for_test") for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap, speaker_id=speaker_id, style_wav=style_wav) file_path = os.path.join(AUDIO_PATH, str(global_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav test_figures['{}-prediction'.format(idx)] = plot_spectrogram( postnet_output, ap) test_figures['{}-alignment'.format(idx)] = plot_alignment( alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(global_step, test_figures) return keep_avg['avg_postnet_loss']
def __call__(self, text, out_path): waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(self.tts_model, text, self.tts_config, self.use_cuda, self._ap, False, self.tts_config.enable_eos_bos_chars) if not self.use_gl: waveform = self.wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).to(self.device), batched=self.batched_wavernn, target=11000, overlap=550) self._ap.save_wav(waveform, out_path)
def evaluate(model, criterion, ap, global_step, epoch): data_loader = setup_loader(ap, model.decoder.r, is_val=True) model.eval() epoch_time = 0 keep_avg = KeepAverage() c_logger.print_eval_start() if data_loader is not None: for num_iter, data in enumerate(data_loader): start_time = time.time() # format data text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data( data) assert mel_input.shape[1] % model.decoder.r == 0 # forward pass model if c.bidirectional_decoder or c.double_decoder_consistency: decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) else: decoder_output, postnet_output, alignments, stop_tokens = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) decoder_backward_output = None alignments_backward = None # set the alignment lengths wrt reduction factor for guided attention if mel_lengths.max() % model.decoder.r != 0: alignment_lengths = ( mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r else: alignment_lengths = mel_lengths // model.decoder.r # compute loss loss_dict = criterion(postnet_output, decoder_output, mel_input, linear_input, stop_tokens, stop_targets, mel_lengths, decoder_backward_output, alignments, alignment_lengths, alignments_backward, text_lengths) # step time step_time = time.time() - start_time epoch_time += step_time # compute alignment score align_error = 1 - alignment_diagonal_score(alignments) loss_dict['align_error'] = align_error # aggregate losses from processes if num_gpus > 1: loss_dict['postnet_loss'] = reduce_tensor( loss_dict['postnet_loss'].data, num_gpus) loss_dict['decoder_loss'] = reduce_tensor( loss_dict['decoder_loss'].data, num_gpus) if c.stopnet: loss_dict['stopnet_loss'] = reduce_tensor( loss_dict['stopnet_loss'].data, num_gpus) # detach loss values loss_dict_new = dict() for key, value in loss_dict.items(): if isinstance(value, (int, float)): loss_dict_new[key] = value else: loss_dict_new[key] = value.item() loss_dict = loss_dict_new # update avg stats update_train_values = dict() for key, value in loss_dict.items(): update_train_values['avg_' + key] = value keep_avg.update_values(update_train_values) if c.print_eval: c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) if args.rank == 0: # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = postnet_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [ "Tacotron", "TacotronGST" ] else mel_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() eval_figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } # Sample audio if c.model in ["Tacotron", "TacotronGST"]: eval_audio = ap.inv_spectrogram(const_spec.T) else: eval_audio = ap.inv_melspectrogram(const_spec.T) tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) # Plot Validation Stats if c.bidirectional_decoder or c.double_decoder_consistency: align_b_img = alignments_backward[idx].data.cpu().numpy() eval_figures['alignment2'] = plot_alignment(align_b_img) tb_logger.tb_eval_stats(global_step, keep_avg.avg_values) tb_logger.tb_eval_figures(global_step, eval_figures) if args.rank == 0 and epoch > c.test_delay_epochs: if c.test_sentences_file is None: test_sentences = [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", "Be a voice, not an echo.", "I'm sorry Dave. I'm afraid I can't do that.", "This cake is great. It's so delicious and moist.", "Prior to November 22, 1963." ] else: with open(c.test_sentences_file, "r") as f: test_sentences = [s.strip() for s in f.readlines()] # test sentences test_audios = {} test_figures = {} print(" | > Synthesizing test sentences") speaker_id = 0 if c.use_speaker_embedding else None style_wav = c.get("style_wav_for_test") for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens, inputs = synthesis( model, test_sentence, c, use_cuda, ap, speaker_id=speaker_id, style_wav=style_wav, truncated=False, enable_eos_bos_chars=c.enable_eos_bos_chars, #pylint: disable=unused-argument use_griffin_lim=True, do_trim_silence=False) file_path = os.path.join(AUDIO_PATH, str(global_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav test_figures['{}-prediction'.format(idx)] = plot_spectrogram( postnet_output, ap) test_figures['{}-alignment'.format(idx)] = plot_alignment( alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(global_step, test_figures) return keep_avg.avg_values
def tts(self, model, text, CONFIG, use_cuda, ap): waveform, alignment, spectrogram, mel_spectrogram, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap) ap.save_wav(waveform, 'out.wav') return alignment, spectrogram, stop_tokens
def evaluate(model, criterion, ap, global_step, epoch): data_loader = setup_loader(ap, model.decoder.r, is_val=True) model.eval() epoch_time = 0 eval_values_dict = { 'avg_postnet_loss': 0, 'avg_decoder_loss': 0, 'avg_stopnet_loss': 0, 'avg_align_error': 0 } if c.bidirectional_decoder: eval_values_dict['avg_decoder_b_loss'] = 0 # decoder backward loss eval_values_dict['avg_decoder_c_loss'] = 0 # decoder consistency loss if c.ga_alpha > 0: eval_values_dict['avg_ga_loss'] = 0 # guidede attention loss keep_avg = KeepAverage() keep_avg.add_values(eval_values_dict) c_logger.print_eval_start() if data_loader is not None: for num_iter, data in enumerate(data_loader): start_time = time.time() # format data text_input, text_lengths, mel_input, mel_lengths, linear_input, stop_targets, speaker_ids, _, _ = format_data( data) assert mel_input.shape[1] % model.decoder.r == 0 # forward pass model if c.bidirectional_decoder: decoder_output, postnet_output, alignments, stop_tokens, decoder_backward_output, alignments_backward = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) else: decoder_output, postnet_output, alignments, stop_tokens = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) decoder_backward_output = None # set the alignment lengths wrt reduction factor for guided attention if mel_lengths.max() % model.decoder.r != 0: alignment_lengths = ( mel_lengths + (model.decoder.r - (mel_lengths.max() % model.decoder.r))) // model.decoder.r else: alignment_lengths = mel_lengths // model.decoder.r # compute loss loss_dict = criterion(postnet_output, decoder_output, mel_input, linear_input, stop_tokens, stop_targets, mel_lengths, decoder_backward_output, alignments, alignment_lengths, text_lengths) if c.bidirectional_decoder: keep_avg.update_values({ 'avg_decoder_b_loss': loss_dict['decoder_b_loss'].item(), 'avg_decoder_c_loss': loss_dict['decoder_c_loss'].item() }) if c.ga_alpha > 0: keep_avg.update_values( {'avg_ga_loss': loss_dict['ga_loss'].item()}) # step time step_time = time.time() - start_time epoch_time += step_time # compute alignment score align_error = 1 - alignment_diagonal_score(alignments) keep_avg.update_value('avg_align_error', align_error) # aggregate losses from processes if num_gpus > 1: loss_dict['postnet_loss'] = reduce_tensor( loss_dict['postnet_loss'].data, num_gpus) loss_dict['decoder_loss'] = reduce_tensor( loss_dict['decoder_loss'].data, num_gpus) if c.stopnet: loss_dict['stopnet_loss'] = reduce_tensor( loss_dict['stopnet_loss'].data, num_gpus) keep_avg.update_values({ 'avg_postnet_loss': float(loss_dict['postnet_loss'].item()), 'avg_decoder_loss': float(loss_dict['decoder_loss'].item()), 'avg_stopnet_loss': float(loss_dict['stopnet_loss'].item()), }) if c.print_eval: c_logger.print_eval_step(num_iter, loss_dict, keep_avg.avg_values) if args.rank == 0: # Diagnostic visualizations idx = np.random.randint(mel_input.shape[0]) const_spec = postnet_output[idx].data.cpu().numpy() gt_spec = linear_input[idx].data.cpu().numpy() if c.model in [ "Tacotron", "TacotronGST" ] else mel_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() eval_figures = { "prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img) } # Sample audio if c.model in ["Tacotron", "TacotronGST"]: eval_audio = ap.inv_spectrogram(const_spec.T) else: eval_audio = ap.inv_melspectrogram(const_spec.T) tb_logger.tb_eval_audios(global_step, {"ValAudio": eval_audio}, c.audio["sample_rate"]) # Plot Validation Stats epoch_stats = { "loss_postnet": keep_avg['avg_postnet_loss'], "loss_decoder": keep_avg['avg_decoder_loss'], "stopnet_loss": keep_avg['avg_stopnet_loss'], "alignment_score": keep_avg['avg_align_error'], } if c.bidirectional_decoder: epoch_stats['loss_decoder_backward'] = keep_avg[ 'avg_decoder_b_loss'] align_b_img = alignments_backward[idx].data.cpu().numpy() eval_figures['alignment_backward'] = plot_alignment( align_b_img) if c.ga_alpha > 0: epoch_stats['guided_attention_loss'] = keep_avg['avg_ga_loss'] tb_logger.tb_eval_stats(global_step, epoch_stats) tb_logger.tb_eval_figures(global_step, eval_figures) if args.rank == 0 and epoch > c.test_delay_epochs: if c.test_sentences_file is None: test_sentences = [ "Con la mia voce posso dire cose splendide.", "Ciao Marco ed Alice, come state?", "Ora che ho una voce, voglio solo parlare.", "Tra tutte le cose che ho letto, in tanti anni, questo libro è davvero il mio preferito." ] else: with open(c.test_sentences_file, "r") as f: test_sentences = [s.strip() for s in f.readlines()] # test sentences test_audios = {} test_figures = {} print(" | > Synthesizing test sentences") speaker_id = 0 if c.use_speaker_embedding else None style_wav = c.get("style_wav_for_test") for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, decoder_output, postnet_output, stop_tokens, inputs = synthesis( model, test_sentence, c, use_cuda, ap, speaker_id=speaker_id, style_wav=style_wav, truncated=False, enable_eos_bos_chars=c.enable_eos_bos_chars, #pylint: disable=unused-argument use_griffin_lim=True, do_trim_silence=False) file_path = os.path.join(AUDIO_PATH, str(global_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav test_figures['{}-prediction'.format(idx)] = plot_spectrogram( postnet_output, ap) test_figures['{}-alignment'.format(idx)] = plot_alignment( alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() tb_logger.tb_test_audios(global_step, test_audios, c.audio['sample_rate']) tb_logger.tb_test_figures(global_step, test_figures) return keep_avg.avg_values