def log_audio(model: Tacotron2, iteration: int, logger: Tacotron2Logger, waveglow): text = "Does it work yet?" sequence = array(text_to_sequence(text, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) with torch.no_grad(): audio = waveglow.infer(mel_outputs_postnet, sigma=0.666) logger.add_audio(text, audio[0].data.cpu(), global_step=iteration, sample_rate=hparams.sampling_rate)
def log_audio(model: Tacotron2, iteration: int, logger: Tacotron2Logger, waveglow, inference_batch, text_encoded, mel): # load source data to obtain rhythm using tacotron 2 as a forced aligner x, y = model.parse_batch(inference_batch) with torch.no_grad(): # get rhythm (alignment map) using tacotron 2 mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = model.forward( x) rhythm = rhythm.permute(1, 0, 2) for emotion in range(4): emotion_id = torch.LongTensor([emotion]).cuda() with torch.no_grad(): mel_outputs, mel_outputs_postnet, gate_outputs, _ = model.inference_noattention( (text_encoded, mel, emotion_id, rhythm)) audio = waveglow.infer(mel_outputs_postnet, sigma=0.8) logger.add_audio(f"Emotion {str(emotion)}", audio[0].data.cpu(), global_step=iteration, sample_rate=hparams.sampling_rate)