Esempio n. 1
0
def log_audio(model: Tacotron2, iteration: int, logger: Tacotron2Logger, waveglow):
    text = "Does it work yet?"
    sequence = array(text_to_sequence(text, ['english_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
    with torch.no_grad():
        audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)

    logger.add_audio(text, audio[0].data.cpu(), global_step=iteration, sample_rate=hparams.sampling_rate)
Esempio n. 2
0
def log_audio(model: Tacotron2, iteration: int, logger: Tacotron2Logger,
              waveglow, inference_batch, text_encoded, mel):
    # load source data to obtain rhythm using tacotron 2 as a forced aligner
    x, y = model.parse_batch(inference_batch)

    with torch.no_grad():
        # get rhythm (alignment map) using tacotron 2
        mel_outputs, mel_outputs_postnet, gate_outputs, rhythm = model.forward(
            x)
        rhythm = rhythm.permute(1, 0, 2)

    for emotion in range(4):
        emotion_id = torch.LongTensor([emotion]).cuda()

        with torch.no_grad():
            mel_outputs, mel_outputs_postnet, gate_outputs, _ = model.inference_noattention(
                (text_encoded, mel, emotion_id, rhythm))
            audio = waveglow.infer(mel_outputs_postnet, sigma=0.8)

        logger.add_audio(f"Emotion {str(emotion)}",
                         audio[0].data.cpu(),
                         global_step=iteration,
                         sample_rate=hparams.sampling_rate)