def tts(model, text, spk, qF0s):
    """Convert text to speech waveform given a Tacotron model.
    """
    if use_cuda:
        model = model.cuda()

    model.encoder.eval()
    model.postnet.eval()

    sequence = np.array(text)
    spk = np.array([spk])
    #sequence = np.array(text_to_sequence(text, [hparams.cleaners]))
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    spk = Variable(torch.from_numpy(spk))
    qF0s = np.array(qF0s)
    qF0s =  Variable(torch.from_numpy(qF0s)).unsqueeze(0)

    if use_cuda:
        sequence = sequence.cuda()
        spk = spk.cuda()
        qF0s = qF0s.cuda()
 
    # Greedy decoding
    mel_outputs, linear_outputs, alignments = model(sequence, spk, qF0s.long())

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio.denormalize(linear_output)
    alignment = alignments[0].cpu().data.numpy()

    # Predicted audio signal
    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram
def tts(model, text):
    """Convert text to speech waveform given a Tacotron model.
    """
    if use_cuda:
        model = model.cuda()
    # TODO: Turning off dropout of decoder's prenet causes serious performance
    # regression, not sure why.
    # model.decoder.eval()
    model.encoder.eval()
    model.postnet.eval()

    sequence = np.array(text)
    #sequence = np.array(text_to_sequence(text, [hparams.cleaners]))
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    if use_cuda:
        sequence = sequence.cuda()

    # Greedy decoding
    mel_outputs, linear_outputs, alignments = model(sequence)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio.denormalize(linear_output)
    alignment = alignments[0].cpu().data.numpy()

    # Predicted audio signal
    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram
def tts(model, text, mel):
    """Convert text to speech waveform given a Tacotron model.
    """
    if use_cuda:
        model = model.cuda()

    model.encoder.eval()
    model.postnet.eval()

    sequence = np.array(text)
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    mel = Variable(torch.from_numpy(mel)).unsqueeze(0)

    if use_cuda:
        sequence = sequence.cuda()
        mel = mel.cuda()

    mel_outputs, linear_outputs, alignments = model.forward_generate_gst(
        sequence, mel)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio.denormalize(linear_output)
    alignment = alignments[0].cpu().data.numpy()

    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram
def tts(model, text, tones):
    """Convert text to speech waveform given a Tacotron model.
    """
    if use_cuda:
        model = model.cuda()

    model.encoder.eval()
    model.postnet.eval()

    sequence = np.array(text)
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    tones = np.array(tones)
    tones = Variable(torch.from_numpy(tones)).unsqueeze(0)

    if use_cuda:
        sequence = sequence.cuda()
        tones = tones.cuda()

    mel_outputs, linear_outputs = model(sequence, tones)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio.denormalize(linear_output)

    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, spectrogram
def synthesize(model, mspec, spk):
    """Convert text to speech waveform given a Tacotron model.
    """
    if use_cuda:
        model = model.cuda()

    model.eval()

    sequence = np.array(mspec)
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    spk = np.array(spk)
    spk = Variable(torch.from_numpy(spk)).unsqueeze(0)

    if use_cuda:
        sequence = sequence.cuda()
        spk = spk.cuda()

    with torch.no_grad():
        model.forward_getlatents(sequence)
        mel_outputs, linear_outputs, = model.forward_eval(sequence, spk)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio.denormalize(linear_output)
    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform
Beispiel #6
0
 def display_mel(self, mel, tag='', sr=22050):
     amp_mel = denormalize(mel, self.config)
     img = tf.transpose(amp_mel)
     buf = buffer_mel(img, sr=sr)
     img_tf = tf.image.decode_png(buf.getvalue(), channels=3)
     self.add_image(tag, tf.expand_dims(img_tf, 0))