def tts(model, text, spk, qF0s): """Convert text to speech waveform given a Tacotron model. """ if use_cuda: model = model.cuda() model.encoder.eval() model.postnet.eval() sequence = np.array(text) spk = np.array([spk]) #sequence = np.array(text_to_sequence(text, [hparams.cleaners])) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) spk = Variable(torch.from_numpy(spk)) qF0s = np.array(qF0s) qF0s = Variable(torch.from_numpy(qF0s)).unsqueeze(0) if use_cuda: sequence = sequence.cuda() spk = spk.cuda() qF0s = qF0s.cuda() # Greedy decoding mel_outputs, linear_outputs, alignments = model(sequence, spk, qF0s.long()) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio.denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram
def tts(model, text): """Convert text to speech waveform given a Tacotron model. """ if use_cuda: model = model.cuda() # TODO: Turning off dropout of decoder's prenet causes serious performance # regression, not sure why. # model.decoder.eval() model.encoder.eval() model.postnet.eval() sequence = np.array(text) #sequence = np.array(text_to_sequence(text, [hparams.cleaners])) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) if use_cuda: sequence = sequence.cuda() # Greedy decoding mel_outputs, linear_outputs, alignments = model(sequence) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio.denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() # Predicted audio signal waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram
def tts(model, text, mel): """Convert text to speech waveform given a Tacotron model. """ if use_cuda: model = model.cuda() model.encoder.eval() model.postnet.eval() sequence = np.array(text) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) mel = Variable(torch.from_numpy(mel)).unsqueeze(0) if use_cuda: sequence = sequence.cuda() mel = mel.cuda() mel_outputs, linear_outputs, alignments = model.forward_generate_gst( sequence, mel) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio.denormalize(linear_output) alignment = alignments[0].cpu().data.numpy() waveform = audio.inv_spectrogram(linear_output.T) return waveform, alignment, spectrogram
def tts(model, text, tones): """Convert text to speech waveform given a Tacotron model. """ if use_cuda: model = model.cuda() model.encoder.eval() model.postnet.eval() sequence = np.array(text) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) tones = np.array(tones) tones = Variable(torch.from_numpy(tones)).unsqueeze(0) if use_cuda: sequence = sequence.cuda() tones = tones.cuda() mel_outputs, linear_outputs = model(sequence, tones) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio.denormalize(linear_output) waveform = audio.inv_spectrogram(linear_output.T) return waveform, spectrogram
def synthesize(model, mspec, spk): """Convert text to speech waveform given a Tacotron model. """ if use_cuda: model = model.cuda() model.eval() sequence = np.array(mspec) sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0) spk = np.array(spk) spk = Variable(torch.from_numpy(spk)).unsqueeze(0) if use_cuda: sequence = sequence.cuda() spk = spk.cuda() with torch.no_grad(): model.forward_getlatents(sequence) mel_outputs, linear_outputs, = model.forward_eval(sequence, spk) linear_output = linear_outputs[0].cpu().data.numpy() spectrogram = audio.denormalize(linear_output) waveform = audio.inv_spectrogram(linear_output.T) return waveform
def display_mel(self, mel, tag='', sr=22050): amp_mel = denormalize(mel, self.config) img = tf.transpose(amp_mel) buf = buffer_mel(img, sr=sr) img_tf = tf.image.decode_png(buf.getvalue(), channels=3) self.add_image(tag, tf.expand_dims(img_tf, 0))