forked from NVIDIA/tacotron2
/
tts_server_utils.py
112 lines (99 loc) · 3.62 KB
/
tts_server_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import sys
sys.path.append('waveglow/')
import numpy as np
import torch
from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence
from denoiser import Denoiser
from io import BytesIO
import wave
from pydub import AudioSegment
import click
hparams = create_hparams()
hparams.sampling_rate = 22050
def load_taco2(chk_pt_path):
model = load_model(hparams)
model.load_state_dict(torch.load(chk_pt_path)['state_dict'])
_ = model.cuda().eval().half()
return model
def load_waveglow(chk_pt_path):
waveglow = torch.load(chk_pt_path)['model']
waveglow.cuda().eval().half()
for k in waveglow.convinv:
k.float()
denoiser = Denoiser(waveglow)
return waveglow, denoiser
def mel_gen(text, model):
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(
torch.from_numpy(sequence)).cuda().long()
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
return mel_outputs_postnet
def gen_audio_vocoder(mel_output, waveglow, denoiser):
with torch.no_grad():
audio = waveglow.infer(mel_output, sigma=0.666)
audio = audio/torch.max(torch.abs(audio))
audio_denoised = denoiser(audio , strength=0.01)[:, 0]
audio_denoised = audio_denoised/torch.max(torch.abs(audio_denoised))
return wav_bytes(float2pcm(audio_denoised[0].data.cpu().numpy()))
def gen_e2e_taco(text, taco_model, wavenet_model, denoiser):
mel_out = mel_gen(text, taco_model)
return gen_audio_vocoder(mel_out, wavenet_model, denoiser)
def float2pcm(sig, dtype="int16"):
"""Convert floating point signal with a range from -1 to 1 to PCM.
Any signal values outside the interval [-1.0, 1.0) are clipped.
No dithering is used.
Note that there are different possibilities for scaling floating
point numbers to PCM numbers, this function implements just one of
them. For an overview of alternatives see
http://blog.bjornroche.com/2009/12/int-float-int-its-jungle-out-there.html
Parameters
----------
sig : array_like
Input array, must have floating point type.
dtype : data type, optional
Desired (integer) data type.
Returns
-------
numpy.ndarray
Integer data, scaled and clipped to the range of the given
*dtype*.
See Also
--------
pcm2float, dtype
"""
sig = np.asarray(sig)
if sig.dtype.kind != "f":
raise TypeError("'sig' must be a float array")
dtype = np.dtype(dtype)
if dtype.kind not in "iu":
raise TypeError("'dtype' must be an integer type")
i = np.iinfo(dtype)
abs_max = 2 ** (i.bits - 1)
offset = i.min + abs_max
return (sig * abs_max + offset).clip(i.min, i.max).astype(dtype)
def wav_bytes(tts_bytes):
wav_file = BytesIO()
with wave.open(wav_file, "wb") as obj:
obj.setnchannels(1) # mono
obj.setsampwidth(2)
obj.setframerate(22050)
obj.writeframesraw(tts_bytes)
return wav_file.getvalue()
@click.command()
@click.option('--taco', help='Path to Taco 2 model')
@click.option('--wavenet', help='path to wavenet model')
def main(taco, wavenet):
taco_model = load_taco2(taco)
wave_model, denoiser = load_waveglow(wavenet)
for i in range(5):
entered_inp = input("Enter your content: ")
audio_samples = gen_e2e_taco(entered_inp, taco_model, wave_model, denoiser)
with open('./test-{}.wav'.format(i), 'wb') as fp:
fp.write(audio_samples)
if __name__ == "__main__":
main()