forked from NVIDIA/tacotron2
/
web_app.py
147 lines (127 loc) · 5.5 KB
/
web_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import sys
sys.path.append('waveglow/')
# import glow
from flask import Flask, render_template, redirect, url_for, request, send_from_directory
import numpy as np
import os
from pydub import AudioSegment
from hparams import create_hparams
from train import load_model
import torch
from denoiser import Denoiser
from text import text_to_sequence
from scipy.io.wavfile import write
import uuid
import time
SYNTH_DIR = 'static/synth_wav'
tacotron2_model = None
waveglow_model = None
denoiser = None
def init_model():
print("init model!!!!")
global tacotron2_model
global waveglow_model
global denoiser
tacotron2_path = "outdir_finetune/checkpoint_62500"
# tacotron2_path = "outdir_korean/checkpoint_8800"
# tacotron2_path = "../models/tacotron2/outdir_korean/checkpoint_25000"
# tacotron2_path = "../tacotron2-pytorch/outdir/checkpoint_15000"
# tacotron2_path = "../models/tacotron2/outdir_korean/checkpoint_15000"
# tacotron2_path = "outdir_lj_korean/checkpoint_5000"
# tacotron2_path = "outdir_longtrain/checkpoint_439500"
waveglow_path = "../waveglow-fix/checkpoints_finetune/waveglow_478000"
# waveglow_path = "../waveglow/checkpoints/waveglow_335000"
# waveglow_path = "../waveglow-fix/checkpoints_longtrain/waveglow_484000"
sampling_rate = 22050
denoiser_strength = 0.0
hparams = create_hparams()
hparams.sampling_rate = sampling_rate
hparams.training = False
tacotron2_model = load_model(hparams)
tacotron2_model.load_state_dict(torch.load(tacotron2_path)['state_dict'])
_ = tacotron2_model.cuda().eval().half()
# with open("waveglow/config.json") as f:
# data = f.read()
# import json
# config = json.loads(data)
# waveglow_config = config["waveglow_config"]
#
# waveglow_model = glow.WaveGlow(**waveglow_config)
#
# checkpoint_dict = torch.load(waveglow_path, map_location='cpu')
# model_for_loading = checkpoint_dict['model']
# waveglow_model.load_state_dict(model_for_loading.state_dict())
#
# # waveglow_model.load_state_dict(torch.load(waveglow_path)['state_dict'])
# waveglow_model = waveglow_model.remove_weightnorm(waveglow_model)
# waveglow_model.cuda().eval().half()
waveglow_model = torch.load(waveglow_path)['model']
waveglow_model = waveglow_model.remove_weightnorm(waveglow_model)
waveglow_model.cuda().eval().half()
for k in waveglow_model.convinv:
k.float()
if denoiser_strength > 0:
denoiser = Denoiser(waveglow_model)
def create_app():
app = Flask(__name__)
def run_on_start():
init_model()
run_on_start()
return app
app = create_app()
# app = Flask(__name__)
@app.route("/simple_synth")
def simple_synth():
text = request.args.get('input_text', default=None, type=str)
sigma = request.args.get('sigma', default=0.8, type=float)
sampling_rate = request.args.get('sampling_rate', default=22050, type=int)
denoiser_strength = request.args.get('denoiser_strength', default=0.0, type=float)
if not text:
return render_template("simple_synth.html", input_text=None, synth_wav_path=None,
sigma=sigma, sampling_rate=sampling_rate, denoiser_strength=denoiser_strength, elapsed=None)
start = time.time()
# sequence = np.array(text_to_sequence(text, ['transliteration_cleaners']))[None, :]
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
# sequence = np.array(text_to_sequence(text, ['korean_cleaners']))[None, :]
sequence = torch.autograd.Variable(
torch.from_numpy(sequence)).cuda().long()
mel_outputs, mel_outputs_postnet, _, alignments = tacotron2_model.inference(sequence)
MAX_WAV_VALUE = 32768.0
with torch.no_grad():
audio = waveglow_model.infer(mel_outputs_postnet, sigma=sigma) # 0.666)
if denoiser_strength > 0:
audio = denoiser(audio, denoiser_strength) # 0.01 > denoiser_strength
audio = audio * MAX_WAV_VALUE
audio = audio.squeeze()
audio = audio.cpu().numpy()
audio = audio.astype('int16')
if not os.path.isdir(SYNTH_DIR):
os.makedirs(SYNTH_DIR)
elapsed = time.time() - start
filename = str(uuid.uuid4()) + '.wav'
audio_path = os.path.join(SYNTH_DIR, filename)
write(audio_path, sampling_rate, audio)
return render_template("simple_synth.html", input_text=text,
synth_wav_path=filename,
sigma=sigma, sampling_rate=sampling_rate, denoiser_strength=denoiser_strength, elapsed=elapsed)
@app.after_request
def add_header(r):
"""
Add headers to both force latest IE rendering engine or Chrome Frame,
and also to cache the rendered page for 10 minutes.
"""
r.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
r.headers["Pragma"] = "no-cache"
r.headers["Expires"] = "0"
r.headers['Cache-Control'] = 'public, max-age=0'
return r
@app.route('/download/<path:filename>', methods=['GET', 'POST'])
def download(filename):
return send_from_directory(directory=SYNTH_DIR, filename=filename, as_attachment=True)
@app.route('/download_mp3/<path:filename>', methods=['GET', 'POST'])
def download_mp3(filename):
new_filename = os.path.splitext(filename)[0] + ".mp3"
AudioSegment.from_wav(os.path.join(SYNTH_DIR, filename)).export(os.path.join(SYNTH_DIR, new_filename), format="mp3")
return send_from_directory(directory=SYNTH_DIR, filename=new_filename, as_attachment=True)
if __name__ == "__main__":
app.run(debug=True, threaded=False)