-
Notifications
You must be signed in to change notification settings - Fork 8
/
utils.py
175 lines (146 loc) · 5.46 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import yaml
import math
import numpy as np
from scipy.special import expn
from scipy.signal import wiener
def parse_yaml(filepath="conf.yaml"):
"""
This method parses the YAML configuration file and returns the parsed info
as python dictionary.
Args:
filepath (string): relative path of the YAML configuration file
"""
with open(filepath, 'r') as fin:
try:
d = yaml.safe_load(fin)
return d
except Exception as exc:
print("ERROR while parsing YAML conf.")
return exc
# REF: https://github.com/rajivpoddar/logmmse/blob/master/logmmse.py
def logmmse(x, Srate, noise_frames=6, Slen=0, eta=0.15, saved_params=None):
if Slen == 0:
Slen = int(math.floor(0.02 * Srate))
if Slen % 2 == 1:
Slen = Slen + 1
PERC = 50
len1 = int(math.floor(Slen * PERC / 100))
len2 = int(Slen - len1)
win = np.hanning(Slen)
win = win * len2 / np.sum(win)
nFFT = 2 * Slen
x_old = np.zeros(len1)
Xk_prev = np.zeros(len1)
Nframes = int(math.floor(len(x) / len2) - math.floor(Slen / len2))
xfinal = np.zeros(Nframes * len2)
if saved_params is None:
noise_mean = np.zeros(nFFT)
for j in range(0, Slen*noise_frames, Slen):
noise_mean = noise_mean + \
np.absolute(np.fft.fft(win * x[j:j + Slen], nFFT, axis=0))
noise_mu2 = (noise_mean / noise_frames) ** 2
else:
noise_mu2 = saved_params['noise_mu2']
Xk_prev = saved_params['Xk_prev']
x_old = saved_params['x_old']
aa = 0.98
mu = 0.98
eta = 0.15
ksi_min = 10 ** (-25 / 10)
for k in range(0, Nframes*len2, len2):
insign = win * x[k:k + Slen]
spec = np.fft.fft(insign, nFFT, axis=0)
sig = np.absolute(spec)
sig2 = sig ** 2
gammak = np.minimum(sig2 / noise_mu2, 40)
if Xk_prev.all() == 0:
ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
else:
ksi = aa * Xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
ksi = np.maximum(ksi_min, ksi)
log_sigma_k = gammak * ksi/(1 + ksi) - np.log(1 + ksi)
vad_decision = np.sum(log_sigma_k)/Slen
if vad_decision < eta:
noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
A = ksi / (1 + ksi)
vk = A * gammak
ei_vk = 0.5 * expn(1, vk)
hw = A * np.exp(ei_vk)
sig = sig * hw
Xk_prev = sig ** 2
xi_w = np.fft.ifft(hw * spec, nFFT, axis=0)
xi_w = np.real(xi_w)
xfinal[k:k + len2] = x_old + xi_w[0:len1]
x_old = xi_w[len1:Slen]
return xfinal, {'noise_mu2': noise_mu2, 'Xk_prev': Xk_prev, 'x_old': x_old}
def reduce_noise(audio, method="wiener"):
"""
Apply a certain method to reduce audio noise.
Args:
audio (numpy.array): the audio data as 1D numpy array
Returns:
numpy.array of the audio after reducing the noise
NOTE: wiener is faster than logmmse
"""
methods = ["logmmse", "wiener"]
assert method in methods, \
"noise-reduction method must be one of these " + str(methods)
output = np.array([], dtype=np.float32)
if method == "logmmse":
sr = 16000 #sample rate
# you can tune the following values
initial_noise, window_size, noise_threshold = 6, 0, 0.15
# number of frames per second
chunk_size = 60*sr
total_frames = audio.shape[0]
frames_read = 0
# iterate 1 second at a time
while frames_read < total_frames:
if frames_read + chunk_size > total_frames:
frames = total_frames - frames_read
else:
frames = chunk_size
audio_subsample = audio[frames_read:frames_read + frames]
frames_read = frames_read + frames
_output, _ = logmmse(audio_subsample, sr, initial_noise,
window_size, noise_threshold)
output = np.concatenate( (output, _output) )
elif method == "wiener":
output = wiener(audio)
return output
def normalize_audio(audio, method="z-score"):
"""
This method is used to normalize/scale the audio based on different
normalization methods. Normalization is important for the ASR model to treat
loud audio as good as quite audio
Args:
audio (numpy.array): the audio data as 1D numpy array
method (str): normalization method.. default (z-score)
Returns:
numpy.array of the audio data after normalization
NOTE:
"-1_1" normalization is a special case of min-max normalization where the
min is -1 and the max is 1
REF:
https://en.wikipedia.org/wiki/Normalization
"""
norm_methods = ["-1_1", "z-score", "mean"]
assert method in norm_methods,\
"normalization method must be one of these " + str(norm_methods)
if method == "-1_1":
# apply 0-1 normalization method
new_min, new_max = -1, 1
old_min, old_max = audio.min(), audio.max()
scaled = (audio-old_min)/(old_max-old_min) * (new_max-new_min) + new_min
elif method == "mean":
# apply mean normalization
meu = np.mean(audio)
mn, mx = audio.min(), audio.max()
scaled = (audio - meu) / (mx - mn)
elif method == "z-score":
# apply z-score normalization (standarization)
std = np.std(audio)
meu = np.mean(audio)
scaled = (audio - meu) / std
# get back the scaled version
return scaled