forked from bregmanstudio/AudioSpectrumPatchApproximation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
AudioSpectrumPatchApproximation.py
333 lines (308 loc) · 14.6 KB
/
AudioSpectrumPatchApproximation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
"""
ASPA - AudioSpectrumPatchApproximation.py
Time-frequency 2D sparse code dictionary learning and approximation in Python
https://github.com/bregmanstudio/Audio2DSparseApprox
Assumes python distribution with numply / scipy installed
Michael A. Casey, Bregman Media Labs 2014-2015, Dartmouth College
See attached license file (Apache 2.0)
Dependencies:
sklearn - http://scikit-learn.org/stable/
skimage - http://scikit-image.org/docs/dev/api/skimage.html
bregman - https://github.com/bregmanstudio/BregmanToolkit
scikits.audiolab - https://pypi.python.org/pypi/scikits.audiolab/
Example: (see AudioSpectrumPatchApproximation.ipynb)
"""
import os, sys, glob
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.decomposition import MiniBatchDictionaryLearning
from sklearn.feature_extraction.image import extract_patches_2d
from sklearn.feature_extraction.image import reconstruct_from_patches_2d
from sklearn.decomposition import ProjectedGradientNMF
from skimage.filter import gabor_kernel
import bregman.suite as br
import pdb
class SparseApproxSpectrum(object):
"""class for 2D patch analysis of audio files
initialization:
patch_size - size of time-frequency 2D patches in spectrogram units (freq,time) [(12,12)]
max_samples - if num audio patches exceeds this threshold, randomly sample spectrum [1000000]
**omp_args - keyword arguments to OrthogonalMatchingPursuit(...) [None]
"""
def __init__(self, patch_size=(12,12), max_samples=1000000, **omp_args):
self.patch_size = patch_size
self.max_samples = max_samples
self.omp = OrthogonalMatchingPursuit(**omp_args)
self.D = None
self.data = None
self.components = None
self.zscore=False
self.log_amplitude=False
def _extract_data_patches(self, X, zscore, log_amplitude):
"utility method for converting spectrogram data to 2D patches "
self.zscore=zscore
self.log_amplitude=log_amplitude
self.X = X
if self.log_amplitude:
X = np.log(1+X)
data = extract_patches_2d(X, self.patch_size)
data = data.reshape(data.shape[0], -1)
if len(data)>self.max_samples:
data = np.random.permutation(data)[:self.max_samples]
print data.shape
if self.zscore:
self.mn = np.mean(data, axis=0)
self.std = np.std(data, axis=0)
data -= self.mn
data /= self.std
self.data = data
def make_gabor_field(self, X, zscore=True, log_amplitude=True, thetas=range(4),
sigmas=(1,3), frequencies=(0.05, 0.25)) :
"""Given a spectrogram, prepare 2D patches and Gabor filter bank kernels
inputs:
X - spectrogram data (frequency x time)
zscore - whether to zscore the ensemble of 2D patches [True]
log_amplitude - whether to apply log(1+X) scaling of spectrogram data [True]
thetas - list of 2D Gabor filter orientations in units of pi/4. [range(4)]
sigmas - list of 2D Gabor filter standard deviations in oriented direction [(1,3)]
frequencies - list of 2D Gabor filter frequencies [(0.05,0.25)]
outputs:
self.data - 2D patches of input spectrogram
self.D.components_ - Gabor dictionary of thetas x sigmas x frequencies atoms
"""
self._extract_data_patches(X, zscore, log_amplitude)
self.n_components = len(thetas)*len(sigmas)*len(frequencies)
self.thetas = thetas
self.sigmas = sigmas
self.frequencies = frequencies
a,b = self.patch_size
self.kernels = []
for theta in thetas:
theta = theta / 4. * np.pi
for sigma in sigmas:
for frequency in frequencies:
kernel = np.real(gabor_kernel(frequency, theta=theta,
sigma_x=sigma, sigma_y=sigma))
c,d = kernel.shape
if c<=a:
z = np.zeros(self.patch_size)
z[(a/2-c/2):(a/2-c/2+c),(b/2-d/2):(b/2-d/2+d)] = kernel
else:
z = kernel[(c/2-a/2):(c/2-a/2+a),(d/2-b/2):(d/2-b/2+b)]
self.kernels.append(z.flatten())
class Bunch:
def __init__(self, **kwds):
self.__dict__.update(kwds)
self.D = Bunch(components_ = np.vstack(self.kernels))
def extract_codes(self, X, n_components=16, zscore=True, log_amplitude=True, **mbl_args):
"""Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data
inputs:
X - spectrogram data (frequency x time)
n_components - how many components to extract [16]
zscore - whether to zscore the ensemble of 2D patches [True]
log_amplitude - whether to apply log(1+X) scaling of spectrogram data [True]
**mbl_args - keyword arguments for MiniBatchDictionaryLearning.fit(...) [None]
outputs:
self.data - 2D patches of input spectrogram
self.D.components_ - dictionary of learned 2D atoms for sparse coding
"""
self._extract_data_patches(X, zscore, log_amplitude)
self.n_components = n_components
self.dico = MiniBatchDictionaryLearning(n_components=self.n_components, **mbl_args)
print "Dictionary learning from data..."
self.D = self.dico.fit(self.data)
def plot_codes(self, cbar=False, show_axis=False, **kwargs):
"plot the learned or generated 2D sparse code dictionary"
N = int(np.ceil(np.sqrt(self.n_components)))
kwargs.setdefault('cmap', plt.cm.gray_r)
kwargs.setdefault('origin','bottom')
kwargs.setdefault('interpolation','nearest')
for i, comp in enumerate(self.D.components_):
plt.subplot(N, N, i+1)
plt.imshow(comp.reshape(self.patch_size), **kwargs)
if cbar:
plt.colorbar()
if not show_axis:
plt.axis('off')
plt.xticks(())
plt.yticks(())
plt.title('%d'%(i))
plt.suptitle('Dictionary of Spectrum Patches\n', fontsize=14)
plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
def extract_audio_dir_codes(self, dir_expr='/home/mkc/exp/FMRI/stimuli/Wav6sRamp/*.wav', **mbl_args):
"""apply dictionary learning to entire directory of audio files (requires LOTS of RAM)
inputs:
**mbl_args - keyword arguments for MiniBatchDictionaryLearning.fit(...) [None]
"""
flist=glob.glob(dir_expr)
self.X = np.vstack([br.feature_scale(br.LogFrequencySpectrum(f, nbpo=24, nhop=1024).X,normalize=1).T for f in flist]).T
self.D = extract_codes(self.X, **mbl_args)
def _get_approximation_coefs(self, data, components):
"""utility function to fit dictionary components to data
inputs:
data - spectrogram data (frqeuency x time) [None]
components - the dictionary components to fit to the data [None]
"""
w = np.array([self.omp.fit(components.T, d.T).coef_ for d in data])
return w
def reconstruct_spectrum(self, w=None, randomize=False):
"""reconstruct by fitting current 2D dictionary to self.data
inputs:
w - per-component reconstruction weights [None=calculate weights]
randomize - randomly permute components after getting weights [False]
returns:
self.X_hat - spectral reconstruction of self.data
"""
data = self.data
components = self.D.components_
if w is None:
self.w = self._get_approximation_coefs(data, components)
w = self.w
if randomize:
components = np.random.permutation(components)
recon = np.dot(w, components)
if self.zscore:
recon = recon * self.std
recon = recon + self.mn
recon = recon.reshape(-1, *self.patch_size)
self.X_hat = reconstruct_from_patches_2d(recon, self.X.shape)
if self.log_amplitude:
self.X_hat = np.exp(self.X_hat) - 1.0 # invert log transform
def reconstruct_individual_spectra(self, w=None, randomize=False, plotting=False, rectify=True, **kwargs):
"""fit each dictionary component to self.data
inputs:
w - per-component reconstruction weights [None=calculate weights]
randomize - randomly permute components after getting weights [False]
plotting - whether to subplot individual spectrum reconstructions [True]
rectify- remove negative ("dark energy") from individual reconstructions [True]
**kwargs - keyword arguments for plotting
returns:
self.X_hat_l - list of indvidual spectrum reconstructions per dictionary atom
"""
omp_args = {}
self.reconstruct_spectrum(w, randomize, **omp_args)
w, components = self.w, self.D.components_
self.X_hat_l = []
for i in range(len(self.w.T)):
r=np.array((np.matrix(w)[:,i]*np.matrix(components)[i,:])).reshape(-1,*self.patch_size)
X_hat = reconstruct_from_patches_2d(r, self.X.shape)
if self.log_amplitude:
X_hat = np.exp(X_hat) - 1.0
if rectify: # half wave rectification
X_hat[X_hat<0] = 0
self.X_hat_l.append(X_hat)
if plotting:
self.plot_individual_spectra(**kwargs)
def plot_individual_spectra(self, **kwargs):
"plot individual spectrum reconstructions for self.X_hat_l"
if self.X_hat_l is None: return
plt.figure()
rn = np.ceil(self.n_components**0.5)
for k in range(self.n_components):
plt.subplot(rn,rn,k+1)
br.feature_plot(self.X_hat_l[k], nofig=1, **kwargs)
plt.title('%d'%(k))
plt.suptitle('Component Reconstructions\n', fontsize=14)
class SparseApproxSpectrumNonNegative(SparseApproxSpectrum):
"""Non-negative sparse dictionary learning from 2D spectrogram patches
initialization:
patch_size=(12,12) - size of time-frequency 2D patches in spectrogram units (freq,time)
max_samples=1000000 - if num audio patches exceeds this threshold, randomly sample spectrum
"""
def __init__(self, patch_size=(12,12), max_samples=1000000):
self.patch_size = patch_size
self.max_samples = max_samples
self.D = None
self.data = None
self.components = None
self.zscore=False
self.log_amplitude=False
def extract_codes(self, X, n_components=16, log_amplitude=True, **nmf_args):
"""Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data
inputs:
X - spectrogram data (frequency x time)
n_components - how many components to extract [16]
log_amplitude - weather to apply log amplitude scaling log(1+X)
**nmf_args - keyword arguments for ProjectedGradientNMF(...) [None]
outputs:
self.data - 2D patches of input spectrogram
self.D.components_ - dictionary of 2D NMF components
"""
zscore=False
self._extract_data_patches(X, zscore, log_amplitude)
self.n_components=n_components
nmf_args.setdefault('sparseness','components')
nmf_args.setdefault('init','nndsvd')
nmf_args.setdefault('beta',0.5)
print "NMF..."
self.model = ProjectedGradientNMF(n_components=self.n_components, **nmf_args)
self.model.fit(self.data)
self.D = self.model
def reconstruct_spectrum(self, w=None, randomize=False):
"reconstruct by fitting current NMF 2D dictionary to self.data"
if w is None:
self.w = self.model.transform(self.data)
w = self.w
return SparseApproxSpectrum.reconstruct_spectrum(self, w=w, randomize=randomize)
class SparseApproxSpectrumPLCA2D(SparseApproxSpectrum):
"""PLCA2D sparse dictionary learning from 2D spectrogram patches
initialization:
patch_size=(12,12) - size of time-frequency 2D patches in spectrogram units (freq,time)
max_samples=1000000 - if num audio patches exceeds this threshold, randomly sample spectrum
"""
def __init__(self, patch_size=(12,12), max_samples=1000000):
self.patch_size = patch_size
self.max_samples = max_samples
self.D = None
self.data = None
self.components = None
self.zscore=False
self.log_amplitude=False
self.X_hat_l=None
self.X_hat=None
def extract_codes(self, F, n_components=16, log_amplitude=True, **nmf_args):
"""Given a spectrogram, learn a dictionary of 2D patch atoms from spectrogram data
inputs:
F - feature class with spectrogram data (frequency x time)
n_components - how many components to extract [16]
log_amplitude - weather to apply log amplitude scaling log(1+X)
**nmf_args - keyword arguments for ProjectedGradientNMF(...) [None]
outputs:
self.data - 2D patches of input spectrogram
self.D.components_ - dictionary of 2D NMF components
"""
zscore=False
self.F = F
self.data = F.X
self.X = F.X
self.n_components = n_components
self.log_amplitude = log_amplitude
if self.log_amplitude:
F.X = np.log(1+F.X)
F.separate(br.SIPLCA2, n_components, win=self.patch_size, **nmf_args)
class Bunch:
def __init__(self, **kwds):
self.__dict__.update(kwds)
self.D = Bunch(components_ = np.vstack(F.w.swapaxes(0,1).reshape(F.w.shape[1],-1)))
self.w = F.w
self.z = F.z
self.h = F.h
def plot_codes(self, cbar=False, show_axis=False, **kwargs):
patch_size = self.patch_size
self.patch_size = (self.F.w.shape[0],self.F.w.shape[2])
super(SparseApproxSpectrumPLCA2D, self).plot_codes(cbar, show_axis, **kwargs)
self.patch_size = patch_size
def reconstruct_individual_spectra(self, w=None, randomize=False):
"reconstruct by fitting current NMF 2D dictionary to self.data"
w = w if w is not None else self.w
self.X_hat_l = self.F.invert_components(br.SIPLCA2, w, self.z, self.h)
if self.log_amplitude:
self.X_hat_l = [np.exp(x)-1.0 for x in self.X_hat_l]
self.reconstruct_spectrum(w, randomize)
def reconstruct_spectrum(self, w=None, randomize=False):
w = w if w is not None else self.w
if self.X_hat_l is None:
self.reconstruct_individual_spectra(w,randomize)
self.X_hat = np.array(self.X_hat_l).sum(0)