def extract_features(fname, bdir, sox, htk_mfc, mfc_extension, stereo_wav, gammatones, spectrograms, filterbanks): #def extract_features(fname, bdir): if fname[-4:] != '.wav': return rawfname = bdir+'/'+fname[:-4]+'.rawaudio' wavfname = bdir+'/'+fname tempfname = bdir+'/'+fname[:-4]+'_temp.wav' # temp fname with .wav for sox mfccfname = bdir+'/'+fname[:-4]+mfc_extension if sox: shutil.move(wavfname, tempfname) call(['sox', tempfname, wavfname]) #call(['sox', '-G', tempfname, '-r 16k', wavfname]) # w/o headers, sox uses extension shutil.move(tempfname, rawfname) if htk_mfc: call(['HCopy', '-C', 'wav_config', wavfname, mfccfname]) srate = 16000 #srate, sound = wavfile.read(wavfname) sound, srate = readwav(wavfname) if stereo_wav and len(sound.shape) == 2: # in mono sound is a list sound = 0.5 * (sound[:, 0] + sound[:, 1]) # for stereo wav, sum both channels if gammatones: gammatonefname = bdir+'/'+fname[:-4]+'_gamma.npy' tmp_snd = loadsound(wavfname) gamma_cf = erbspace(20*Hz, 20*kHz, N_GAMMATONES_FILTERS) gamma_fb = Gammatone(tmp_snd, gamma_cf) with open(gammatonefname, 'w') as o_f: npsave(o_f, gamma_fb.process()) if spectrograms: powerspec, _, _, _ = specgram(sound, NFFT=int(srate * SPECGRAM_WINDOW), Fs=srate, noverlap=int(srate * SPECGRAM_OVERLAP)) # TODO specgramfname = bdir+'/'+fname[:-4]+'_specgram.npy' with open(specgramfname, 'w') as o_f: npsave(o_f, powerspec.T) if filterbanks: # convert to Mel filterbanks fbanks = Spectral(nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs compression='log', fs=srate, # sampling rate lowerf=50, # lower frequency frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) sound /= np.abs(sound).max(axis=0) # TODO put that as option fbank = fbanks.transform(sound) fbanksfname = bdir+'/'+fname[:-4]+'_fbanks.npy' with open(fbanksfname, 'w') as o_f: npsave(o_f, fbank) # TODO wavelets scattergrams / scalograms print "dealt with file", wavfname
def main(args, samplegraph): t1 = time.time() g = Graph() print("Reading...") g.read_edgelist(samplegraph) model = Spectral(graph=g, dim=args.representation_size) t2 = time.time() print(t2 - t1) print("Saving embeddings...") model.save_embeddings(args.output)
def extract_features(df, label2ix, spec_kwargs, vad_kwargs, stacksize=1, frate=100, return_y=False): if return_y: return_y = 'label' in df X = {} if return_y: y = {} spectrum_encoder = Spectral(**spec_kwargs) vad_encoder = VAD(**vad_kwargs) for ix, fname in enumerate(df.filename.unique()): sig, fs = wavread(fname) if fs != spec_kwargs['fs']: raise ValueError('expected samplerate {}, got {}'.format( spec_kwargs['fs'], fs) ) spec = spectrum_encoder.transform(sig) spec = (spec - spec.mean(0)) / spec.std(0) if stacksize > 1: spec = roll_array(spec, stacksize) vad = vad_encoder.activations(sig) vad = vad.reshape(vad.shape[0], -1) if stacksize > 1: vad = roll_array(vad, stacksize) X_curr = [] if return_y: y_curr = [] rows_iter = df[df.filename == fname].iterrows() for _, row in rows_iter: start = row.start end = row.end start_fr = int(start * frate) end_fr = int(end * frate) feat = np.hstack( (spec[start_fr: end_fr], vad[start_fr: end_fr]) ) X_curr.append( feat.astype(np.float32) ) if return_y: y_curr.append( np.ones(feat.shape[0], dtype=np.uint8) * \ label2ix[row.label] ) X[fname] = np.vstack(X_curr) if return_y: y[fname] = np.hstack(y_curr) if return_y: return X, y else: return X
def extract_features(df, label2ix, spec_kwargs, vad_kwargs, stacksize=1, frate=100, return_y=False): if return_y: return_y = 'label' in df X = {} if return_y: y = {} spectrum_encoder = Spectral(**spec_kwargs) vad_encoder = VAD(**vad_kwargs) for ix, fname in enumerate(df.filename.unique()): sig, fs = wavread(fname) if fs != spec_kwargs['fs']: raise ValueError('expected samplerate {}, got {}'.format( spec_kwargs['fs'], fs)) spec = spectrum_encoder.transform(sig) spec = (spec - spec.mean(0)) / spec.std(0) if stacksize > 1: spec = roll_array(spec, stacksize) vad = vad_encoder.activations(sig) vad = vad.reshape(vad.shape[0], -1) if stacksize > 1: vad = roll_array(vad, stacksize) X_curr = [] if return_y: y_curr = [] rows_iter = df[df.filename == fname].iterrows() for _, row in rows_iter: start = row.start end = row.end start_fr = int(start * frate) end_fr = int(end * frate) feat = np.hstack((spec[start_fr:end_fr], vad[start_fr:end_fr])) X_curr.append(feat.astype(np.float32)) if return_y: y_curr.append( np.ones(feat.shape[0], dtype=np.uint8) * \ label2ix[row.label] ) X[fname] = np.vstack(X_curr) if return_y: y[fname] = np.hstack(y_curr) if return_y: return X, y else: return X
def do_fbank(fname): srate, sound = wavfile.read(fname) fbanks = Spectral(nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs fs=srate, # sampling rate frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fb = fbanks.transform(sound) return fb
def do_fbank(fname): srate, sound = wavfile.read(fname) fbanks = Spectral( nfilt=40, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs fs=srate, # sampling rate frate=100, # frame rate wlen=0.025, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fb = np.array(fbanks.transform(sound), dtype='float32') return fb
def do_fbank(fname): srate, sound = wavfile.read(fname) fbanks = Spectral( nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs fs=srate, # sampling rate frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fb = fbanks.transform(sound) print "did:", fname #print fbnk.shape return fb
def do_mfccs(fname): """Compute standard mfccs from a wav file""" srate, sound = wavfile.read(fname) fbanks = Spectral( nfilt=40, # nb of filters in mel bank alpha=0.97, # pre-emphasis fs=srate, # sampling rate frate=100, # frame rate wlen=0.025, # window length nfft=512, # length of dft ncep=13, # nb of cepstral coefficients lowerf=100, upperf=6855.4976, do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fb = np.array(fbanks.transform(sound), dtype='float32') return fb
def do_fbank(fname): """Compute standard filterbanks from a wav file""" sound, srate = sf.read(fname) #f = Sndfile(fname,'r') #srate = f.samplerate #nf = f.nframes #sound = f.read_frames(nf) fbanks = Spectral( nfilt=40, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs fs=srate, # sampling rate frate=100, # frame rate wlen=0.025, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fb = np.array(fbanks.transform(sound), dtype='float32') return fb
def do_fbank(fname): fn = bdir + fname + '.wav' try: with open(fn[:-3] + 'npy', 'rb') as rfb: fb = np.load(rfb) except IOError: srate, sound = wavfile.read(fn) fbanks = Spectral(nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs fs=srate, # sampling rate frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fb = np.array(fbanks.transform(sound), dtype='float32') print "did:", fn #print fb.shape return fb
def run_all_spectral_clustering( rad, date_range, boxcox=False, norm=True, params=["bmnum", "v", "p_l", "w_l", "slist", "elv", "time_index"], methods=["spc", "spcb", "spcc"], m_params={ "spc": {}, "spcb": {}, "spcc": {} }, n_clusters=20): """ Invoke all spectral clustering algorithm rad: Radar code date_range: Date range """ fd = FetchData(rad, date_range) beams, _ = fd.fetch_data( v_params=["elv", "v", "w_l", "gflg", "p_l", "slist", "v_e"]) rec = fd.convert_to_pandas(beams) rec["time_index"] = utils.time_days_to_index( [x.to_pydatetime() for x in rec["time"].tolist()]) if boxcox: rec = utils.boxcox_tx(rec) if norm: rec = utils.normalize(rec, params) print("\n", rec.head()) for method in methods: print("\n >> Running {c} clustering".format(c=method)) model = Spectral(method, rec[params].values, n_clusters) model.setup(m_params[method]) model.run() print("\n Estimating model skills.") skill = Skills(model.data, model.obj.labels_) return
def __init__(self, stacksize=40, normalize='mvn', n_noise_fr=0, fs=16000, window_length=0.050, window_shift=0.010, nfft=1024, scale='mel', lowerf=120, upperf=7000, nfilt=40, taper_filt=True, compression='log', dct=False, nceps=13, log_e=True, lifter=22, deltas=False, remove_dc=False, medfilt_t=0, medfilt_s=(0, 0), noise_fr=0, pre_emph=0.97, feat_cache=None, noise_cache=None, wav_cache=None, n_jobs=1, verbose=False): self.stacksize = stacksize self.normalize = normalize if self.normalize == 'mvn': self.normalizer = StandardScaler() elif self.normalize == 'zca': self.normalizer = ZCA() elif self.normalize == 'minmax': self.normalizer = MinMaxScaler() else: self.normalizer = IdentityTransform() self.n_noise_fr = n_noise_fr self.fs = fs self.window_length = window_length self.window_shift = window_shift self.nfft = nfft self.scale = scale self.lowerf = lowerf self.upperf = upperf self.nfilt = nfilt self.taper_filt = taper_filt self.compression = compression self.dct = dct self.nceps = nceps self.log_e = log_e self.lifter = lifter self.deltas = deltas self.remove_dc = remove_dc self.medfilt_t = medfilt_t self.medfilt_s = medfilt_s self.noise_fr = noise_fr self.pre_emph = pre_emph self.n_jobs = n_jobs self.verbose = verbose self.encoder = Spectral(fs=fs, window_length=window_length, window_shift=window_shift, nfft=nfft, scale=scale, lowerf=lowerf, upperf=upperf, nfilt=nfilt, taper_filt=taper_filt, compression=compression, dct=dct, nceps=nceps, log_e=log_e, lifter=lifter, deltas=deltas, remove_dc=remove_dc, medfilt_t=medfilt_t, medfilt_s=medfilt_s, noise_fr=noise_fr, pre_emph=pre_emph) self.D = self.encoder.n_features * self.stacksize self.wav_cache = wav_cache if wav_cache else {} self.noise_cache = noise_cache if noise_cache else {} self.feat_cache = feat_cache if feat_cache else {}
class FeatureLoader(TransformerMixin, BaseEstimator): def __init__(self, stacksize=40, normalize='mvn', n_noise_fr=0, fs=16000, window_length=0.050, window_shift=0.010, nfft=1024, scale='mel', lowerf=120, upperf=7000, nfilt=40, taper_filt=True, compression='log', dct=False, nceps=13, log_e=True, lifter=22, deltas=False, remove_dc=False, medfilt_t=0, medfilt_s=(0, 0), noise_fr=0, pre_emph=0.97, feat_cache=None, noise_cache=None, wav_cache=None, n_jobs=1, verbose=False): self.stacksize = stacksize self.normalize = normalize if self.normalize == 'mvn': self.normalizer = StandardScaler() elif self.normalize == 'zca': self.normalizer = ZCA() elif self.normalize == 'minmax': self.normalizer = MinMaxScaler() else: self.normalizer = IdentityTransform() self.n_noise_fr = n_noise_fr self.fs = fs self.window_length = window_length self.window_shift = window_shift self.nfft = nfft self.scale = scale self.lowerf = lowerf self.upperf = upperf self.nfilt = nfilt self.taper_filt = taper_filt self.compression = compression self.dct = dct self.nceps = nceps self.log_e = log_e self.lifter = lifter self.deltas = deltas self.remove_dc = remove_dc self.medfilt_t = medfilt_t self.medfilt_s = medfilt_s self.noise_fr = noise_fr self.pre_emph = pre_emph self.n_jobs = n_jobs self.verbose = verbose self.encoder = Spectral(fs=fs, window_length=window_length, window_shift=window_shift, nfft=nfft, scale=scale, lowerf=lowerf, upperf=upperf, nfilt=nfilt, taper_filt=taper_filt, compression=compression, dct=dct, nceps=nceps, log_e=log_e, lifter=lifter, deltas=deltas, remove_dc=remove_dc, medfilt_t=medfilt_t, medfilt_s=medfilt_s, noise_fr=noise_fr, pre_emph=pre_emph) self.D = self.encoder.n_features * self.stacksize self.wav_cache = wav_cache if wav_cache else {} self.noise_cache = noise_cache if noise_cache else {} self.feat_cache = feat_cache if feat_cache else {} def clear_cache(self): self.wav_cache = {} self.noise_cache = {} self.feat_cache = {} def get_params(self, deep=True): p = super(FeatureLoader, self).get_params() del p['n_jobs'] del p['verbose'] return p def get_key(self): """'Frozen' dictionary representation of this object's parameters. Used as key in caching. """ p = self.get_params() del p['wav_cache'] del p['noise_cache'] del p['feat_cache'] return tuple(sorted(p.items())) def _load_wav(self, fname): """ Memoized audio loader. """ key = fname if not key in self.wav_cache: sig, fs_ = wavread(fname) if self.fs != fs_: raise ValueError('sampling rate should be {0}, not {1}. ' 'please resample.'.format(self.fs, fs_)) if len(sig.shape) > 1: warnings.warn('stereo audio: merging channels') sig = (sig[:, 0] + sig[:, 1]) / 2 self.wav_cache[key] = sig return self.wav_cache[key] def _fill_noise_cache(self, X): for fname in X[:, 0]: self._extract_noise(fname) def _extract_noise(self, fname): cfg = (('fs', self.fs), ('window_length', self.window_length), ('window_shift', self.window_shift), ('nfft', self.nfft), ('remove_dc', self.remove_dc), ('medfilt_t', self.medfilt_t), ('medfilt_s', self.medfilt_s), ('pre_emph', self.pre_emph)) key = (fname, cfg) if not key in self.noise_cache: if self.n_noise_fr == 0: self.noise_cache[key] = None else: sig = self._load_wav(fname) nsamples = (self.n_noise_fr + 2) * self.encoder.fshift spec = self.encoder.get_spectrogram(sig[:nsamples])[2:, :] noise = spec.mean(axis=0) noise = np.clip(noise, 1e-4, np.inf) self.noise_cache[key] = noise return self.noise_cache[key] def _fill_feat_cache(self, X_keys): sigs = [self._load_wav(fname) for fname, _ in X_keys] noises = [self._extract_noise(fname) for fname, _ in X_keys] p = Parallel(n_jobs=self.n_jobs, verbose=0)( delayed(extract_features_at)(sig, noise, start, self.stacksize, self.encoder) for (fname, start), sig, noise in izip(X_keys, sigs, noises)) r = {x_key: feat for x_key, feat in izip(X_keys, p)} key = self.get_key() self.feat_cache[key].update(r) def get_specs(self, X): key = self.get_key() # list of [(filename, start)] X_keys = [(X[ix, 0], X[ix, 1]) for ix in xrange(X.shape[0])] if key in self.feat_cache: # check for missing keys missing_X_keys = [ x_key for x_key in X_keys if not x_key in self.feat_cache[key] ] self._fill_feat_cache(missing_X_keys) else: self.feat_cache[key] = {} self._fill_feat_cache(X_keys) return np.vstack((self.feat_cache[key][x_key] for x_key in X_keys)) def fit(self, X, y=None): """Load audio and optionally estimate mean and covar Parameters ---------- X : ndarray with columns filename, start, end y : """ r = self.get_specs(X) self.normalizer.fit(r) return self def transform(self, X, y=None): """Load audio and perform feature extraction. Parameters ---------- X : ndarray """ r = self.get_specs(X) r = self.normalizer.transform(r) return as_strided(r, shape=(r.shape[0] // self.stacksize, r.shape[1] * self.stacksize), strides=(r.strides[0] * self.stacksize, r.strides[1])) def fit_transform(self, X, y=None): r = self.get_specs(X) r = self.normalizer.fit_transform(r) return as_strided(r, shape=(r.shape[0] // self.stacksize, r.shape[1] * self.stacksize), strides=(r.strides[0] * self.stacksize, r.strides[1]))
def process(folder, debug=False, htk_mfc=False, forcemfcext=False, stereo_wav=False, gammatones=False, spectrograms=False, filterbanks=False, sox=True): """ applies to all *.wav in folder """ # first find if we produce normalized MFCC, otherwise note it in the ext # because we can then normalize on the whole corpus with another py script mfc_extension = '.mfc_unnorm' wcfg = open('wav_config', 'r') for line in wcfg: if "ENORMALISE" in line: mfc_extension = '.mfc' if forcemfcext: mfc_extension = '.mfc' print "MFC extension:", mfc_extension if gammatones: try: from brian import Hz, kHz from brian.hears import loadsound, erbspace, Gammatone except ImportError: print >> sys.stderr, "You need Brian Hears" print >> sys.stderr, "http://www.briansimulator.org/docs/\ hears.html" sys.exit(-1) if spectrograms: try: from pylab import specgram except ImportError: print >> sys.stderr, "You need Pylab" sys.exit(-1) fbanks = None if filterbanks: try: sys.path.append('../spectral') from spectral import Spectral except ImportError: print >> sys.stderr, "You need spectral (in the parent folder)" print >> sys.stderr, "https://github.com/mwv/spectral" sys.exit(-1) # run through all the folders and files in the path "folder" # and put a header to the waves, save the originals as .rawaudio # use HCopy to produce MFCC files according to "wav_config" file for bdir, _, files in os.walk(folder): for fname in files: if fname[-4:] != '.wav': continue rawfname = bdir+'/'+fname[:-4]+'.rawaudio' wavfname = bdir+'/'+fname tempfname = bdir+'/'+fname[:-4]+'_temp.wav' # temp fname with .wav for sox mfccfname = bdir+'/'+fname[:-4]+mfc_extension if sox: shutil.move(wavfname, tempfname) call(['sox', tempfname, wavfname]) # w/o headers, sox uses extension shutil.move(tempfname, rawfname) if htk_mfc: call(['HCopy', '-C', 'wav_config', wavfname, mfccfname]) srate = 16000 srate, sound = wavfile.read(wavfname) if stereo_wav and len(sound.shape) == 2: # in mono sound is a list sound = sound[:, 0] + sound[:, 1] # for stereo wav, sum both channels if gammatones: gammatonefname = bdir+'/'+fname[:-4]+'_gamma.npy' tmp_snd = loadsound(wavfname) gamma_cf = erbspace(20*Hz, 20*kHz, N_GAMMATONES_FILTERS) gamma_fb = Gammatone(tmp_snd, gamma_cf) with open(gammatonefname, 'w') as o_f: npsave(o_f, gamma_fb.process()) if spectrograms: powerspec, _, _, _ = specgram(sound, NFFT=int(srate * SPECGRAM_WINDOW), Fs=srate, noverlap=int(srate * SPECGRAM_OVERLAP)) # TODO specgramfname = bdir+'/'+fname[:-4]+'_specgram.npy' with open(specgramfname, 'w') as o_f: npsave(o_f, powerspec.T) if filterbanks: # convert to Mel filterbanks if fbanks == None: # assume parameters are fixed fbanks = Spectral(nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs fs=srate, # sampling rate frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fbank = fbanks.transform(sound)[0] # first dimension is for # deltas & deltasdeltas fbanksfname = bdir+'/'+fname[:-4]+'_fbanks.npy' with open(fbanksfname, 'w') as o_f: npsave(o_f, fbank) # TODO wavelets scattergrams / scalograms print "dealt with file", wavfname
_, _, fs, nframes, _, _ = fid.getparams() sig = np.array(struct.unpack_from("%dh" % nframes, fid.readframes(nframes))) fid.close() return sig, fs FBANKS_WINDOW = 0.025 # 25ms FBANKS_RATE = 100 # 10ms N_FBANKS = 40 for wavfname in sys.argv[1:]: sound, srate = readwav(wavfname) fbanks = Spectral( nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs compression='log', fs=srate, # sampling rate #lowerf=50, # lower frequency frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fbank = fbanks.transform(sound) fbanksfname = wavfname[:-4] + '_fbanks.npy' with open(fbanksfname, 'w') as o_f: np.save(o_f, fbank)
def process(folder, debug=False, htk_mfc=False, forcemfcext=False, stereo_wav=False, gammatones=False, spectrograms=False, filterbanks=False, sox=True): """ applies to all *.wav in folder """ # first find if we produce normalized MFCC, otherwise note it in the ext # because we can then normalize on the whole corpus with another py script mfc_extension = '.mfc_unnorm' wcfg = open('wav_config', 'r') for line in wcfg: if "ENORMALISE" in line: mfc_extension = '.mfc' if forcemfcext: mfc_extension = '.mfc' print "MFC extension:", mfc_extension if gammatones: try: from brian import Hz, kHz from brian.hears import loadsound, erbspace, Gammatone except ImportError: print >> sys.stderr, "You need Brian Hears" print >> sys.stderr, "http://www.briansimulator.org/docs/\ hears.html" sys.exit(-1) if spectrograms: try: from pylab import specgram except ImportError: print >> sys.stderr, "You need Pylab" sys.exit(-1) fbanks = None if filterbanks: try: sys.path.append('../spectral') from spectral import Spectral except ImportError: print >> sys.stderr, "You need spectral (in the parent folder)" print >> sys.stderr, "https://github.com/mwv/spectral" sys.exit(-1) # run through all the folders and files in the path "folder" # and put a header to the waves, save the originals as .rawaudio # use HCopy to produce MFCC files according to "wav_config" file for bdir, _, files in os.walk(folder): for fname in files: if fname[-4:] != '.wav': continue rawfname = bdir + '/' + fname[:-4] + '.rawaudio' wavfname = bdir + '/' + fname tempfname = bdir + '/' + fname[:-4] + '_temp.wav' # temp fname with .wav for sox mfccfname = bdir + '/' + fname[:-4] + mfc_extension if sox: shutil.move(wavfname, tempfname) call(['sox', tempfname, wavfname]) # w/o headers, sox uses extension shutil.move(tempfname, rawfname) if htk_mfc: call(['HCopy', '-C', 'wav_config', wavfname, mfccfname]) srate = 16000 srate, sound = wavfile.read(wavfname) if stereo_wav and len(sound.shape) == 2: # in mono sound is a list sound = sound[:, 0] + sound[:, 1] # for stereo wav, sum both channels if gammatones: gammatonefname = bdir + '/' + fname[:-4] + '_gamma.npy' tmp_snd = loadsound(wavfname) gamma_cf = erbspace(20 * Hz, 20 * kHz, N_GAMMATONES_FILTERS) gamma_fb = Gammatone(tmp_snd, gamma_cf) with open(gammatonefname, 'w') as o_f: npsave(o_f, gamma_fb.process()) if spectrograms: powerspec, _, _, _ = specgram( sound, NFFT=int(srate * SPECGRAM_WINDOW), Fs=srate, noverlap=int(srate * SPECGRAM_OVERLAP)) # TODO specgramfname = bdir + '/' + fname[:-4] + '_specgram.npy' with open(specgramfname, 'w') as o_f: npsave(o_f, powerspec.T) if filterbanks: # convert to Mel filterbanks if fbanks == None: # assume parameters are fixed fbanks = Spectral( nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs fs=srate, # sampling rate frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fbank = fbanks.transform(sound)[0] # first dimension is for # deltas & deltasdeltas fbanksfname = bdir + '/' + fname[:-4] + '_fbanks.npy' with open(fbanksfname, 'w') as o_f: npsave(o_f, fbank) # TODO wavelets scattergrams / scalograms print "dealt with file", wavfname
def process(folder,debug=False,htk_mfcc=False,forcemfcext=False,stereo_wave=False,gammatones=False,spectograms=False,filterbanks=False,sox=True): mfc_extension = '.mfc_unnorm' wcfg = open('wav_config','r') for line in wcfg: if "ENORMALISE" in line: mfc_extension = '.mfc' if forcemfcext: mfc_extension = '.mfc' print "MFC Extension is", mfc_extension if gammatones: try: from brian import Hz, kHz from brian.hears import loadsound, erbspace, Gammatone except ImportError: print >> sys.stderr, "You need Brian Hears" sys.exit(-1) if spectograms: try: from pylab import specgram except ImportError: print >> sys.stderr,'You need Pylab' sys.exit(-1) fbanks = None if filterbanks: try: sys.path.append('../spectral') from spectral import Spectral except ImportError: print >> sys.stderr, 'you need spectral (in the parent folder)' for bdir, _ , files in os.walk(folder): for fname in files: if fname[-4:] != '.WAV': continue rawfname= bdir + '/' + fname[:-4]+'.rawaudio' wavfname = bdir + '/'+ fname tempfname = bdir + '/' + fname[:-4] + '_temp.wav' mfccfname = bdir + '/' + fname[:-4] + '.txt' if sox: shutil.move(wavfname, tempfname) call(['sox',tempfname,wavfname]) shutil.move(tempfname,wavfname) if htk_mfcc: call(['HCopy','-C','wav_config',wavfname,mfccfname]) srate = 16000 srate, sound = wavfile.read(wavfname) if stereo_wave and len(sound.shape == 2): sound = sound[:,0]+ sound[:,1] if gammatones: gammatonefname = bdir + '/' + fname[:-4] + '_gamma.npy' tmp_snd = loadsound(wavfname) gamma_cf = erbspace(20*Hz, 20*kHz, n_gmammatones_filters) gamma_fb = Gammatone(tmp_snd, gamma_cf) with open(gammatonefname,'w') as o_f: npsave(o_f, gamma_fb.process()) if spectograms: powersspec, _,_,_ = specgram(sound, NFFT=int(srate * specgram_window), Fs=srate,noverlap=int(srate*specgram_window)) specgramfname = bdir + '/' + fname[:-4]+'_specgram.npy' with open(specgramfname,'w') as o_f: npsave(o_f , powerspec.T) if filterbanks: if fbanks ==None: fbanks = Spectral(nfilt = n_fbanks, alpha=0.97,do_dct=False, fs=srate, frate=fbanks_rate, wlen=fbanks_window,nfft=1024,do_deltas=False,do_deltasdeltas=False) fbank = fbanks.transform(sound)[0] fbanksfname = bdir + '/' + fname[:-4]+'_fbanks.npy' with open(fbanksfname,'w') as o_f: npsave(o_f, fbank) print "Dealt with the file ", wavfname
class FeatureLoader(TransformerMixin, BaseEstimator): def __init__(self, stacksize=40, normalize='mvn', n_noise_fr=0, fs=16000, window_length=0.050, window_shift=0.010, nfft=1024, scale='mel', lowerf=120, upperf=7000, nfilt=40, taper_filt=True, compression='log', dct=False, nceps=13, log_e=True, lifter=22, deltas=False, remove_dc=False, medfilt_t=0, medfilt_s=(0, 0), noise_fr=0, pre_emph=0.97, feat_cache=None, noise_cache=None, wav_cache=None, n_jobs=1, verbose=False): self.stacksize = stacksize self.normalize = normalize if self.normalize == 'mvn': self.normalizer = StandardScaler() elif self.normalize == 'zca': self.normalizer = ZCA() elif self.normalize == 'minmax': self.normalizer = MinMaxScaler() else: self.normalizer = IdentityTransform() self.n_noise_fr = n_noise_fr self.fs = fs self.window_length = window_length self.window_shift = window_shift self.nfft = nfft self.scale = scale self.lowerf = lowerf self.upperf = upperf self.nfilt = nfilt self.taper_filt = taper_filt self.compression = compression self.dct = dct self.nceps = nceps self.log_e = log_e self.lifter = lifter self.deltas = deltas self.remove_dc = remove_dc self.medfilt_t = medfilt_t self.medfilt_s = medfilt_s self.noise_fr = noise_fr self.pre_emph = pre_emph self.n_jobs = n_jobs self.verbose = verbose self.encoder = Spectral( fs=fs, window_length=window_length, window_shift=window_shift, nfft=nfft, scale=scale, lowerf=lowerf, upperf=upperf, nfilt=nfilt, taper_filt=taper_filt, compression=compression, dct=dct, nceps=nceps, log_e=log_e, lifter=lifter, deltas=deltas, remove_dc=remove_dc, medfilt_t=medfilt_t, medfilt_s=medfilt_s, noise_fr=noise_fr, pre_emph=pre_emph ) self.D = self.encoder.n_features * self.stacksize self.wav_cache = wav_cache if wav_cache else {} self.noise_cache = noise_cache if noise_cache else {} self.feat_cache = feat_cache if feat_cache else {} def clear_cache(self): self.wav_cache = {} self.noise_cache = {} self.feat_cache = {} def get_params(self, deep=True): p = super(FeatureLoader, self).get_params() del p['n_jobs'] del p['verbose'] return p def get_key(self): """'Frozen' dictionary representation of this object's parameters. Used as key in caching. """ p = self.get_params() del p['wav_cache'] del p['noise_cache'] del p['feat_cache'] return tuple(sorted(p.items())) def _load_wav(self, fname): """ Memoized audio loader. """ key = fname if not key in self.wav_cache: sig, fs_ = wavread(fname) if self.fs != fs_: raise ValueError('sampling rate should be {0}, not {1}. ' 'please resample.'.format(self.fs, fs_)) if len(sig.shape) > 1: warnings.warn('stereo audio: merging channels') sig = (sig[:, 0] + sig[:, 1]) / 2 self.wav_cache[key] = sig return self.wav_cache[key] def _fill_noise_cache(self, X): for fname in X[:, 0]: self._extract_noise(fname) def _extract_noise(self, fname): cfg = ( ('fs', self.fs), ('window_length', self.window_length), ('window_shift', self.window_shift), ('nfft', self.nfft), ('remove_dc', self.remove_dc), ('medfilt_t', self.medfilt_t), ('medfilt_s', self.medfilt_s), ('pre_emph', self.pre_emph) ) key = (fname, cfg) if not key in self.noise_cache: if self.n_noise_fr == 0: self.noise_cache[key] = None else: sig = self._load_wav(fname) nsamples = (self.n_noise_fr + 2) * self.encoder.fshift spec = self.encoder.get_spectrogram(sig[:nsamples])[2:, :] noise = spec.mean(axis=0) noise = np.clip(noise, 1e-4, np.inf) self.noise_cache[key] = noise return self.noise_cache[key] def _fill_feat_cache(self, X_keys): sigs = [self._load_wav(fname) for fname, _ in X_keys] noises = [self._extract_noise(fname) for fname, _ in X_keys] p = Parallel(n_jobs=self.n_jobs, verbose=0)( delayed(extract_features_at)( sig, noise, start, self.stacksize, self.encoder) for (fname, start), sig, noise in izip(X_keys, sigs, noises) ) r = {x_key: feat for x_key, feat in izip(X_keys, p)} key = self.get_key() self.feat_cache[key].update(r) def get_specs(self, X): key = self.get_key() # list of [(filename, start)] X_keys = [(X[ix, 0], X[ix, 1]) for ix in xrange(X.shape[0])] if key in self.feat_cache: # check for missing keys missing_X_keys = [ x_key for x_key in X_keys if not x_key in self.feat_cache[key] ] self._fill_feat_cache(missing_X_keys) else: self.feat_cache[key] = {} self._fill_feat_cache(X_keys) return np.vstack((self.feat_cache[key][x_key] for x_key in X_keys)) def fit(self, X, y=None): """Load audio and optionally estimate mean and covar Parameters ---------- X : ndarray with columns filename, start, end y : """ r = self.get_specs(X) self.normalizer.fit(r) return self def transform(self, X, y=None): """Load audio and perform feature extraction. Parameters ---------- X : ndarray """ r = self.get_specs(X) r = self.normalizer.transform(r) return as_strided( r, shape=(r.shape[0]//self.stacksize, r.shape[1]*self.stacksize), strides=(r.strides[0]*self.stacksize, r.strides[1]) ) def fit_transform(self, X, y=None): r = self.get_specs(X) r = self.normalizer.fit_transform(r) return as_strided( r, shape=(r.shape[0]//self.stacksize, r.shape[1]*self.stacksize), strides=(r.strides[0]*self.stacksize, r.strides[1]) )
def __init__(self, stacksize=40, normalize='mvn', n_noise_fr=0, fs=16000, window_length=0.050, window_shift=0.010, nfft=1024, scale='mel', lowerf=120, upperf=7000, nfilt=40, taper_filt=True, compression='log', dct=False, nceps=13, log_e=True, lifter=22, deltas=False, remove_dc=False, medfilt_t=0, medfilt_s=(0, 0), noise_fr=0, pre_emph=0.97, feat_cache=None, noise_cache=None, wav_cache=None, n_jobs=1, verbose=False): self.stacksize = stacksize self.normalize = normalize if self.normalize == 'mvn': self.normalizer = StandardScaler() elif self.normalize == 'zca': self.normalizer = ZCA() elif self.normalize == 'minmax': self.normalizer = MinMaxScaler() else: self.normalizer = IdentityTransform() self.n_noise_fr = n_noise_fr self.fs = fs self.window_length = window_length self.window_shift = window_shift self.nfft = nfft self.scale = scale self.lowerf = lowerf self.upperf = upperf self.nfilt = nfilt self.taper_filt = taper_filt self.compression = compression self.dct = dct self.nceps = nceps self.log_e = log_e self.lifter = lifter self.deltas = deltas self.remove_dc = remove_dc self.medfilt_t = medfilt_t self.medfilt_s = medfilt_s self.noise_fr = noise_fr self.pre_emph = pre_emph self.n_jobs = n_jobs self.verbose = verbose self.encoder = Spectral( fs=fs, window_length=window_length, window_shift=window_shift, nfft=nfft, scale=scale, lowerf=lowerf, upperf=upperf, nfilt=nfilt, taper_filt=taper_filt, compression=compression, dct=dct, nceps=nceps, log_e=log_e, lifter=lifter, deltas=deltas, remove_dc=remove_dc, medfilt_t=medfilt_t, medfilt_s=medfilt_s, noise_fr=noise_fr, pre_emph=pre_emph ) self.D = self.encoder.n_features * self.stacksize self.wav_cache = wav_cache if wav_cache else {} self.noise_cache = noise_cache if noise_cache else {} self.feat_cache = feat_cache if feat_cache else {}
fid = wave.open(fname, 'r') _, _, fs, nframes, _, _ = fid.getparams() sig = np.array(struct.unpack_from("%dh" % nframes, fid.readframes(nframes))) fid.close() return sig, fs FBANKS_WINDOW = 0.025 # 25ms FBANKS_RATE = 100 # 10ms N_FBANKS = 40 for wavfname in sys.argv[1:]: sound, srate = readwav(wavfname) fbanks = Spectral(nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs compression='log', fs=srate, # sampling rate #lowerf=50, # lower frequency frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fbank = fbanks.transform(sound) fbanksfname = wavfname[:-4]+'_fbanks.npy' with open(fbanksfname, 'w') as o_f: np.save(o_f, fbank)