def extract_features(fname, bdir, sox, htk_mfc, mfc_extension, stereo_wav, gammatones, spectrograms, filterbanks): #def extract_features(fname, bdir): if fname[-4:] != '.wav': return rawfname = bdir+'/'+fname[:-4]+'.rawaudio' wavfname = bdir+'/'+fname tempfname = bdir+'/'+fname[:-4]+'_temp.wav' # temp fname with .wav for sox mfccfname = bdir+'/'+fname[:-4]+mfc_extension if sox: shutil.move(wavfname, tempfname) call(['sox', tempfname, wavfname]) #call(['sox', '-G', tempfname, '-r 16k', wavfname]) # w/o headers, sox uses extension shutil.move(tempfname, rawfname) if htk_mfc: call(['HCopy', '-C', 'wav_config', wavfname, mfccfname]) srate = 16000 #srate, sound = wavfile.read(wavfname) sound, srate = readwav(wavfname) if stereo_wav and len(sound.shape) == 2: # in mono sound is a list sound = 0.5 * (sound[:, 0] + sound[:, 1]) # for stereo wav, sum both channels if gammatones: gammatonefname = bdir+'/'+fname[:-4]+'_gamma.npy' tmp_snd = loadsound(wavfname) gamma_cf = erbspace(20*Hz, 20*kHz, N_GAMMATONES_FILTERS) gamma_fb = Gammatone(tmp_snd, gamma_cf) with open(gammatonefname, 'w') as o_f: npsave(o_f, gamma_fb.process()) if spectrograms: powerspec, _, _, _ = specgram(sound, NFFT=int(srate * SPECGRAM_WINDOW), Fs=srate, noverlap=int(srate * SPECGRAM_OVERLAP)) # TODO specgramfname = bdir+'/'+fname[:-4]+'_specgram.npy' with open(specgramfname, 'w') as o_f: npsave(o_f, powerspec.T) if filterbanks: # convert to Mel filterbanks fbanks = Spectral(nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs compression='log', fs=srate, # sampling rate lowerf=50, # lower frequency frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) sound /= np.abs(sound).max(axis=0) # TODO put that as option fbank = fbanks.transform(sound) fbanksfname = bdir+'/'+fname[:-4]+'_fbanks.npy' with open(fbanksfname, 'w') as o_f: npsave(o_f, fbank) # TODO wavelets scattergrams / scalograms print "dealt with file", wavfname
def extract_features(df, label2ix, spec_kwargs, vad_kwargs, stacksize=1, frate=100, return_y=False): if return_y: return_y = 'label' in df X = {} if return_y: y = {} spectrum_encoder = Spectral(**spec_kwargs) vad_encoder = VAD(**vad_kwargs) for ix, fname in enumerate(df.filename.unique()): sig, fs = wavread(fname) if fs != spec_kwargs['fs']: raise ValueError('expected samplerate {}, got {}'.format( spec_kwargs['fs'], fs) ) spec = spectrum_encoder.transform(sig) spec = (spec - spec.mean(0)) / spec.std(0) if stacksize > 1: spec = roll_array(spec, stacksize) vad = vad_encoder.activations(sig) vad = vad.reshape(vad.shape[0], -1) if stacksize > 1: vad = roll_array(vad, stacksize) X_curr = [] if return_y: y_curr = [] rows_iter = df[df.filename == fname].iterrows() for _, row in rows_iter: start = row.start end = row.end start_fr = int(start * frate) end_fr = int(end * frate) feat = np.hstack( (spec[start_fr: end_fr], vad[start_fr: end_fr]) ) X_curr.append( feat.astype(np.float32) ) if return_y: y_curr.append( np.ones(feat.shape[0], dtype=np.uint8) * \ label2ix[row.label] ) X[fname] = np.vstack(X_curr) if return_y: y[fname] = np.hstack(y_curr) if return_y: return X, y else: return X
def extract_features(df, label2ix, spec_kwargs, vad_kwargs, stacksize=1, frate=100, return_y=False): if return_y: return_y = 'label' in df X = {} if return_y: y = {} spectrum_encoder = Spectral(**spec_kwargs) vad_encoder = VAD(**vad_kwargs) for ix, fname in enumerate(df.filename.unique()): sig, fs = wavread(fname) if fs != spec_kwargs['fs']: raise ValueError('expected samplerate {}, got {}'.format( spec_kwargs['fs'], fs)) spec = spectrum_encoder.transform(sig) spec = (spec - spec.mean(0)) / spec.std(0) if stacksize > 1: spec = roll_array(spec, stacksize) vad = vad_encoder.activations(sig) vad = vad.reshape(vad.shape[0], -1) if stacksize > 1: vad = roll_array(vad, stacksize) X_curr = [] if return_y: y_curr = [] rows_iter = df[df.filename == fname].iterrows() for _, row in rows_iter: start = row.start end = row.end start_fr = int(start * frate) end_fr = int(end * frate) feat = np.hstack((spec[start_fr:end_fr], vad[start_fr:end_fr])) X_curr.append(feat.astype(np.float32)) if return_y: y_curr.append( np.ones(feat.shape[0], dtype=np.uint8) * \ label2ix[row.label] ) X[fname] = np.vstack(X_curr) if return_y: y[fname] = np.hstack(y_curr) if return_y: return X, y else: return X
def do_fbank(fname): srate, sound = wavfile.read(fname) fbanks = Spectral(nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs fs=srate, # sampling rate frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fb = fbanks.transform(sound) return fb
def do_fbank(fname): srate, sound = wavfile.read(fname) fbanks = Spectral( nfilt=40, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs fs=srate, # sampling rate frate=100, # frame rate wlen=0.025, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fb = np.array(fbanks.transform(sound), dtype='float32') return fb
def do_fbank(fname): srate, sound = wavfile.read(fname) fbanks = Spectral( nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs fs=srate, # sampling rate frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fb = fbanks.transform(sound) print "did:", fname #print fbnk.shape return fb
def do_mfccs(fname): """Compute standard mfccs from a wav file""" srate, sound = wavfile.read(fname) fbanks = Spectral( nfilt=40, # nb of filters in mel bank alpha=0.97, # pre-emphasis fs=srate, # sampling rate frate=100, # frame rate wlen=0.025, # window length nfft=512, # length of dft ncep=13, # nb of cepstral coefficients lowerf=100, upperf=6855.4976, do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fb = np.array(fbanks.transform(sound), dtype='float32') return fb
def do_mfccs(fname): """Compute standard mfccs from a wav file""" srate, sound = wavfile.read(fname) fbanks = Spectral( nfilt=40, # nb of filters in mel bank alpha=0.97, # pre-emphasis fs=srate, # sampling rate frate=100, # frame rate wlen=0.025, # window length nfft=512, # length of dft ncep=13, # nb of cepstral coefficients lowerf=100, upperf=6855.4976, do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fb = np.array(fbanks.transform(sound), dtype='float32') return fb
def do_fbank(fname): """Compute standard filterbanks from a wav file""" sound, srate = sf.read(fname) #f = Sndfile(fname,'r') #srate = f.samplerate #nf = f.nframes #sound = f.read_frames(nf) fbanks = Spectral( nfilt=40, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs fs=srate, # sampling rate frate=100, # frame rate wlen=0.025, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fb = np.array(fbanks.transform(sound), dtype='float32') return fb
def do_fbank(fname): fn = bdir + fname + '.wav' try: with open(fn[:-3] + 'npy', 'rb') as rfb: fb = np.load(rfb) except IOError: srate, sound = wavfile.read(fn) fbanks = Spectral(nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs fs=srate, # sampling rate frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fb = np.array(fbanks.transform(sound), dtype='float32') print "did:", fn #print fb.shape return fb
_, _, fs, nframes, _, _ = fid.getparams() sig = np.array(struct.unpack_from("%dh" % nframes, fid.readframes(nframes))) fid.close() return sig, fs FBANKS_WINDOW = 0.025 # 25ms FBANKS_RATE = 100 # 10ms N_FBANKS = 40 for wavfname in sys.argv[1:]: sound, srate = readwav(wavfname) fbanks = Spectral( nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs compression='log', fs=srate, # sampling rate #lowerf=50, # lower frequency frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fbank = fbanks.transform(sound) fbanksfname = wavfname[:-4] + '_fbanks.npy' with open(fbanksfname, 'w') as o_f: np.save(o_f, fbank)
def process(folder, debug=False, htk_mfc=False, forcemfcext=False, stereo_wav=False, gammatones=False, spectrograms=False, filterbanks=False, sox=True): """ applies to all *.wav in folder """ # first find if we produce normalized MFCC, otherwise note it in the ext # because we can then normalize on the whole corpus with another py script mfc_extension = '.mfc_unnorm' wcfg = open('wav_config', 'r') for line in wcfg: if "ENORMALISE" in line: mfc_extension = '.mfc' if forcemfcext: mfc_extension = '.mfc' print "MFC extension:", mfc_extension if gammatones: try: from brian import Hz, kHz from brian.hears import loadsound, erbspace, Gammatone except ImportError: print >> sys.stderr, "You need Brian Hears" print >> sys.stderr, "http://www.briansimulator.org/docs/\ hears.html" sys.exit(-1) if spectrograms: try: from pylab import specgram except ImportError: print >> sys.stderr, "You need Pylab" sys.exit(-1) fbanks = None if filterbanks: try: sys.path.append('../spectral') from spectral import Spectral except ImportError: print >> sys.stderr, "You need spectral (in the parent folder)" print >> sys.stderr, "https://github.com/mwv/spectral" sys.exit(-1) # run through all the folders and files in the path "folder" # and put a header to the waves, save the originals as .rawaudio # use HCopy to produce MFCC files according to "wav_config" file for bdir, _, files in os.walk(folder): for fname in files: if fname[-4:] != '.wav': continue rawfname = bdir + '/' + fname[:-4] + '.rawaudio' wavfname = bdir + '/' + fname tempfname = bdir + '/' + fname[:-4] + '_temp.wav' # temp fname with .wav for sox mfccfname = bdir + '/' + fname[:-4] + mfc_extension if sox: shutil.move(wavfname, tempfname) call(['sox', tempfname, wavfname]) # w/o headers, sox uses extension shutil.move(tempfname, rawfname) if htk_mfc: call(['HCopy', '-C', 'wav_config', wavfname, mfccfname]) srate = 16000 srate, sound = wavfile.read(wavfname) if stereo_wav and len(sound.shape) == 2: # in mono sound is a list sound = sound[:, 0] + sound[:, 1] # for stereo wav, sum both channels if gammatones: gammatonefname = bdir + '/' + fname[:-4] + '_gamma.npy' tmp_snd = loadsound(wavfname) gamma_cf = erbspace(20 * Hz, 20 * kHz, N_GAMMATONES_FILTERS) gamma_fb = Gammatone(tmp_snd, gamma_cf) with open(gammatonefname, 'w') as o_f: npsave(o_f, gamma_fb.process()) if spectrograms: powerspec, _, _, _ = specgram( sound, NFFT=int(srate * SPECGRAM_WINDOW), Fs=srate, noverlap=int(srate * SPECGRAM_OVERLAP)) # TODO specgramfname = bdir + '/' + fname[:-4] + '_specgram.npy' with open(specgramfname, 'w') as o_f: npsave(o_f, powerspec.T) if filterbanks: # convert to Mel filterbanks if fbanks == None: # assume parameters are fixed fbanks = Spectral( nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs fs=srate, # sampling rate frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fbank = fbanks.transform(sound)[0] # first dimension is for # deltas & deltasdeltas fbanksfname = bdir + '/' + fname[:-4] + '_fbanks.npy' with open(fbanksfname, 'w') as o_f: npsave(o_f, fbank) # TODO wavelets scattergrams / scalograms print "dealt with file", wavfname
def process(folder,debug=False,htk_mfcc=False,forcemfcext=False,stereo_wave=False,gammatones=False,spectograms=False,filterbanks=False,sox=True): mfc_extension = '.mfc_unnorm' wcfg = open('wav_config','r') for line in wcfg: if "ENORMALISE" in line: mfc_extension = '.mfc' if forcemfcext: mfc_extension = '.mfc' print "MFC Extension is", mfc_extension if gammatones: try: from brian import Hz, kHz from brian.hears import loadsound, erbspace, Gammatone except ImportError: print >> sys.stderr, "You need Brian Hears" sys.exit(-1) if spectograms: try: from pylab import specgram except ImportError: print >> sys.stderr,'You need Pylab' sys.exit(-1) fbanks = None if filterbanks: try: sys.path.append('../spectral') from spectral import Spectral except ImportError: print >> sys.stderr, 'you need spectral (in the parent folder)' for bdir, _ , files in os.walk(folder): for fname in files: if fname[-4:] != '.WAV': continue rawfname= bdir + '/' + fname[:-4]+'.rawaudio' wavfname = bdir + '/'+ fname tempfname = bdir + '/' + fname[:-4] + '_temp.wav' mfccfname = bdir + '/' + fname[:-4] + '.txt' if sox: shutil.move(wavfname, tempfname) call(['sox',tempfname,wavfname]) shutil.move(tempfname,wavfname) if htk_mfcc: call(['HCopy','-C','wav_config',wavfname,mfccfname]) srate = 16000 srate, sound = wavfile.read(wavfname) if stereo_wave and len(sound.shape == 2): sound = sound[:,0]+ sound[:,1] if gammatones: gammatonefname = bdir + '/' + fname[:-4] + '_gamma.npy' tmp_snd = loadsound(wavfname) gamma_cf = erbspace(20*Hz, 20*kHz, n_gmammatones_filters) gamma_fb = Gammatone(tmp_snd, gamma_cf) with open(gammatonefname,'w') as o_f: npsave(o_f, gamma_fb.process()) if spectograms: powersspec, _,_,_ = specgram(sound, NFFT=int(srate * specgram_window), Fs=srate,noverlap=int(srate*specgram_window)) specgramfname = bdir + '/' + fname[:-4]+'_specgram.npy' with open(specgramfname,'w') as o_f: npsave(o_f , powerspec.T) if filterbanks: if fbanks ==None: fbanks = Spectral(nfilt = n_fbanks, alpha=0.97,do_dct=False, fs=srate, frate=fbanks_rate, wlen=fbanks_window,nfft=1024,do_deltas=False,do_deltasdeltas=False) fbank = fbanks.transform(sound)[0] fbanksfname = bdir + '/' + fname[:-4]+'_fbanks.npy' with open(fbanksfname,'w') as o_f: npsave(o_f, fbank) print "Dealt with the file ", wavfname
fid = wave.open(fname, 'r') _, _, fs, nframes, _, _ = fid.getparams() sig = np.array(struct.unpack_from("%dh" % nframes, fid.readframes(nframes))) fid.close() return sig, fs FBANKS_WINDOW = 0.025 # 25ms FBANKS_RATE = 100 # 10ms N_FBANKS = 40 for wavfname in sys.argv[1:]: sound, srate = readwav(wavfname) fbanks = Spectral(nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs compression='log', fs=srate, # sampling rate #lowerf=50, # lower frequency frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fbank = fbanks.transform(sound) fbanksfname = wavfname[:-4]+'_fbanks.npy' with open(fbanksfname, 'w') as o_f: np.save(o_f, fbank)
def process(folder, debug=False, htk_mfc=False, forcemfcext=False, stereo_wav=False, gammatones=False, spectrograms=False, filterbanks=False, sox=True): """ applies to all *.wav in folder """ # first find if we produce normalized MFCC, otherwise note it in the ext # because we can then normalize on the whole corpus with another py script mfc_extension = '.mfc_unnorm' wcfg = open('wav_config', 'r') for line in wcfg: if "ENORMALISE" in line: mfc_extension = '.mfc' if forcemfcext: mfc_extension = '.mfc' print "MFC extension:", mfc_extension if gammatones: try: from brian import Hz, kHz from brian.hears import loadsound, erbspace, Gammatone except ImportError: print >> sys.stderr, "You need Brian Hears" print >> sys.stderr, "http://www.briansimulator.org/docs/\ hears.html" sys.exit(-1) if spectrograms: try: from pylab import specgram except ImportError: print >> sys.stderr, "You need Pylab" sys.exit(-1) fbanks = None if filterbanks: try: sys.path.append('../spectral') from spectral import Spectral except ImportError: print >> sys.stderr, "You need spectral (in the parent folder)" print >> sys.stderr, "https://github.com/mwv/spectral" sys.exit(-1) # run through all the folders and files in the path "folder" # and put a header to the waves, save the originals as .rawaudio # use HCopy to produce MFCC files according to "wav_config" file for bdir, _, files in os.walk(folder): for fname in files: if fname[-4:] != '.wav': continue rawfname = bdir+'/'+fname[:-4]+'.rawaudio' wavfname = bdir+'/'+fname tempfname = bdir+'/'+fname[:-4]+'_temp.wav' # temp fname with .wav for sox mfccfname = bdir+'/'+fname[:-4]+mfc_extension if sox: shutil.move(wavfname, tempfname) call(['sox', tempfname, wavfname]) # w/o headers, sox uses extension shutil.move(tempfname, rawfname) if htk_mfc: call(['HCopy', '-C', 'wav_config', wavfname, mfccfname]) srate = 16000 srate, sound = wavfile.read(wavfname) if stereo_wav and len(sound.shape) == 2: # in mono sound is a list sound = sound[:, 0] + sound[:, 1] # for stereo wav, sum both channels if gammatones: gammatonefname = bdir+'/'+fname[:-4]+'_gamma.npy' tmp_snd = loadsound(wavfname) gamma_cf = erbspace(20*Hz, 20*kHz, N_GAMMATONES_FILTERS) gamma_fb = Gammatone(tmp_snd, gamma_cf) with open(gammatonefname, 'w') as o_f: npsave(o_f, gamma_fb.process()) if spectrograms: powerspec, _, _, _ = specgram(sound, NFFT=int(srate * SPECGRAM_WINDOW), Fs=srate, noverlap=int(srate * SPECGRAM_OVERLAP)) # TODO specgramfname = bdir+'/'+fname[:-4]+'_specgram.npy' with open(specgramfname, 'w') as o_f: npsave(o_f, powerspec.T) if filterbanks: # convert to Mel filterbanks if fbanks == None: # assume parameters are fixed fbanks = Spectral(nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs fs=srate, # sampling rate frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fbank = fbanks.transform(sound)[0] # first dimension is for # deltas & deltasdeltas fbanksfname = bdir+'/'+fname[:-4]+'_fbanks.npy' with open(fbanksfname, 'w') as o_f: npsave(o_f, fbank) # TODO wavelets scattergrams / scalograms print "dealt with file", wavfname