def getModulation(x, fmin, fmax, bands, sr): cf = erbspace(fmin * Hz, fmax * Hz, bands) m = [] for i in range(x.shape[1]): gfb = Gammatone(Sound(x[:, i], samplerate=sr * Hz), cf) m.append(gfb.process()) return np.asarray(m)
def define_f_bands(stt=180, stp=7000, n_bands=32, kind='log'): ''' Defines log-spaced frequency bands...generally this is for auditory spectrogram extraction. Brian used 180 - 7000 Hz, so for now those are the defaults. INPUTS -------- stt : int The starting frequency stp : int The end frequency n_bands : int The number of bands to calculate kind : string, ['log', 'erb'] What kind of spacing will we use for the frequency bands. ''' if kind == 'log': aud_fs = np.logspace(np.log10(stt), np.log10(stp), n_bands).astype(int) elif kind == 'lin': aud_fs = np.linspace(stt, stp, n_bands).astype(int) elif kind == 'erb': aud_fs = hears.erbspace(stt*Hz, stp*Hz, n_bands) else: raise NameError("I don't know what kind of spacing that is") return aud_fs
def define_f_bands(stt=180, stp=7000, n_bands=32, kind='log'): ''' Defines log-spaced frequency bands...generally this is for auditory spectrogram extraction. Brian used 180 - 7000 Hz, so for now those are the defaults. INPUTS -------- stt : int The starting frequency stp : int The end frequency n_bands : int The number of bands to calculate kind : string, ['log', 'erb'] What kind of spacing will we use for the frequency bands. ''' if kind == 'log': aud_fs = np.logspace(np.log10(stt), np.log10(stp), n_bands).astype(int) elif kind == 'lin': aud_fs = np.linspace(stt, stp, n_bands).astype(int) elif kind == 'erb': aud_fs = hears.erbspace(stt * Hz, stp * Hz, n_bands) else: raise NameError("I don't know what kind of spacing that is") return aud_fs
def create_center_frequencies(stt=180, stp=7000, n_bands=32, kind='log'): ''' Define center frequencies for spectrograms. Generally this is for auditory spectrogram extraction. Most auditory analysis uses 180 - 7000 Hz, so for now those are the defaults. Parameters ---------- stt : float | int The starting frequency stp : float | int The end frequency n_bands : int The number of bands to calculate kind : 'log' | 'erb' Whether to use log or erb spacing Returns ------- freqs : array, shape (n_frequencies,) An array of center frequencies. ''' if kind == 'log': freqs = np.logspace(np.log10(stt), np.log10(stp), n_bands).astype(int) elif kind == 'erb': freqs = hears.erbspace(stt * Hz, stp * Hz, n_bands) else: print("I don't know what kind of spacing that is") return freqs
def extract_features(fname, bdir, sox, htk_mfc, mfc_extension, stereo_wav, gammatones, spectrograms, filterbanks): #def extract_features(fname, bdir): if fname[-4:] != '.wav': return rawfname = bdir+'/'+fname[:-4]+'.rawaudio' wavfname = bdir+'/'+fname tempfname = bdir+'/'+fname[:-4]+'_temp.wav' # temp fname with .wav for sox mfccfname = bdir+'/'+fname[:-4]+mfc_extension if sox: shutil.move(wavfname, tempfname) call(['sox', tempfname, wavfname]) #call(['sox', '-G', tempfname, '-r 16k', wavfname]) # w/o headers, sox uses extension shutil.move(tempfname, rawfname) if htk_mfc: call(['HCopy', '-C', 'wav_config', wavfname, mfccfname]) srate = 16000 #srate, sound = wavfile.read(wavfname) sound, srate = readwav(wavfname) if stereo_wav and len(sound.shape) == 2: # in mono sound is a list sound = 0.5 * (sound[:, 0] + sound[:, 1]) # for stereo wav, sum both channels if gammatones: gammatonefname = bdir+'/'+fname[:-4]+'_gamma.npy' tmp_snd = loadsound(wavfname) gamma_cf = erbspace(20*Hz, 20*kHz, N_GAMMATONES_FILTERS) gamma_fb = Gammatone(tmp_snd, gamma_cf) with open(gammatonefname, 'w') as o_f: npsave(o_f, gamma_fb.process()) if spectrograms: powerspec, _, _, _ = specgram(sound, NFFT=int(srate * SPECGRAM_WINDOW), Fs=srate, noverlap=int(srate * SPECGRAM_OVERLAP)) # TODO specgramfname = bdir+'/'+fname[:-4]+'_specgram.npy' with open(specgramfname, 'w') as o_f: npsave(o_f, powerspec.T) if filterbanks: # convert to Mel filterbanks fbanks = Spectral(nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs compression='log', fs=srate, # sampling rate lowerf=50, # lower frequency frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) sound /= np.abs(sound).max(axis=0) # TODO put that as option fbank = fbanks.transform(sound) fbanksfname = bdir+'/'+fname[:-4]+'_fbanks.npy' with open(fbanksfname, 'w') as o_f: npsave(o_f, fbank) # TODO wavelets scattergrams / scalograms print "dealt with file", wavfname
def process(folder, debug=False, htk_mfc=False, forcemfcext=False, stereo_wav=False, gammatones=False, spectrograms=False): """ debug output? HCopy for MFCC? wav are stereo? produce gammatones? """ # first find if we produce normalized MFCC, otherwise note it in the ext # because we can then normalize on the whole corpus with another py script mfc_extension = '.mfc_unnorm' wcfg = open('wav_config', 'r') for line in wcfg: if "ENORMALISE" in line: mfc_extension = '.mfc' if forcemfcext: mfc_extension = '.mfc' print "MFC extension:", mfc_extension # run through all the folders and files in the path "folder" # and put a header to the waves, save the originals as .rawaudio # use HCopy to produce MFCC files according to "wav_config" file for d, ds, fs in os.walk(folder): for fname in fs: if fname[-4:] != '.wav': continue rawfname = d+'/'+fname[:-4]+'.rawaudio' wavfname = d+'/'+fname tempfname = d+'/'+fname[:-4]+'_temp.wav' # temp fname with .wav for sox mfccfname = d+'/'+fname[:-4]+mfc_extension shutil.move(wavfname, tempfname) call(['sox', tempfname, wavfname]) # w/o headers, sox uses extension shutil.move(tempfname, rawfname) if htk_mfc: call(['HCopy', '-C', 'wav_config', wavfname, mfccfname]) sr = 16000 sr, sound = wavfile.read(wavfname) if stereo_wav and len(sound.shape) == 2: # in mono sound is a list sound = sound[:,1] # for stereo wav, arbitrarily take channel 1 if gammatones: from brian import Hz, kHz from brian.hears import loadsound, erbspace, Gammatone gammatonefname = d+'/'+fname[:-4]+'_gamma.npy' tmp_snd = loadsound(wavfname) cf = erbspace(20*Hz, 20*kHz, N_GAMMATONES_FILTERS) fb = Gammatone(tmp_snd, cf) with open(gammatonefname, 'w') as of: numpy.save(of, fb.process()) if spectrograms: from pylab import specgram Pxx, freqs, bins, im = specgram(sound, NFFT=int(sr * SPECGRAM_WINDOW), Fs=sr, noverlap=int(sr * SPECGRAM_OVERLAP)) specgramfname = d+'/'+fname[:-4]+'_specgram.npy' with open(specgramfname, 'w') as of: numpy.save(of, Pxx.T) print "dealt with file", wavfname
def drnl(sound, n_channels=50, uncompressed=True): """ use predefined cochlear model, see Lopez-Poveda et al 2001""" cf = erbspace(100*Hz, 8000*Hz, n_channels) # centre frequencies of ERB scale # (equivalent rectangular bandwidth) # between 100 and 8000 Hz drnl_filter = DRNL(sound, cf, type='human') # use DNRL model, see documentation print 'processing sound' out = drnl_filter.process() # get array of channel activations if not uncompressed: out = out.clip(0.0) # -> fast oscillations can't be downsampled otherwise out = resample(out, int(round(sound.nsamples/1000.0))) # downsample sound for memory reasons return out
def drnl(sound, n_channels=50, uncompressed=True): """ use predefined cochlear model, see Lopez-Poveda et al 2001""" cf = erbspace(100 * Hz, 8000 * Hz, n_channels) # centre frequencies of ERB scale # (equivalent rectangular bandwidth) # between 100 and 8000 Hz drnl_filter = DRNL(sound, cf, type='human') # use DNRL model, see documentation print 'processing sound' out = drnl_filter.process() # get array of channel activations if not uncompressed: out = out.clip( 0.0) # -> fast oscillations can't be downsampled otherwise out = resample(out, int(round(sound.nsamples / 1000.0))) # downsample sound for memory reasons return out
def process(folder, debug=False, htk_mfc=False, forcemfcext=False, stereo_wav=False, gammatones=False, spectrograms=False, filterbanks=False, sox=True): """ applies to all *.wav in folder """ # first find if we produce normalized MFCC, otherwise note it in the ext # because we can then normalize on the whole corpus with another py script mfc_extension = '.mfc_unnorm' wcfg = open('wav_config', 'r') for line in wcfg: if "ENORMALISE" in line: mfc_extension = '.mfc' if forcemfcext: mfc_extension = '.mfc' print "MFC extension:", mfc_extension if gammatones: try: from brian import Hz, kHz from brian.hears import loadsound, erbspace, Gammatone except ImportError: print >> sys.stderr, "You need Brian Hears" print >> sys.stderr, "http://www.briansimulator.org/docs/\ hears.html" sys.exit(-1) if spectrograms: try: from pylab import specgram except ImportError: print >> sys.stderr, "You need Pylab" sys.exit(-1) fbanks = None if filterbanks: try: sys.path.append('../spectral') from spectral import Spectral except ImportError: print >> sys.stderr, "You need spectral (in the parent folder)" print >> sys.stderr, "https://github.com/mwv/spectral" sys.exit(-1) # run through all the folders and files in the path "folder" # and put a header to the waves, save the originals as .rawaudio # use HCopy to produce MFCC files according to "wav_config" file for bdir, _, files in os.walk(folder): for fname in files: if fname[-4:] != '.wav': continue rawfname = bdir + '/' + fname[:-4] + '.rawaudio' wavfname = bdir + '/' + fname tempfname = bdir + '/' + fname[:-4] + '_temp.wav' # temp fname with .wav for sox mfccfname = bdir + '/' + fname[:-4] + mfc_extension if sox: shutil.move(wavfname, tempfname) call(['sox', tempfname, wavfname]) # w/o headers, sox uses extension shutil.move(tempfname, rawfname) if htk_mfc: call(['HCopy', '-C', 'wav_config', wavfname, mfccfname]) srate = 16000 srate, sound = wavfile.read(wavfname) if stereo_wav and len(sound.shape) == 2: # in mono sound is a list sound = sound[:, 0] + sound[:, 1] # for stereo wav, sum both channels if gammatones: gammatonefname = bdir + '/' + fname[:-4] + '_gamma.npy' tmp_snd = loadsound(wavfname) gamma_cf = erbspace(20 * Hz, 20 * kHz, N_GAMMATONES_FILTERS) gamma_fb = Gammatone(tmp_snd, gamma_cf) with open(gammatonefname, 'w') as o_f: npsave(o_f, gamma_fb.process()) if spectrograms: powerspec, _, _, _ = specgram( sound, NFFT=int(srate * SPECGRAM_WINDOW), Fs=srate, noverlap=int(srate * SPECGRAM_OVERLAP)) # TODO specgramfname = bdir + '/' + fname[:-4] + '_specgram.npy' with open(specgramfname, 'w') as o_f: npsave(o_f, powerspec.T) if filterbanks: # convert to Mel filterbanks if fbanks == None: # assume parameters are fixed fbanks = Spectral( nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs fs=srate, # sampling rate frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fbank = fbanks.transform(sound)[0] # first dimension is for # deltas & deltasdeltas fbanksfname = bdir + '/' + fname[:-4] + '_fbanks.npy' with open(fbanksfname, 'w') as o_f: npsave(o_f, fbank) # TODO wavelets scattergrams / scalograms print "dealt with file", wavfname
def process(folder,debug=False,htk_mfcc=False,forcemfcext=False,stereo_wave=False,gammatones=False,spectograms=False,filterbanks=False,sox=True): mfc_extension = '.mfc_unnorm' wcfg = open('wav_config','r') for line in wcfg: if "ENORMALISE" in line: mfc_extension = '.mfc' if forcemfcext: mfc_extension = '.mfc' print "MFC Extension is", mfc_extension if gammatones: try: from brian import Hz, kHz from brian.hears import loadsound, erbspace, Gammatone except ImportError: print >> sys.stderr, "You need Brian Hears" sys.exit(-1) if spectograms: try: from pylab import specgram except ImportError: print >> sys.stderr,'You need Pylab' sys.exit(-1) fbanks = None if filterbanks: try: sys.path.append('../spectral') from spectral import Spectral except ImportError: print >> sys.stderr, 'you need spectral (in the parent folder)' for bdir, _ , files in os.walk(folder): for fname in files: if fname[-4:] != '.WAV': continue rawfname= bdir + '/' + fname[:-4]+'.rawaudio' wavfname = bdir + '/'+ fname tempfname = bdir + '/' + fname[:-4] + '_temp.wav' mfccfname = bdir + '/' + fname[:-4] + '.txt' if sox: shutil.move(wavfname, tempfname) call(['sox',tempfname,wavfname]) shutil.move(tempfname,wavfname) if htk_mfcc: call(['HCopy','-C','wav_config',wavfname,mfccfname]) srate = 16000 srate, sound = wavfile.read(wavfname) if stereo_wave and len(sound.shape == 2): sound = sound[:,0]+ sound[:,1] if gammatones: gammatonefname = bdir + '/' + fname[:-4] + '_gamma.npy' tmp_snd = loadsound(wavfname) gamma_cf = erbspace(20*Hz, 20*kHz, n_gmammatones_filters) gamma_fb = Gammatone(tmp_snd, gamma_cf) with open(gammatonefname,'w') as o_f: npsave(o_f, gamma_fb.process()) if spectograms: powersspec, _,_,_ = specgram(sound, NFFT=int(srate * specgram_window), Fs=srate,noverlap=int(srate*specgram_window)) specgramfname = bdir + '/' + fname[:-4]+'_specgram.npy' with open(specgramfname,'w') as o_f: npsave(o_f , powerspec.T) if filterbanks: if fbanks ==None: fbanks = Spectral(nfilt = n_fbanks, alpha=0.97,do_dct=False, fs=srate, frate=fbanks_rate, wlen=fbanks_window,nfft=1024,do_deltas=False,do_deltasdeltas=False) fbank = fbanks.transform(sound)[0] fbanksfname = bdir + '/' + fname[:-4]+'_fbanks.npy' with open(fbanksfname,'w') as o_f: npsave(o_f, fbank) print "Dealt with the file ", wavfname
# TODO load the following parameters from wav_config SAMPLING_RATE = 16000 # Hz MFCC_TIMESTEP = 10 # 10 ms HAMMING_SIZE = 25 # 25 ms N_MFCC_COEFFS = 39 # as in Mohamed et al. / Dahl et al. (Hinton group) papers N_FILTERBANK_COEFFS = 40 # as in Acoustic Modeling using Deep Belief Networks # Mohamed et al. TALKBOX_FBANKS = False if TALKBOX_FBANKS: from scikits.talkbox.features import mfcc as tbmfcc DEBUG = False N_GAMMATONES = 50 # c.f. http://www.briansimulator.org/docs/hears.html # and http://www.briansimulator.org/docs/examples-hears_approximate_gammatone.html#example-hears-approximate-gammatone center_frequencies = erbspace(100 * Hz, 1000 * Hz, N_GAMMATONES) TEST = True # test numpy serialization usage = """ python timit_to_numpy.py MLF_FILENAME.mlf [--gamma] output files are MLF_FILENAME_xdata.npy, MLF_FILENAME_xfbank.npy, MLF_FILENAME_xgamma.npy, and MLF_FILENAME_ylabels.npy """ def compute_speed_and_accel(x): tmp_diff = np.pad(np.diff(x, axis=0), ((0, 1), (0, 0)), 'constant', constant_values=(0.0, 0.0)) tmp_accel = np.pad(np.diff(tmp_diff, axis=0), ((0, 1), (0, 0)),
# TODO load the following parameters from wav_config SAMPLING_RATE = 16000 # Hz MFCC_TIMESTEP = 10 # 10 ms HAMMING_SIZE = 25 # 25 ms N_MFCC_COEFFS = 39 # as in Mohamed et al. / Dahl et al. (Hinton group) papers N_FILTERBANK_COEFFS = 40 # as in Acoustic Modeling using Deep Belief Networks # Mohamed et al. TALKBOX_FBANKS = False if TALKBOX_FBANKS: from scikits.talkbox.features import mfcc as tbmfcc DEBUG = False N_GAMMATONES = 50 # c.f. http://www.briansimulator.org/docs/hears.html # and http://www.briansimulator.org/docs/examples-hears_approximate_gammatone.html#example-hears-approximate-gammatone center_frequencies = erbspace(100*Hz, 1000*Hz, N_GAMMATONES) TEST = True # test numpy serialization usage = """ python timit_to_numpy.py MLF_FILENAME.mlf [--gamma] output files are MLF_FILENAME_xdata.npy, MLF_FILENAME_xfbank.npy, MLF_FILENAME_xgamma.npy, and MLF_FILENAME_ylabels.npy """ def compute_speed_and_accel(x): tmp_diff = np.pad(np.diff(x, axis=0), ((0, 1), (0, 0)), 'constant', constant_values=(0.0, 0.0)) tmp_accel = np.pad(np.diff(tmp_diff, axis=0), ((0, 1), (0, 0)), 'constant', constant_values=(0.0, 0.0))
from brian.hears import erbspace, Gammatone, Sound from brian import Hz from scipy.signal import hilbert from sklearn.metrics import mean_absolute_error, mean_squared_error from sklearn.metrics.pairwise import paired_distances from collections import OrderedDict import sys import os import scipy import numpy as np kGfmin, kGfmax, kGbands = 26, 6950, 12 kMfmin, kMfmax, kMbands = 0.5, 100, 12 kEsr = 400 kSR = 16000 cfG = erbspace(kGfmin * Hz, kGfmax * Hz, kGbands) cfM = erbspace(kMfmin * Hz, kMfmax * Hz, kMbands) # kEfmin = [0.5, 4.5, 10.5, 20.5] # kEfmax = [4. , 10., 20., 100. ] kEfmin = cfM[:-1] kEfmax = cfM[1:] def getGammatone(x, fmin, fmax, bands, sr): cf = erbspace(fmin * Hz, fmax * Hz, bands) gfb = Gammatone(Sound(x, samplerate=sr * Hz), cf) gamma = gfb.process() return gamma
def process( folder, debug=False, htk_mfc=False, forcemfcext=False, stereo_wav=False, gammatones=False, spectrograms=False, filterbanks=False, sox=True, ): """ applies to all *.wav in folder """ # first find if we produce normalized MFCC, otherwise note it in the ext # because we can then normalize on the whole corpus with another py script mfc_extension = ".mfc_unnorm" wcfg = open("wav_config", "r") for line in wcfg: if "ENORMALISE" in line: mfc_extension = ".mfc" if forcemfcext: mfc_extension = ".mfc" print "MFC extension:", mfc_extension if gammatones: try: from brian import Hz, kHz from brian.hears import loadsound, erbspace, Gammatone except ImportError: print >> sys.stderr, "You need Brian Hears" print >> sys.stderr, "http://www.briansimulator.org/docs/\ hears.html" sys.exit(-1) if spectrograms: try: from pylab import specgram except ImportError: print >> sys.stderr, "You need Pylab" sys.exit(-1) fbanks = None if filterbanks: try: sys.path.append("../spectral") from spectral import Mel except ImportError: print >> sys.stderr, "You need spectral (in the parent folder)" print >> sys.stderr, "https://github.com/mwv/spectral" sys.exit(-1) # run through all the folders and files in the path "folder" # and put a header to the waves, save the originals as .rawaudio # use HCopy to produce MFCC files according to "wav_config" file for bdir, _, files in os.walk(folder): for fname in files: if fname[-4:] != ".wav": continue rawfname = bdir + "/" + fname[:-4] + ".rawaudio" wavfname = bdir + "/" + fname tempfname = bdir + "/" + fname[:-4] + "_temp.wav" # temp fname with .wav for sox mfccfname = bdir + "/" + fname[:-4] + mfc_extension if sox: shutil.move(wavfname, tempfname) call(["sox", tempfname, wavfname]) # w/o headers, sox uses extension shutil.move(tempfname, rawfname) if htk_mfc: call(["HCopy", "-C", "wav_config", wavfname, mfccfname]) srate = 16000 srate, sound = wavfile.read(wavfname) if stereo_wav and len(sound.shape) == 2: # in mono sound is a list sound = sound[:, 0] + sound[:, 1] # for stereo wav, sum both channels if gammatones: gammatonefname = bdir + "/" + fname[:-4] + "_gamma.npy" tmp_snd = loadsound(wavfname) gamma_cf = erbspace(20 * Hz, 20 * kHz, N_GAMMATONES_FILTERS) gamma_fb = Gammatone(tmp_snd, gamma_cf) with open(gammatonefname, "w") as o_f: npsave(o_f, gamma_fb.process()) if spectrograms: powerspec, _, _, _ = specgram( sound, NFFT=int(srate * SPECGRAM_WINDOW), Fs=srate, noverlap=int(srate * SPECGRAM_OVERLAP) ) # TODO specgramfname = bdir + "/" + fname[:-4] + "_specgram.npy" with open(specgramfname, "w") as o_f: npsave(o_f, powerspec.T) if filterbanks: # convert to Mel filterbanks if fbanks == None: # assume parameters are fixed fbanks = Mel( nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis fs=srate, # sampling rate frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft mel_deltas=False, # speed mel_deltasdeltas=False, # acceleration ) fbank = fbanks.transform(sound)[0] # first dimension is for # deltas & deltasdeltas fbanksfname = bdir + "/" + fname[:-4] + "_fbanks.npy" with open(fbanksfname, "w") as o_f: npsave(o_f, fbank) # TODO wavelets scattergrams / scalograms print "dealt with file", wavfname
def getGammatone(x, fmin, fmax, bands, sr): cf = erbspace(fmin * Hz, fmax * Hz, bands) gfb = Gammatone(Sound(x, samplerate=sr * Hz), cf) gamma = gfb.process() return gamma
def __init__(self, which_set, frame_length, overlap=0, n_channels=64, frames_per_example=1, start=0, stop=None, audio_only=False, n_prev_phones=0, n_next_phones=0, samples_to_predict=1, filter_fn=None, rng=_default_seed, gtfb_data_path='/home/jfsantos/data/pylearn2data/timit/readable'): """ Parameters ---------- which_set : str Either "train", "valid" or "test" frame_length : int Number of acoustic samples contained in a frame overlap : int, optional Number of overlapping acoustic samples for two consecutive frames. Defaults to 0, meaning frames don't overlap. frames_per_example : int, optional Number of frames in a training example. Defaults to 1. start : int, optional Starting index of the sequences to use. Defaults to 0. stop : int, optional Ending index of the sequences to use. Defaults to `None`, meaning sequences are selected all the way to the end of the array. audio_only : bool, optional Whether to load only the raw audio and no auxiliary information. Defaults to `False`. rng : object, optional A random number generator used for picking random indices into the design matrix when choosing minibatches. """ self.frame_length = frame_length self.overlap = overlap self.frames_per_example = frames_per_example self.offset = self.frame_length - self.overlap self.audio_only = audio_only self.n_prev_phones = n_prev_phones self.n_next_phones = n_next_phones self.samples_to_predict = samples_to_predict self.n_channels = n_channels # RNG initialization if hasattr(rng, 'random_integers'): self.rng = rng else: self.rng = numpy.random.RandomState(rng) self.fc = erbspace(80*hertz, 5*khertz, self.n_channels) # Load data from disk self._load_data(which_set, gtfb_data_path) # Standardize data for i, sequence in enumerate(self.raw_wav): self.raw_wav[i] = (sequence - TIMITGTFB._mean) / TIMITGTFB._std if filter_fn is not None: filter_fn = eval(filter_fn) indexes = filter_fn(self.speaker_info_list[self.speaker_id]) self.raw_wav = self.raw_wav[indexes] if not self.audio_only: self.phones = self.phones[indexes] # Slice data if stop is not None: self.raw_wav = self.raw_wav[start:stop] if not self.audio_only: self.phones = self.phones[start:stop] else: self.raw_wav = self.raw_wav[start:] if not self.audio_only: self.phones = self.phones[start:] examples_per_sequence = [0] self.phone_rel_dur = [] for sequence_id, samples_sequence in enumerate(self.raw_wav): if not self.audio_only: tot_n_frames = samples_sequence.shape[0] # Phones segmentation phones_sequence = self.phones[sequence_id] phone_list = numpy.asarray([k for k, g in itertools.groupby(phones_sequence)]) phone_duration = [len(list(g)) for k, g in itertools.groupby(phones_sequence)] phone_position = numpy.cumsum(phone_duration) frame_position = numpy.arange(0, tot_n_frames*self.overlap, self.overlap) seq_phones = numpy.empty((tot_n_frames, 1+self.n_prev_phones+self.n_next_phones), dtype=int) phone_rel_dur = numpy.empty(tot_n_frames, dtype=float) for frame in range(tot_n_frames): cur_phone_idx = (frame_position[frame] < phone_position).argmax() if cur_phone_idx == 0: phone_rel_dur[frame] = frame_position[frame]/float(phone_duration[cur_phone_idx]) else: phone_rel_dur[frame] = (frame_position[frame] - phone_position[cur_phone_idx-1])/float(phone_duration[cur_phone_idx]) if self.n_prev_phones > 0: if cur_phone_idx - self.n_prev_phones < 0: seq_phones[frame,0:self.n_prev_phones] = 5 # code for silent frame else: seq_phones[frame,0:self.n_prev_phones] = phone_list[cur_phone_idx-self.n_prev_phones:cur_phone_idx] # prev phones if self.n_next_phones > 0: if cur_phone_idx + self.n_next_phones >= len(phone_list): seq_phones[frame,-self.n_next_phones:] = 5 # code for silent frame else: seq_phones[frame,-self.n_next_phones] = phone_list[cur_phone_idx+1:cur_phone_idx+self.n_next_phones+1] #next phones seq_phones[frame,self.n_prev_phones] = phone_list[cur_phone_idx] self.phone_rel_dur.append(phone_rel_dur) self.phones[sequence_id] = seq_phones # TODO: look at this, does it force copying the data? # Sequence segmentation # s = Sound(samples_sequence, samplerate=16*khertz) # fb = Gammatone(s, self.fc) # y = fb.process() # channel_energy = [] # Compute energy per channel # for ch in range(self.n_channels): # y_ch = segment_axis(y[:,ch], frame_length, overlap) # y_energy = numpy.sum(y_ch**2, axis=1) # channel_energy.append(y_energy) # channel_energy = numpy.vstack(channel_energy).T # channel_energy = samples_sequence # if self.n_next_phones == 0: # self.raw_wav[sequence_id] = channel_energy[self.n_prev_phones:] # else: # self.raw_wav[sequence_id] = channel_energy[self.n_prev_phones:-self.n_next_phones] # TODO: change me # Generate features/targets/phones/phonemes/words map num_frames = samples_sequence.shape[0]-(self.n_prev_phones+self.n_next_phones) num_examples = num_frames - self.frames_per_example examples_per_sequence.append(num_examples) self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence) self.samples_sequences = self.raw_wav # numpy.save('%s_gtfb_%sch.npy'%(which_set, str(self.n_channels)), self.samples_sequences) if not self.audio_only: self.phones_sequences = self.phones self.num_examples = self.cumulative_example_indexes[-1] # DataSpecs features_space = VectorSpace( dim=self.n_channels * self.frames_per_example ) features_source = 'features' def features_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index:example_index + self.frames_per_example].ravel()) return rval targets_space = VectorSpace(dim=self.n_channels) targets_source = 'targets' def targets_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.samples_sequences[sequence_index][example_index + self.frames_per_example]) return rval space_components = [features_space, targets_space] source_components = [features_source, targets_source] map_fn_components = [features_map_fn, targets_map_fn] batch_components = [None, None] if not self.audio_only: num_phones = numpy.max([numpy.max(sequence) for sequence in self.phones]) + 1 phones_space = IndexSpace(max_labels=num_phones, dim=1+self.n_prev_phones+self.n_next_phones, dtype=str(self.phones_sequences[0].dtype)) phones_source = 'phones' def phones_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.phones_sequences[sequence_index][example_index].ravel()) return rval phone_rel_dur_space = VectorSpace(dim=1) phone_rel_dur_source = 'phone_rel_dur' def phone_rel_dur_map_fn(indexes): rval = [] for sequence_index, example_index in self._fetch_index(indexes): rval.append(self.phone_rel_dur[sequence_index][example_index]) return rval space_components.extend([phones_space, phone_rel_dur_space]) source_components.extend([phones_source, phone_rel_dur_source]) map_fn_components.extend([phones_map_fn, phone_rel_dur_map_fn]) batch_components.extend([None, None]) space = CompositeSpace(space_components) source = tuple(source_components) self.data_specs = (space, source) self.map_functions = tuple(map_fn_components) self.batch_buffers = batch_components # Defaults for iterators self._iter_mode = resolve_iterator_class('shuffled_sequential') self._iter_data_specs = (CompositeSpace((features_space, targets_space)), (features_source, targets_source))