def correct_and_check(self,path): """ The VTL-Produced sounds often come with short bursts right at the onset of the sound. This little function removes such noises as they interfere with learning. Sometimes we don't have any sound at all. Give that info as output to the main function! passed: "did it pass the test?" """ sound = loadsound(path) #loadsound from brianhears low = 249 #duration of initial silence sound[0:low] = 0 sound.save(path) # Look at the sound at different places: Will they add up to a reasonable number? mean = 0 for here in range(4000,12001,200): mean += abs(sound[here][0]) # Only non-silent sounds are valid. valid = mean!=0. #from matplotlib import pyplot as plt #plt.plot(sound) #plt.show() return valid
def ncc(audio): # load .wav file through brian.hears.loadsound() x, _ = librosa.load(audio, sr=16000) sf.write('tmp.wav', x, 16000) audio = bh.loadsound('tmp.wav') t_audio = audio.size / 16000 # set the observing frequences # human hearing frequency range 20 ~ 20000 Hz freqs = mel2hz(np.linspace(hz2mel(20), hz2mel(8000), 4)) print freqs # pass audio to AuditoryPeriphery net APnet = AudPeri(20, audio, freqs) # pass AuditoryPeriphery net to Ceptral net Cepnet = Cep(20, freqs, 4, APnet) # the whole NCC network is the network returned by Ceptral net NCC = Cepnet # dNCC = IntermediateDeriv(20, freqs, Cepnet) # # pass Ceptral net to Derivative net # # choose between the two models to simulate the derivative network # if args.deriv == 'interm' : NCC = IntermediateDeriv(20, freqs, Cepnet) # # elif args.deriv == 'feedforward' : # NCC = FeedforwardDeriv(20, freqs, Cepnet) # the returned net is the NCCs network with NCC: # observe the output of NCCs network with a probe probe = nengo.Probe(NCC.output, synapse=0.01)
def extract_features(fname, bdir, sox, htk_mfc, mfc_extension, stereo_wav, gammatones, spectrograms, filterbanks): #def extract_features(fname, bdir): if fname[-4:] != '.wav': return rawfname = bdir+'/'+fname[:-4]+'.rawaudio' wavfname = bdir+'/'+fname tempfname = bdir+'/'+fname[:-4]+'_temp.wav' # temp fname with .wav for sox mfccfname = bdir+'/'+fname[:-4]+mfc_extension if sox: shutil.move(wavfname, tempfname) call(['sox', tempfname, wavfname]) #call(['sox', '-G', tempfname, '-r 16k', wavfname]) # w/o headers, sox uses extension shutil.move(tempfname, rawfname) if htk_mfc: call(['HCopy', '-C', 'wav_config', wavfname, mfccfname]) srate = 16000 #srate, sound = wavfile.read(wavfname) sound, srate = readwav(wavfname) if stereo_wav and len(sound.shape) == 2: # in mono sound is a list sound = 0.5 * (sound[:, 0] + sound[:, 1]) # for stereo wav, sum both channels if gammatones: gammatonefname = bdir+'/'+fname[:-4]+'_gamma.npy' tmp_snd = loadsound(wavfname) gamma_cf = erbspace(20*Hz, 20*kHz, N_GAMMATONES_FILTERS) gamma_fb = Gammatone(tmp_snd, gamma_cf) with open(gammatonefname, 'w') as o_f: npsave(o_f, gamma_fb.process()) if spectrograms: powerspec, _, _, _ = specgram(sound, NFFT=int(srate * SPECGRAM_WINDOW), Fs=srate, noverlap=int(srate * SPECGRAM_OVERLAP)) # TODO specgramfname = bdir+'/'+fname[:-4]+'_specgram.npy' with open(specgramfname, 'w') as o_f: npsave(o_f, powerspec.T) if filterbanks: # convert to Mel filterbanks fbanks = Spectral(nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs compression='log', fs=srate, # sampling rate lowerf=50, # lower frequency frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) sound /= np.abs(sound).max(axis=0) # TODO put that as option fbank = fbanks.transform(sound) fbanksfname = bdir+'/'+fname[:-4]+'_fbanks.npy' with open(fbanksfname, 'w') as o_f: npsave(o_f, fbank) # TODO wavelets scattergrams / scalograms print "dealt with file", wavfname
def process(folder, debug=False, htk_mfc=False, forcemfcext=False, stereo_wav=False, gammatones=False, spectrograms=False): """ debug output? HCopy for MFCC? wav are stereo? produce gammatones? """ # first find if we produce normalized MFCC, otherwise note it in the ext # because we can then normalize on the whole corpus with another py script mfc_extension = '.mfc_unnorm' wcfg = open('wav_config', 'r') for line in wcfg: if "ENORMALISE" in line: mfc_extension = '.mfc' if forcemfcext: mfc_extension = '.mfc' print "MFC extension:", mfc_extension # run through all the folders and files in the path "folder" # and put a header to the waves, save the originals as .rawaudio # use HCopy to produce MFCC files according to "wav_config" file for d, ds, fs in os.walk(folder): for fname in fs: if fname[-4:] != '.wav': continue rawfname = d+'/'+fname[:-4]+'.rawaudio' wavfname = d+'/'+fname tempfname = d+'/'+fname[:-4]+'_temp.wav' # temp fname with .wav for sox mfccfname = d+'/'+fname[:-4]+mfc_extension shutil.move(wavfname, tempfname) call(['sox', tempfname, wavfname]) # w/o headers, sox uses extension shutil.move(tempfname, rawfname) if htk_mfc: call(['HCopy', '-C', 'wav_config', wavfname, mfccfname]) sr = 16000 sr, sound = wavfile.read(wavfname) if stereo_wav and len(sound.shape) == 2: # in mono sound is a list sound = sound[:,1] # for stereo wav, arbitrarily take channel 1 if gammatones: from brian import Hz, kHz from brian.hears import loadsound, erbspace, Gammatone gammatonefname = d+'/'+fname[:-4]+'_gamma.npy' tmp_snd = loadsound(wavfname) cf = erbspace(20*Hz, 20*kHz, N_GAMMATONES_FILTERS) fb = Gammatone(tmp_snd, cf) with open(gammatonefname, 'w') as of: numpy.save(of, fb.process()) if spectrograms: from pylab import specgram Pxx, freqs, bins, im = specgram(sound, NFFT=int(sr * SPECGRAM_WINDOW), Fs=sr, noverlap=int(sr * SPECGRAM_OVERLAP)) specgramfname = d+'/'+fname[:-4]+'_specgram.npy' with open(specgramfname, 'w') as of: numpy.save(of, Pxx.T) print "dealt with file", wavfname
def process(folder, debug=False, htk_mfc=False, forcemfcext=False, stereo_wav=False, gammatones=False, spectrograms=False, filterbanks=False, sox=True): """ applies to all *.wav in folder """ # first find if we produce normalized MFCC, otherwise note it in the ext # because we can then normalize on the whole corpus with another py script mfc_extension = '.mfc_unnorm' wcfg = open('wav_config', 'r') for line in wcfg: if "ENORMALISE" in line: mfc_extension = '.mfc' if forcemfcext: mfc_extension = '.mfc' print "MFC extension:", mfc_extension if gammatones: try: from brian import Hz, kHz from brian.hears import loadsound, erbspace, Gammatone except ImportError: print >> sys.stderr, "You need Brian Hears" print >> sys.stderr, "http://www.briansimulator.org/docs/\ hears.html" sys.exit(-1) if spectrograms: try: from pylab import specgram except ImportError: print >> sys.stderr, "You need Pylab" sys.exit(-1) fbanks = None if filterbanks: try: sys.path.append('../spectral') from spectral import Spectral except ImportError: print >> sys.stderr, "You need spectral (in the parent folder)" print >> sys.stderr, "https://github.com/mwv/spectral" sys.exit(-1) # run through all the folders and files in the path "folder" # and put a header to the waves, save the originals as .rawaudio # use HCopy to produce MFCC files according to "wav_config" file for bdir, _, files in os.walk(folder): for fname in files: if fname[-4:] != '.wav': continue rawfname = bdir + '/' + fname[:-4] + '.rawaudio' wavfname = bdir + '/' + fname tempfname = bdir + '/' + fname[:-4] + '_temp.wav' # temp fname with .wav for sox mfccfname = bdir + '/' + fname[:-4] + mfc_extension if sox: shutil.move(wavfname, tempfname) call(['sox', tempfname, wavfname]) # w/o headers, sox uses extension shutil.move(tempfname, rawfname) if htk_mfc: call(['HCopy', '-C', 'wav_config', wavfname, mfccfname]) srate = 16000 srate, sound = wavfile.read(wavfname) if stereo_wav and len(sound.shape) == 2: # in mono sound is a list sound = sound[:, 0] + sound[:, 1] # for stereo wav, sum both channels if gammatones: gammatonefname = bdir + '/' + fname[:-4] + '_gamma.npy' tmp_snd = loadsound(wavfname) gamma_cf = erbspace(20 * Hz, 20 * kHz, N_GAMMATONES_FILTERS) gamma_fb = Gammatone(tmp_snd, gamma_cf) with open(gammatonefname, 'w') as o_f: npsave(o_f, gamma_fb.process()) if spectrograms: powerspec, _, _, _ = specgram( sound, NFFT=int(srate * SPECGRAM_WINDOW), Fs=srate, noverlap=int(srate * SPECGRAM_OVERLAP)) # TODO specgramfname = bdir + '/' + fname[:-4] + '_specgram.npy' with open(specgramfname, 'w') as o_f: npsave(o_f, powerspec.T) if filterbanks: # convert to Mel filterbanks if fbanks == None: # assume parameters are fixed fbanks = Spectral( nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis do_dct=False, # we do not want MFCCs fs=srate, # sampling rate frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft do_deltas=False, # speed do_deltasdeltas=False # acceleration ) fbank = fbanks.transform(sound)[0] # first dimension is for # deltas & deltasdeltas fbanksfname = bdir + '/' + fname[:-4] + '_fbanks.npy' with open(fbanksfname, 'w') as o_f: npsave(o_f, fbank) # TODO wavelets scattergrams / scalograms print "dealt with file", wavfname
def process(folder,debug=False,htk_mfcc=False,forcemfcext=False,stereo_wave=False,gammatones=False,spectograms=False,filterbanks=False,sox=True): mfc_extension = '.mfc_unnorm' wcfg = open('wav_config','r') for line in wcfg: if "ENORMALISE" in line: mfc_extension = '.mfc' if forcemfcext: mfc_extension = '.mfc' print "MFC Extension is", mfc_extension if gammatones: try: from brian import Hz, kHz from brian.hears import loadsound, erbspace, Gammatone except ImportError: print >> sys.stderr, "You need Brian Hears" sys.exit(-1) if spectograms: try: from pylab import specgram except ImportError: print >> sys.stderr,'You need Pylab' sys.exit(-1) fbanks = None if filterbanks: try: sys.path.append('../spectral') from spectral import Spectral except ImportError: print >> sys.stderr, 'you need spectral (in the parent folder)' for bdir, _ , files in os.walk(folder): for fname in files: if fname[-4:] != '.WAV': continue rawfname= bdir + '/' + fname[:-4]+'.rawaudio' wavfname = bdir + '/'+ fname tempfname = bdir + '/' + fname[:-4] + '_temp.wav' mfccfname = bdir + '/' + fname[:-4] + '.txt' if sox: shutil.move(wavfname, tempfname) call(['sox',tempfname,wavfname]) shutil.move(tempfname,wavfname) if htk_mfcc: call(['HCopy','-C','wav_config',wavfname,mfccfname]) srate = 16000 srate, sound = wavfile.read(wavfname) if stereo_wave and len(sound.shape == 2): sound = sound[:,0]+ sound[:,1] if gammatones: gammatonefname = bdir + '/' + fname[:-4] + '_gamma.npy' tmp_snd = loadsound(wavfname) gamma_cf = erbspace(20*Hz, 20*kHz, n_gmammatones_filters) gamma_fb = Gammatone(tmp_snd, gamma_cf) with open(gammatonefname,'w') as o_f: npsave(o_f, gamma_fb.process()) if spectograms: powersspec, _,_,_ = specgram(sound, NFFT=int(srate * specgram_window), Fs=srate,noverlap=int(srate*specgram_window)) specgramfname = bdir + '/' + fname[:-4]+'_specgram.npy' with open(specgramfname,'w') as o_f: npsave(o_f , powerspec.T) if filterbanks: if fbanks ==None: fbanks = Spectral(nfilt = n_fbanks, alpha=0.97,do_dct=False, fs=srate, frate=fbanks_rate, wlen=fbanks_window,nfft=1024,do_deltas=False,do_deltasdeltas=False) fbank = fbanks.transform(sound)[0] fbanksfname = bdir + '/' + fname[:-4]+'_fbanks.npy' with open(fbanksfname,'w') as o_f: npsave(o_f, fbank) print "Dealt with the file ", wavfname
'group_speaker':group_speaker, 'pitch_var':self.speaker_pitch_rel[speaker], 'verbose':True } paths = { 'input_path':self.learner_path, 'wav_folder':self.output_path+'/'+vowel } synthesize.main(input_dict,paths) wavFile = synthesize_wav.main( params, speaker, simulation_name,pitch_var=0,len_var=1.0,verbose=self.verbose,self.rank=self.rank, different_output_path=self.output_folder) # wavFile = par_to_wav(params, speaker, simulation_name, verbose=self.verbose, self.rank=self.rank) # call parToWave to generate sound file if self.verbose: print 'wav file '+str(wavFile)+' produced' sound = loadsound(wavFile) # load sound file for brian.hears processing if self.verbose: print 'sound loaded' ############### Audio processing sound = correct_initial(sound) # call correct_initial to remove initial burst sound_resampled = get_resampled(sound) # call get_resampled to adapt generated sound to AN model sound_extended = get_extended(sound_resampled) # call get_extended to equalize duration of all sounds sound_extended.save(wavFile) # save current sound as sound file
def extract_from_mlf(mlf, do_gammatones): x = np.ndarray((0, N_MFCC_COEFFS), dtype='float32') x_fbank = np.ndarray((0, N_FILTERBANK_COEFFS), dtype='float32') x_gamma = np.ndarray((0, N_GAMMATONES * 3), dtype='float32') y = [] y_spkr = [] with open(mlf) as f: tmp_len_x = 0 # verify sizes len_x = 0 end = 0 speaker_label = '' for line in f: line = line.rstrip('\n') if len(line) < 1: continue if line[0] == '"': assert tmp_len_x == 0, "the file above this one %s was mismatching x (%d frames) and y (%d frames) lengths by %d" % ( line, len_x, end, tmp_len_x) speaker_label = line.split('/')[-2] # load HTK's MFCC t = htkmfc.open(line.strip('"')[:-3] + 'mfc') # .lab -> .mfc x = np.append(x, t.getall(), axis=0) len_x = t.getall().shape[0] tmp_len_x = len_x if TALKBOX_FBANKS: # do our own filterbanks TODO fr, snd = wavfile.read(line.strip('"')[:-3] + 'wav') # .lab -> .wav assert fr == SAMPLING_RATE, "SAMPLING_RATE is not what is found in the wav file" _, fbank, _ = tbmfcc(snd, nwin=HAMMING_SIZE / 1000. * SAMPLING_RATE, nfft=2048, fs=SAMPLING_RATE, nceps=13) x_fbank = np.append(x_fbank, fbank, axis=0) assert t.getall().shape[0] == fbank.shape[ 0], "MFCC and filterbank not of the same length (not on the same sampling rate)" else: fbank = None with open(line.strip('"')[:-4] + '_fbanks.npy') as fbanksf: fbank = np.load(fbanksf) if fbank is not None: # it seems filterbanks obtained with spectral are a little longer at the end if DEBUG: print("cutting the last", fbank.shape[0] - t.getall().shape[0], "frames from the filterbank") fbank = fbank[:t.getall().shape[0]] x_fbank = np.append(x_fbank, fbank, axis=0) assert t.getall().shape[0] == fbank.shape[ 0], "MFCC and filterbank not of the same length (not on the same sampling rate)" if do_gammatones: # load the wav sound (with Brian) sound = loadsound(line.strip('"')[:-3] + 'wav') # .lab -> .wav # compute the gammatones (see Brian's doc) bw = 10**(0.037 + 0.785 * log10(center_frequencies)) gammatone = ApproximateGammatone(sound, center_frequencies, bw, order=3) g = gammatone.process() # subsample the gammatones at the same rate than the MFCC's # (just for practicality so that they are aligned...) n_samples = g.shape[0] * 1. / (t.getall().shape[0] + 1 ) # TODO check "+1" ### # do the harmonic mean (nth root of the product of the terms) ### g_sub = subsample_apply_f(g, n_samples, lambda z: np.power(np.prod(z), 1./n_samples)) g_sub = subsample_apply_f( g, n_samples, lambda z: np.sqrt(np.sum(np.square(z)))) # compute the delta and delta of the subsampled gammatones gamma_speed_accel = compute_speed_and_accel(g_sub) # append tmp = gamma_speed_accel[:t.getall().shape[0]] # TODO check if tmp.shape[0] != t.getall().shape[0]: # TODO remove print(line) print(tmp.shape) print(t.getall().shape) print(n_samples) print(g.shape) print("exiting because of the mismatch") sys.exit(-1) x_gamma = np.append(x_gamma, tmp, axis=0) elif line[0].isdigit(): start, end, state = line.split()[:3] start = (int(start) + 9999) / (MFCC_TIMESTEP * 10000) # htk end = (int(end) + 9999) / (MFCC_TIMESTEP * 10000) # htk for i in range(start, end): tmp_len_x -= 1 y.append(state) y_spkr.append(speaker_label) assert (len(y) == x.shape[0]) assert (len(y_spkr) == x.shape[0]) rootname = mlf[:-4] np.save(rootname + '_xdata.npy', x) np.save(rootname + '_xfbank.npy', x_fbank) if do_gammatones: np.save(rootname + '_xgamma.npy', x_gamma) yy = np.array(y) yy_spkr = np.array(y_spkr) np.save(rootname + '_ylabels.npy', yy) np.save(rootname + '_yspeakers.npy', yy_spkr) print("length x:", len(x), "length y:", len(y), "length y_spkr:", len(y_spkr)) print("shape x:", x.shape, "shape yy:", yy.shape, "shape yy_spkr:", yy_spkr.shape) if TEST: tx = np.load(rootname + '_xdata.npy') tx_fbank = np.load(rootname + '_xfbank.npy') if do_gammatones: tx_gamma = np.load(rootname + '_xgamma.npy') ty = np.load(rootname + '_ylabels.npy') ty_spkr = np.load(rootname + '_yspeakers.npy') if np.all(tx == x) and np.all(ty == yy) and np.all(ty_spkr == yy_spkr): assert_allclose( tx_fbank, x_fbank, err_msg="x_fbank and its serialized version are not allclose") if do_gammatones: assert_allclose( tx_gamma, x_gamma, err_msg= "x_gamma and its serialized version are not allclose") print("SUCCESS: serialized and current in-memory arrays are equal") sys.exit(0) else: print( "ERROR: serialized and current X (MFCC) or Y in-memory arrays differ!" ) print("x (MFCC):", np.all(tx == x)) print("y (labels):", np.all(ty == yy)) print("y (speakers):", np.all(ty_spkr == yy_spkr)) sys.exit(-1)
def evaluate_environment(params, i_global, simulation_name, outputfolder, i_target=0, rank=1, speaker='adult', n_vow=5, normalize=False): folder = outputfolder ############### Sound generation if output: print 'simulating vocal tract' wavFile = parToWave(params, speaker, simulation_name, verbose=output, rank=rank) # call parToWave to generate sound file # wavFile = par_to_wav(params, speaker, simulation_name, verbose=output, rank=rank) # call parToWave to generate sound file if output: print 'wav file '+str(wavFile)+' produced' sound = loadsound(wavFile) # load sound file for brian.hears processing if output: print 'sound loaded' ############### Audio processing sound = correct_initial(sound) # call correct_initial to remove initial burst sound_resampled = get_resampled(sound) # call get_resampled to adapt generated sound to AN model sound_extended = get_extended(sound_resampled) # call get_extended to equalize duration of all sounds sound_extended.save(wavFile) # save current sound as sound file os.system('cp '+wavFile+' '+folder+'data/vowel_'+str(i_target)+'_'+str(rank)+'.wav') if playback: print 'playing back...' sound_extended.play(sleep=True) # play back sound file if output: print 'sound acquired, preparing auditory processing' out = drnl(sound_extended) # call drnl to get cochlear activation ############### Classifier evaluation flow_name = 'data/current_auditory_system.flow' flow_file = open(flow_name, 'r') # open classifier file flow = cPickle.load(flow_file) # load classifier flow_file.close() # close classifier file sample_vote_unnormalized = flow(out) # evaluate trained output units' responses for current item if normalize: sample_vote = normalize_activity(sample_vote_unnormalized) else: sample_vote = sample_vote_unnormalized mean_sample_vote = np.mean(sample_vote, axis=0) # average each output neurons' response over time confidences = get_confidences(mean_sample_vote) plot_reservoir_states(flow, sample_vote, i_target, folder, n_vow, rank) return confidences
def main(args): """Main script.""" vowel = args["<vowel>"] n_samples = int(args["--n_samples"]) n_channels = int(args["--n_channels"]) sigma = float(args["--sigma"]) uncompressed = args["--uncompressed"] infant = args["--infant"] monotone = args["--monotone"] print 'generating ' + vowel + ' samples, infant mode: ' + str(infant) np.random.seed() # numpy random seed w.r.t. global runtime if infant: speaker = 'infant' else: speaker = 'adult' infant = True if speaker == 'infant' else False initial_params_r = get_initial_params_r(vowel, infant=infant) for i_global in xrange(n_samples): folder = 'data/temp/' + vowel + '/' if not os.path.isdir(folder): os.makedirs(folder) name = folder + vowel + '_' + str(i_global) filename_act = name + '.dat.gz' filename_wav = name + '.wav' # declare sound file name of current simulation invalid = True while invalid: noise = np.random.randn( 16) * sigma # standard normally distributed vector x = initial_params_r + noise # add mutation, Eq. 37 invalid = (x < 0.0).any() or (x > 1.0).any() if invalid: print 'sample rejected. resampling.' params_tot = get_abs_coord(x, infant=infant) ############### Sound generation wav_file = par_to_wav(params_tot, speaker=speaker, simulation_name=vowel, different_folder=filename_wav, monotone=monotone) # call gesToWave to generate sound file print 'wav file ' + str(wav_file) + ' produced' sound = loadsound( wav_file) # load sound file for brian.hears processing print 'sound loaded' sound = correct_initial( sound) # call correct_initial to remove initial burst sound_resampled = get_resampled(sound) # call get_resampled to adapt generated sound to AN model sound_extended = get_extended(sound_resampled) # call get_extended to equalize duration of all sounds sound_extended.save(wav_file) # save current sound as sound file print 'sound acquired, preparing sound processing' ############### Audio processing # call drnl to get cochlear activation out = drnl(sound_extended, n_channels, uncompressed) print 'writing auditory nerve response' # create and open new output file in gzip write mode with gzip.open(filename_act, 'wb') as outputfile: out.dump(outputfile) # dump numpy array into output file print 'done'
def process( folder, debug=False, htk_mfc=False, forcemfcext=False, stereo_wav=False, gammatones=False, spectrograms=False, filterbanks=False, sox=True, ): """ applies to all *.wav in folder """ # first find if we produce normalized MFCC, otherwise note it in the ext # because we can then normalize on the whole corpus with another py script mfc_extension = ".mfc_unnorm" wcfg = open("wav_config", "r") for line in wcfg: if "ENORMALISE" in line: mfc_extension = ".mfc" if forcemfcext: mfc_extension = ".mfc" print "MFC extension:", mfc_extension if gammatones: try: from brian import Hz, kHz from brian.hears import loadsound, erbspace, Gammatone except ImportError: print >> sys.stderr, "You need Brian Hears" print >> sys.stderr, "http://www.briansimulator.org/docs/\ hears.html" sys.exit(-1) if spectrograms: try: from pylab import specgram except ImportError: print >> sys.stderr, "You need Pylab" sys.exit(-1) fbanks = None if filterbanks: try: sys.path.append("../spectral") from spectral import Mel except ImportError: print >> sys.stderr, "You need spectral (in the parent folder)" print >> sys.stderr, "https://github.com/mwv/spectral" sys.exit(-1) # run through all the folders and files in the path "folder" # and put a header to the waves, save the originals as .rawaudio # use HCopy to produce MFCC files according to "wav_config" file for bdir, _, files in os.walk(folder): for fname in files: if fname[-4:] != ".wav": continue rawfname = bdir + "/" + fname[:-4] + ".rawaudio" wavfname = bdir + "/" + fname tempfname = bdir + "/" + fname[:-4] + "_temp.wav" # temp fname with .wav for sox mfccfname = bdir + "/" + fname[:-4] + mfc_extension if sox: shutil.move(wavfname, tempfname) call(["sox", tempfname, wavfname]) # w/o headers, sox uses extension shutil.move(tempfname, rawfname) if htk_mfc: call(["HCopy", "-C", "wav_config", wavfname, mfccfname]) srate = 16000 srate, sound = wavfile.read(wavfname) if stereo_wav and len(sound.shape) == 2: # in mono sound is a list sound = sound[:, 0] + sound[:, 1] # for stereo wav, sum both channels if gammatones: gammatonefname = bdir + "/" + fname[:-4] + "_gamma.npy" tmp_snd = loadsound(wavfname) gamma_cf = erbspace(20 * Hz, 20 * kHz, N_GAMMATONES_FILTERS) gamma_fb = Gammatone(tmp_snd, gamma_cf) with open(gammatonefname, "w") as o_f: npsave(o_f, gamma_fb.process()) if spectrograms: powerspec, _, _, _ = specgram( sound, NFFT=int(srate * SPECGRAM_WINDOW), Fs=srate, noverlap=int(srate * SPECGRAM_OVERLAP) ) # TODO specgramfname = bdir + "/" + fname[:-4] + "_specgram.npy" with open(specgramfname, "w") as o_f: npsave(o_f, powerspec.T) if filterbanks: # convert to Mel filterbanks if fbanks == None: # assume parameters are fixed fbanks = Mel( nfilt=N_FBANKS, # nb of filters in mel bank alpha=0.97, # pre-emphasis fs=srate, # sampling rate frate=FBANKS_RATE, # frame rate wlen=FBANKS_WINDOW, # window length nfft=1024, # length of dft mel_deltas=False, # speed mel_deltasdeltas=False, # acceleration ) fbank = fbanks.transform(sound)[0] # first dimension is for # deltas & deltasdeltas fbanksfname = bdir + "/" + fname[:-4] + "_fbanks.npy" with open(fbanksfname, "w") as o_f: npsave(o_f, fbank) # TODO wavelets scattergrams / scalograms print "dealt with file", wavfname
def main(args): """Main script.""" vowel = args["<vowel>"] n_samples = int(args["--n_samples"]) n_channels = int(args["--n_channels"]) sigma = float(args["--sigma"]) uncompressed = args["--uncompressed"] infant = args["--infant"] monotone = args["--monotone"] print 'generating ' + vowel + ' samples, infant mode: ' + str(infant) np.random.seed() # numpy random seed w.r.t. global runtime if infant: speaker = 'infant' else: speaker = 'adult' infant = True if speaker == 'infant' else False initial_params_r = get_initial_params_r(vowel, infant=infant) for i_global in xrange(n_samples): folder = 'data/temp/'+vowel+'/' if not os.path.isdir(folder): os.makedirs(folder) name = folder + vowel + '_' + str(i_global) filename_act = name+'.dat.gz' filename_wav = name+'.wav' # declare sound file name of current simulation invalid = True while invalid: noise = np.random.randn(16) * sigma # standard normally distributed vector x = initial_params_r + noise # add mutation, Eq. 37 invalid = (x < 0.0).any() or (x > 1.0).any() if invalid: print 'sample rejected. resampling.' params_tot = get_abs_coord(x, infant=infant) ############### Sound generation wav_file = par_to_wav(params_tot, speaker=speaker, simulation_name=vowel, different_folder=filename_wav, monotone=monotone) # call gesToWave to generate sound file print 'wav file ' + str(wav_file) + ' produced' sound = loadsound(wav_file) # load sound file for brian.hears processing print 'sound loaded' sound = correct_initial(sound) # call correct_initial to remove initial burst sound_resampled = get_resampled(sound) # call get_resampled to adapt generated sound to AN model sound_extended = get_extended(sound_resampled) # call get_extended to equalize duration of all sounds sound_extended.save(wav_file) # save current sound as sound file print 'sound acquired, preparing sound processing' ############### Audio processing # call drnl to get cochlear activation out = drnl(sound_extended, n_channels, uncompressed) print 'writing auditory nerve response' # create and open new output file in gzip write mode with gzip.open(filename_act, 'wb') as outputfile: out.dump(outputfile) # dump numpy array into output file print 'done'
def evaluate_environment(params, i_global, simulation_name, outputfolder, i_target=0, rank=1, speaker='adult', n_vow=5, normalize=False): folder = outputfolder ############### Sound generation if output: print 'simulating vocal tract' wavFile = parToWave(params, speaker, simulation_name, verbose=output, rank=rank) # call parToWave to generate sound file # wavFile = par_to_wav(params, speaker, simulation_name, verbose=output, rank=rank) # call parToWave to generate sound file if output: print 'wav file ' + str(wavFile) + ' produced' sound = loadsound(wavFile) # load sound file for brian.hears processing if output: print 'sound loaded' ############### Audio processing sound = correct_initial( sound) # call correct_initial to remove initial burst sound_resampled = get_resampled(sound) # call get_resampled to adapt generated sound to AN model sound_extended = get_extended(sound_resampled) # call get_extended to equalize duration of all sounds sound_extended.save(wavFile) # save current sound as sound file os.system('cp ' + wavFile + ' ' + folder + 'data/vowel_' + str(i_target) + '_' + str(rank) + '.wav') if playback: print 'playing back...' sound_extended.play(sleep=True) # play back sound file if output: print 'sound acquired, preparing auditory processing' out = drnl(sound_extended) # call drnl to get cochlear activation ############### Classifier evaluation flow_name = 'data/current_auditory_system.flow' flow_file = open(flow_name, 'r') # open classifier file flow = cPickle.load(flow_file) # load classifier flow_file.close() # close classifier file sample_vote_unnormalized = flow( out) # evaluate trained output units' responses for current item if normalize: sample_vote = normalize_activity(sample_vote_unnormalized) else: sample_vote = sample_vote_unnormalized mean_sample_vote = np.mean(sample_vote, axis=0) # average each output neurons' response over time confidences = get_confidences(mean_sample_vote) plot_reservoir_states(flow, sample_vote, i_target, folder, n_vow, rank) return confidences
def extract_from_mlf(mlf, do_gammatones): x = np.ndarray((0, N_MFCC_COEFFS), dtype='float32') x_fbank = np.ndarray((0, N_FILTERBANK_COEFFS), dtype='float32') x_gamma = np.ndarray((0, N_GAMMATONES*3), dtype='float32') y = [] y_spkr = [] with open(mlf) as f: tmp_len_x = 0 # verify sizes len_x = 0 end = 0 speaker_label = '' for line in f: line = line.rstrip('\n') if len(line) < 1: continue if line[0] == '"': assert tmp_len_x == 0, "the file above this one %s was mismatching x (%d frames) and y (%d frames) lengths by %d" % (line, len_x, end, tmp_len_x) speaker_label = line.split('/')[-2] # load HTK's MFCC t = htkmfc.open(line.strip('"')[:-3] + 'mfc') # .lab -> .mfc x = np.append(x, t.getall(), axis=0) len_x = t.getall().shape[0] tmp_len_x = len_x if TALKBOX_FBANKS: # do our own filterbanks TODO fr, snd = wavfile.read(line.strip('"')[:-3] + 'wav') # .lab -> .wav assert fr == SAMPLING_RATE, "SAMPLING_RATE is not what is found in the wav file" _, fbank, _ = tbmfcc(snd, nwin=HAMMING_SIZE/1000.*SAMPLING_RATE, nfft=2048, fs=SAMPLING_RATE, nceps=13) x_fbank = np.append(x_fbank, fbank, axis=0) assert t.getall().shape[0] == fbank.shape[0], "MFCC and filterbank not of the same length (not on the same sampling rate)" else: fbank = None with open(line.strip('"')[:-4] + '_fbanks.npy') as fbanksf: fbank = np.load(fbanksf) if fbank != None: # it seems filterbanks obtained with spectral are a little longer at the end if DEBUG: print "cutting the last", fbank.shape[0] - t.getall().shape[0], "frames from the filterbank" fbank = fbank[:t.getall().shape[0]] x_fbank = np.append(x_fbank, fbank, axis=0) assert t.getall().shape[0] == fbank.shape[0], "MFCC and filterbank not of the same length (not on the same sampling rate)" if do_gammatones: # load the wav sound (with Brian) sound = loadsound(line.strip('"')[:-3] + 'wav') # .lab -> .wav # compute the gammatones (see Brian's doc) bw = 10**(0.037+0.785*log10(center_frequencies)) gammatone = ApproximateGammatone(sound, center_frequencies, bw, order=3) g = gammatone.process() # subsample the gammatones at the same rate than the MFCC's # (just for practicality so that they are aligned...) n_samples = g.shape[0]*1./(t.getall().shape[0] + 1) # TODO check "+1" ### # do the harmonic mean (nth root of the product of the terms) ### g_sub = subsample_apply_f(g, n_samples, lambda z: np.power(np.prod(z), 1./n_samples)) g_sub = subsample_apply_f(g, n_samples, lambda z: np.sqrt(np.sum(np.square(z)))) # compute the delta and delta of the subsampled gammatones gamma_speed_accel = compute_speed_and_accel(g_sub) # append tmp = gamma_speed_accel[:t.getall().shape[0]] # TODO check if tmp.shape[0] != t.getall().shape[0]: # TODO remove print line print tmp.shape print t.getall().shape print n_samples print g.shape print "exiting because of the mismatch" sys.exit(-1) x_gamma = np.append(x_gamma, tmp, axis=0) elif line[0].isdigit(): start, end, state = line.split()[:3] start = (int(start)+9999)/(MFCC_TIMESTEP * 10000) # htk end = (int(end)+9999)/(MFCC_TIMESTEP * 10000) # htk for i in xrange(start, end): tmp_len_x -= 1 y.append(state) y_spkr.append(speaker_label) assert(len(y) == x.shape[0]) assert(len(y_spkr) == x.shape[0]) rootname = mlf[:-4] np.save(rootname + '_xdata.npy', x) np.save(rootname + '_xfbank.npy', x_fbank) if do_gammatones: np.save(rootname + '_xgamma.npy', x_gamma) yy = np.array(y) yy_spkr = np.array(y_spkr) np.save(rootname + '_ylabels.npy', yy) np.save(rootname + '_yspeakers.npy', yy_spkr) print "length x:", len(x), "length y:", len(y), "length y_spkr:", len(y_spkr) print "shape x:", x.shape, "shape yy:", yy.shape, "shape yy_spkr:", yy_spkr.shape if TEST: tx = np.load(rootname + '_xdata.npy') tx_fbank = np.load(rootname + '_xfbank.npy') if do_gammatones: tx_gamma = np.load(rootname + '_xgamma.npy') ty = np.load(rootname + '_ylabels.npy') ty_spkr = np.load(rootname + '_yspeakers.npy') if np.all(tx==x) and np.all(ty==yy) and np.all(ty_spkr==yy_spkr): assert_allclose(tx_fbank, x_fbank, err_msg="x_fbank and its serialized version are not allclose") if do_gammatones: assert_allclose(tx_gamma, x_gamma, err_msg="x_gamma and its serialized version are not allclose") print "SUCCESS: serialized and current in-memory arrays are equal" sys.exit(0) else: print "ERROR: serialized and current X (MFCC) or Y in-memory arrays differ!" print "x (MFCC):", np.all(tx==x) print "y (labels):", np.all(ty==yy) print "y (speakers):", np.all(ty_spkr==yy_spkr) sys.exit(-1)