def train_auto(train, fun, transform, testdir, outdir, num_epochs=30, model="1.pkl", scale_factor=0.3, load=False, skip_train=False, skip_sep=False): """ Trains a network built with \"fun\" with the data generated with \"train\" and then separates the files in \"testdir\",writing the result in \"outdir\" Parameters ---------- train : Callable, e.g. LargeDataset object The callable which generates training data for the network: inputs, target = train() fun : lasagne network object, Theano tensor The network to be trained transform : transformFFT object The Transform object which was used to compute the features (see compute_features.py) testdir : string, optional The directory where the files to be separated are located outdir : string, optional The directory where to write the separated files num_epochs : int, optional The number the epochs to train for (one epoch is when all examples in the dataset are seen by the network) model : string, optional The path where to save the trained model (theano tensor containing the network) scale_factor : float, optional Scale the magnitude of the files to be separated with this factor Yields ------ losser : list The losses for each epoch, stored in a list """ logging.info("Building Autoencoder") input_var2 = T.tensor4('inputs') target_var2 = T.tensor4('targets') rand_num = T.tensor4('rand_num') eps = 1e-8 alpha = 0.001 beta = 0.01 beta_voc = 0.03 network2 = fun(input_var=input_var2, batch_size=train.batch_size, time_context=train.time_context, feat_size=train.input_size) if load: params = load_model(model) lasagne.layers.set_all_param_values(network2, params) prediction2 = lasagne.layers.get_output(network2, deterministic=True) rand_num = np.random.uniform(size=(train.batch_size, 1, train.time_context, train.input_size)) voc = prediction2[:, 0:1, :, :] + eps * rand_num bas = prediction2[:, 1:2, :, :] + eps * rand_num dru = prediction2[:, 2:3, :, :] + eps * rand_num oth = prediction2[:, 3:4, :, :] + eps * rand_num mask1 = voc / (voc + bas + dru + oth) mask2 = bas / (voc + bas + dru + oth) mask3 = dru / (voc + bas + dru + oth) mask4 = oth / (voc + bas + dru + oth) vocals = mask1 * input_var2 bass = mask2 * input_var2 drums = mask3 * input_var2 others = mask4 * input_var2 train_loss_recon_vocals = lasagne.objectives.squared_error( vocals, target_var2[:, 0:1, :, :]) alpha_component = alpha * lasagne.objectives.squared_error( vocals, target_var2[:, 1:2, :, :]) alpha_component += alpha * lasagne.objectives.squared_error( vocals, target_var2[:, 2:3, :, :]) train_loss_recon_neg_voc = beta_voc * lasagne.objectives.squared_error( vocals, target_var2[:, 3:4, :, :]) train_loss_recon_bass = lasagne.objectives.squared_error( bass, target_var2[:, 1:2, :, :]) alpha_component += alpha * lasagne.objectives.squared_error( bass, target_var2[:, 0:1, :, :]) alpha_component += alpha * lasagne.objectives.squared_error( bass, target_var2[:, 2:3, :, :]) train_loss_recon_neg = beta * lasagne.objectives.squared_error( bass, target_var2[:, 3:4, :, :]) train_loss_recon_drums = lasagne.objectives.squared_error( drums, target_var2[:, 2:3, :, :]) alpha_component += alpha * lasagne.objectives.squared_error( drums, target_var2[:, 0:1, :, :]) alpha_component += alpha * lasagne.objectives.squared_error( drums, target_var2[:, 1:2, :, :]) train_loss_recon_neg += beta * lasagne.objectives.squared_error( drums, target_var2[:, 3:4, :, :]) vocals_error = train_loss_recon_vocals.sum() drums_error = train_loss_recon_drums.sum() bass_error = train_loss_recon_bass.sum() negative_error = train_loss_recon_neg.sum() negative_error_voc = train_loss_recon_neg_voc.sum() alpha_component = alpha_component.sum() loss = abs(vocals_error + drums_error + bass_error - negative_error - alpha_component - negative_error_voc) params1 = lasagne.layers.get_all_params(network2, trainable=True) updates = lasagne.updates.adadelta(loss, params1) # val_updates=lasagne.updates.nesterov_momentum(loss1, params1, learning_rate=0.00001, momentum=0.7) train_fn = theano.function([input_var2, target_var2], loss, updates=updates, allow_input_downcast=True) train_fn1 = theano.function([input_var2, target_var2], [ vocals_error, bass_error, drums_error, negative_error, alpha_component, negative_error_voc ], allow_input_downcast=True) predict_function2 = theano.function([input_var2], [vocals, bass, drums, others], allow_input_downcast=True) losser = [] loss2 = [] if not skip_train: logging.info("Training...") for epoch in range(num_epochs): train_err = 0 train_batches = 0 vocals_err = 0 drums_err = 0 bass_err = 0 negative_err = 0 alpha_component = 0 beta_voc = 0 start_time = time.time() for batch in range(train.iteration_size): inputs, target = train() jump = inputs.shape[2] inputs = np.reshape( inputs, (inputs.shape[0], 1, inputs.shape[1], inputs.shape[2])) targets = np.ndarray(shape=(inputs.shape[0], 4, inputs.shape[2], inputs.shape[3])) #import pdb;pdb.set_trace() targets[:, 0, :, :] = target[:, :, :jump] targets[:, 1, :, :] = target[:, :, jump:jump * 2] targets[:, 2, :, :] = target[:, :, jump * 2:jump * 3] targets[:, 3, :, :] = target[:, :, jump * 3:jump * 4] target = None train_err += train_fn(inputs, targets) [ vocals_erre, bass_erre, drums_erre, negative_erre, alpha, betae_voc ] = train_fn1(inputs, targets) vocals_err += vocals_erre bass_err += bass_erre drums_err += drums_erre negative_err += negative_erre beta_voc += betae_voc alpha_component += alpha train_batches += 1 print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) losser.append(train_err / train_batches) print(" training loss for vocals:\t\t{:.6f}".format( vocals_err / train_batches)) print(" training loss for bass:\t\t{:.6f}".format(bass_err / train_batches)) print(" training loss for drums:\t\t{:.6f}".format(drums_err / train_batches)) print(" Beta component:\t\t{:.6f}".format(negative_err / train_batches)) print(" Beta component for voice:\t\t{:.6f}".format( beta_voc / train_batches)) print(" alpha component:\t\t{:.6f}".format(alpha_component / train_batches)) losser.append(train_err / train_batches) save_model(model, network2) if not skip_sep: logging.info("Separating") source = ['vocals', 'bass', 'drums', 'other'] dev_directory = os.listdir(os.path.join(testdir, "Dev")) test_directory = os.listdir(os.path.join( testdir, "Test")) #we do not include the test dir dirlist = [] dirlist.extend(dev_directory) dirlist.extend(test_directory) for f in sorted(dirlist): if not f.startswith('.'): if f in dev_directory: song = os.path.join(testdir, "Dev", f, "mixture.wav") else: song = os.path.join(testdir, "Test", f, "mixture.wav") audioObj, sampleRate, bitrate = util.readAudioScipy(song) assert sampleRate == 44100, "Sample rate needs to be 44100" audio = (audioObj[:, 0] + audioObj[:, 1]) / 2 audioObj = None mag, ph = transform.compute_file(audio, phase=True) mag = scale_factor * mag.astype(np.float32) batches, nchunks = util.generate_overlapadd( mag, input_size=mag.shape[-1], time_context=train.time_context, overlap=train.overlap, batch_size=train.batch_size, sampleRate=sampleRate) output = [] batch_no = 1 for batch in batches: batch_no += 1 start_time = time.time() output.append(predict_function2(batch)) output = np.array(output) mm = util.overlapadd_multi(output, batches, nchunks, overlap=train.overlap) #write audio files if f in dev_directory: dirout = os.path.join(outdir, "Dev", f) else: dirout = os.path.join(outdir, "Test", f) if not os.path.exists(dirout): os.makedirs(dirout) for i in range(mm.shape[0]): audio_out = transform.compute_inverse( mm[i, :len(ph)] / scale_factor, ph) if len(audio_out) > len(audio): audio_out = audio_out[:len(audio)] util.writeAudioScipy( os.path.join(dirout, source[i] + '.wav'), audio_out, sampleRate, bitrate) audio_out = None audio = None return losser
def train_auto(fun,transform,testdir,outdir,testfile_list,testdir1,outdir1,testfile_list1,num_epochs=30,model="1.pkl",scale_factor=0.3,load=False,skip_train=False,skip_sep=False, path_transform_in=None,nsamples=40,batch_size=32, batch_memory=50, time_context=30, overlap=25, nprocs=4,mult_factor_in=0.3,mult_factor_out=0.3,timbre_model_path=None): """ Trains a network built with \"fun\" with the data generated with \"train\" and then separates the files in \"testdir\",writing the result in \"outdir\" Parameters ---------- fun : lasagne network object, Theano tensor The network to be trained transform : transformFFT object The Transform object which was used to compute the features (see compute_features.py) testdir : string, optional The directory where the files to be separated are located outdir : string, optional The directory where to write the separated files num_epochs : int, optional The number the epochs to train for (one epoch is when all examples in the dataset are seen by the network) model : string, optional The path where to save the trained model (theano tensor containing the network) scale_factor : float, optional Scale the magnitude of the files to be separated with this factor Yields ------ losser : list The losses for each epoch, stored in a list """ logging.info("Building Autoencoder") input_var2 = T.tensor4('inputs') target_var2 = T.tensor4('targets') rand_num = T.tensor4('rand_num') #parameters for the score-informed separation nharmonics=20 interval=50 #cents tuning_freq=440 #Hz eps=1e-18 alpha=0.001 input_size = int(float(transform.frameSize) / 2 + 1) network2 = fun(input_var=input_var2,batch_size=batch_size,time_context=time_context,feat_size=input_size,nchannels=4) if load: params=load_model(model) lasagne.layers.set_all_param_values(network2,params) prediction2 = lasagne.layers.get_output(network2, deterministic=True) rand_num = np.random.uniform(size=(batch_size,1,time_context,input_size)) s1=prediction2[:,0:1,:,:] s2=prediction2[:,1:2,:,:] s3=prediction2[:,2:3,:,:] s4=prediction2[:,3:4,:,:] mask1=s1/(s1+s2+s3+s4+eps*rand_num) mask2=s2/(s1+s2+s3+s4+eps*rand_num) mask3=s3/(s1+s2+s3+s4+eps*rand_num) mask4=s4/(s1+s2+s3+s4+eps*rand_num) input_var = input_var2[:,0:1,:,:] + input_var2[:,1:2,:,:] + input_var2[:,2:3,:,:] + input_var2[:,3:4,:,:] source1=mask1*input_var[:,0:1,:,:] source2=mask2*input_var[:,0:1,:,:] source3=mask3*input_var[:,0:1,:,:] source4=mask4*input_var[:,0:1,:,:] train_loss_recon1 = lasagne.objectives.squared_error(source1,target_var2[:,0:1,:,:]) train_loss_recon2 = lasagne.objectives.squared_error(source2,target_var2[:,1:2,:,:]) train_loss_recon3 = lasagne.objectives.squared_error(source3,target_var2[:,2:3,:,:]) train_loss_recon4 = lasagne.objectives.squared_error(source4,target_var2[:,3:4,:,:]) error1=train_loss_recon1.sum() error2=train_loss_recon2.sum() error3=train_loss_recon3.sum() error4=train_loss_recon4.sum() loss=abs(error1+error2+error3+error4) params1 = lasagne.layers.get_all_params(network2, trainable=True) updates = lasagne.updates.adadelta(loss, params1) train_fn = theano.function([input_var2,target_var2], loss, updates=updates,allow_input_downcast=True) train_fn1 = theano.function([input_var2,target_var2], [error1,error2,error3,error4], allow_input_downcast=True) predict_function2=theano.function([input_var2],[source1,source2,source3,source4],allow_input_downcast=True) losser=[] min_loss = 1e14 training_steps = 0 if not skip_train: logging.info("Training...") for epoch in range(num_epochs): train = LargeDatasetMask2(path_transform_in=path_in, nsources=4, nsamples=nsamples, batch_size=batch_size, batch_memory=batch_memory, time_context=time_context, overlap=overlap, nprocs=nprocs,mult_factor_in=scale_factor,mult_factor_out=scale_factor,\ sampleRate=transform.sampleRate,pitch_code='e', nharmonics=20, pitch_norm=127.,tensortype=theano.config.floatX,timbre_model_path=timbre_model_path) train_err = 0 train_batches = 0 err1=0 err2=0 err3=0 err4=0 start_time = time.time() for batch in range(train.iteration_size): inputs, target, masks = train() jump = inputs.shape[2] mask=np.empty(shape=(inputs.shape[0],4,inputs.shape[1],inputs.shape[2]),dtype=theano.config.floatX) mask[:,0,:,:]=masks[:,:,:jump] * inputs mask[:,1,:,:]=masks[:,:,jump:jump*2] * inputs mask[:,2,:,:]=masks[:,:,jump*2:jump*3] * inputs mask[:,3,:,:]=masks[:,:,jump*3:jump*4] * inputs masks=None targets=np.empty(shape=(inputs.shape[0],4,inputs.shape[1],inputs.shape[2]),dtype=theano.config.floatX) targets[:,0,:,:]=target[:,:,:jump] targets[:,1,:,:]=target[:,:,jump:jump*2] targets[:,2,:,:]=target[:,:,jump*2:jump*3] targets[:,3,:,:]=target[:,:,jump*3:jump*4] target=None inputs=None train_err+=train_fn(mask,targets) [e1,e2,e3,e4]=train_fn1(mask,targets) err1 += e1 err2 += e2 err3 += e3 err4 += e4 train_batches += 1 logging.info("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) logging.info(" training loss:\t\t{:.6f}".format(train_err/train_batches)) logging.info(" training loss for bassoon:\t\t{:.6f}".format(err1/train_batches)) logging.info(" training loss for clarinet:\t\t{:.6f}".format(err2/train_batches)) logging.info(" training loss for saxophone:\t\t{:.6f}".format(err3/train_batches)) logging.info(" training loss for violin:\t\t{:.6f}".format(err4/train_batches)) losser.append(train_err / train_batches) #save_model(model,network2) # if (train_err/train_batches) < min_loss: # min_loss = train_err/train_batches save_model(model,network2) # training_steps = training_steps + 1 # num_epochs = int(np.ceil(float(num_epochs)/5.)) # if losser[-1] > min_loss: # params=load_model(model) # lasagne.layers.set_all_param_values(network2,params,learning_rate=0.0001) # updates = lasagne.updates.adam(loss, params1) # train_fn = theano.function([input_var2,target_var2], loss, updates=updates,allow_input_downcast=True) if not skip_sep: logging.info("Separating") sources = ['bassoon','clarinet','saxphone','violin'] sources_midi = ['bassoon','clarinet','saxophone','violin'] train = LargeDatasetMask2(path_transform_in=path_in, nsources=4, batch_size=batch_size, batch_memory=batch_memory, time_context=time_context, overlap=overlap, nprocs=nprocs,mult_factor_in=scale_factor,mult_factor_out=scale_factor,\ sampleRate=transform.sampleRate,pitch_code='e', nharmonics=20, pitch_norm=127.,tensortype=theano.config.floatX,timbre_model_path=timbre_model_path) for f in testfile_list: nelem_g=1 for i in range(len(sources)): ng = util.getMidiNum(sources_midi[i]+'_b',os.path.join(testdir,f),0,40.0) nelem_g = np.maximum(ng,nelem_g) melody = np.zeros((len(sources),int(nelem_g),2*nharmonics+3)) for i in range(len(sources)): filename=os.path.join(testdir,f,f+'-'+sources[i]+'.wav') audioObj, sampleRate, bitrate = util.readAudioScipy(filename) assert sampleRate == 44100,"Sample rate needs to be 44100" nframes = int(np.ceil(len(audioObj) / np.double(tt.hopSize))) + 2 if i==0: audio = np.zeros(audioObj.shape[0]) audio = audio + audioObj audioObj=None tmp = util.expandMidi(sources_midi[i]+'_b',os.path.join(testdir,f),0,40.0,interval,tuning_freq,nharmonics,sampleRate,tt.hopSize,tt.frameSize,0.2,0.2,nframes,0.5) melody[i,:tmp.shape[0],:] = tmp tmp = None mag,ph=transform.compute_file(audio,phase=True) mag=scale_factor*mag.astype(np.float32) jump = mag.shape[-1] masks_temp = train.filterSpec(mag,melody,0,nframes) masks = np.ones((train.ninst,mag.shape[0],mag.shape[1])) masks[0,:,:]=masks_temp[:,:jump] * mag masks[1,:,:]=masks_temp[:,jump:jump*2] * mag masks[2,:,:]=masks_temp[:,jump*2:jump*3] * mag masks[3,:,:]=masks_temp[:,jump*3:jump*4] * mag mag = None masks_temp = None batches,nchunks = util.generate_overlapadd(masks,input_size=masks.shape[-1],time_context=train.time_context,overlap=train.overlap,batch_size=train.batch_size,sampleRate=44100) masks = None batch_no=1 output=[] for batch in batches: batch_no+=1 #start_time=time.time() output.append(predict_function2(batch)) output=np.array(output) mm=util.overlapadd_multi(output,batches,nchunks,overlap=train.overlap) for i in range(len(sources)): audio_out=transform.compute_inverse(mm[i,:len(ph)]/scale_factor,ph) if len(audio_out)>len(audio): audio_out=audio_out[:len(audio)] util.writeAudioScipy(os.path.join(outdir,f+'-'+sources[i]+'.wav'),audio_out,sampleRate,bitrate) audio_out=None # style = ['fast','slow','original'] # style_midi = ['_fast20','_slow20','_original'] # if not os.path.exists(outdir1): # os.makedirs(outdir1) # for s in range(len(style)): # for f in testfile_list1: # nelem_g=1 # for i in range(len(sources)): # ng = util.getMidiNum(sources_midi[i]+'_g'+style_midi[s],os.path.join(testdir1,f),0,40.0) # nelem_g = np.maximum(ng,nelem_g) # melody = np.zeros((len(sources),int(nelem_g),2*nharmonics+3)) # for i in range(len(sources)): # filename=os.path.join(testdir1,f,f+'_'+style[s]+'_'+sources_midi[i]+'.wav') # audioObj, sampleRate, bitrate = util.readAudioScipy(filename) # assert sampleRate == 44100,"Sample rate needs to be 44100" # nframes = int(np.ceil(len(audioObj) / np.double(tt.hopSize))) + 2 # if i==0: # audio = np.zeros(audioObj.shape[0]) # audio = audio + audioObj # audioObj=None # tmp = util.expandMidi(sources_midi[i]+'_g'+style_midi[s],os.path.join(testdir1,f),0,40.0,interval,tuning_freq,nharmonics,sampleRate,tt.hopSize,tt.frameSize,0.2,0.2,nframes) # melody[i,:tmp.shape[0],:] = tmp # tmp = None # mag,ph=transform.compute_file(audio,phase=True) # mag=scale_factor*mag.astype(np.float32) # jump = mag.shape[-1] # masks_temp = train.filterSpec(mag,melody,0,nframes) # masks = np.ones((train.ninst,mag.shape[0],mag.shape[1])) # masks[0,:,:]=masks_temp[:,:jump] * mag # masks[1,:,:]=masks_temp[:,jump:jump*2] * mag # masks[2,:,:]=masks_temp[:,jump*2:jump*3] * mag # masks[3,:,:]=masks_temp[:,jump*3:jump*4] * mag # mag = None # masks_temp = None # batches,nchunks = util.generate_overlapadd(masks,input_size=masks.shape[-1],time_context=train.time_context,overlap=train.overlap,batch_size=train.batch_size,sampleRate=44100) # masks = None # batch_no=1 # output=[] # for batch in batches: # batch_no+=1 # #start_time=time.time() # output.append(predict_function2(batch)) # output=np.array(output) # mm=util.overlapadd_multi(output,batches,nchunks,overlap=train.overlap) # for i in range(len(sources)): # audio_out=transform.compute_inverse(mm[i,:len(ph)]/scale_factor,ph) # if len(audio_out)>len(audio): # audio_out=audio_out[:len(audio)] # filename=os.path.join(outdir1,f+'_'+style[s]+'_'+sources_midi[i]+'.wav') # util.writeAudioScipy(filename,audio_out,sampleRate,bitrate) # audio_out=None return losser
def train_auto(train, fun, transform, testdir, outdir, testfile_list, testdir1, outdir1, testfile_list1, num_epochs=30, model="1.pkl", scale_factor=0.3, load=False, skip_train=False, skip_sep=False): """ Trains a network built with \"fun\" with the data generated with \"train\" and then separates the files in \"testdir\",writing the result in \"outdir\" Parameters ---------- train : Callable, e.g. LargeDataset object The callable which generates training data for the network: inputs, target = train() fun : lasagne network object, Theano tensor The network to be trained transform : transformFFT object The Transform object which was used to compute the features (see compute_features.py) testdir : string, optional The directory where the files to be separated are located outdir : string, optional The directory where to write the separated files num_epochs : int, optional The number the epochs to train for (one epoch is when all examples in the dataset are seen by the network) model : string, optional The path where to save the trained model (theano tensor containing the network) scale_factor : float, optional Scale the magnitude of the files to be separated with this factor Yields ------ losser : list The losses for each epoch, stored in a list """ logging.info("Building Autoencoder") input_var2 = T.tensor4('inputs') target_var2 = T.tensor4('targets') rand_num = T.tensor4('rand_num') eps = 1e-18 alpha = 0.001 network2 = fun(input_var=input_var2, batch_size=train.batch_size, time_context=train.time_context, feat_size=train.input_size) if load: params = load_model(model) lasagne.layers.set_all_param_values(network2, params) prediction2 = lasagne.layers.get_output(network2, deterministic=True) rand_num = np.random.uniform(size=(train.batch_size, 1, train.time_context, train.input_size)) s1 = prediction2[:, 0:1, :, :] s2 = prediction2[:, 1:2, :, :] s3 = prediction2[:, 2:3, :, :] s4 = prediction2[:, 3:4, :, :] mask1 = s1 / (s1 + s2 + s3 + s4 + eps * rand_num) mask2 = s2 / (s1 + s2 + s3 + s4 + eps * rand_num) mask3 = s3 / (s1 + s2 + s3 + s4 + eps * rand_num) mask4 = s4 / (s1 + s2 + s3 + s4 + eps * rand_num) source1 = mask1 * input_var2[:, 0:1, :, :] source2 = mask2 * input_var2[:, 0:1, :, :] source3 = mask3 * input_var2[:, 0:1, :, :] source4 = mask4 * input_var2[:, 0:1, :, :] train_loss_recon1 = lasagne.objectives.squared_error( source1, target_var2[:, 0:1, :, :]) train_loss_recon2 = lasagne.objectives.squared_error( source2, target_var2[:, 1:2, :, :]) train_loss_recon3 = lasagne.objectives.squared_error( source3, target_var2[:, 2:3, :, :]) train_loss_recon4 = lasagne.objectives.squared_error( source4, target_var2[:, 3:4, :, :]) error1 = train_loss_recon1.sum() error2 = train_loss_recon2.sum() error3 = train_loss_recon3.sum() error4 = train_loss_recon4.sum() loss = abs(error1 + error2 + error3 + error4) params1 = lasagne.layers.get_all_params(network2, trainable=True) updates = lasagne.updates.adadelta(loss, params1) train_fn = theano.function([input_var2, target_var2], loss, updates=updates, allow_input_downcast=True) train_fn1 = theano.function([input_var2, target_var2], [error1, error2, error3, error4], allow_input_downcast=True) predict_function2 = theano.function([input_var2], [source1, source2, source3, source4], allow_input_downcast=True) losser = [] if not skip_train: logging.info("Training...") for epoch in range(num_epochs): train_err = 0 train_batches = 0 err1 = 0 err2 = 0 err3 = 0 err4 = 0 start_time = time.time() for batch in range(train.iteration_size): inputs, target = train() jump = inputs.shape[2] targets = np.ndarray(shape=(inputs.shape[0], 4, inputs.shape[1], inputs.shape[2])) inputs = np.reshape( inputs, (inputs.shape[0], 1, inputs.shape[1], inputs.shape[2])) targets[:, 0, :, :] = target[:, :, :jump] targets[:, 1, :, :] = target[:, :, jump:jump * 2] targets[:, 2, :, :] = target[:, :, jump * 2:jump * 3] targets[:, 3, :, :] = target[:, :, jump * 3:jump * 4] target = None #gc.collect() train_err += train_fn(inputs, targets) [e1, e2, e3, e4] = train_fn1(inputs, targets) err1 += e1 err2 += e2 err3 += e3 err4 += e4 train_batches += 1 logging.info("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) logging.info(" training loss:\t\t{:.6f}".format(train_err / train_batches)) logging.info(" training loss for bassoon:\t\t{:.6f}".format( err1 / train_batches)) logging.info(" training loss for clarinet:\t\t{:.6f}".format( err2 / train_batches)) logging.info(" training loss for saxophone:\t\t{:.6f}".format( err3 / train_batches)) logging.info(" training loss for violin:\t\t{:.6f}".format( err4 / train_batches)) losser.append(train_err / train_batches) save_model(model, network2) if not skip_sep: logging.info("Separating") sources = ['bassoon', 'clarinet', 'saxphone', 'violin'] sources_midi = ['bassoon', 'clarinet', 'saxophone', 'violin'] for f in testfile_list: for i in range(len(sources)): filename = os.path.join(testdir, f, f + '-' + sources[i] + '.wav') audioObj, sampleRate, bitrate = util.readAudioScipy(filename) assert sampleRate == 44100, "Sample rate needs to be 44100" nframes = int(np.ceil( len(audioObj) / np.double(tt.hopSize))) + 2 if i == 0: audio = np.zeros(audioObj.shape[0]) #melody = np.zeros((len(sources),1,nframes)) audio = audio + audioObj audioObj = None mag, ph = transform.compute_file(audio, phase=True) mag = scale_factor * mag.astype(np.float32) batches, nchunks = util.generate_overlapadd( mag, input_size=mag.shape[-1], time_context=train.time_context, overlap=train.overlap, batch_size=train.batch_size, sampleRate=44100) output = [] #output1=[] batch_no = 1 for batch in batches: batch_no += 1 start_time = time.time() output.append(predict_function2(batch)) output = np.array(output) mm = util.overlapadd_multi(output, batches, nchunks, overlap=train.overlap) for i in range(len(sources)): audio_out = transform.compute_inverse( mm[i, :len(ph)] / scale_factor, ph) if len(audio_out) > len(audio): audio_out = audio_out[:len(audio)] util.writeAudioScipy( os.path.join(outdir, f + '-' + sources[i] + '.wav'), audio_out, sampleRate, bitrate) audio_out = None style = ['fast', 'slow', 'original'] if not os.path.exists(outdir1): os.makedirs(outdir1) for s in style: for f in testfile_list1: for i in range(len(sources)): filename = os.path.join( testdir1, f, f + '_' + s + '_' + sources_midi[i] + '.wav') audioObj, sampleRate, bitrate = util.readAudioScipy( filename) assert sampleRate == 44100, "Sample rate needs to be 44100" nframes = int( np.ceil(len(audioObj) / np.double(tt.hopSize))) + 2 if i == 0: audio = np.zeros(audioObj.shape[0]) #melody = np.zeros((len(sources),1,nframes)) audio = audio + audioObj audioObj = None mag, ph = transform.compute_file(audio, phase=True) mag = scale_factor * mag.astype(np.float32) batches, nchunks = util.generate_overlapadd( mag, input_size=mag.shape[-1], time_context=train.time_context, overlap=train.overlap, batch_size=train.batch_size, sampleRate=44100) output = [] batch_no = 1 for batch in batches: batch_no += 1 start_time = time.time() output.append(predict_function2(batch)) output = np.array(output) mm = util.overlapadd_multi(output, batches, nchunks, overlap=train.overlap) for i in range(len(sources)): audio_out = transform.compute_inverse( mm[i, :len(ph)] / scale_factor, ph) if len(audio_out) > len(audio): audio_out = audio_out[:len(audio)] filename = os.path.join( outdir1, f + '_' + s + '_' + sources_midi[i] + '.wav') util.writeAudioScipy(filename, audio_out, sampleRate, bitrate) audio_out = None return losser
def train_auto(fun, train, transform, testdir, outdir, num_epochs=30, model="1.pkl", scale_factor=0.3, load=False, skip_train=False, skip_sep=False, chunk_size=60, chunk_overlap=2, nsamples=40, batch_size=32, batch_memory=50, time_context=30, overlap=25, nprocs=4, mult_factor_in=0.3, mult_factor_out=0.3): """ Trains a network built with \"fun\" with the data generated with \"train\" and then separates the files in \"testdir\",writing the result in \"outdir\" Parameters ---------- fun : lasagne network object, Theano tensor The network to be trained transform : transformFFT object The Transform object which was used to compute the features (see compute_features_DSD100.py) testdir : string, optional The directory where the files to be separated are located outdir : string, optional The directory where to write the separated files num_epochs : int, optional The number the epochs to train for (one epoch is when all examples in the dataset are seen by the network) model : string, optional The path where to save the trained model (theano tensor containing the network) scale_factor : float, optional Scale the magnitude of the files to be separated with this factor Yields ------ losser : list The losses for each epoch, stored in a list """ logging.info("Building Autoencoder") input_var = T.tensor4('inputs') input_mask = T.tensor4('input_mask') target_var = T.tensor4('targets') theano_rng = RandomStreams(128) eps = 1e-12 sources = ['vocals', 'bass', 'drums', 'other'] nchannels = int(train.channels_in) nsources = int(train.channels_out / train.channels_in) print 'nchannels: ', nchannels print 'nsources: ', nsources input_size = int(float(transform.frameSize) / 2 + 1) rand_num = theano_rng.normal(size=(batch_size, nsources, time_context, input_size), avg=0.0, std=0.1, dtype=theano.config.floatX) net = fun(input_var=input_var, batch_size=batch_size, time_context=time_context, feat_size=input_size, nchannels=nchannels, nsources=nsources) network = net['l_out'] if load: params = load_model(model) lasagne.layers.set_all_param_values(network, params) prediction = lasagne.layers.get_output(network, deterministic=True) sourceall = [] errors_insts = [] loss = 0 sep_chann = [] # prediction example for 2 sources in 2 channels: # 0, 1 source 0 in channel 0 and 1 # 2, 3 source 1 in channel 0 and 1 for j in range(nchannels): #print "j: ", j masksum = T.sum(prediction[:, j::nchannels, :, :], axis=1) temp = T.tile(masksum.dimshuffle(0, 'x', 1, 2), (1, nsources, 1, 1)) mask = prediction[:, j::nchannels, :, :] / (temp + eps * rand_num) source = mask * T.tile(input_var[:, j:j + 1, :, :], (1, nsources, 1, 1)) + eps * rand_num sourceall.append(source) sep_chann.append(source) train_loss_recon = lasagne.objectives.squared_error( source, target_var[:, j::nchannels, :, :]) errors_inst = abs(train_loss_recon.sum(axis=(0, 2, 3))) errors_insts.append(errors_inst) loss = loss + abs(train_loss_recon.sum()) params1 = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.adadelta(loss, params1) train_fn_mse = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True) train_fn1 = theano.function([input_var, target_var], errors_insts, allow_input_downcast=True) #----------NEW ILD LOSS CONDITION---------- rand_num2 = theano_rng.normal( size=(batch_size, nsources, time_context, input_size), avg=0.0, std=0.1, dtype=theano.config.floatX) #nsources a primera dim? #estimate interaural_spec_est = sep_chann[0] / (sep_chann[1] + eps * rand_num2) alpha_est = 20 * np.log10(abs(interaural_spec_est + eps * rand_num2)) alpha_est_mean = alpha_est.mean(axis=(0, 1, 2)) #groundtruth interaural_spec_gt = target_var[:, 0::nchannels, :, :] / ( target_var[:, 1::nchannels, :, :] + eps * rand_num2) alpha_gt = 20 * np.log10(abs(interaural_spec_gt + eps * rand_num2)) alpha_gt_mean = alpha_gt.mean( axis=(0, 1, 2)) #aixo hauria de ser un vector d'una dimensio train_loss_ild = lasagne.objectives.squared_error(alpha_est_mean, alpha_gt_mean) loss = loss + (abs(train_loss_ild.sum()) / 500) #------------------------------------------ predict_function = theano.function([input_var], sourceall, allow_input_downcast=True) losser = [] if not skip_train: logging.info("Training stage 1 (mse)...") for epoch in range(num_epochs): train_err = 0 train_batches = 0 errs = np.zeros((nchannels, nsources)) start_time = time.time() for batch in range(train.iteration_size): inputs, target = train() train_err += train_fn_mse(inputs, target) errs += np.array(train_fn1(inputs, target)) train_batches += 1 logging.info("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) logging.info(" training loss:\t\t{:.6f}".format(train_err / train_batches)) for j in range(nchannels): for i in range(nsources): logging.info(" training loss for " + sources[i] + " in mic " + str(j) + ":\t\t{:.6f}".format(errs[j][i] / train_batches)) model_noILD = model[:-4] + '_noILD' + model[-4:] print 'model_noILD: ', model_noILD save_model(model_noILD, network) losser.append(train_err / train_batches) #NEW ILD TRAINING--------------------------------------------------------- params = load_model(model_noILD) lasagne.layers.set_all_param_values(network, params) params1 = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.adadelta(loss, params1) train_fn_ILD = theano.function([input_var, target_var], loss, updates=updates, allow_input_downcast=True) logging.info("Training stage 2 (ILD)...") for epoch in range(int(num_epochs / 2)): train_err = 0 train_batches = 0 start_time = time.time() for batch in range(train.iteration_size): inputs, target = train() train_err += train_fn_ILD(inputs, target) train_batches += 1 logging.info("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epochs, time.time() - start_time)) logging.info(" training loss:\t\t{:.6f}".format(train_err / train_batches)) save_model(model, network) losser.append(train_err / train_batches) if not skip_sep: logging.info("Separating") subsets = ['Dev', 'Test'] for sub in subsets: for d in sorted(os.listdir(os.path.join(db, 'Mixtures', sub))): print os.path.join(os.path.sep, db, 'Mixtures', sub, d, 'mixture.wav') audio, sampleRate, bitrate = util.readAudioScipy( os.path.join(os.path.sep, db, 'Mixtures', sub, d, 'mixture.wav')) nsamples = audio.shape[0] sep_audio = np.zeros((nsamples, len(sources), audio.shape[1])) mag, ph = transform.compute_transform(audio, phase=True) mag = scale_factor * mag.astype(np.float32) #print 'mag.shape: ', mag.shape, 'batch_size: ', train.batch_size nframes = mag.shape[-2] batches_mag, nchunks = util.generate_overlapadd( mag, input_size=mag.shape[-1], time_context=train.time_context, overlap=train.overlap, batch_size=train.batch_size, sampleRate=sampleRate) mag = None output = [] for b in range(len(batches_mag)): output.append(predict_function(batches_mag[b])) output = np.array(output) for j in range(audio.shape[1]): mm = util.overlapadd_multi(np.swapaxes( output[:, j:j + 1, :, :, :, :], 1, 3), batches_mag, nchunks, overlap=train.overlap) for i in range(len(sources)): audio_out = transform.compute_inverse( mm[i, :ph.shape[1], :] / scale_factor, ph[j]) # if len(sep_audio[:i,j])<len(audio_out): # print len(sep_audio), len(audio_out), len(audio_out)-len(sep_audio[:i,j]) # sep_audio = np.concatenate(sep_audio,np.zeros(len(audio_out)-len(sep_audio[:i,j]))) # print len(sep_audio), len(audio_out), len(audio_out)-len(sep_audio[:i,j]) sep_audio[:, i, j] = audio_out[:len(sep_audio)] print 'Saving separation: ', outdir if not os.path.exists(os.path.join(outdir)): os.makedirs(os.path.join(outdir)) print 'Creating model folder' if not os.path.exists(os.path.join(outdir, 'Sources')): os.makedirs(os.path.join(outdir, 'Sources')) print 'Creating Sources folder: ', os.path.join( outdir, 'Sources') if not os.path.exists(os.path.join(outdir, 'Sources', sub)): os.makedirs(os.path.join(outdir, 'Sources', sub)) print 'Creating subset folder' if not os.path.exists(os.path.join(outdir, 'Sources', sub, d)): os.makedirs(os.path.join(outdir, 'Sources', sub, d)) print 'Creating song folder', os.path.join( outdir, 'Sources', sub, d) for i in range(len(sources)): print 'Final audio file: ', i, os.path.join( outdir, 'Sources', sub, d, sources[i] + '.wav' ), 'nsamples: ', nsamples, 'len sep_audio :', len( sep_audio) util.writeAudioScipy( os.path.join(outdir, 'Sources', sub, d, sources[i] + '.wav'), sep_audio[:nsamples, i, :], sampleRate, bitrate) return losser