def run_nn(data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict,cfg_file,processed_first,next_config_file): # This function processes the current chunk using the information in cfg_file. In parallel, the next chunk is load into the CPU memory # Reading chunk-specific cfg file (first argument-mandatory file) if not(os.path.exists(cfg_file)): sys.stderr.write('ERROR: The config file %s does not exist!\n'%(cfg_file)) sys.exit(0) else: config = configparser.ConfigParser() config.read(cfg_file) # Setting torch seed seed=int(config['exp']['seed']) torch.manual_seed(seed) random.seed(seed) np.random.seed(seed) # Reading config parameters output_folder=config['exp']['out_folder'] use_cuda=strtobool(config['exp']['use_cuda']) multi_gpu=strtobool(config['exp']['multi_gpu']) to_do=config['exp']['to_do'] info_file=config['exp']['out_info'] model=config['model']['model'].split('\n') forward_outs=config['forward']['forward_out'].split(',') forward_normalize_post=list(map(strtobool,config['forward']['normalize_posteriors'].split(','))) forward_count_files=config['forward']['normalize_with_counts_from'].split(',') require_decodings=list(map(strtobool,config['forward']['require_decoding'].split(','))) use_cuda=strtobool(config['exp']['use_cuda']) save_gpumem=strtobool(config['exp']['save_gpumem']) is_production=strtobool(config['exp']['production']) if to_do=='train': batch_size=int(config['batches']['batch_size_train']) if to_do=='valid': batch_size=int(config['batches']['batch_size_valid']) if to_do=='forward': batch_size=1 # ***** Reading the Data******** if processed_first: # Reading all the features and labels for this chunk shared_list=[] p=threading.Thread(target=read_lab_fea, args=(cfg_file,is_production,shared_list,output_folder,)) p.start() p.join() data_name=shared_list[0] data_end_index=shared_list[1] fea_dict=shared_list[2] lab_dict=shared_list[3] arch_dict=shared_list[4] data_set=shared_list[5] # converting numpy tensors into pytorch tensors and put them on GPUs if specified if not(save_gpumem) and use_cuda: data_set=torch.from_numpy(data_set).float().cuda() else: data_set=torch.from_numpy(data_set).float() # Reading all the features and labels for the next chunk shared_list=[] p=threading.Thread(target=read_lab_fea, args=(next_config_file,is_production,shared_list,output_folder,)) p.start() # Reading model and initialize networks inp_out_dict=fea_dict [nns,costs]=model_init(inp_out_dict,model,config,arch_dict,use_cuda,multi_gpu,to_do) # optimizers initialization optimizers=optimizer_init(nns,config,arch_dict) # pre-training and multi-gpu init for net in nns.keys(): pt_file_arch=config[arch_dict[net][0]]['arch_pretrain_file'] if pt_file_arch!='none': checkpoint_load = torch.load(pt_file_arch) nns[net].load_state_dict(checkpoint_load['model_par']) optimizers[net].load_state_dict(checkpoint_load['optimizer_par']) optimizers[net].param_groups[0]['lr']=float(config[arch_dict[net][0]]['arch_lr']) # loading lr of the cfg file for pt if multi_gpu: nns[net] = torch.nn.DataParallel(nns[net]) if to_do=='forward': post_file={} for out_id in range(len(forward_outs)): if require_decodings[out_id]: out_file=info_file.replace('.info','_'+forward_outs[out_id]+'_to_decode.ark') else: out_file=info_file.replace('.info','_'+forward_outs[out_id]+'.ark') post_file[forward_outs[out_id]]=open_or_fd(out_file,output_folder,'wb') # check automatically if the model is sequential seq_model=is_sequential_dict(config,arch_dict) # ***** Minibatch Processing loop******** if seq_model or to_do=='forward': N_snt=len(data_name) N_batches=int(N_snt/batch_size) else: N_ex_tr=data_set.shape[0] N_batches=int(N_ex_tr/batch_size) beg_batch=0 end_batch=batch_size snt_index=0 beg_snt=0 start_time = time.time() # array of sentence lengths arr_snt_len=shift(shift(data_end_index, -1,0)-data_end_index,1,0) arr_snt_len[0]=data_end_index[0] loss_sum=0 err_sum=0 inp_dim=data_set.shape[1] for i in range(N_batches): max_len=0 if seq_model: max_len=int(max(arr_snt_len[snt_index:snt_index+batch_size])) inp= torch.zeros(max_len,batch_size,inp_dim).contiguous() for k in range(batch_size): snt_len=data_end_index[snt_index]-beg_snt N_zeros=max_len-snt_len # Appending a random number of initial zeros, tge others are at the end. N_zeros_left=random.randint(0,N_zeros) # randomizing could have a regularization effect inp[N_zeros_left:N_zeros_left+snt_len,k,:]=data_set[beg_snt:beg_snt+snt_len,:] beg_snt=data_end_index[snt_index] snt_index=snt_index+1 else: # features and labels for batch i if to_do!='forward': inp= data_set[beg_batch:end_batch,:].contiguous() else: snt_len=data_end_index[snt_index]-beg_snt inp= data_set[beg_snt:beg_snt+snt_len,:].contiguous() beg_snt=data_end_index[snt_index] snt_index=snt_index+1 # use cuda if use_cuda: inp=inp.cuda() if to_do=='train': # Forward input, with autograd graph active outs_dict=forward_model(fea_dict,lab_dict,arch_dict,model,nns,costs,inp,inp_out_dict,max_len,batch_size,to_do,forward_outs) for opt in optimizers.keys(): optimizers[opt].zero_grad() outs_dict['loss_final'].backward() # Gradient Clipping (th 0.1) #for net in nns.keys(): # torch.nn.utils.clip_grad_norm_(nns[net].parameters(), 0.1) for opt in optimizers.keys(): if not(strtobool(config[arch_dict[opt][0]]['arch_freeze'])): optimizers[opt].step() else: with torch.no_grad(): # Forward input without autograd graph (save memory) outs_dict=forward_model(fea_dict,lab_dict,arch_dict,model,nns,costs,inp,inp_out_dict,max_len,batch_size,to_do,forward_outs) if to_do=='forward': for out_id in range(len(forward_outs)): out_save=outs_dict[forward_outs[out_id]].data.cpu().numpy() if forward_normalize_post[out_id]: # read the config file counts = load_counts(forward_count_files[out_id]) out_save=out_save-np.log(counts/np.sum(counts)) # save the output write_mat(output_folder,post_file[forward_outs[out_id]], out_save, data_name[i]) else: loss_sum=loss_sum+outs_dict['loss_final'].detach() err_sum=err_sum+outs_dict['err_final'].detach() # update it to the next batch beg_batch=end_batch end_batch=beg_batch+batch_size # Progress bar if to_do == 'train': status_string="Training | (Batch "+str(i+1)+"/"+str(N_batches)+")"+" | L:" +str(round(loss_sum.cpu().item()/(i+1),3)) if i==N_batches-1: status_string="Training | (Batch "+str(i+1)+"/"+str(N_batches)+")" if to_do == 'valid': status_string="Validating | (Batch "+str(i+1)+"/"+str(N_batches)+")" if to_do == 'forward': status_string="Forwarding | (Batch "+str(i+1)+"/"+str(N_batches)+")" progress(i, N_batches, status=status_string) elapsed_time_chunk=time.time() - start_time loss_tot=loss_sum/N_batches err_tot=err_sum/N_batches # clearing memory del inp, outs_dict, data_set # save the model if to_do=='train': for net in nns.keys(): checkpoint={} if multi_gpu: checkpoint['model_par']=nns[net].module.state_dict() else: checkpoint['model_par']=nns[net].state_dict() checkpoint['optimizer_par']=optimizers[net].state_dict() out_file=info_file.replace('.info','_'+arch_dict[net][0]+'.pkl') torch.save(checkpoint, out_file) if to_do=='forward': for out_name in forward_outs: post_file[out_name].close() # Write info file with open(info_file, "w") as text_file: text_file.write("[results]\n") if to_do!='forward': text_file.write("loss=%s\n" % loss_tot.cpu().numpy()) text_file.write("err=%s\n" % err_tot.cpu().numpy()) text_file.write("elapsed_time_chunk=%f\n" % elapsed_time_chunk) text_file.close() # Getting the data for the next chunk (read in parallel) p.join() data_name=shared_list[0] data_end_index=shared_list[1] fea_dict=shared_list[2] lab_dict=shared_list[3] arch_dict=shared_list[4] data_set=shared_list[5] # converting numpy tensors into pytorch tensors and put them on GPUs if specified if not(save_gpumem) and use_cuda: data_set=torch.from_numpy(data_set).float().cuda() else: data_set=torch.from_numpy(data_set).float() return [data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict]
def main(self,rank): os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3" options=read_conf() do_training=bool(int(options.do_training)) do_eval=bool(int(options.do_eval)) do_forward=bool(int(options.do_forward)) if do_forward: torch.cuda.set_device(0) device = "cuda:{}".format(0) else: torch.cuda.set_device(dist.get_rank()-1) device = "cuda:{}".format(dist.get_rank()-1) PS = Parameter_Server() if int(rank)==0 and do_training: PS.ps_server(rank) port = sys.argv[1] world_size = sys.argv[3] ip_add = sys.argv[4] fea_scp=options.fea_scp fea_opts=options.fea_opts lab_folder=options.lab_folder lab_opts=options.lab_opts dev_fea_scp="/home/slave3/kaldi/egs/timit/s5/pytorch-kaldi/exp/mfcc_shu/dev_split.000" dev_fea_opts="apply-cmvn --utt2spk=ark:$KALDI_ROOT/egs/timit/s5/data/dev/utt2spk ark:$PYTORCH_EXP/mfcc_shu/dev_cmvn_speaker.ark ark:- ark:- | add-deltas --delta-order=2 ark:- ark:- |" dev_lab_folder='/home/slave3/kaldi/egs/timit/s5/exp/dnn4_pretrain-dbn_dnn_ali_dev' dev_lab_opts='ali-to-pdf' out_file=options.out_file count_file=options.count_file pt_file=options.pt_file left=int(options.cw_left) right=int(options.cw_right) seed=int(options.seed) use_cuda=bool(int(options.use_cuda)) multi_gpu=bool(int(options.multi_gpu)) NN_type=options.NN_type batch_size=int(options.batch_size) lr=float(options.lr) save_gpumem=int(options.save_gpumem) opt=options.optimizer if NN_type=='RNN': from neural_nets import RNN as ann rnn=1 if NN_type=='LSTM': from neural_nets import LSTM as ann rnn=1 if NN_type=='GRU': from neural_nets import GRU as ann rnn=1 if NN_type=='MLP': from neural_nets import MLP as ann rnn=0 options.input_dim=429 options.num_classes=1944 net = ann(options) if use_cuda: net.cuda(device=device) update_time=0 sum_update_time=0 st_update_time=0 end_update_time=0 shu_time=0 sum_shu_time=0 st_shu_time=0 end_shu_time=0 model_time=0 sum_model_time=0 st_model_time=0 end_model_time=0 load_time=0 sum_load_time=0 st_load_time=0 end_load_time=0 val_time=0 sum_val_time=0 st_val_time=0 end_val_time=0 epoch_time=0 sum_epoch_time=0 st_epoch_time=0 end_epoch_time=0 data_time=0 st_data_time=0 end_data_time=0 train_time=0 st_train_time=0 end_train_time=0 _, st_train_time= timestamp(), resource_usage(RUSAGE_SELF) torch.manual_seed(seed) random.seed(seed) print("[INFO] Batch size: ",batch_size) if rnn or do_eval or do_forward: seed=-1 _, st_data_time= timestamp(), resource_usage(RUSAGE_SELF) if do_forward == 1: dev_data_name=[0] if do_forward == 0: [dev_data_name,dev_data_set_ori,dev_data_end_index]=load_chunk(dev_fea_scp,dev_fea_opts,dev_lab_folder,dev_lab_opts,left,right,-1) [data_name,data_set_ori,data_end_index]=load_chunk(fea_scp,fea_opts,lab_folder,lab_opts,left,right,seed) data_len = int(len(data_set_ori)/(int(world_size)-1)) if do_training: if int(world_size)-1==1: print("Partition data 1") elif int(world_size)-1==2: print("partition data 2") if int(rank)==1: data_set_ori = data_set_ori[0:data_len] elif int(rank)==2: data_set_ori = data_set_ori[data_len:] elif int(world_size)-1==3: print("partition data 3") if int(rank)==1: data_set_ori = data_set_ori[0:data_len] elif int(rank)==2: data_set_ori = data_set_ori[data_len:data_len*2] elif int(rank)==3: data_set_ori = data_set_ori[data_len*2:] elif int(world_size)-1==4: print("partition data 4") if int(rank)==1: data_set_ori = data_set_ori[0:data_len] elif int(rank)==2: data_set_ori = data_set_ori[data_len:data_len*2] elif int(rank)==3: data_set_ori = data_set_ori[data_len*2:data_len*3] elif int(rank)==4: data_set_ori = data_set_ori[data_len*3:] data_len = len(data_set_ori) end_data_time,_ = resource_usage(RUSAGE_SELF), timestamp() data_time = end_data_time.ru_utime - st_data_time.ru_utime print("data generate time: ", data_time) print(np.shape(data_set_ori)) if not(save_gpumem): data_set=torch.from_numpy(data_set_ori).float().cuda(device=device) else: data_set=torch.from_numpy(data_set_ori).float() if do_forward ==0: if not(save_gpumem): dev_data_set=torch.from_numpy(dev_data_set_ori).float().cuda(device=device) else: dev_data_set=torch.from_numpy(dev_data_set_ori).float() N_fea=data_set.shape[1]-1 options.input_dim=N_fea N_out=int(data_set[:,N_fea].max()-data_set[:,N_fea].min()+1) options.num_classes=N_out if multi_gpu: net = nn.DataParallel(net) optimizer_worker=None if optimizer_worker is None: optimizer_worker = optim.SGD(net.parameters(), lr=lr) else: optimizer_worker = optim.RMSprop(net.parameters(), lr=lr,alpha=0.95, eps=1e-8) if do_forward: if pt_file!='none': checkpoint_load = torch.load(pt_file) net.load_state_dict(checkpoint_load['model_par']) optimizer_worker.load_state_dict(checkpoint_load['optimizer_par']) optimizer_worker.param_groups[0]['lr']=lr dev_N_snt=len(dev_data_name) N_snt=len(data_name) if do_training: print("do training") net.train() test_flag=0 if do_training: N_batches=int((N_snt/batch_size)/(int(world_size)-1)) else: N_batches=int(N_snt/batch_size) if rnn==0: N_ex_tr=data_set.shape[0] N_batches=int(N_ex_tr/batch_size) if do_eval: N_batches=N_snt net.eval() test_flag=1 batch_size=1 if do_forward: post_file=kaldi_io.open_or_fd(out_file,'wb') counts = load_counts(count_file) beg_batch=0 end_batch=beg_batch+batch_size dev_beg_batch=0 dev_end_batch=dev_beg_batch+1 snt_index=0 beg_snt=0 dev_beg_snt=0 loss_sum=0 err_sum=0 dev_loss_sum=0 dev_err_sum=0 temp_err=0 dev_err_sum_tot=0 dev_N_batches=0 num_epoch=24 main_class = MAIN_CLASS() if do_forward: for i in range(N_batches): if do_training : if rnn==1: max_len=data_end_index[snt_index+batch_size-1]-data_end_index[snt_index+batch_size-2] inp= Variable(torch.zeros(max_len,batch_size,N_fea)).contiguous() lab= Variable(torch.zeros(max_len,batch_size)).contiguous().long() for k in range(batch_size): snt_len=data_end_index[snt_index]-beg_snt N_zeros=max_len-snt_len N_zeros_left=random.randint(0,N_zeros) inp[N_zeros_left:N_zeros_left+snt_len,k,:]=data_set[beg_snt:beg_snt+snt_len,0:N_fea] lab[N_zeros_left:N_zeros_left+snt_len,k]=data_set[beg_snt:beg_snt+snt_len,-1] beg_snt=data_end_index[snt_index] snt_index=snt_index+1 else: inp= Variable(data_set[beg_batch:end_batch,0:N_fea]).contiguous().cuda(device=device) lab= Variable(data_set[beg_batch:end_batch,N_fea]).contiguous().long().cuda(device=device) if do_eval: end_snt=data_end_index[i] inp= Variable(data_set[beg_snt:end_snt,0:N_fea],volatile=True).contiguous().cuda(device=device) lab= Variable(data_set[beg_snt:end_snt,N_fea],volatile=True).contiguous().long().cuda(device=device) if rnn==1: inp=inp.view(inp.shape[0],1,inp.shape[1]) lab=lab.view(lab.shape[0],1) beg_snt=data_end_index[i] [loss,err,pout] = net(inp,lab,test_flag,rank) if multi_gpu: loss=loss.mean() err=err.mean() if do_forward: if rnn==1: pout=pout.view(pout.shape[0]*pout.shape[1],pout.shape[2]) if int(rank)==0: kaldi_io.write_mat(post_file, pout.data.cpu().numpy()-np.log(counts/np.sum(counts)), data_name[i]) if do_training: optimizer.zero_grad() loss.backward() optimizer.step() loss_sum=loss_sum+loss.data err_sum=err_sum+err.data beg_batch=end_batch end_batch=beg_batch+batch_size else: m=0 for e in range(num_epoch): print("Batch size: ",m) _, st_epoch_time= timestamp(), resource_usage(RUSAGE_SELF) if e>0: dev_N_batches=dev_N_snt if e>1: temp_err=dev_err_sum_tot net.eval() test_flag=1 dev_batch_size=1 dev_beg_batch=0 dev_end_batch=dev_beg_batch+1 dev_loss_sum=0 dev_err_sum=0 dev_beg_snt=0 _, st_val_time= timestamp(), resource_usage(RUSAGE_SELF) for j in range(dev_N_batches): end_snt=dev_data_end_index[j] dev_inp= Variable(dev_data_set[dev_beg_snt:end_snt,0:N_fea],volatile=True).contiguous().cuda(device=device) dev_lab= Variable(dev_data_set[dev_beg_snt:end_snt,N_fea],volatile=True).contiguous().long().cuda(device=device) if rnn==1: inp=inp.view(inp.shape[0],1,inp.shape[1]) lab=lab.view(lab.shape[0],1) dev_beg_snt=dev_data_end_index[j] [dev_loss,dev_err,dev_pout] = net(dev_inp,dev_lab,test_flag,rank) dev_loss_sum=dev_loss_sum+dev_loss.data dev_err_sum=dev_err_sum+dev_err.data dev_beg_batch=dev_end_batch dev_end_batch=dev_beg_batch+dev_batch_size end_val_time,_ = resource_usage(RUSAGE_SELF), timestamp() val_time = end_val_time.ru_utime - st_val_time.ru_utime sum_val_time=sum_val_time+val_time print('[INFO] EPOCH: %d, In Worker: %d, val_Err: %0.3f, val_loss: %0.3f, val_time: %0.3f' % ((e+1), int(rank),dev_err_sum/dev_N_batches, dev_loss_sum/dev_N_batches, sum_val_time)) dev_err_sum_tot=dev_err_sum/dev_N_batches if e>1: threshold = (temp_err-dev_err_sum_tot)/dev_err_sum_tot if threshold<0.0005: lr = lr * 0.5 net.train() beg_batch=0 end_batch=beg_batch+batch_size beg_snt=0 _, st_shu_time= timestamp(), resource_usage(RUSAGE_SELF) np.random.shuffle(data_set_ori) if not(save_gpumem): data_set=torch.from_numpy(data_set_ori).float().cuda(device=device) else: data_set=torch.from_numpy(data_set_ori).float() N_fea=data_set.shape[1]-1 options.input_dim=N_fea N_out=int(data_set[:,N_fea].max()-data_set[:,N_fea].min()+1) options.num_classes=N_out end_shu_time,_ = resource_usage(RUSAGE_SELF), timestamp() shu_time = end_shu_time.ru_utime - st_shu_time.ru_utime sum_shu_time=sum_shu_time+shu_time loss_sum=0 err_sum=0 for i in range(N_batches): _, st_load_time= timestamp(), resource_usage(RUSAGE_SELF) end_load_time,_ = resource_usage(RUSAGE_SELF), timestamp() load_time = end_load_time.ru_utime - st_load_time.ru_utime if do_training : if rnn==1: max_len=data_end_index[snt_index+batch_size-1]-data_end_index[snt_index+batch_size-2] inp= Variable(torch.zeros(max_len,batch_size,N_fea)).contiguous() lab= Variable(torch.zeros(max_len,batch_size)).contiguous().long() for k in range(batch_size): snt_len=data_end_index[snt_index]-beg_snt N_zeros=max_len-snt_len N_zeros_left=random.randint(0,N_zeros) inp[N_zeros_left:N_zeros_left+snt_len,k,:]=data_set[beg_snt:beg_snt+snt_len,0:N_fea] lab[N_zeros_left:N_zeros_left+snt_len,k]=data_set[beg_snt:beg_snt+snt_len,-1] beg_snt=data_end_index[snt_index] snt_index=snt_index+1 else: inp= Variable(data_set[beg_batch:end_batch,0:N_fea]).contiguous().cuda(device=device) lab= Variable(data_set[beg_batch:end_batch,N_fea]).contiguous().long().cuda(device=device) if do_eval: end_snt=data_end_index[i] inp= Variable(data_set[beg_snt:end_snt,0:N_fea],volatile=True).contiguous().cuda(device=device) lab= Variable(data_set[beg_snt:end_snt,N_fea],volatile=True).contiguous().long().cuda(device=device) if rnn==1: inp=inp.view(inp.shape[0],1,inp.shape[1]) lab=lab.view(lab.shape[0],1) beg_snt=data_end_index[i] [loss,err,pout] = net(inp,lab,test_flag,rank) if multi_gpu: loss=loss.mean() err=err.mean() if do_forward: if rnn==1: pout=pout.view(pout.shape[0]*pout.shape[1],pout.shape[2]) if int(rank)==1: kaldi_io.write_mat(post_file, pout.data.cpu().numpy()-np.log(counts/np.sum(counts)), data_name[i]) if do_training: optimizer_worker.zero_grad() loss.backward() _,st_update_time = timestamp(), resource_usage(RUSAGE_SELF) main_class.ensure_shared_params(net,rank) end_update_time,_ = resource_usage(RUSAGE_SELF), timestamp() update_time = end_update_time.ru_utime-st_update_time.ru_utime cc=0 _,st_model_time = timestamp(), resource_usage(RUSAGE_SELF) end_model_time,_ = resource_usage(RUSAGE_SELF), timestamp() model_time = end_model_time.ru_utime-st_model_time.ru_utime b=0 sum_update_time=sum_update_time + update_time sum_load_time=sum_load_time+load_time sum_model_time= sum_model_time+model_time loss_sum=loss_sum+loss.data err_sum=err_sum+err.data if i%100==0: if i!=0: print('[INFO] EPOCH: %d, Batch: %d, In Worker: %d, Err: %0.3f, loss: %0.3f, update_time: %0.3f, load_time: %0.3f' % ((e+1),i, int(rank),err_sum/i, loss_sum/i,sum_update_time,sum_load_time)) beg_batch=end_batch end_batch=beg_batch+batch_size m=m+1 end_epoch_time,_ = resource_usage(RUSAGE_SELF), timestamp() epoch_time = end_epoch_time.ru_utime - st_epoch_time.ru_utime sum_epoch_time= sum_epoch_time+epoch_time if do_training: checkpoint={'model_par': net.state_dict(), 'optimizer_par' : optimizer_worker.state_dict()} torch.save(checkpoint,options.out_file) loss_tot=loss_sum/(N_batches) err_tot=err_sum/(N_batches) end_train_time,_ = resource_usage(RUSAGE_SELF), timestamp() train_time = end_train_time.ru_utime - st_train_time.ru_utime if do_training: checkpoint={'model_par': net.state_dict(), 'optimizer_par' : optimizer_worker.state_dict()} torch.save(checkpoint,options.out_file) info_file=out_file.replace(".pkl",".info") with open(info_file, "a") as inf: inf.write("model_in=%s\n" %(pt_file)) inf.write("fea_in=%s\n" %(fea_scp)) inf.write("loss=%f\n" %(loss_tot)) inf.write("err=%f\n" %(err_tot)) inf.write("all_time=%f\n" %(train_time)) inf.write("shu_time=%f\n" %(sum_shu_time)) inf.write("model load time=%f\n" %(sum_load_time)) inf.write("gradient send time=%f\n" %(sum_update_time)) inf.write("val data calculate time=%f\n" %(sum_val_time)) inf.write("data generate time=%f\n" %(data_time)) inf.write("model update time=%f\n" %(sum_model_time)) inf.write("epoch time=%f\n" %((sum_epoch_time-sum_load_time-sum_update_time-sum_model_time-sum_val_time)/num_epoch)) inf.write("training time=%f\n" %(train_time-sum_load_time-sum_update_time-sum_val_time-data_time-sum_model_time-sum_shu_time)) inf.close() if do_forward: post_file.close()
# Gradient Clipping (th 0.1) #for net in nns.keys(): # torch.nn.utils.clip_grad_norm_(nns[net].parameters(), 0.1) for opt in optimizers.keys(): if not (strtobool(config[arch_dict[opt][0]]['arch_freeze'])): optimizers[opt].step() if to_do == 'forward': for out_id in range(len(forward_outs)): out_save = outs_dict[forward_outs[out_id]].data.cpu().numpy() if forward_normalize_post[out_id]: # read the config file counts = load_counts(forward_count_files[out_id]) out_save = out_save - np.log(counts / np.sum(counts)) # save the output kaldi_io.write_mat(post_file[forward_outs[out_id]], out_save, data_name[i]) else: loss_sum = loss_sum + outs_dict['loss_final'].detach() err_sum = err_sum + outs_dict['err_final'].detach() # update it to the next batch beg_batch = end_batch end_batch = beg_batch + batch_size # Progress bar if to_do == 'train':
# set the next epoch learning rate for param_group in optimizer.param_groups: param_group['lr'] = lr # ---EVALUATION OF TEST---# beg_snt = 0 err_sum = 0.0 loss_sum = 0.0 n_te_snt = len(te_name) net.eval() if ep == N_ep: # set folder for posteriors ark post_file = kaldi_io.open_or_fd(options.out_folder + '/pout_test.ark', 'wb') counts = load_counts(count_file) for i in range(n_te_snt): end_snt = te_end_index[i] inp = Variable(te_set[beg_snt:end_snt, 0:N_fea], volatile=True) lab = Variable(te_set[beg_snt:end_snt, N_fea], volatile=True) if save_gpumem and use_cuda: inp = inp.cuda() lab = lab.cuda() [loss, err, pout, pred] = net(inp, lab) if ep == N_ep: # writing the ark containing the normalized posterior probabilities (needed for kaldi decoding)
net.train() test_flag=0 N_batches=int(N_snt/batch_size) if rnn==0: N_ex_tr=data_set.shape[0] N_batches=int(N_ex_tr/batch_size) if do_eval: N_batches=N_snt net.eval() test_flag=1 batch_size=1 if do_forward: post_file=kaldi_io.open_or_fd(out_file,'wb') counts = load_counts(count_file) beg_batch=0 end_batch=batch_size snt_index=0 beg_snt=0 loss_sum=0 err_sum=0 for i in range(N_batches):
def run_nn_refac01(data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict,cfg_file,processed_first,next_config_file): def _read_chunk_specific_config(cfg_file): if not(os.path.exists(cfg_file)): sys.stderr.write('ERROR: The config file %s does not exist!\n'%(cfg_file)) sys.exit(0) else: config = configparser.ConfigParser() config.read(cfg_file) return config def _get_batch_size_from_config(config, to_do): if to_do=='train': batch_size=int(config['batches']['batch_size_train']) elif to_do=='valid': batch_size=int(config['batches']['batch_size_valid']) elif to_do=='forward': batch_size=1 return batch_size def _initialize_random_seed(config): seed=int(config['exp']['seed']) torch.manual_seed(seed) random.seed(seed) np.random.seed(seed) def _load_model_and_optimizer(fea_dict,model,config,arch_dict,use_cuda,multi_gpu,to_do): inp_out_dict = fea_dict nns, costs = model_init(inp_out_dict,model,config,arch_dict,use_cuda,multi_gpu,to_do) optimizers = optimizer_init(nns,config,arch_dict) for net in nns.keys(): pt_file_arch=config[arch_dict[net][0]]['arch_pretrain_file'] if pt_file_arch!='none': if use_cuda: checkpoint_load = torch.load(pt_file_arch) else: checkpoint_load = torch.load(pt_file_arch, map_location='cpu') nns[net].load_state_dict(checkpoint_load['model_par']) if net in optimizers: optimizers[net].load_state_dict(checkpoint_load['optimizer_par']) optimizers[net].param_groups[0]['lr']=float(config[arch_dict[net][0]]['arch_lr']) # loading lr of the cfg file for pt if multi_gpu: nns[net] = torch.nn.DataParallel(nns[net]) return nns, costs, optimizers, inp_out_dict def _open_forward_output_files_and_get_file_handles(forward_outs, require_decodings, info_file, output_folder): post_file={} for out_id in range(len(forward_outs)): if require_decodings[out_id]: out_file=info_file.replace('.info','_'+forward_outs[out_id]+'_to_decode.ark') else: out_file=info_file.replace('.info','_'+forward_outs[out_id]+'.ark') post_file[forward_outs[out_id]]=open_or_fd(out_file,output_folder,'wb') return post_file def _get_batch_config(data_set_input, seq_model, to_do, data_name, batch_size): N_snt = None N_ex_tr = None N_batches = None if seq_model or to_do=='forward': N_snt=len(data_name) N_batches=int(N_snt/batch_size) else: N_ex_tr=data_set_input.shape[0] N_batches=int(N_ex_tr/batch_size) return N_snt, N_ex_tr, N_batches def _prepare_input(snt_index, batch_size, inp_dim, ref_dim, beg_snt_fea, beg_snt_lab, data_end_index_fea, data_end_index_lab, beg_batch, end_batch, seq_model, arr_snt_len_fea, arr_snt_len_lab, data_set_inp, data_set_ref, use_cuda): def _zero_padding(inp, ref, max_len_fea, max_len_lab, data_end_index_fea, data_end_index_lab, data_set_inp, data_set_ref, beg_snt_fea, beg_snt_lab, snt_index, k): def _input_and_ref_have_same_time_dimension(N_zeros_fea, N_zeros_lab): if N_zeros_fea == N_zeros_lab: return True return False snt_len_fea = data_end_index_fea[snt_index] - beg_snt_fea snt_len_lab = data_end_index_lab[snt_index] - beg_snt_lab N_zeros_fea = max_len_fea - snt_len_fea N_zeros_lab = max_len_lab - snt_len_lab if _input_and_ref_have_same_time_dimension(N_zeros_fea, N_zeros_lab): N_zeros_fea_left = random.randint(0,N_zeros_fea) N_zeros_lab_left = N_zeros_fea_left else: N_zeros_fea_left = 0 N_zeros_lab_left = 0 inp[N_zeros_fea_left:N_zeros_fea_left+snt_len_fea,k,:] = data_set_inp[beg_snt_fea:beg_snt_fea+snt_len_fea,:] ref[N_zeros_lab_left:N_zeros_lab_left+snt_len_lab,k,:] = data_set_ref[beg_snt_lab:beg_snt_lab+snt_len_lab,:] return inp, ref, snt_len_fea, snt_len_lab if len(data_set_ref.shape) == 1: data_set_ref = data_set_ref.shape.view((data_set_ref.shape[0], 1)) max_len=0 if seq_model: max_len_fea = int(max(arr_snt_len_fea[snt_index:snt_index+batch_size])) max_len_lab = int(max(arr_snt_len_lab[snt_index:snt_index+batch_size])) inp = torch.zeros(max_len_fea,batch_size,inp_dim).contiguous() ref = torch.zeros(max_len_lab,batch_size,ref_dim).contiguous() for k in range(batch_size): inp, ref, snt_len_fea, snt_len_lab = _zero_padding(inp, ref, max_len_fea, max_len_lab, data_end_index_fea, data_end_index_lab, data_set_inp, data_set_ref, beg_snt_fea, beg_snt_lab, snt_index, k) beg_snt_fea = data_end_index_fea[snt_index] beg_snt_lab = data_end_index_lab[snt_index] snt_index = snt_index + 1 else: if to_do != 'forward': inp = data_set[beg_batch:end_batch,:].contiguous() else: snt_len_fea = data_end_index_fea[snt_index] - beg_snt_fea snt_len_lab = data_end_index_lab[snt_index] - beg_snt_lab inp = data_set_inp[beg_snt_fea:beg_snt_fea+snt_len_fea,:].contiguous() ref = data_set_ref[beg_snt_lab:beg_snt_lab+snt_len_lab,:].contiguous() beg_snt_fea = data_end_index_fea[snt_index] beg_snt_lab = data_end_index_lab[snt_index] snt_index = snt_index + 1 if use_cuda: inp=inp.cuda() ref=ref.cuda() return inp, ref, max_len_fea, max_len_lab, snt_len_fea, snt_len_lab, beg_snt_fea, beg_snt_lab, snt_index def _optimization_step(optimizers, outs_dict, config, arch_dict): for opt in optimizers.keys(): optimizers[opt].zero_grad() outs_dict['loss_final'].backward() for opt in optimizers.keys(): if not(strtobool(config[arch_dict[opt][0]]['arch_freeze'])): optimizers[opt].step() def _update_progress_bar(to_do, i, N_batches, loss_sum): if to_do == 'train': status_string="Training | (Batch "+str(i+1)+"/"+str(N_batches)+")"+" | L:" +str(round(loss_sum.cpu().item()/(i+1),3)) if i==N_batches-1: status_string="Training | (Batch "+str(i+1)+"/"+str(N_batches)+")" if to_do == 'valid': status_string="Validating | (Batch "+str(i+1)+"/"+str(N_batches)+")" if to_do == 'forward': status_string="Forwarding | (Batch "+str(i+1)+"/"+str(N_batches)+")" progress(i, N_batches, status=status_string) def _write_info_file(info_file, to_do, loss_tot, err_tot, elapsed_time_chunk): with open(info_file, "w") as text_file: text_file.write("[results]\n") if to_do!='forward': text_file.write("loss=%s\n" % loss_tot.cpu().numpy()) text_file.write("err=%s\n" % err_tot.cpu().numpy()) text_file.write("elapsed_time_chunk=%f\n" % elapsed_time_chunk) text_file.close() def _save_model(to_do, nns, multi_gpu, optimizers, info_file, arch_dict): if to_do=='train': for net in nns.keys(): checkpoint={} if multi_gpu: checkpoint['model_par']=nns[net].module.state_dict() else: checkpoint['model_par']=nns[net].state_dict() if net in optimizers: checkpoint['optimizer_par']=optimizers[net].state_dict() else: checkpoint['optimizer_par']=dict() out_file=info_file.replace('.info','_'+arch_dict[net][0]+'.pkl') torch.save(checkpoint, out_file) def _get_dim_from_data_set(data_set_inp, data_set_ref): inp_dim = data_set_inp.shape[1] ref_dim = 1 if len(data_set_ref.shape) > 1: ref_dim = data_set_ref.shape[1] return inp_dim, ref_dim from data_io import read_lab_fea_refac01 as read_lab_fea from utils import forward_model_refac01 as forward_model config = _read_chunk_specific_config(cfg_file) _initialize_random_seed(config) output_folder = config['exp']['out_folder'] use_cuda = strtobool(config['exp']['use_cuda']) multi_gpu = strtobool(config['exp']['multi_gpu']) to_do = config['exp']['to_do'] info_file = config['exp']['out_info'] model = config['model']['model'].split('\n') forward_outs = config['forward']['forward_out'].split(',') forward_normalize_post = list(map(strtobool,config['forward']['normalize_posteriors'].split(','))) forward_count_files = config['forward']['normalize_with_counts_from'].split(',') require_decodings = list(map(strtobool,config['forward']['require_decoding'].split(','))) save_gpumem = strtobool(config['exp']['save_gpumem']) is_production = strtobool(config['exp']['production']) batch_size = _get_batch_size_from_config(config, to_do) if processed_first: shared_list = list() p = read_next_chunk_into_shared_list_with_subprocess(read_lab_fea, shared_list, cfg_file, is_production, output_folder, wait_for_process=True) data_name, data_end_index_fea, data_end_index_lab, fea_dict, lab_dict, arch_dict, data_set_dict = extract_data_from_shared_list(shared_list) data_set_inp, data_set_ref = convert_numpy_to_torch(data_set_dict, save_gpumem, use_cuda) else: data_set_inp = data_set['input'] data_set_ref = data_set['ref'] data_end_index_fea = data_end_index['fea'] data_end_index_lab = data_end_index['lab'] shared_list = list() data_loading_process = None if not next_config_file is None: data_loading_process = read_next_chunk_into_shared_list_with_subprocess(read_lab_fea, shared_list, next_config_file, is_production, output_folder, wait_for_process=False) nns, costs, optimizers, inp_out_dict = _load_model_and_optimizer(fea_dict,model,config,arch_dict,use_cuda,multi_gpu,to_do) if to_do=='forward': post_file = _open_forward_output_files_and_get_file_handles(forward_outs, require_decodings, info_file, output_folder) seq_model = is_sequential_dict(config,arch_dict) N_snt, N_ex_tr, N_batches = _get_batch_config(data_set_inp, seq_model, to_do, data_name, batch_size) beg_batch = 0 end_batch = batch_size snt_index = 0 beg_snt_fea = 0 beg_snt_lab = 0 arr_snt_len_fea = shift(shift(data_end_index_fea, -1,0) - data_end_index_fea,1,0) arr_snt_len_lab = shift(shift(data_end_index_lab, -1,0) - data_end_index_lab,1,0) arr_snt_len_fea[0] = data_end_index_fea[0] arr_snt_len_lab[0] = data_end_index_lab[0] data_set_inp_dim, data_set_ref_dim = _get_dim_from_data_set(data_set_inp, data_set_ref) inp_dim = data_set_inp_dim + data_set_ref_dim loss_sum = 0 err_sum = 0 start_time = time.time() for i in range(N_batches): inp, ref, max_len_fea, max_len_lab, snt_len_fea, snt_len_lab, beg_snt_fea, beg_snt_lab, snt_index = _prepare_input(snt_index, batch_size, data_set_inp_dim, data_set_ref_dim, beg_snt_fea, beg_snt_lab, data_end_index_fea, data_end_index_lab, beg_batch, end_batch, seq_model, arr_snt_len_fea, arr_snt_len_lab, data_set_inp, data_set_ref, use_cuda) if to_do=='train': outs_dict = forward_model(fea_dict, lab_dict, arch_dict, model, nns, costs, inp, ref, inp_out_dict, max_len_fea, max_len_lab, batch_size, to_do, forward_outs) _optimization_step(optimizers, outs_dict, config, arch_dict) else: with torch.no_grad(): outs_dict = forward_model(fea_dict, lab_dict, arch_dict, model, nns, costs, inp, ref, inp_out_dict, max_len_fea, max_len_lab, batch_size, to_do, forward_outs) if to_do == 'forward': for out_id in range(len(forward_outs)): out_save = outs_dict[forward_outs[out_id]].data.cpu().numpy() if forward_normalize_post[out_id]: counts = load_counts(forward_count_files[out_id]) out_save=out_save-np.log(counts/np.sum(counts)) write_mat(output_folder,post_file[forward_outs[out_id]], out_save, data_name[i]) else: loss_sum=loss_sum+outs_dict['loss_final'].detach() err_sum=err_sum+outs_dict['err_final'].detach() beg_batch=end_batch end_batch=beg_batch+batch_size _update_progress_bar(to_do, i, N_batches, loss_sum) elapsed_time_chunk=time.time() - start_time loss_tot=loss_sum/N_batches err_tot=err_sum/N_batches del inp, ref, outs_dict, data_set_inp_dim, data_set_ref_dim _save_model(to_do, nns, multi_gpu, optimizers, info_file, arch_dict) if to_do=='forward': for out_name in forward_outs: post_file[out_name].close() _write_info_file(info_file, to_do, loss_tot, err_tot, elapsed_time_chunk) if not data_loading_process is None: data_loading_process.join() data_name, data_end_index_fea, data_end_index_lab, fea_dict, lab_dict, arch_dict, data_set_dict = extract_data_from_shared_list(shared_list) data_set_inp, data_set_ref = convert_numpy_to_torch(data_set_dict, save_gpumem, use_cuda) data_set = {'input': data_set_inp, 'ref': data_set_ref} data_end_index = {'fea': data_end_index_fea,'lab': data_end_index_lab} return [data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict] else: return [None,None,None,None,None,None]