def train(config,ADMM,device,train_loader,optimizer,epoch): config.model.train() adv_loss = None for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) if config.gpu is not None: data = data.cuda(config.gpu, non_blocking = True) target = target.cuda(config.gpu,non_blocking = True) optimizer.zero_grad() nat_output,adv_output,pert_inputs = config.model(data,target) nat_loss = F.cross_entropy(nat_output, target) adv_loss = F.cross_entropy(adv_output, target) if config.admm: admm.admm_update(config,ADMM,device,train_loader,optimizer,epoch,data,batch_idx) # update Z and U adv_loss,admm_loss,mixed_loss = admm.append_admm_loss(config,ADMM,adv_loss) # append admm losss if config.admm: mixed_loss.backward() else: adv_loss.backward() #nat_loss.backward() if config.masked_progressive: with torch.no_grad(): for name,W in config.model.named_parameters(): if name in config.zero_masks: W.grad *=config.zero_masks[name] if config.masked_retrain: with torch.no_grad(): for name,W in config.model.named_parameters(): if name in config.masks: W.grad *=config.masks[name] optimizer.step() if batch_idx % config.print_freq == 0: print ("nat_cross_entropy loss: {} adv_cross_entropy loss : {}".format(nat_loss,adv_loss)) print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), adv_loss.item()))
def train(lr, epoch = 0): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) # if args.admm: if stage == 'admm': ce_loss = loss admm.admm_update(args,ADMM,model,None,None,None,epoch,None,batch) # update Z and U ce_loss,admm_loss,mixed_loss = admm.append_admm_loss(args,ADMM,model,ce_loss) # append admm losss loss = mixed_loss loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) if stage == 'masked_retrain': for name,W in model.named_parameters(): if name in config.masks: W.grad.data *= config.masks[name] for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(config, ADMM, device, train_loader, criterion, optimizer, scheduler, epoch): config.model.train() ce_loss = None for batch_idx, (data, target) in enumerate(train_loader): # adjust learning rate if config.admm: admm.admm_adjust_learning_rate(optimizer, epoch, config) else: if scheduler is not None: scheduler.step() data, target = data.to(device), target.to(device) if config.gpu is not None: data = data.cuda(config.gpu, non_blocking=True) target = target.cuda(config.gpu, non_blocking=True) if config.mixup: data, target_a, target_b, lam = mixup_data(data, target, config.alpha) optimizer.zero_grad() output = config.model(data) if config.mixup: ce_loss = mixup_criterion(criterion, output, target_a, target_b, lam, config.smooth) else: ce_loss = criterion(output, target, smooth=config.smooth) if config.admm: admm.admm_update(config, ADMM, device, train_loader, optimizer, epoch, data, batch_idx) # update Z and U ce_loss, admm_loss, mixed_loss = admm.append_admm_loss( config, ADMM, ce_loss) # append admm losss if config.admm: mixed_loss.backward() else: ce_loss.backward() if config.masked_progressive: with torch.no_grad(): for name, W in config.model.named_parameters(): if name in config.zero_masks: W.grad *= config.zero_masks[name] if config.masked_retrain: with torch.no_grad(): for name, W in config.model.named_parameters(): if name in config.masks: W.grad *= config.masks[name] optimizer.step() if batch_idx % config.print_freq == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), ce_loss.item()))
def train(train_loader, config, ADMM, criterion, optimizer, scheduler, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to train mode config.model.train() end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) # adjust learning rate if config.admm: admm.admm_adjust_learning_rate(optimizer, epoch, config) else: scheduler.step() input = input.cuda(config.gpu, non_blocking=True) target = target.cuda(config.gpu) data = input if config.mixup: input, target_a, target_b, lam = mixup_data( input, target, config.alpha) # compute output output = config.model(input) if config.mixup: ce_loss = mixup_criterion(criterion, output, target_a, target_b, lam, config.smooth) else: ce_loss = criterion(output, target, smooth=config.smooth) if config.admm: admm.admm_update(config, ADMM, device, train_loader, optimizer, epoch, data, i) # update Z and U ce_loss, admm_loss, mixed_loss = admm.append_admm_loss( config, ADMM, ce_loss) # append admm losss # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(ce_loss.item(), input.size(0)) top1.update(acc1[0], input.size(0)) top5.update(acc5[0], input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() if config.admm: mixed_loss.backward() else: ce_loss.backward() if config.masked_progressive: with torch.no_grad(): for name, W in config.model.named_parameters(): if name in config.zero_masks: W.grad *= config.zero_masks[name] if config.masked_retrain: with torch.no_grad(): for name, W in config.model.named_parameters(): if name in config.masks: W.grad *= config.masks[name] optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % config.print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5)) print("cross_entropy loss: {}".format(ce_loss))
def train(train_loader, criterion, optimizer, epoch, config): batch_time = AverageMeter() data_time = AverageMeter() nat_losses = AverageMeter() adv_losses = AverageMeter() nat_loss = 0 adv_loss = 0 nat_top1 = AverageMeter() adv_top1 = AverageMeter() # switch to train mode config.model.train() end = time.time() for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) # adjust learning rate if config.admm: admm.admm_adjust_learning_rate(optimizer, epoch, config) else: scheduler.step() if config.gpu is not None: input = input.cuda(config.gpu, non_blocking=True) target = target.cuda(config.gpu, non_blocking=True) if config.mixup: input, target_a, target_b, lam = mixup_data( input, target, config.alpha) # compute output nat_output, adv_output, pert_inputs = config.model(input, target) if config.mixup: adv_loss = mixup_criterion(criterion, adv_output, target_a, target_b, lam, config.smooth) nat_loss = mixup_criterion(criterion, nat_output, target_a, target_b, lam, config.smooth) else: adv_loss = criterion(adv_output, target, smooth=config.smooth) nat_loss = criterion(nat_output, target, smooth=config.smooth) if config.admm: admm.admm_update(config, ADMM, device, train_loader, optimizer, epoch, input, i) # update Z and U adv_loss, admm_loss, mixed_loss = admm.append_admm_loss( config, ADMM, adv_loss) # append admm losss # measure accuracy and record loss nat_acc1, _ = accuracy(nat_output, target, topk=(1, 5)) adv_acc1, _ = accuracy(adv_output, target, topk=(1, 5)) nat_losses.update(nat_loss.item(), input.size(0)) adv_losses.update(adv_loss.item(), input.size(0)) adv_top1.update(adv_acc1[0], input.size(0)) nat_top1.update(nat_acc1[0], input.size(0)) # compute gradient and do SGD step optimizer.zero_grad() if config.admm: mixed_loss.backward() else: adv_loss.backward() if config.masked_progressive: with torch.no_grad(): for name, W in config.model.named_parameters(): if name in config.zero_masks: W.grad *= config.zero_masks[name] if config.masked_retrain: with torch.no_grad(): for name, W in config.model.named_parameters(): if name in config.masks: W.grad *= config.masks[ name] #returns boolean array called mask when weights are above treshhold optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % config.print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Nat_Loss {nat_loss.val:.4f} ({nat_loss.avg:.4f})\t' 'Nat_Acc@1 {nat_top1.val:.3f} ({nat_top1.avg:.3f})\t' 'Adv_Loss {adv_loss.val:.4f} ({adv_loss.avg:.4f})\t' 'Adv_Acc@1 {adv_top1.val:.3f} ({adv_top1.avg:.3f})\t'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, nat_loss=nat_losses, nat_top1=nat_top1, adv_loss=adv_losses, adv_top1=adv_top1))
def run_admm(data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict,cfg_file,processed_first,next_config_file,ADMM,masks,ep,ck): # This function processes the current chunk using the information in cfg_file. In parallel, the next chunk is load into the CPU memory # Reading chunk-specific cfg file (first argument-mandatory file) if not(os.path.exists(cfg_file)): sys.stderr.write('ERROR: The config file %s does not exist!\n'%(cfg_file)) sys.exit(0) else: config = configparser.ConfigParser() config.read(cfg_file) # Setting torch seed seed=int(config['exp']['seed']) torch.manual_seed(seed) random.seed(seed) np.random.seed(seed) # Reading config parameters output_folder=config['exp']['out_folder'] multi_gpu=strtobool(config['exp']['multi_gpu']) to_do=config['exp']['to_do'] info_file=config['exp']['out_info'] model=config['model']['model'].split('\n') forward_outs=config['forward']['forward_out'].split(',') forward_normalize_post=list(map(strtobool,config['forward']['normalize_posteriors'].split(','))) forward_count_files=config['forward']['normalize_with_counts_from'].split(',') require_decodings=list(map(strtobool,config['forward']['require_decoding'].split(','))) use_cuda=strtobool(config['exp']['use_cuda']) save_gpumem=strtobool(config['exp']['save_gpumem']) is_production=strtobool(config['exp']['production']) if to_do=='train': batch_size=int(config['batches']['batch_size_train']) if to_do=='valid': batch_size=int(config['batches']['batch_size_valid']) if to_do=='forward': batch_size=1 # ***** Reading the Data******** if processed_first: # admm初始化的工作,咱们都在这儿做了吧 # Reading all the features and labels for this chunk shared_list=[] p=threading.Thread(target=read_lab_fea, args=(cfg_file,is_production,shared_list,output_folder,)) p.start() p.join() data_name=shared_list[0] data_end_index=shared_list[1] fea_dict=shared_list[2] lab_dict=shared_list[3] arch_dict=shared_list[4] data_set=shared_list[5] # converting numpy tensors into pytorch tensors and put them on GPUs if specified if not(save_gpumem) and use_cuda: data_set=torch.from_numpy(data_set).float().cuda() else: data_set=torch.from_numpy(data_set).float() # Reading all the features and labels for the next chunk shared_list=[] p=threading.Thread(target=read_lab_fea, args=(next_config_file,is_production,shared_list,output_folder,)) p.start() # Reading model and initialize networks inp_out_dict=fea_dict [nns,costs]=model_init(inp_out_dict,model,config,arch_dict,use_cuda,multi_gpu,to_do) if processed_first: ADMM = admm.ADMM(config, nns) # optimizers initialization optimizers=optimizer_init(nns,config,arch_dict) # pre-training and multi-gpu init for net in nns.keys(): pt_file_arch=config[arch_dict[net][0]]['arch_pretrain_file'] if pt_file_arch!='none': checkpoint_load = torch.load(pt_file_arch) nns[net].load_state_dict(checkpoint_load['model_par']) optimizers[net].load_state_dict(checkpoint_load['optimizer_par']) optimizers[net].param_groups[0]['lr']=float(config[arch_dict[net][0]]['arch_lr']) # loading lr of the cfg file for pt if multi_gpu: nns[net] = torch.nn.DataParallel(nns[net]) if to_do=='forward': post_file={} for out_id in range(len(forward_outs)): if require_decodings[out_id]: out_file=info_file.replace('.info','_'+forward_outs[out_id]+'_to_decode.ark') else: out_file=info_file.replace('.info','_'+forward_outs[out_id]+'.ark') post_file[forward_outs[out_id]]=open_or_fd(out_file,output_folder,'wb') if strtobool(config['exp']['retrain']) and processed_first and strtobool(config['exp']['masked_progressive']): # make sure small weights are pruned and confirm the acc print ("<============masking both weights and gradients for retrain") masks = admm.masking(config, ADMM, nns) print("<============all masking statistics") masks = admm.zero_masking(config, nns) print ("<============testing sparsity before retrain") admm.test_sparsity(config, nns, ADMM) if strtobool(config['exp']['masked_progressive']) and processed_first and strtobool(config['exp']['admm']): masks = admm.zero_masking(config, nns) # check automatically if the model is sequential seq_model=is_sequential_dict(config,arch_dict) # ***** Minibatch Processing loop******** if seq_model or to_do=='forward': N_snt=len(data_name) N_batches=int(N_snt/batch_size) else: N_ex_tr=data_set.shape[0] N_batches=int(N_ex_tr/batch_size) beg_batch=0 end_batch=batch_size snt_index=0 beg_snt=0 start_time = time.time() # array of sentence lengths arr_snt_len=shift(shift(data_end_index, -1,0)-data_end_index,1,0) arr_snt_len[0]=data_end_index[0] loss_sum=0 err_sum=0 inp_dim=data_set.shape[1] for i in range(N_batches): max_len=0 if seq_model: max_len=int(max(arr_snt_len[snt_index:snt_index+batch_size])) inp= torch.zeros(max_len,batch_size,inp_dim).contiguous() for k in range(batch_size): snt_len=data_end_index[snt_index]-beg_snt N_zeros=max_len-snt_len # Appending a random number of initial zeros, tge others are at the end. N_zeros_left=random.randint(0,N_zeros) # randomizing could have a regularization effect inp[N_zeros_left:N_zeros_left+snt_len,k,:]=data_set[beg_snt:beg_snt+snt_len,:] beg_snt=data_end_index[snt_index] snt_index=snt_index+1 else: # features and labels for batch i if to_do!='forward': inp= data_set[beg_batch:end_batch,:].contiguous() else: snt_len=data_end_index[snt_index]-beg_snt inp= data_set[beg_snt:beg_snt+snt_len,:].contiguous() beg_snt=data_end_index[snt_index] snt_index=snt_index+1 # use cuda if use_cuda: inp=inp.cuda() if to_do=='train': # Forward input, with autograd graph active outs_dict=forward_model(fea_dict,lab_dict,arch_dict,model,nns,costs,inp,inp_out_dict,max_len,batch_size,to_do,forward_outs) if strtobool(config['exp']['admm']): batch_idx = i + ck admm.admm_update(config,ADMM,nns, ep,batch_idx) # update Z and U outs_dict['loss_final'],admm_loss,mixed_loss = admm.append_admm_loss(config,ADMM,nns,outs_dict['loss_final']) # append admm losss for opt in optimizers.keys(): optimizers[opt].zero_grad() if strtobool(config['exp']['admm']): mixed_loss.backward() else: outs_dict['loss_final'].backward() if strtobool(config['exp']['masked_progressive']) and not strtobool(config['exp']['retrain']): with torch.no_grad(): for net in nns.keys(): for name, W in nns[net].named_parameters(): if name in masks: W.grad *=masks[name] break if strtobool(config['exp']['retrain']): with torch.no_grad(): for net in nns.keys(): for name, W in nns[net].named_parameters(): if name in masks: W.grad *=masks[name] break # Gradient Clipping (th 0.1) #for net in nns.keys(): # torch.nn.utils.clip_grad_norm_(nns[net].parameters(), 0.1) for opt in optimizers.keys(): if not(strtobool(config[arch_dict[opt][0]]['arch_freeze'])): optimizers[opt].step() else: with torch.no_grad(): # Forward input without autograd graph (save memory) outs_dict=forward_model(fea_dict,lab_dict,arch_dict,model,nns,costs,inp,inp_out_dict,max_len,batch_size,to_do,forward_outs) if to_do=='forward': for out_id in range(len(forward_outs)): out_save=outs_dict[forward_outs[out_id]].data.cpu().numpy() if forward_normalize_post[out_id]: # read the config file counts = load_counts(forward_count_files[out_id]) out_save=out_save-np.log(counts/np.sum(counts)) # save the output write_mat(output_folder,post_file[forward_outs[out_id]], out_save, data_name[i]) else: loss_sum=loss_sum+outs_dict['loss_final'].detach() err_sum=err_sum+outs_dict['err_final'].detach() # update it to the next batch beg_batch=end_batch end_batch=beg_batch+batch_size # Progress bar if to_do == 'train': status_string="Training | (Batch "+str(i+1)+"/"+str(N_batches)+")"+" | L:" +str(round(loss_sum.cpu().item()/(i+1),3)) if i==N_batches-1: status_string="Training | (Batch "+str(i+1)+"/"+str(N_batches)+")" if to_do == 'valid': status_string="Validating | (Batch "+str(i+1)+"/"+str(N_batches)+")" if to_do == 'forward': status_string="Forwarding | (Batch "+str(i+1)+"/"+str(N_batches)+")" progress(i, N_batches, status=status_string) elapsed_time_chunk=time.time() - start_time loss_tot=loss_sum/N_batches err_tot=err_sum/N_batches # clearing memory del inp, outs_dict, data_set # save the model if to_do=='train': for net in nns.keys(): checkpoint={} if multi_gpu: checkpoint['model_par']=nns[net].module.state_dict() else: checkpoint['model_par']=nns[net].state_dict() checkpoint['optimizer_par']=optimizers[net].state_dict() out_file=info_file.replace('.info','_'+arch_dict[net][0]+'.pkl') torch.save(checkpoint, out_file) if to_do=='forward': for out_name in forward_outs: post_file[out_name].close() # Write info file with open(info_file, "w") as text_file: text_file.write("[results]\n") if to_do!='forward': text_file.write("loss=%s\n" % loss_tot.cpu().numpy()) text_file.write("err=%s\n" % err_tot.cpu().numpy()) text_file.write("elapsed_time_chunk=%f\n" % elapsed_time_chunk) text_file.close() # Getting the data for the next chunk (read in parallel) p.join() data_name=shared_list[0] data_end_index=shared_list[1] fea_dict=shared_list[2] lab_dict=shared_list[3] arch_dict=shared_list[4] data_set=shared_list[5] # converting numpy tensors into pytorch tensors and put them on GPUs if specified if not(save_gpumem) and use_cuda: data_set=torch.from_numpy(data_set).float().cuda() else: data_set=torch.from_numpy(data_set).float() return [data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict,masks,ADMM]