def train(self, dataset, dataset_eval=None): # Obtain needed information data_size = dataset.data_size token_size = dataset.token_size ans_size = dataset.ans_size pretrained_emb = dataset.pretrained_emb # Define the MCAN model net = Net(self.__C, pretrained_emb, token_size, ans_size) net.cuda() net.train() # Define the Question-only model qnet = QNet(self.__C, pretrained_emb, token_size, ans_size) qnet.cuda() qnet.train() # Watch net & qnet wandb.watch(net) wandb.watch(qnet) # Define the multi-gpu training if needed if self.__C.N_GPU > 1: net = nn.DataParallel(net, device_ids=self.__C.DEVICES) # Define the binary cross entropy loss # loss_fn = torch.nn.BCELoss(size_average=False).cuda() loss_qm = torch.nn.BCELoss(reduction='sum').cuda() loss_qo = torch.nn.BCELoss(reduction='sum').cuda() # Load checkpoint if resume training if self.__C.RESUME: # default -> FALSE print(' ========== Resume training') if self.__C.CKPT_PATH is not None: print('Warning: you are now using CKPT_PATH args, ' 'CKPT_VERSION and CKPT_EPOCH will not work') path = self.__C.CKPT_PATH else: path = self.__C.CKPTS_PATH + \ 'ckpt_' + self.__C.CKPT_VERSION + \ '/epoch' + str(self.__C.CKPT_EPOCH) + '.pkl' # Load the network parameters print('Loading ckpt {}'.format(path)) ckpt = torch.load(path) print('Finish!') net.load_state_dict(ckpt['state_dict']) # Load the optimizer paramters #params = list(net.parameters()) + list(qnet.parameters()) optim = get_optim(self.__C, net, data_size, ckpt['lr_base']) optim._step = int(data_size / self.__C.BATCH_SIZE * self.__C.CKPT_EPOCH) optim.optimizer.load_state_dict(ckpt['optimizer']) start_epoch = self.__C.CKPT_EPOCH else: if ('ckpt_' + self.__C.VERSION) in os.listdir(self.__C.CKPTS_PATH): shutil.rmtree(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION) os.mkdir(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION) #params = net.parameters() + qnet.parameters() optim = get_optim(self.__C, net, data_size) optim_q = get_optim(self.__C, qnet, data_size) start_epoch = 0 loss_sum = 0 L_qo_sum = 0 L_qm_sum = 0 named_params = list(net.named_parameters()) + list( qnet.named_parameters()) grad_norm = np.zeros(len(named_params)) # Define multi-thread dataloader if self.__C.SHUFFLE_MODE in ['external']: dataloader = Data.DataLoader(dataset, batch_size=self.__C.BATCH_SIZE, shuffle=False, num_workers=self.__C.NUM_WORKERS, pin_memory=self.__C.PIN_MEM, drop_last=True) else: dataloader = Data.DataLoader(dataset, batch_size=self.__C.BATCH_SIZE, shuffle=True, num_workers=self.__C.NUM_WORKERS, pin_memory=self.__C.PIN_MEM, drop_last=True) # Training script for epoch in range(start_epoch, self.__C.MAX_EPOCH): # Save log information logfile = open( self.__C.LOG_PATH + 'log_run_' + self.__C.VERSION + '.txt', 'a+') logfile.write( 'nowTime: ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n') logfile.close() # Learning Rate Decay if epoch in self.__C.LR_DECAY_LIST: adjust_lr(optim, self.__C.LR_DECAY_R) adjust_lr(optim_q, self.__C.LR_DECAY_R) # Externally shuffle if self.__C.SHUFFLE_MODE == 'external': shuffle_list(dataset.ans_list) time_start = time.time() # Iteration for step, (img_feat_iter, ques_ix_iter, ans_iter) in enumerate(dataloader): optim.zero_grad() optim_q.zero_grad() img_feat_iter = img_feat_iter.cuda() ques_ix_iter = ques_ix_iter.cuda() ans_iter = ans_iter.cuda() for accu_step in range(self.__C.GRAD_ACCU_STEPS): sub_img_feat_iter = \ img_feat_iter[accu_step * self.__C.SUB_BATCH_SIZE: (accu_step + 1) * self.__C.SUB_BATCH_SIZE] sub_ques_ix_iter = \ ques_ix_iter[accu_step * self.__C.SUB_BATCH_SIZE: (accu_step + 1) * self.__C.SUB_BATCH_SIZE] sub_ans_iter = \ ans_iter[accu_step * self.__C.SUB_BATCH_SIZE: (accu_step + 1) * self.__C.SUB_BATCH_SIZE] out, q_emb, lang_feat_mask = net(sub_img_feat_iter, sub_ques_ix_iter) pred_qo, q_out = qnet(q_emb, lang_feat_mask) #print(pred_qo.shape, sub_ans_iter.shape) #print(torch.argmax(sub_ans_iter.long(), dim=1)) ans_idx = torch.argmax(sub_ans_iter.long(), dim=1) pred_idx = torch.argmax( pred_qo.long(), dim=1) # predicted answer index from QO qo_scale = pred_qo.detach().clone() for i in range(self.__C.SUB_BATCH_SIZE): if (ans_idx[i] == pred_idx[i]): qo_scale[i, :] = torch.ones(3129) L_qo = loss_qo(q_out, sub_ans_iter) L_qm = loss_qm( torch.sigmoid(out * torch.sigmoid(qo_scale)), sub_ans_iter) #L_qo = loss_qo(q_out, sub_ans_iter) #L_qm = loss_qm(torch.sigmoid(out*torch.sigmoid(pred_qo)), sub_ans_iter) loss = L_qo + L_qm # only mean-reduction needs be divided by grad_accu_steps # removing this line wouldn't change our results because the speciality of Adam optimizer, # but would be necessary if you use SGD optimizer. # loss /= self.__C.GRAD_ACCU_STEPS loss.backward() loss_sum += loss.cpu().data.numpy( ) * self.__C.GRAD_ACCU_STEPS L_qo_sum += L_qo.cpu().data.numpy( ) * self.__C.GRAD_ACCU_STEPS L_qm_sum += L_qm.cpu().data.numpy( ) * self.__C.GRAD_ACCU_STEPS wandb.log({ "Training loss": loss.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE, "Question only loss": L_qo.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE, "Fusion loss": L_qm.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE }) # Tracking training loss if self.__C.VERBOSE: # print loss every step -> TRUE if dataset_eval is not None: mode_str = self.__C.SPLIT[ 'train'] + '->' + self.__C.SPLIT['val'] else: mode_str = self.__C.SPLIT[ 'train'] + '->' + self.__C.SPLIT['test'] print( "\r[version %s][epoch %2d][step %4d/%4d][%s] loss: %.4f, lr: %.2e" % (self.__C.VERSION, epoch + 1, step, int(data_size / self.__C.BATCH_SIZE), mode_str, loss.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE, optim._rate), end=' ') # Gradient norm clipping if self.__C.GRAD_NORM_CLIP > 0: nn.utils.clip_grad_norm_(net.parameters(), self.__C.GRAD_NORM_CLIP) # Save the gradient information for name in range(len(named_params)): norm_v = torch.norm(named_params[name][1].grad).cpu().data.numpy() \ if named_params[name][1].grad is not None else 0 grad_norm[name] += norm_v * self.__C.GRAD_ACCU_STEPS # print('Param %-3s Name %-80s Grad_Norm %-20s'% # (str(grad_wt), # params[grad_wt][0], # str(norm_v))) optim.step() optim_q.step() time_end = time.time() print('Finished in {}s'.format(int(time_end - time_start))) # print('') epoch_finish = epoch + 1 # Save checkpoint state = { 'state_dict': net.state_dict(), 'optimizer': optim.optimizer.state_dict(), 'lr_base': optim.lr_base } torch.save( state, self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION + '/epoch' + str(epoch_finish) + '.pkl') # Logging logfile = open( self.__C.LOG_PATH + 'log_run_' + self.__C.VERSION + '.txt', 'a+') logfile.write('epoch = ' + str(epoch_finish) + ' Q loss = ' + str(L_qo_sum / data_size) + ' fusion loss = ' + str(L_qm_sum / data_size) + ' loss = ' + str(loss_sum / data_size) + '\n' + 'lr = ' + str(optim._rate) + '\n\n') logfile.close() # Eval after every epoch if dataset_eval is not None: self.eval(dataset_eval, state_dict=net.state_dict(), valid=True) # if self.__C.VERBOSE: # logfile = open( # self.__C.LOG_PATH + # 'log_run_' + self.__C.VERSION + '.txt', # 'a+' # ) # for name in range(len(named_params)): # logfile.write( # 'Param %-3s Name %-80s Grad_Norm %-25s\n' % ( # str(name), # named_params[name][0], # str(grad_norm[name] / data_size * self.__C.BATCH_SIZE) # ) # ) # logfile.write('\n') # logfile.close() loss_sum = 0 L_qo_sum = 0 L_qm_sum = 0 grad_norm = np.zeros(len(named_params))
def train(self, dataset, dataset_eval=None): #1.3 首先训练的开始前要获取需要的信息,数据集大小,数据集的问题嵌入大小,答案大小,文本嵌入向量 data_size = dataset.data_size token_size = dataset.token_size #18405 ans_size = dataset.ans_size pretrained_emb = dataset.pretrained_emb #1.4 需要的信息获取后,开始定义MCAN模型,传入需要的参数,输出:多模态融合特征 proj_feat net = Net(self.__C, pretrained_emb, token_size, ans_size) net.cuda() net.train() # 1.5 调用train进行训练,这步的前一步是?后一步是? # 如果需要的话,定义多gpu训练 if self.__C.N_GPU > 1: net = nn.DataParallel(net, device_ids=self.__C.DEVICES) loss_fn = torch.nn.BCELoss(reduction='sum').cuda() # 如果恢复训练,则加载检查点 if self.__C.RESUME: print(' ========== 恢复性训练 ==========') if self.__C.CKPT_PATH is not None: print('警告:您现在正在使用CKPT_PATH参数,CKPT_VERSION和CKPT_EPOCH不能工作') path = self.__C.CKPT_PATH #此处要设置ckpt_path的目录,不能为None else: path = self.__C.CKPTS_PATH + \ 'ckpt_' + self.__C.CKPT_VERSION + \ '/epoch' + str(self.__C.CKPT_EPOCH) + '.pkl' # 加载模型网络参数 print('加载ckpt {} 文件'.format(path)) ckpt = torch.load(path) print('参数加载完成!') #...............state_dict这里存什么数据............. net.load_state_dict(ckpt['state_dict']) # 加载优化器参数 optim = get_optim(self.__C, net, data_size, ckpt['lr_base']) optim._step = int(data_size / self.__C.BATCH_SIZE * self.__C.CKPT_EPOCH) optim.optimizer.load_state_dict(ckpt['optimizer']) # epoch start_epoch = self.__C.CKPT_EPOCH # 如果不恢复训练,则重新训练 else: if ('ckpt_' + self.__C.VERSION) in os.listdir(self.__C.CKPTS_PATH): shutil.rmtree(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION) os.mkdir(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION) optim = get_optim(self.__C, net, data_size) start_epoch = 0 loss_sum = 0 named_params = list(net.named_parameters()) # 参数名称 grad_norm = np.zeros(len(named_params)) # 梯度规范化 # 定义多线程数据加载 dataloader if self.__C.SHUFFLE_MODE in ['external']: dataloader = Data.DataLoader( dataset, batch_size=self.__C.BATCH_SIZE, shuffle=False, num_workers=self.__C.NUM_WORKERS, # 进程数 pin_memory=self.__C.PIN_MEM, drop_last=True) else: dataloader = Data.DataLoader(dataset, batch_size=self.__C.BATCH_SIZE, shuffle=True, num_workers=self.__C.NUM_WORKERS, pin_memory=self.__C.PIN_MEM, drop_last=True) # 训练过程 这里max_epoch我设置为1 for epoch in range(start_epoch, self.__C.MAX_EPOCH): # 保存日志信息 logfile = open( self.__C.LOG_PATH + 'log_run_' + self.__C.VERSION + '.txt', 'a+') # 写入日志信息 logfile.write( 'nowTime: ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n') logfile.close() # 学习率衰减 if epoch in self.__C.LR_DECAY_LIST: adjust_lr(optim, self.__C.LR_DECAY_R) # Externally shuffle if self.__C.SHUFFLE_MODE == 'external': shuffle_list(dataset.ans_list) time_start = time.time() # 迭代的加载 图像特征迭代器,问题特征迭代器,答案迭代器 for step, (img_feat_iter, ques_ix_iter, ans_iter) in enumerate(dataloader): optim.zero_grad() # 梯度清零 img_feat_iter = img_feat_iter.cuda() ques_ix_iter = ques_ix_iter.cuda() ans_iter = ans_iter.cuda() # grad_accu_steps:累计梯度,来解决本地显存不足的问题, # 其是变相扩大batchsize,如果batch_size=6,样本总量为24,grad_acc_steps=2 # 那么参数更新次数为24/6=4,如果减小batch_size = 6/2=3,则参数更新次数不变 for accu_step in range(self.__C.GRAD_ACCU_STEPS): sub_img_feat_iter = \ img_feat_iter[accu_step * self.__C.SUB_BATCH_SIZE: (accu_step + 1) * self.__C.SUB_BATCH_SIZE] sub_ques_ix_iter = \ ques_ix_iter[accu_step * self.__C.SUB_BATCH_SIZE: (accu_step + 1) * self.__C.SUB_BATCH_SIZE] sub_ans_iter = \ ans_iter[accu_step * self.__C.SUB_BATCH_SIZE: (accu_step + 1) * self.__C.SUB_BATCH_SIZE] pred = net( sub_img_feat_iter, #[5,100,2048] sub_ques_ix_iter # [5,14] ) loss = loss_fn(pred, sub_ans_iter) # 只有平均减少需要被grad_accu_steps划分 loss.backward() # 反向传播,计算当前梯度 loss_sum += loss.cpu().data.numpy( ) * self.__C.GRAD_ACCU_STEPS # 输出每个train的loss if self.__C.VERBOSE: if dataset_eval is not None: mode_str = self.__C.SPLIT['train'] else: mode_str = self.__C.SPLIT[ 'train'] + '->' + self.__C.SPLIT['train'] print( "\r[version %s][epoch %2d][step %4d/%4d][%s] loss: %.4f, lr: %.2e" % (self.__C.VERSION, epoch + 1, step, int(data_size / self.__C.BATCH_SIZE), mode_str, loss.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE, optim._rate), end=' ') # Gradient norm clipping 梯度标准剪裁 if self.__C.GRAD_NORM_CLIP > 0: nn.utils.clip_grad_norm_(net.parameters(), self.__C.GRAD_NORM_CLIP) # 保存梯度下降信息 for name in range(len(named_params)): norm_v = torch.norm(named_params[name][1].grad).cpu().data.numpy() \ if named_params[name][1].grad is not None else 0 grad_norm[name] += norm_v * self.__C.GRAD_ACCU_STEPS optim.step() # with open('One_epoch_data.txt','w') as F: # F.write(net.state_dict()+optim.optimizer.state_dict()+optim.lr_base) time_end = time.time() print('Finished in {}s'.format(int(time_end - time_start))) # print('') epoch_finish = epoch + 1 # 保存检查点 state = { 'state_dict': net.state_dict(), 'optimizer': optim.optimizer.state_dict(), 'lr_base': optim.lr_base } print("===========训练模型的state=====") print(state) torch.save( state, self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION + '/epoch' + str(epoch_finish) + '.pkl') # 打开日志文件 logfile = open( self.__C.LOG_PATH + 'log_run_' + self.__C.VERSION + '.txt', 'a+') logfile.write('epoch = ' + str(epoch_finish) + ' loss = ' + str(loss_sum / data_size) + '\n' + 'lr = ' + str(optim._rate) + '\n\n') logfile.close() # 每个epoch后,进行模型评估,调用评估函数 if dataset_eval is not None: self.eval(dataset_eval, state_dict=net.state_dict(), valid=True) loss_sum = 0 grad_norm = np.zeros(len(named_params))
def train(self, dataset): net = Net( self.__C, ) net.cuda() net.train() #Create checkpoint if ('ckpt_' + self.__C.VERSION) in os.listdir(self.__C.CKPTS_PATH): shutil.rmtree(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION) os.mkdir(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION) loader_params = {'batch_size': 16, 'num_gpus':1} dataloader = TheLoader.from_dataset(dataset, **loader_params) loss_sum = 0 named_params = list(net.named_parameters()) grad_norm = np.zeros(len(named_params)) loss_fn = torch.nn.NLLLoss().cuda() # Load checkpoint if resume training if self.__C.RESUME: print(' ========== Resume training') path = self.__C.CKPTS_PATH + \ 'ckpt_' + self.__C.CKPT_VERSION + \ '/epoch' + str(self.__C.CKPT_EPOCH) + '.pkl' # Load the network parameters print('Loading ckpt {}'.format(path)) ckpt = torch.load(path) print('Finish!') net.load_state_dict(ckpt['state_dict']) # Load the optimizer paramters optim = get_optim(self.__C, net, len(dataloader), ckpt['lr_base']) optim._step = int(len(dataloader) / self.__C.BATCH_SIZE * self.__C.CKPT_EPOCH) optim.optimizer.load_state_dict(ckpt['optimizer']) start_epoch = self.__C.CKPT_EPOCH else: optim = get_optim(self.__C, net, len(dataloader)) start_epoch = 0 for epoch in range(start_epoch, self.__C.MAX_EPOCH): print("Training epoch...", epoch) # Learning Rate Decay if epoch in self.__C.LR_DECAY_LIST: adjust_lr(optim, self.__C.LR_DECAY_R) time_start = time.time() print("time_start:" , time_start) pred_argmax = [] for b, (time_per_batch, batch) in enumerate(time_batch(dataloader)): optim.zero_grad() x, goldsentence = net(**batch) goldsentence = goldsentence[:, 1:] x = x[:,:31,:] pred_argmax = np.argmax(x.cpu().data.numpy(), axis=2) loss = loss_fn(x.permute(0,2,1), goldsentence) loss /= self.__C.GRAD_ACCU_STEPS loss.backward() loss_sum += loss.cpu().data.numpy() * self.__C.GRAD_ACCU_STEPS mode_str = self.__C.SPLIT['train'] print("\r[version %s][epoch %2d][%s] loss: %.4f, lr: %.2e" % ( self.__C.VERSION, epoch + 1, mode_str, loss.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE, optim._rate ), end=' ') # Gradient norm clipping if self.__C.GRAD_NORM_CLIP > 0: nn.utils.clip_grad_norm_( net.parameters(), self.__C.GRAD_NORM_CLIP ) # Save the gradient information for name in range(len(named_params)): norm_v = torch.norm(named_params[name][1].grad).cpu().data.numpy() \ if named_params[name][1].grad is not None else 0 grad_norm[name] += norm_v * self.__C.GRAD_ACCU_STEPS optim.step() time_end = time.time() print('Finished in {}s'.format(int(time_end-time_start))) epoch_finish = epoch + 1 loss_sum = 0 grad_norm = np.zeros(len(named_params)) # Save checkpoint state = { 'state_dict': net.state_dict(), 'optimizer': optim.optimizer.state_dict(), 'lr_base': optim.lr_base } torch.save( state, self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION + '/epoch' + str(epoch_finish) + '.pkl' ) print("Gold sentence: " , str(goldsentence.cpu().data)) print("A sample prediction: ", pred_argmax ) print("Checkpoint saved. " )
def train(self, dataset, dataset_eval=None): # Obtain needed information data_size = dataset.data_size token_size = dataset.token_size ans_size = dataset.ans_size pretrained_emb = dataset.pretrained_emb # Define the MCAN model net = Net( self.__C, pretrained_emb, token_size, ans_size ) net.cuda() net.train() # Define the multi-gpu training if needed if self.__C.N_GPU > 1: net = nn.DataParallel(net, device_ids=self.__C.DEVICES) # Define the binary cross entropy loss # loss_fn = torch.nn.BCELoss(size_average=False).cuda() loss_fn = torch.nn.BCELoss(reduction='sum').cuda() # Load checkpoint if resume training if self.__C.RESUME: print(' ========== Resume training') if self.__C.CKPT_PATH is not None: print('Warning: you are now using CKPT_PATH args, ' 'CKPT_VERSION and CKPT_EPOCH will not work') path = self.__C.CKPT_PATH else: path = self.__C.CKPTS_PATH + \ 'ckpt_' + self.__C.CKPT_VERSION + \ '/epoch' + str(self.__C.CKPT_EPOCH) + '.pkl' # Load the network parameters print('Loading ckpt {}'.format(path)) ckpt = torch.load(path) print('Finish!') net.load_state_dict(ckpt['state_dict']) # Load the optimizer paramters optim = get_optim(self.__C, net, data_size, ckpt['lr_base']) optim._step = int(data_size / self.__C.BATCH_SIZE * self.__C.CKPT_EPOCH) optim.optimizer.load_state_dict(ckpt['optimizer']) start_epoch = self.__C.CKPT_EPOCH else: if ('ckpt_' + self.__C.VERSION) in os.listdir(self.__C.CKPTS_PATH): shutil.rmtree(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION) os.mkdir(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION) optim = get_optim(self.__C, net, data_size) start_epoch = 0 loss_sum = 0 named_params = list(net.named_parameters()) grad_norm = np.zeros(len(named_params)) # Define multi-thread dataloader if self.__C.SHUFFLE_MODE in ['external']: dataloader = Data.DataLoader( dataset, batch_size=self.__C.BATCH_SIZE, shuffle=False, num_workers=self.__C.NUM_WORKERS, pin_memory=self.__C.PIN_MEM, drop_last=True ) else: dataloader = Data.DataLoader( dataset, batch_size=self.__C.BATCH_SIZE, shuffle=True, num_workers=self.__C.NUM_WORKERS, pin_memory=self.__C.PIN_MEM, drop_last=True ) # Training script for epoch in range(start_epoch, self.__C.MAX_EPOCH): # Save log information logfile = open( self.__C.LOG_PATH + 'log_run_' + self.__C.VERSION + '.txt', 'a+' ) logfile.write( 'nowTime: ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n' ) logfile.close() # Learning Rate Decay if epoch in self.__C.LR_DECAY_LIST: adjust_lr(optim, self.__C.LR_DECAY_R) # Externally shuffle if self.__C.SHUFFLE_MODE == 'external': shuffle_list(dataset.ans_list) time_start = time.time() # Iteration for step, ( img_feat_iter, ques_ix_iter, ans_iter, fact_idx_iter, ) in enumerate(dataloader): optim.zero_grad() img_feat_iter = img_feat_iter.cuda() ques_ix_iter = ques_ix_iter.cuda() ans_iter = ans_iter.cuda() fact_idx_iter = fact_idx_iter.cuda() for accu_step in range(self.__C.GRAD_ACCU_STEPS): sub_img_feat_iter = \ img_feat_iter[accu_step * self.__C.SUB_BATCH_SIZE: (accu_step + 1) * self.__C.SUB_BATCH_SIZE] sub_ques_ix_iter = \ ques_ix_iter[accu_step * self.__C.SUB_BATCH_SIZE: (accu_step + 1) * self.__C.SUB_BATCH_SIZE] sub_ans_iter = \ ans_iter[accu_step * self.__C.SUB_BATCH_SIZE: (accu_step + 1) * self.__C.SUB_BATCH_SIZE] sub_fact_idx_iter = \ fact_idx_iter[accu_step * self.__C.SUB_BATCH_SIZE: (accu_step + 1) * self.__C.SUB_BATCH_SIZE] pred = net( sub_img_feat_iter, sub_ques_ix_iter, sub_fact_idx_iter, ) loss = loss_fn(pred, sub_ans_iter) loss /= self.__C.GRAD_ACCU_STEPS loss.backward() loss_sum += loss.cpu().data.numpy() * self.__C.GRAD_ACCU_STEPS if self.__C.VERBOSE: if dataset_eval is not None: mode_str = self.__C.SPLIT['train'] + '->' + self.__C.SPLIT['val'] else: mode_str = self.__C.SPLIT['train'] + '->' + self.__C.SPLIT['test'] print("\r[version %s][epoch %2d][step %4d/%4d][%s] loss: %.4f, lr: %.2e" % ( self.__C.VERSION, epoch + 1, step, int(data_size / self.__C.BATCH_SIZE), mode_str, loss.cpu().data.numpy() / self.__C.SUB_BATCH_SIZE, optim._rate ), end=' ') # Gradient norm clipping if self.__C.GRAD_NORM_CLIP > 0: nn.utils.clip_grad_norm_( net.parameters(), self.__C.GRAD_NORM_CLIP ) # Save the gradient information for name in range(len(named_params)): norm_v = torch.norm(named_params[name][1].grad).cpu().data.numpy() \ if named_params[name][1].grad is not None else 0 grad_norm[name] += norm_v * self.__C.GRAD_ACCU_STEPS # print('Param %-3s Name %-80s Grad_Norm %-20s'% # (str(grad_wt), # params[grad_wt][0], # str(norm_v))) optim.step() time_end = time.time() print('Finished in {}s'.format(int(time_end-time_start))) # print('') epoch_finish = epoch + 1 # Save checkpoint state = { 'state_dict': net.state_dict(), 'optimizer': optim.optimizer.state_dict(), 'lr_base': optim.lr_base } torch.save( state, self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION + '/epoch' + str(epoch_finish) + '.pkl' ) # Logging logfile = open( self.__C.LOG_PATH + 'log_run_' + self.__C.VERSION + '.txt', 'a+' ) logfile.write( 'epoch = ' + str(epoch_finish) + ' loss = ' + str(loss_sum / data_size) + '\n' + 'lr = ' + str(optim._rate) + '\n\n' ) logfile.close() # Eval after every epoch if dataset_eval is not None: self.eval( dataset_eval, state_dict=net.state_dict(), valid=True ) # if self.__C.VERBOSE: # logfile = open( # self.__C.LOG_PATH + # 'log_run_' + self.__C.VERSION + '.txt', # 'a+' # ) # for name in range(len(named_params)): # logfile.write( # 'Param %-3s Name %-80s Grad_Norm %-25s\n' % ( # str(name), # named_params[name][0], # str(grad_norm[name] / data_size * self.__C.BATCH_SIZE) # ) # ) # logfile.write('\n') # logfile.close() loss_sum = 0 grad_norm = np.zeros(len(named_params))
def train(self, dataset, dataset_eval=None): super_time_start = time.time() # Obtain needed information data_size = dataset.data_size token_size = dataset.token_size ans_size = dataset.ans_size pretrained_emb = dataset.pretrained_emb # Define the MCAN model net = Net(self.__C, pretrained_emb, token_size, ans_size) net.cuda() net.train() # Define the binary cross entropy loss loss_fn = torch.nn.BCELoss(reduction='sum').cuda() # Load checkpoint if resume training if self.__C.RESUME: print('========== Resume training') if self.__C.CKPT_PATH is not None: print( 'Warning: you are now using CKPT_PATH args, CKPT_VERSION and CKPT_EPOCH will not work' ) path = self.__C.CKPT_PATH else: path = self.__C.CKPTS_PATH + 'ckpt_' + self.__C.CKPT_VERSION \ + '/epoch' + str(self.__C.CKPT_EPOCH) + '.pkl' # Load the network parameters print('========== Loading ckpt {}'.format(path)) ckpt = torch.load(path) print('========== Finished!') net.load_state_dict(ckpt['state_dict']) # Load the optimizer parameters optim = get_optim(self.__C, net, data_size, ckpt['lr_base']) optim._step = int(data_size / self.__C.BATCH_SIZE * self.__C.CKPT_EPOCH) optim.optimizer.load_state_dict(ckpt['optimizer']) start_epoch = self.__C.CKPT_EPOCH else: if ('ckpt_' + self.__C.VERSION) in os.listdir(self.__C.CKPTS_PATH): shutil.rmtree(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION) os.mkdir(self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION) optim = get_optim(self.__C, net, data_size) start_epoch = 0 loss_sum = 0 named_params = list(net.named_parameters()) grad_norm = np.zeros(len(named_params)) # Define multi-thread dataloader dataloader = Data.DataLoader(dataset, batch_size=self.__C.BATCH_SIZE, shuffle=False, num_workers=self.__C.NUM_WORKERS, pin_memory=self.__C.PIN_MEM, drop_last=True) # Training script for epoch in range(start_epoch, self.__C.MAX_EPOCH): epoch_finish = epoch + 1 # Save log information logfile = open( self.__C.LOG_PATH + 'log_run_' + self.__C.VERSION + '.txt', 'a+') logfile.write( 'nowTime: ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n') logfile.close() # Learning Rate Decay if epoch in self.__C.LR_DECAY_LIST: adjust_lr(optim, self.__C.LR_DECAY_R) # Externally shuffle shuffle_list(dataset.ans_list) time_start = time.time() # Iteration for step, (img_feat_iter, ques_ix_iter, ans_iter) in enumerate(dataloader): optim.zero_grad() img_feat_iter = img_feat_iter.cuda() ques_ix_iter = ques_ix_iter.cuda() ans_iter = ans_iter.cuda() pred = net(img_feat_iter, ques_ix_iter) loss = loss_fn(pred, ans_iter) loss.backward() loss_sum += loss.cpu().data.numpy() if self.__C.VERBOSE: # print loss every step if dataset_eval is not None: mode_str = self.__C.SPLIT[ 'train'] + '->' + self.__C.SPLIT['val'] else: mode_str = self.__C.SPLIT[ 'train'] + '->' + self.__C.SPLIT['test'] print( "\r[version %s][epoch %2d][step %4d/%4d][%s] loss: %.4f, lr: %.2e" % (self.__C.VERSION, epoch_finish, step, int(data_size / self.__C.BATCH_SIZE), mode_str, loss.cpu().data.numpy() / self.__C.BATCH_SIZE, optim._rate), end=' ') # Save the gradient information for name in range(len(named_params)): if named_params[name][1].grad is not None: norm_v = torch.norm( named_params[name][1].grad).cpu().data.numpy() else: norm_v = 0 grad_norm[name] += norm_v optim.step() time_end = time.time() print('========== Finished in {}s'.format( int(time_end - time_start))) # Save checkpoint state = { 'state_dict': net.state_dict(), 'optimizer': optim.optimizer.state_dict(), 'lr_base': optim.lr_base } torch.save( state, self.__C.CKPTS_PATH + 'ckpt_' + self.__C.VERSION + '/epoch' + str(epoch_finish) + '.pkl') # Logging logfile = open( self.__C.LOG_PATH + 'log_run_' + self.__C.VERSION + '.txt', 'a+') logfile.write('epoch = ' + str(epoch_finish) + ' loss = ' + str(loss_sum / data_size) + '\n' + 'lr = ' + str(optim._rate) + '\n\n') logfile.close() # Eval after every epoch if dataset_eval is not None: self.eval(dataset_eval, state_dict=net.state_dict(), valid=True) loss_sum = 0 grad_norm = np.zeros(len(named_params)) print('========== Total Training time is {}s'.format( int(time.time() - super_time_start)))