def model_update(model, optimizer, criterions, x, y): x = x.to(DEVICE) y = y.to(DEVICE) for rate in size_rates: optimizer.zero_grad() # trainsize = int(round(2624*rate/32)*32) if rate != 1: images = F.upsample(x, scale_factor=rate, mode='bilinear', align_corners=True) gts = F.upsample(y.unsqueeze(1), scale_factor=rate, mode='bilinear', align_corners=True) else: images = x gts = y.unsqueeze(1) predictions = model.forward(images) loss = sum( [criterion(predictions[i], gts) for criterion in criterions]) loss.backward() clip_gradient(optimizer, 0.5) optimizer.step() if rate == 1: prediction_to_return = predictions[-1] loss_to_return = torch.mean(loss) return prediction_to_return, loss_to_return
def train(train_loader, model, optimizer, epoch, test_path, best): model.train() size_rates = [0.75, 1, 1.25] loss_attention_record, loss_detection_record = AvgMeter(), AvgMeter() criterion = WIoUBCELoss() for i, pack in enumerate(train_loader, start=1): for rate in size_rates: optimizer.zero_grad() images, gts = pack images = Variable(images).cuda() gts = Variable(gts).cuda() trainsize = int(round(opt.trainsize * rate / 32) * 32) if rate != 1: images = F.upsample(images, size=(trainsize, trainsize), mode='bilinear', align_corners=True) gts = F.upsample(gts, size=(trainsize, trainsize), mode='bilinear', align_corners=True) attention_map, detection_map = model(images) loss1 = criterion(attention_map, gts) loss2 = criterion(detection_map, gts) loss = loss1 + loss2 loss.backward() clip_gradient(optimizer, opt.clip) optimizer.step() if rate == 1: loss_attention_record.update(loss1.data, opt.batchsize) loss_detection_record.update(loss2.data, opt.batchsize) if i % 20 == 0 or i == total_step: print( f'{datetime.now()} Epoch [{epoch}/{opt.epoch}], Step [{i}/{total_step}], \ Loss [attention_loss: {loss_attention_record.show()}, detection_loss: {loss_detection_record.show()}]' ) train_logger.info( f'{datetime.now()} Epoch [{epoch}/{opt.epoch}], Step [{i}/{total_step}], \ Loss [attention_loss: {loss_attention_record.show()}, detection_loss: {loss_detection_record.show()}]' ) save_path = 'snapshots/{}/'.format(opt.train_save) os.makedirs(save_path, exist_ok=True) if (epoch + 1) % 1 == 0: meandice = validation(model, test_path) if meandice > best: best = meandice torch.save(model.state_dict(), save_path + 'HarDMSEG-best.pth') print('[Saving Snapshots:]', save_path + 'HarDMSEG-best.pth', meandice) train_logger.info( f'[Saving Snapshots: {save_path + "HarGMSEG-best"} {meandice}]' ) return best
def train(train_loader, model, optimizer, epochs, batch_size, train_size, clip, test_path): best_dice_score = 0 for epoch in range(1, epochs): # 99 epoch adjust_lr(optimizer, lr, epoch, 0.1, 200) model.train() size_rates = [0.75, 1, 1.25] loss_record = AvgMeter() criterion = WIoUBCELoss() for i, pack in enumerate(train_loader, start=1): for rate in size_rates: optimizer.zero_grad() images, gts = pack images = Variable(images).cuda() gts = Variable(gts).cuda() trainsize = int(round(train_size * rate / 32) * 32) if rate != 1: images = F.upsample(images, size=(trainsize, trainsize), mode='bilinear', align_corners=True) gts = F.upsample(gts, size=(trainsize, trainsize), mode='bilinear', align_corners=True) # predict predict_maps = model(images) loss = criterion(predict_maps, gts) loss.backward() clip_gradient(optimizer, clip) optimizer.step() if rate == 1: loss_record.update(loss.data, batch_size) if i % 20 == 0 or i == total_step: print( f'{datetime.now()} Epoch [{epoch}/{epochs}], Step [{i}/{total_step}], Loss: {loss_record.show()}' ) train_logger.info( f'{datetime.now()} Epoch [{epoch}/{epochs}], Step [{i}/{total_step}], Loss: {loss_record.show()}' ) save_path = 'checkpoints/' os.makedirs(save_path, exist_ok=True) if (epoch + 1) % 1 == 0: meandice = validation(model, test_path) print(f'meandice: {meandice}') train_logger.info(f'meandice: {meandice}') if meandice > best_dice_score: best_dice_score = meandice torch.save(model.state_dict(), save_path + 'effnetv2pd.pth') print('[Saving Snapshots:]', save_path + 'effnetv2pd.pth', meandice) if epoch in [50, 60, 70]: file_ = 'effnetv2pd_' + str(epoch) + '.pth' torch.save(model.state_dict(), save_path + file_) print('[Saving Snapshots:]', save_path + file_, meandice)
def train(train_loader, model, optimizer, epoch, test_path): model.train() # ---- multi-scale training ---- size_rates = [0.75, 1, 1.25] loss_record2, loss_record3, loss_record4, loss_record5 = AvgMeter( ), AvgMeter(), AvgMeter(), AvgMeter() for i, pack in enumerate(train_loader, start=1): for rate in size_rates: optimizer.zero_grad() # ---- data prepare ---- images, gts = pack images = Variable(images).cuda() gts = Variable(gts).cuda() # ---- rescale ---- trainsize = int(round(opt.trainsize * rate / 32) * 32) if rate != 1: images = F.upsample(images, size=(trainsize, trainsize), mode='bilinear', align_corners=True) gts = F.upsample(gts, size=(trainsize, trainsize), mode='bilinear', align_corners=True) # ---- forward ---- #lateral_map_5, lateral_map_4, lateral_map_3, lateral_map_2 = model(images) lateral_map_5 = model(images) # ---- loss function ---- loss5 = structure_loss(lateral_map_5, gts) #loss = loss2 + 0.4*loss3 + 0.4*loss4 + 0.2*loss5 # TODO: try different weights for loss loss = loss5 # ---- backward ---- loss.backward() clip_gradient(optimizer, opt.clip) optimizer.step() # ---- recording loss ---- if rate == 1: loss_record5.update(loss5.data, opt.batchsize) # ---- train visualization ---- if i % 20 == 0 or i == total_step: print('{} Epoch [{:03d}/{:03d}], Step [{:04d}/{:04d}], ' ' lateral-5: {:0.4f}]'.format(datetime.now(), epoch, opt.epoch, i, total_step, loss_record5.show())) save_path = 'snapshots/{}/'.format(opt.train_save) os.makedirs(save_path, exist_ok=True) best = 0 if (epoch + 1) % 1 == 0: meandice = test(model, test_path) if meandice > best: best = meandice torch.save(model.state_dict(), save_path + str(best) + 'HarD-MSEG-best.pth') print('[Saving Snapshot:]', save_path + 'HarD-MSEG-best.pth', meandice)
def train(train_loader, model, optimizer, epochs, batch_size, train_size, clip, test_path): best_dice_score = 0 for epoch in range(1, epochs): adjust_lr(optimizer, lr, epoch, 0.1, 200) for param in optimizer.param_groups: print(param['lr']) model.train() size_rates = [0.75, 1, 1.25] loss_record2, loss_record3, loss_record4, loss_record5 = AvgMeter( ), AvgMeter(), AvgMeter(), AvgMeter() criterion = WIoUBCELoss() for i, pack in enumerate(train_loader, start=1): for rate in size_rates: optimizer.zero_grad() images, gts = pack images = Variable(images).cuda() gts = Variable(gts).cuda() trainsize = int(round(train_size * rate / 32) * 32) if rate != 1: images = F.upsample(images, size=( trainsize, trainsize), mode='bilinear', align_corners=True) gts = F.upsample(gts, size=( trainsize, trainsize), mode='bilinear', align_corners=True) # predict lateral_map_5, lateral_map_4, lateral_map_3, lateral_map_2 = model( images) loss5 = criterion(lateral_map_5, gts) loss4 = criterion(lateral_map_4, gts) loss3 = criterion(lateral_map_3, gts) loss2 = criterion(lateral_map_2, gts) loss = loss2 + loss3 + loss4 + loss5 loss.backward() clip_gradient(optimizer, clip) optimizer.step() if rate == 1: loss_record2.update(loss2.data, batch_size) loss_record3.update(loss3.data, batch_size) loss_record4.update(loss4.data, batch_size) loss_record5.update(loss5.data, batch_size) if i % 20 == 0 or i == total_step: print(f'{datetime.now()} Epoch [{epoch}/{epochs}], Step [{i}/{total_step}], [lateral-2: {loss_record2.show()}, lateral-3: {loss_record3.show()}, lateral-4: {loss_record4.show()}, lateral-5: {loss_record5.show()},]') train_logger.info( f'{datetime.now()} Epoch [{epoch}/{epochs}], Step [{i}/{total_step}], [lateral-2: {loss_record2.show()}, lateral-3: {loss_record3.show()}, lateral-4: {loss_record4.show()}, lateral-5: {loss_record5.show()},]') save_path = 'checkpoints/' os.makedirs(save_path, exist_ok=True) if (epoch+1) % 1 == 0: meandice = validation(model, test_path) print(f'meandice: {meandice}') train_logger.info(f'meandice: {meandice}') if meandice > best_dice_score: best_dice_score = meandice torch.save(model.state_dict(), save_path + 'PraHarDNet.pth') print('[Saving Snapshots:]', save_path + 'PraHarDNet.pth', meandice)
def train(train_loader, encoder, decoder, criterion, enc_optimizer, dec_optimizer, epoch): ''' Performs one epoch's training. ''' encoder.train() decoder.train() avg_loss = 0 for cnt, (encap, zhcap, video, caplen_en, caplen_zh, enrefs, zhrefs) in enumerate(train_loader, 1): encap, zhcap, video, caplen_en, caplen_zh = encap.cuda(), zhcap.cuda( ), video.cuda(), caplen_en.cuda(), caplen_zh.cuda() init_hidden, vid_out = encoder( video ) # fea: decoder input from encoder, should be of size (mb, encout_dim) = (mb, decoder_dim) scores_en, scores_zh = decoder( encap, zhcap, init_hidden, vid_out, args.MAX_INPUT_LENGTH, teacher_forcing_ratio=args.teacher_ratio) targets_en = encap[:, 1:] # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> loss_en = criterion( scores_en[:, 1:].contiguous().view(-1, decoder.vocab_size_en), targets_en.contiguous().view(-1)) targets_zh = zhcap[:, 1:] # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> loss_zh = criterion( scores_zh[:, 1:].contiguous().view(-1, decoder.vocab_size_zh), targets_zh.contiguous().view(-1)) # Back prop. dec_optimizer.zero_grad() if enc_optimizer is not None: enc_optimizer.zero_grad() loss = loss_en + loss_zh loss.backward() # Clip gradients if args.grad_clip is not None: clip_gradient(dec_optimizer, args.grad_clip) clip_gradient(enc_optimizer, args.grad_clip) # Update weights dec_optimizer.step() enc_optimizer.step() # Keep track of metrics avg_loss += loss.item() return avg_loss / cnt
def training_epoch(self): self.model.train() running_loss = {} running_time = 0 for i, batch in enumerate(self.trainloader): self.optimizer.zero_grad() start_time = time.time() loss, loss_dict = self.model.training_step(batch) if loss.item() == 0 or not torch.isfinite(loss): continue loss.backward() # if self.clip_grad is not None: clip_gradient(self.optimizer, self.clip_grad) self.optimizer.step() end_time = time.time() for (key,value) in loss_dict.items(): if key in running_loss.keys(): running_loss[key] += value else: running_loss[key] = value running_time += end_time-start_time self.iters = self.start_iter + len(self.trainloader)*self.epoch + i + 1 if self.iters % self.print_per_iter == 0: for key in running_loss.keys(): running_loss[key] /= self.print_per_iter running_loss[key] = np.round(running_loss[key], 5) loss_string = '{}'.format(running_loss)[1:-1].replace("'",'').replace(",",' ||') print("[{}|{}] [{}|{}] || {} || Time: {:10.4f}s".format(self.epoch, self.num_epochs, self.iters, self.num_iters,loss_string, running_time)) self.logging({"Training Loss/Batch" : running_loss['T']/ self.print_per_iter,}) running_loss = {} running_time = 0 if (self.iters % self.checkpoint.save_per_iter == 0 or self.iters == self.num_iters - 1): self.checkpoint.save(self.model, epoch = self.epoch, iters=self.iters)
def train(train_loader, model, optimizer, epoch, logger): model.train() # train mode (dropout and batchnorm is used) losses = AverageMeter() # Batches for i, (imgs, boxes, labels, keypoints) in enumerate(train_loader): # Move to GPU, if available imgs = imgs.type(torch.FloatTensor).to(device) # [N, 3, 320, 320] targets = dict() targets['boxes'] = boxes targets['labels'] = labels targets['keypoints'] = keypoints print(boxes.size()) print(labels.size()) print(keypoints.size()) # Forward prop. loss = model(imgs, [targets]) # [N, 3, 320, 320] # Back prop. optimizer.zero_grad() loss.backward() # Clip gradients clip_gradient(optimizer, grad_clip) # Update weights optimizer.step() # Keep track of metrics losses.update(loss.item()) # Print status if i % print_freq == 0: status = 'Epoch: [{0}][{1}/{2}]\t' \ 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(epoch, i, len(train_loader), loss=losses) logger.info(status) return losses.avg
def train(train_loader, model, optimizer, epoch): model.train() # ---- multi-scale training ---- size_rates = [0.75, 1, 1.25] loss_record3 = AvgMeter() for i, pack in enumerate(train_loader, start=1): for rate in size_rates: optimizer.zero_grad() # ---- data prepare ---- images, gts = pack images = Variable(images).cuda() gts = Variable(gts).cuda() # ---- rescale ---- trainsize = int(round(opt.trainsize*rate/32)*32) if rate != 1: images = F.upsample(images, size=(trainsize, trainsize), mode='bilinear', align_corners=True) gts = F.upsample(gts, size=(trainsize, trainsize), mode='bilinear', align_corners=True) # ---- forward ---- lateral_map_3 = model(images) # ---- loss function ---- loss3 = structure_loss(lateral_map_3, gts) loss = loss3 # ---- backward ---- loss.backward() clip_gradient(optimizer, opt.clip) optimizer.step() # ---- recording loss ---- if rate == 1: loss_record3.update(loss3.data, opt.batchsize) # ---- train visualization ---- if i % 20 == 0 or i == total_step: print('{} Epoch [{:03d}/{:03d}], Step [{:04d}/{:04d}], ' '[lateral-3: {:.4f}]'. format(datetime.now(), epoch, opt.epoch, i, total_step, loss_record3.show())) save_path = 'checkpoints/{}/'.format(opt.train_save) os.makedirs(save_path, exist_ok=True) if (epoch+1) % 5 == 0: torch.save(model.state_dict(), save_path + 'C2FNet-%d.pth' % epoch) print('[Saving Snapshot:]', save_path + 'C2FNet-%d.pth'% epoch)
def train(train_loader, model, optimizer, epoch, lr): avg = AverageMeter() model.train() def is_valid_number(x): return not (math.isnan(x) or math.isinf(x) or x > 1e4) for it, dataItems in enumerate(train_loader): end = time.time() ims = Variable(torch.cat((dataItems[0], dataItems[1]), dim=0)).cuda() rois = torch.cat((dataItems[2], dataItems[3]), dim=0) idx = torch.FloatTensor( np.concatenate([[i] * args.sample_num for i in range(len(ims))]).astype(np.float32)) rois_idx = Variable( torch.cat([idx.view(-1, 1), rois], dim=1).contiguous()).cuda() id_targets = Variable(torch.cat((dataItems[4], dataItems[5]), dim=0)).cuda() reg_targets = Variable(dataItems[6]).cuda() reg_pred, logits_list, corr_feat, local_feats_list = model( ims, rois_idx) # (reg_pred, soft_feat, corr_feat, reid_feat) # loss reg_loss = _smooth_l1_loss(reg_pred, reg_targets) reid_feat = torch.cat(local_feats_list, dim=1) cls_loss, trp_loss = criterion._combine(reid_feat, logits_list, id_targets) loss = cls_loss + trp_loss + args.lam * reg_loss optimizer.zero_grad() loss.backward() clip_gradient(model, 10) if is_valid_number(loss.data[0]): optimizer.step() batch_time = time.time() - end # eval fscore = evaluator.eval_combine(reid_feat, id_targets) N = int(len(rois) / 2) anchor_rois = rois_idx[:N, 1:].contiguous().view(-1, 4).contiguous() pred = decoding(anchor_rois, reg_pred) # Variable, Variable -> Variable reg_acc = evaluator.eval_reg( pred, rois_idx[N:, 1:].contiguous().view(-1, 4).contiguous()) avg.update(batch_time=batch_time, loss=loss.data[0], reg_loss=reg_loss.data[0], cls_loss=cls_loss.data[0], trp_loss=trp_loss.data[0], reg_acc=reg_acc, cls_acc=fscore) if (it + 1) % args.print_freq == 0: show_img_with_bbox( ims[0], ims[args.b], rois_idx[:args.sample_num, 1:], rois_idx[args.sample_num * args.b:args.sample_num * (args.b + 1), 1:], pred[:args.sample_num]) t = corr_feat[0].data.cpu().numpy() sample_idx = [ 0, 8, 16, 17 * 8, 17 * 8 + 8, 17 * 8 + 16, 17 * 16, 17 * 16 + 8, 17 * 16 + 16 ] heatmap = np.zeros((3 * 60, 3 * 80)) for i in range(3): for j in range(3): crop_patch = t[sample_idx[i * 3 + j]] max_abs_value = max( max(abs(np.max(crop_patch)), abs(np.min(crop_patch))), 1e-6) heatmap[i * 60:(i + 1) * 60, j * 80:(j + 1) * 80] = SKITransform.resize( crop_patch / max_abs_value, (60, 80))[::-1] vis.img_heatmap('heatmap', heatmap) vis.plot_many({ 'loss': avg.avg('loss'), 'reg_loss': avg.avg('reg_loss'), 'cls_loss': avg.avg('cls_loss'), 'reg_acc': avg.avg('reg_acc'), 'cls_acc': avg.avg('cls_acc'), 'trp_loss': avg.avg('trp_loss') }) left_time = (len(train_loader) - it) * batch_time / 3600. # hour log_str = 'Epoch: [{0}][{1}/{2}] lr: {lr:.6f} {batch_time:s} \n' \ '{loss:s} \t {reg_loss:s} \t {cls_loss:s} \t {trp_loss:s} \n' \ '{reg_acc:s} \t {cls_acc:s} \t remaining time:{left_time:.4}h\n'.format( epoch, it + 1, len(train_loader), lr=lr, batch_time=avg.batch_time, loss=avg.loss, reg_loss=avg.reg_loss, cls_loss=avg.cls_loss, trp_loss=avg.trp_loss, reg_acc=avg.reg_acc, cls_acc=avg.cls_acc, left_time=left_time) vis.log(log_str, win='train_log') logger.info(log_str)
def train(opt): opt.use_att = utils.if_use_att(opt.caption_model) loader = DataLoader(opt) opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length td_summary_writer = td.writer.SummaryWriter(opt.ckpt_path) infos = { 'iter': 0, 'epoch': 0, 'loader_state_dict': None, 'vocab': loader.get_vocab(), } histories = {} if opt.start_from is not None: # open old infos and check if models are compatible with open(os.path.join(opt.start_from, 'infos.pkl'), 'rb') as f: infos = cPickle.load(f, encoding='latin-1') saved_model_opt = infos['opt'] need_be_same = [ "caption_model", "rnn_type", "rnn_size", "num_layers", "embed_weight_file" ] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars( opt )[checkme], "Command line argument and saved model disagree on '%s' " % checkme if os.path.isfile(os.path.join(opt.start_from, 'histories.pkl')): with open(os.path.join(opt.start_from, 'histories.pkl'), 'rb') as f: histories = cPickle.load(f, encoding='latin-1') iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) iteration = infos['iter'] epoch = infos['epoch'] # For back compatibility if 'iterators' in infos: infos['loader_state_dict'] = { split: { 'index_list': infos['split_ix'][split], 'iter_counter': infos['iterators'][split] } for split in ['train', 'val', 'test'] } loader.load_state_dict(infos['loader_state_dict']) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) if opt.load_best_score == 1: best_val_score = infos.get('best_val_score', None) model = models.setup(opt) model.cuda() update_lr_flag = True # Assure in training mode model.train() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = utils.NewNoamOpt(optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0, betas=(0.9, 0.98), eps=1e-9), max_lr=opt.learning_rate, warmup=opt.newnoamopt_warmup, batchsize=opt.batch_size, decay_start=opt.newnoamopt_decay, datasize=len( loader.dataset.split_ix['train'])) if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=opt.learning_rate, betas=(opt.optim_alpha, opt.optim_beta), eps=opt.optim_epsilon, weight_decay=opt.weight_decay) params = list(model.named_parameters()) grad_norm = np.zeros(len(params)) loss_sum = 0 while True: if opt.self_critical_after != -1 and epoch >= opt.self_critical_after and update_lr_flag and opt.caption_model in [ 'svbase', 'umv' ]: print('start self critical') if epoch >= 15 and epoch < 20 and opt.learning_rate_decay_start >= 0: opt.current_lr = opt.learning_rate elif epoch >= 20 and opt.learning_rate_decay_start >= 0: opt.current_lr = opt.learning_rate / 2.0 utils.set_lr(optimizer, opt.current_lr) update_lr_flag = False # Assign the scheduled sampling prob if epoch > opt.scheduled_sampling_start and opt.scheduled_sampling_start >= 0: frac = (epoch - opt.scheduled_sampling_start ) // opt.scheduled_sampling_increase_every opt.ss_prob = min(opt.scheduled_sampling_increase_prob * frac, opt.scheduled_sampling_max_prob) model.ss_prob = opt.ss_prob # If start self critical training if opt.self_critical_after != -1 and epoch >= opt.self_critical_after: sc_flag = True opt.embed_weight_requires_grad = True init_cider_scorer(opt.cached_tokens) init_bleu_scorer() else: sc_flag = False opt.embed_weight_requires_grad = False start = time.time() # Load data from train split (0) data = loader.get_batch('train') print('Read data:', time.time() - start) torch.cuda.synchronize() start = time.time() num_bbox, att_feats = data['num_bbox'].cuda(), data['att_feats'].cuda() labels = data['labels'].cuda() masks = data['masks'].cuda() optimizer.zero_grad() if not sc_flag: loss = crit(model(att_feats, num_bbox, labels), labels[:, 1:], masks[:, 1:]) else: gen_result, sample_logprobs = model.sample(att_feats, num_bbox, opt={'sample_max': 0}) reward = get_self_critical_reward(model, att_feats, num_bbox, data, gen_result) loss = rl_crit(sample_logprobs, gen_result, torch.from_numpy(reward).float().cuda()) loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) for grad_wt in range(len(params)): norm_v = torch.norm(params[grad_wt][1].grad).cpu().data.numpy( ) if params[grad_wt][1].grad is not None else 0 grad_norm[grad_wt] += norm_v if not sc_flag: optimizer.step(epoch) else: optimizer.step() train_loss = loss.item() loss_sum += train_loss torch.cuda.synchronize() end = time.time() if not sc_flag: print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, end - start)) else: print("lr {} iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}" \ .format(opt.current_lr, iteration, epoch, np.mean(reward[:,0]), end - start)) # Update the iteration and epoch iteration += 1 if sc_flag: del gen_result del sample_logprobs if data['bounds']['wrapped']: epoch += 1 update_lr_flag = True # Write the training loss summary if (iteration % opt.losses_log_every == 0): if opt.noamopt: opt.current_lr = optimizer.rate() elif not sc_flag: opt.current_lr = optimizer.rate(epoch) if td is not None: td_summary_writer.add_scalar('train_loss', train_loss, iteration) td_summary_writer.add_scalar('learning_rate', opt.current_lr, iteration) td_summary_writer.add_scalar('scheduled_sampling_prob', model.ss_prob, iteration) if sc_flag: td_summary_writer.add_scalar('avg_reward', np.mean(reward[:, 0]), iteration) # tf_summary_writer.flush() loss_history[iteration] = train_loss if not sc_flag else np.mean( reward[:, 0]) lr_history[iteration] = opt.current_lr ss_prob_history[iteration] = model.ss_prob # make evaluation on validation set, and save model if (iteration % opt.save_checkpoint_every == 0): # eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split( model, loader, eval_kwargs) # Write validation result into summary if td is not None: td_summary_writer.add_scalar('validation loss', val_loss, iteration) if lang_stats is not None: for k, v in lang_stats.items(): td_summary_writer.add_scalar(k, v, iteration) # tf_summary_writer.flush() val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if True: # if true if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True checkpoint_path = os.path.join(opt.ckpt_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join(opt.ckpt_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) # Dump miscalleous informations infos['iter'] = iteration infos['epoch'] = epoch infos['loader_state_dict'] = loader.state_dict() histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history with open(os.path.join(opt.ckpt_path, 'infos.pkl'), 'wb') as f: cPickle.dump(infos, f) with open(os.path.join(opt.ckpt_path, 'histories_.pkl'), 'wb') as f: cPickle.dump(histories, f) if best_flag: checkpoint_path = os.path.join(opt.ckpt_path, 'model-best.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open(os.path.join(opt.ckpt_path, 'infos-best.pkl'), 'wb') as f: cPickle.dump(infos, f) loss_sum = 0 grad_norm = np.zeros(len(params)) # Stop if reaching max epochs if epoch >= opt.max_epochs and opt.max_epochs != -1: eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss, predictions, lang_stats = eval_utils.eval_split( model, loader, eval_kwargs) # Write validation result into summary if td is not None: td_summary_writer.add_scalar('validation loss', val_loss, iteration) if lang_stats is not None: for k, v in lang_stats.items(): td_summary_writer.add_scalar(k, v, iteration) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } # Save model if is improving on validation result if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss best_flag = False if True: # if true if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True checkpoint_path = os.path.join(opt.ckpt_path, 'model.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) optimizer_path = os.path.join(opt.ckpt_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) # Dump miscalleous informations infos['iter'] = iteration infos['epoch'] = epoch infos['loader_state_dict'] = loader.state_dict() histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history histories['ss_prob_history'] = ss_prob_history with open(os.path.join(opt.ckpt_path, 'infos.pkl'), 'wb') as f: cPickle.dump(infos, f) with open(os.path.join(opt.ckpt_path, 'histories.pkl'), 'wb') as f: cPickle.dump(histories, f) if best_flag: checkpoint_path = os.path.join(opt.ckpt_path, 'model-best.pth') torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open(os.path.join(opt.ckpt_path, 'infos-best.pkl'), 'wb') as f: cPickle.dump(infos, f) break if sc_flag: del loss del reward del att_feats del num_bbox del labels del masks del data
def train(config, train_loader, model, criterion, optimizer, epoch, output_dir, tb_log_dir, writer_dict): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() # switch to train mode model.train() end = time.time() tit = 0 for i, (input, smap, fixmap, info) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) if tit == 0: optimizer.zero_grad() # compute output outputs = model(input) smap = smap.cuda(non_blocking=True) fixmap = fixmap.cuda(non_blocking=True) if config.DATASET.SAMPLER == "RandomIdentitySampler": outputs = outputs[0] loss, other_info = criterion(outputs, smap, fixmap) if config.TRAIN.OHEM != -1: loss = ohem(loss, config.TRAIN.OHEM) # measure accuracy and record loss losses.update(loss.item(), len(input) if isinstance(input, list) else input.size(0)) loss /= config.TRAIN.ITER_NUM loss.backward() tit += 1 if tit == config.TRAIN.ITER_NUM: # compute gradient and do update step if config.TRAIN.CLIP_GRAD != 0: clip_gradient(model, config.TRAIN.CLIP_GRAD) optimizer.step() tit = 0 # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % config.PRINT_FREQ == 0: num_instances = len(input) if isinstance(input, list) else input.size(0) msg = 'Epoch: [{0}][{1}/{2}]\t' \ 'Time {batch_time.val:.3f}s ({batch_time.avg:.3f}s)\t' \ 'Speed {speed:.1f} samples/s\t' \ 'Data {data_time.val:.3f}s ({data_time.avg:.3f}s)\t' \ 'Loss {loss.val:.5f} ({loss.avg:.5f})'.format( epoch, i, len(train_loader), batch_time=batch_time, speed=num_instances/batch_time.val, data_time=data_time, loss=losses) if other_info is not None: msg += other_info logger.info(msg) writer = writer_dict['writer'] global_steps = writer_dict['train_global_steps'] writer.add_scalar('train_loss', losses.val, global_steps) writer_dict['train_global_steps'] = global_steps + 1
def train(train_loader, encoder, decoder, criterion, optimizer, epoch, writer): """ Performs one epoch's training. """ encoder.train() decoder.train() # train mode (dropout and batchnorm is used) total_step = len(train_loader) batch_time = AverageMeter() # forward prop. + back prop. time data_time = AverageMeter() # data loading time losses = AverageMeter() # loss (per word decoded) top5accs = AverageMeter() # top5 accuracy start = time.time() # Batches for i, (imgs, caps, caplens, tags_target) in enumerate(train_loader): data_time.update(time.time() - start) # Move to GPU, if available imgs = imgs.to(device) caps = caps.to(device) caplens = caplens.to(device) #tags_target = tags_target.to(device) attributes, imgs_features = encoder(imgs) # loss2 = criterion2(attributes, tags_target) scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder( attributes, imgs_features, caps, caplens) # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets = caps_sorted[:, 1:] # Remove timesteps that we didn't decode at, or are pads # pack_padded_sequence is an easy trick to do this # torch在计算时会自动除去pad,这样不带pad计算不影响精度 scores, _ = pack_padded_sequence(scores, decode_lengths, batch_first=True) targets, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True) # Calculate loss loss = criterion(scores, targets) # + loss2 loss += alpha_c * ((1. - alphas.sum(dim=1))**2).mean() # Back prop. optimizer.zero_grad() # loss2.backward(retain_graph=True) loss.backward() # Clip gradients if grad_clip is not None: clip_gradient(optimizer, grad_clip) # Update weights optimizer.step() # Keep track of metrics top5 = accuracy(scores, targets, 5) losses.update(loss.item(), sum(decode_lengths)) top5accs.update(top5, sum(decode_lengths)) batch_time.update(time.time() - start) start = time.time() # Print status if i % print_freq == 0: writer.add_scalars('train', { 'loss': loss.item(), 'mAp': top5accs.val }, epoch * total_step + i) print('Epoch: [{0}][{1}/{2}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top5=top5accs))
def train(opt): loader = get_loader(opt, 'train') opt.vocab_size = loader.vocab_size opt.seq_length = loader.seq_length summry_writer = tensorboardX.SummaryWriter() infos = {} histories = {} if opt.start_from is not None: infos_path = os.path.join(opt.start_from, 'infos_' + opt.id + '.pkl') histories_path = os.path.join(opt.start_from, 'histories_' + opt.id + '.pkl') # open infos and check if models are compatible with open(infos_path, 'rb') as f: infos = pickle.load(f) saved_model_opt = infos['opt'] need_be_same = ['hidden_size'] for checkme in need_be_same: assert vars(saved_model_opt)[checkme] == vars(opt)[checkme],\ "Command line argument and saved model disagree on %s"%(checkme) if os.path.isfile(histories_path): with open(histories_path, 'rb') as f: histories = pickle.load(f) iteration = infos.get('iter', 0) current_epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) if opt.load_best_score == 1: best_val_score = infos.get("best_val_score", None) encoder = Encoder() decoder = Decoder(opt) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = encoder.to(device) decoder = decoder.to(device) criterion = utils.LanguageModelCriterion().to(device) optimizer = optim.Adam(decoder.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay) if vars(opt).get('start_from', None) is not None: optimizer_path = os.path.join(opt.start_from, 'optimizer.pth') optimizer.load_state_dict(torch.load(optimizer_path)) total_step = len(loader) start = time.time() for epoch in range(current_epoch, opt.max_epochs): if epoch > opt.learning_rate_decay_start and \ opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every deccay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * deccay_factor utils.set_lr(optimizer, opt.current_lr) print("learing rate change form {} to {}".format( opt.learning_rate, opt.current_lr)) else: opt.current_lr = opt.learning_rate for i, data in enumerate(loader, iteration): if i > total_step - 1: iteration = 0 break transform = transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) imgs = [] for k in range(data['imgs'].shape[0]): img = torch.tensor(data['imgs'][k], dtype=torch.float) img = transform(img) imgs.append(img) imgs = torch.stack(imgs, dim=0).to(device) labels = torch.tensor(data['labels'].astype(np.int32), dtype=torch.long).to(device) masks = torch.tensor(data['masks'], dtype=torch.float).to(device) with torch.no_grad(): features = encoder(imgs) preds = decoder(features, labels) loss = criterion(preds, labels[:, 1:], masks[:, 1:]) optimizer.zero_grad() loss.backward() utils.clip_gradient(optimizer, opt.grad_clip) optimizer.step() train_loss = loss.item() print("iter: {}/{} (epoch {}), train loss = {:.3f}, time/batch = {}"\ .format(i, total_step, epoch, train_loss, utils.get_duration(start))) log_iter = i + epoch * total_step # write training loss summary if (i % opt.losses_log_every) == 0: summry_writer.add_scalar('train_loss', train_loss, log_iter) summry_writer.add_scalar('learning_rate', opt.current_lr, log_iter) # make evaluation on validation set, and save model if (i % opt.save_checkpoint_every == 0): #eval model eval_kwargs = {'split': 'val', 'dataset': opt.input_json} eval_kwargs.update(vars(opt)) val_loss,\ predictions,\ lang_stats = eval_utils.eval_split(encoder, decoder, criterion, opt, eval_kwargs) summry_writer.add_scalar('valaidation loss', val_loss, log_iter) if lang_stats is not None: for metric, score in lang_stats.items(): summry_writer.add_scalar(metric, score, log_iter) val_result_history[i] = { "loss": val_loss, "lang_stats": lang_stats, "predictions": predictions } if opt.language_eval == 1: current_score = lang_stats['CIDEr'] else: current_score = -val_loss.item() best_flag = False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True if not os.path.exists(opt.checkpoint_path): os.makedirs(opt.checkpoint_path) checkpoint_ptah = os.path.join(opt.checkpoint_path, 'model.pth') torch.save(decoder.state_dict(), checkpoint_ptah) print("model saved to {}".format(checkpoint_ptah)) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer.pth') torch.save(optimizer.state_dict(), optimizer_path) # Dump miscalleous informations infos['iter'] = i + 1 infos['epoch'] = epoch infos['best_val_score'] = best_val_score infos['opt'] = opt infos['vocab'] = loader.ix_to_word histories['val_result_history'] = val_result_history histories['loss_history'] = loss_history histories['lr_history'] = lr_history infos_path = os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '.pkl') histories_path = os.path.join(opt.checkpoint_path, 'histories_' + opt.id + '.pkl') with open(infos_path, 'wb') as f: pickle.dump(infos, f) print("infos saved into {}".format(infos_path)) with open(histories_path, 'wb') as f: pickle.dump(histories, f) print('histories saved into {}'.format(histories_path)) if best_flag: checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth') torch.save(decoder.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open( os.path.join(opt.checkpoint_path, 'infos_' + opt.id + '-best.pkl'), 'wb') as f: pickle.dump(infos, f) summry_writer.close()
def train(train_loader, model, optimizer, epoch, save_label): model.train() # ---- multi-scale training ---- size_rates = [0.75, 1, 1.25] loss_record2, loss_record3, loss_record4, loss_record5 = AvgMeter( ), AvgMeter(), AvgMeter(), AvgMeter() loss22, loss33, loss44, loss55 = 0, 0, 0, 0 for i, pack in enumerate(train_loader, start=1): for rate in size_rates: optimizer.zero_grad() # ---- data prepare ---- images, gts = pack images = Variable(images).cuda() gts = Variable(gts).cuda() # ---- rescale ---- trainsize = int(round(opt.trainsize * rate / 32) * 32) if rate != 1: images = F.upsample(images, size=(trainsize, trainsize), mode='bilinear', align_corners=True) gts = F.upsample(gts, size=(trainsize, trainsize), mode='bilinear', align_corners=True) # ---- forward ---- lateral_map_5, lateral_map_4, lateral_map_3, lateral_map_2 = model( images) # ---- loss function ---- loss5 = structure_loss(lateral_map_5, gts) loss4 = structure_loss(lateral_map_4, gts) loss3 = structure_loss(lateral_map_3, gts) loss2 = structure_loss(lateral_map_2, gts) loss = loss2 + loss3 + loss4 + loss5 # TODO: try different weights for loss # ---- backward ---- loss.backward() clip_gradient(optimizer, opt.clip) optimizer.step() # ---- recording loss ---- if rate == 1: loss_record2.update(loss2.data, opt.batchsize) loss_record3.update(loss3.data, opt.batchsize) loss_record4.update(loss4.data, opt.batchsize) loss_record5.update(loss5.data, opt.batchsize) # ---- train visualization ---- if i % 20 == 0 or i == total_step: loss22 = float(loss_record2.show().cuda().data.cpu().numpy()) loss33 = float(loss_record3.show().cuda().data.cpu().numpy()) loss44 = float(loss_record4.show().cuda().data.cpu().numpy()) loss55 = float(loss_record5.show().cuda().data.cpu().numpy()) print( '{} Epoch [{:03d}/{:03d}], Step [{:04d}/{:04d}], ' '[lateral-2: {:.4f}, lateral-3: {:0.4f}, lateral-4: {:0.4f}, lateral-5: {:0.4f}]' .format(datetime.now(), epoch, opt.epoch, i, total_step, loss22, loss33, loss44, loss55)) save_path = 'snapshots/{}/'.format(opt.train_save) # print("save_path is:",save_path) os.makedirs(save_path, exist_ok=True) # if (epoch+1) % 10 == 0: if save_label == 1: torch.save(model.state_dict(), save_path + 'model-%d.pth' % (epoch + 1)) print('[Saving Snapshot:]', save_path + 'model-%d.pth' % (epoch + 1)) return loss22, loss33, loss44, loss55
def fit( self, train_loader, is_val=False, test_loader=None, img_size=352, start_from=0, num_epochs=200, batchsize=16, clip=0.5, fold=4, ): size_rates = [0.75, 1, 1.25] rate = 1 test_fold = f"fold{fold}" start = timeit.default_timer() for epoch in range(start_from, num_epochs): self.net.train() loss_all, loss_record2, loss_record3, loss_record4, loss_record5 = ( AvgMeter(), AvgMeter(), AvgMeter(), AvgMeter(), AvgMeter(), ) for i, pack in enumerate(train_loader, start=1): self.optimizer.zero_grad() # ---- data prepare ---- images, gts = pack # images, gts, paths, oriimgs = pack images = Variable(images).cuda() gts = Variable(gts).cuda() lateral_map_5 = self.net(images) loss5 = self.loss(lateral_map_5, gts) loss5.backward() clip_gradient(self.optimizer, clip) self.optimizer.step() if rate == 1: loss_record5.update(loss5.data, batchsize) self.writer.add_scalar( "Loss5", loss_record5.show(), (epoch - 1) * len(train_loader) + i, ) total_step = len(train_loader) if i % 25 == 0 or i == total_step: self.logger.info( "{} Epoch [{:03d}/{:03d}], with lr = {}, Step [{:04d}/{:04d}],\ [loss_record5: {:.4f}]".format( datetime.now(), epoch, epoch, self.optimizer.param_groups[0]["lr"], i, total_step, loss_record5.show(), )) if is_val: self.val(test_loader, epoch) os.makedirs(self.save_dir, exist_ok=True) if (epoch + 1) % 3 == 0 and epoch > self.save_from or epoch == 23: torch.save( { "model_state_dict": self.net.state_dict(), "lr": self.optimizer.param_groups[0]["lr"], }, os.path.join(self.save_dir, "PraNetDG-" + test_fold + "-%d.pth" % epoch), ) self.logger.info( "[Saving Snapshot:]" + os.path.join(self.save_dir, "PraNetDG-" + test_fold + "-%d.pth" % epoch)) self.scheduler.step() self.writer.flush() self.writer.close() end = timeit.default_timer() self.logger.info("Training cost: " + str(end - start) + "seconds")
def train_model(self, net, criterion, optimizer, scheduler, trainloader, valloader, testloader, logger, writer, path): use_cuda = torch.cuda.is_available() ## if or not pre if self.pretrained: checkpoint = torch.load(os.path.join(self.pretrained)) net.load_state_dict(checkpoint['net']) print("==> Loaded checkpoint from pretrained model-'{}'".format(self.pretrained)) ## resume_from elif self.resume_from: print('Loading weight...') checkpoint = torch.load(os.path.join(self.resume_from)) net.load_state_dict(checkpoint['net']) acc = checkpoint['acc'] cur_epoch = checkpoint['epoch'] current_epoch = cur_epoch print("=> Loaded checkpoint='{}' (epoch={})".format(self.resume_from, current_epoch)) else: resume_from = 0 current_epoch = 0 print('==> Building model..') idx_ter = 0 for epoch in range(current_epoch, self.num_epochs): print('Epoch {}/{}'.format(epoch, self.num_epochs)) global Train_acc net.train() train_loss = 0 correct = 0 total = 0 current_lr = optimizer.param_groups[0]['lr'] #warmup_steps = 500 for batch_idx, (inputs, targets) in enumerate(trainloader): ## warmup if batch_idx <= self.warmup_steps and epoch == 0: warmup_percent_done = (batch_idx + 1) / (self.warmup_steps + 1) warmup_lr = float(self.init_lr * warmup_percent_done) current_lr = warmup_lr set_lr(optimizer, current_lr) else: current_lr = current_lr if use_cuda: inputs, targets = inputs.cuda(device = self.device_ids[0]),\ targets.cuda(device = self.device_ids[0]) inputs, targets = Variable(inputs), Variable(targets) ## mixup or cutmix if self.mixup or self.cutmix: if epoch <=self.num_epochs - 20: if self.mixup and (not self.cutmix): x_batch, y_batch_a, y_batch_b, lam = mixup_data_radio(inputs, targets, alpha = 0.25, mix_radio = 0.5) elif self.cutmix and (not self.mixup): x_batch, y_batch_a, y_batch_b, lam = cutmix_data_radio(inputs, targets, alpha = 0.25, mix_radio = 0.5) elif self.mixup and self.cutmix: x_batch, y_batch_a, y_batch_b, lam = cutmix_data_radio(inputs, targets, alpha=0.15, mix_radio=0.5)\ if np.random.rand() > 0.5 else mixup_data_radio(inputs, targets, alpha=0.15, mix_radio=0.5) ## save mix_img if self.save_mix_results: self.save_mix_images(x_batch = x_batch, epoch = epoch, batch_idx = batch_idx, end_epoch = 0, end_batch_idx = 5) outputs = net(x_batch.cuda()) loss = mixup_criterion(criterion, outputs, y_batch_a.cuda(device=self.device_ids[0]), y_batch_b.cuda(device=self.device_ids[0]), lam) optimizer.zero_grad() loss.backward() clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss.item() _,predicted = torch.max(outputs.data, 1) total += targets.size(0) lam = torch.tensor(lam, dtype=torch.float32) correct += lam * predicted.eq(y_batch_a.data).cpu().sum() + (1 - lam) * predicted.eq(y_batch_b.data).cpu().sum() else: optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() else: optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() clip_gradient(optimizer, 0.1) optimizer.step() train_loss += loss.item() _, predicted = torch.max(outputs.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() accuracy = 100. * float(correct)/total ##logging if batch_idx % 20 == 0: train_log = ('Epoch[%d/%d] Batch[%d] lr: %.6f, Training_Loss=%.4f, Train_Accuracy=%.4f' % (epoch + 1, self.num_epochs, batch_idx, current_lr, train_loss / (batch_idx + 1), accuracy)) logger.info(train_log) idx_ter += 1 writer.add_scalar('Accuracy/iter_train', accuracy, idx_ter) writer.add_scalar('Loss/iter_train', train_loss / (batch_idx + 1), idx_ter) scheduler.step() train_acc = 100.*correct/total train_epoch_loss = train_loss / (batch_idx+1) epoch_log = ('Epoch[%d] Training-Accuracy=%.4f, Train-Loss=%.4f'%(epoch + 1, train_acc, train_epoch_loss)) logger.info(epoch_log) writer.add_scalar('Accuracy/epoch_train', accuracy, epoch+1) writer.add_scalar('Loss/epoch_train', train_epoch_loss, epoch+1) ## val or test if not self.evel_only: print('Starting valing..') self.eval_model(valloader, criterion, net, epoch, logger, writer, path) print('Starting testing...') self.test_model(testloader, criterion, net, epoch, logger, writer, path) else: print('Only valing..') self.eval_model(valloader, criterion, net, epoch, logger, writer, path)
def pred_err_loop(current_lr, pred_err_dataset, pred_err_map): ''' loop循环,将预测错的继续训练 :param current_lr: 学习率 :param pred_err_dataset: 预测错误的数据集 :param pred_err_map: 预测错误的类别数量 :return: None ''' pred_err_epoch = 1 pred_err_epoch_max = opt.pred_err_epoch_max pred_err_dataset_temp = None while len(pred_err_dataset) > 0 and pred_err_epoch <= pred_err_epoch_max: print("pred_err_loop:%d, err_num:%d, err_map:%s, current_lr:%s" % (pred_err_epoch, len(pred_err_dataset), pred_err_map, str(current_lr))) pred_err_epoch += 1 pred_err_map = [0, 0, 0, 0, 0, 0, 0] train_loss = 0 correct = 0 total = 0 time_start = time.time() batch_idx = 0 del pred_err_dataset_temp pred_err_dataset_temp = pred_err_dataset pred_err_dataset = RawDataSet() pred_err_loader = torch.utils.data.DataLoader(pred_err_dataset_temp, batch_size=opt.bs, shuffle=True) pred_err_prefetcher = Prefetcher(pred_err_loader) inputs, targets = pred_err_prefetcher.next() while inputs is not None: optimizer.zero_grad() outputs = put_through_net(inputs, targets) loss = criterion(outputs, targets) loss.backward() utils.clip_gradient(optimizer, 2 * current_lr) optimizer.step() train_loss += float(loss.data) _, predicted = torch.max(outputs.data, 1) if target_type == 'ls': ground_value = targets.data elif target_type == 'fa': _, ground_value = torch.max(targets.data, 1) for i in range(len(predicted)): if predicted[i] != ground_value[i]: pred_err_dataset.add(inputs[i], targets[i]) pred_err_map[ground_value[i].item()] += 1 total += targets.size(0) correct += predicted.eq(ground_value.data).cpu().sum() cur_train_acc = float(correct) / float(total) * 100. time_end = time.time() duration = time_end - time_start utils.progress_bar( batch_idx, len(pred_err_loader), 'Time: %.2fs | Loss: %.3f | Acc: %.3f%% (%d/%d)' % (duration, train_loss / (batch_idx + 1), cur_train_acc, correct, total)) # 删除无用的变量,释放显存 del loss del inputs del outputs del predicted inputs, targets = pred_err_prefetcher.next() batch_idx += 1 del pred_err_dataset del pred_err_dataset_temp
def train(train_loader, model, optimizer, epoch, save_path, writer, cur_loss): global step model.train() loss_all = 0 epoch_step = 0 try: for i, (images, gts) in enumerate(train_loader, start=1): optimizer.zero_grad() images = images.cuda() gts = gts.cuda() preds = model(images) loss_init = cur_loss(preds[0], gts) + cur_loss( preds[1], gts) + cur_loss(preds[2], gts) loss_final = cur_loss(preds[3], gts) loss = loss_init + loss_final loss.backward() clip_gradient(optimizer, opt.clip) optimizer.step() step += 1 epoch_step += 1 loss_all += loss.data if i % 10 == 0 or i == total_step or i == 1: print( '{} Epoch [{:03d}/{:03d}], Step [{:04d}/{:04d}], Total_loss: {:.4f} Loss1: {:.4f} Loss2: {:0.4f}' .format(datetime.now(), epoch, opt.epoch, i, total_step, loss.data, loss_init.data, loss_final.data)) logging.info( '[Train Info]:Epoch [{:03d}/{:03d}], Step [{:04d}/{:04d}], ' 'Total_loss: {:.4f} Loss1: {:.4f} Loss2: {:0.4f}'.format( epoch, opt.epoch, i, total_step, loss.data, loss_init.data, loss_final.data)) # TensorboardX-Loss writer.add_scalars('Loss_Statistics', { 'Loss_init': loss_init.data, 'Loss_final': loss_final.data, 'Loss_total': loss.data }, global_step=step) # TensorboardX-Training Data grid_image = make_grid(images[0].clone().cpu().data, 1, normalize=True) writer.add_image('RGB', grid_image, step) grid_image = make_grid(gts[0].clone().cpu().data, 1, normalize=True) writer.add_image('GT', grid_image, step) # TensorboardX-Outputs res = preds[0][0].clone() res = res.sigmoid().data.cpu().numpy().squeeze() res = (res - res.min()) / (res.max() - res.min() + 1e-8) writer.add_image('Pred_init', torch.tensor(res), step, dataformats='HW') res = preds[3][0].clone() res = res.sigmoid().data.cpu().numpy().squeeze() res = (res - res.min()) / (res.max() - res.min() + 1e-8) writer.add_image('Pred_final', torch.tensor(res), step, dataformats='HW') loss_all /= epoch_step logging.info( '[Train Info]: Epoch [{:03d}/{:03d}], Loss_AVG: {:.4f}'.format( epoch, opt.epoch, loss_all)) writer.add_scalar('Loss-epoch', loss_all, global_step=epoch) if epoch % 10 == 0: torch.save(model.state_dict(), save_path + 'Net_epoch_{}.pth'.format(epoch)) except KeyboardInterrupt: print('Keyboard Interrupt: save model and exit.') if not os.path.exists(save_path): os.makedirs(save_path) torch.save(model.state_dict(), save_path + 'Net_epoch_{}.pth'.format(epoch + 1)) print('Save checkpoints successfully!') raise
def train(args, model, optimizer,dataloader_train,total): # Dicedict = {'CVC-300': [], 'CVC-ClinicDB': [], 'Kvasir': [], 'CVC-ColonDB': [], 'ETIS-LaribPolypDB': [], # 'test': []} Dicedict = {"CVC-ClinicDB-612-Test":[], "CVC-ClinicDB-612-Valid":[], "CVC-ColonDB-300":[], 'test': []} best_dice=0 best_epo =0 BCE = torch.nn.BCEWithLogitsLoss() criterion = u.BceDiceLoss() for epoch in range(1, args.num_epochs+1): u.adjust_lr(optimizer, args.lr, epoch, args.decay_rate, args.decay_epoch) size_rates = [0.75, 1, 1.25] # replace your desired scale, try larger scale for better accuracy in small object model.train() loss_record = [] loss_record1, loss_record2, loss_record3, loss_record4, loss_record5 = u.AvgMeter(), u.AvgMeter(), u.AvgMeter(), u.AvgMeter(), u.AvgMeter() for i, (data, label) in enumerate(dataloader_train, start=1): for rate in size_rates: #dataprepare if torch.cuda.is_available() and args.use_gpu: data = Variable(data).cuda() label = Variable(label).cuda() # edgs = Variable(edgs).cuda() # rescale trainsize = int(round(args.trainsize * rate / 32) * 32) if rate != 1: data = F.upsample(data, size=(trainsize, trainsize), mode='bilinear', align_corners=True) label = F.upsample(label, size=(trainsize, trainsize), mode='bilinear', align_corners=True) # edgs = F.upsample(edgs, size=(trainsize, trainsize), mode='bilinear', align_corners=True) """ 网络训练 标准三步 """ optimizer.zero_grad() prediction1, prediction2 =model(data) """ 计算损失函数 """ loss = u.bce_dice(prediction1,label)+u.bce_dice(prediction2,label) loss.backward() u.clip_gradient(optimizer, args.clip) optimizer.step() loss_record.append(loss.item()) # ---- train visualization ---- if i % 20 == 0 or i == total: loss_train_mean = np.mean(loss_record) print('{} Epoch [{:03d}/{:03d}], Step [{:04d}/{:04d}], ' '[loss for train : {:.4f}]'. format(datetime.now(), epoch, args.num_epochs, i, len(dataloader_train), loss_train_mean)) if (epoch + 1) % 1 == 0: for dataset in args.testdataset: # for dataset in ['CVC-300', 'CVC-ClinicDB', 'Kvasir', 'CVC-ColonDB', 'ETIS-LaribPolypDB']: dataset_dice = valid(model, dataset,args) print("dataset:{},Dice:{:.4f}".format(dataset, dataset_dice)) Dicedict[dataset].append(dataset_dice) meandice = valid(model, 'test',args ) print("dataset:{},Dice:{:.4f}".format("test", meandice)) Dicedict['test'].append(meandice) if meandice > best_dice: best_dice = meandice best_epo =epoch checkpoint_dir = "./checkpoint" filename = 'model_{}_{:03d}_{:.4f}.pth.tar'.format(args.net_work, epoch,best_dice) checkpointpath = os.path.join(checkpoint_dir, filename) torch.save(model.state_dict(), checkpointpath) print('############# Saving best ##########################################BestAvgDice:{}'.format(best_dice)) print('bestepo:{:03d} ,bestdice :{:.4f}'.format(best_epo,best_dice))
def fit(self, train_loader, is_val=False, test_loader=None, img_size=352, start_from=0, num_epochs=200, batchsize=16, clip=0.5, fold=4): size_rates = [0.75, 1, 1.25] test_fold = f'fold{fold}' start = timeit.default_timer() for epoch in range(start_from, num_epochs): self.net.train() loss_all, loss_record2, loss_record3, loss_record4, loss_record5 = AvgMeter( ), AvgMeter(), AvgMeter(), AvgMeter(), AvgMeter() for i, pack in enumerate(train_loader, start=1): for rate in size_rates: self.optimizer.zero_grad() # ---- data prepare ---- images, gts = pack # images, gts, paths, oriimgs = pack images = Variable(images).cuda() gts = Variable(gts).cuda() trainsize = int(round(img_size * rate / 32) * 32) if rate != 1: images = F.upsample(images, size=(trainsize, trainsize), mode='bilinear', align_corners=True) gts = F.upsample(gts, size=(trainsize, trainsize), mode='bilinear', align_corners=True) lateral_map_5, lateral_map_4, lateral_map_3, lateral_map_2 = self.net( images) # lateral_map_5 = self.net(images) loss5 = self.loss(lateral_map_5, gts) # loss4 = self.loss(lateral_map_4, gts) # loss3 = self.loss(lateral_map_3, gts) # loss2 = self.loss(lateral_map_2, gts) # loss = loss2 + loss3 + loss4 + loss5 loss = loss5 loss.backward() clip_gradient(self.optimizer, clip) self.optimizer.step() if rate == 1: # loss_record2.update(loss2.data, batchsize) # loss_record3.update(loss3.data, batchsize) # loss_record4.update(loss4.data, batchsize) loss_record5.update(loss5.data, batchsize) loss_all.update(loss.data, batchsize) # self.writer.add_scalar("Loss2", loss_record2.show(), (epoch-1)*len(train_loader) + i) # self.writer.add_scalar("Loss3", loss_record3.show(), (epoch-1)*len(train_loader) + i) # self.writer.add_scalar("Loss4", loss_record4.show(), (epoch-1)*len(train_loader) + i) self.writer.add_scalar( "Loss5", loss_record5.show(), (epoch - 1) * len(train_loader) + i) self.writer.add_scalar( "Loss", loss_all.show(), (epoch - 1) * len(train_loader) + i) total_step = len(train_loader) if i % 25 == 0 or i == total_step: # self.logger.info('{} Epoch [{:03d}/{:03d}], with lr = {}, Step [{:04d}/{:04d}],\ # [loss_record2: {:.4f},loss_record3: {:.4f},loss_record4: {:.4f},loss_record5: {:.4f}]'. # format(datetime.now(), epoch, epoch, self.optimizer.param_groups[0]["lr"],i, total_step,\ # loss_record2.show(), loss_record3.show(), loss_record4.show(), loss_record5.show() # )) self.logger.info( '{} Epoch [{:03d}/{:03d}], with lr = {}, Step [{:04d}/{:04d}],\ [loss_record5: {:.4f}]'.format( datetime.now(), epoch, epoch, self.optimizer.param_groups[0]["lr"], i, total_step, loss_record5.show())) if (is_val): self.val(test_loader, epoch) os.makedirs(self.save_dir, exist_ok=True) if (epoch + 1) % 3 == 0 and epoch > self.save_from or epoch == 23: torch.save( { "model_state_dict": self.net.state_dict(), "lr": self.optimizer.param_groups[0]["lr"] }, os.path.join(self.save_dir, 'PraNetDG-' + test_fold + '-%d.pth' % epoch)) self.logger.info( '[Saving Snapshot:]' + os.path.join(self.save_dir, 'PraNetDG-' + test_fold + '-%d.pth' % epoch)) self.scheduler.step() self.writer.flush() self.writer.close() end = timeit.default_timer() self.logger.info("Training cost: " + str(end - start) + 'seconds')
def train(train_loader, miml, decoder, criterion, miml_optimizer, decoder_optimizer, epoch, writer): """ Performs one epoch's training. :param train_loader: DataLoader for training data :param encoder: encoder model :param decoder: decoder model :param criterion: loss layer :param encoder_optimizer: optimizer to update encoder's weights (if fine-tuning) :param decoder_optimizer: optimizer to update decoder's weights :param epoch: epoch number """ decoder.train() # train mode (dropout and batchnorm is used) miml.train() total_step = len(train_loader) batch_time = AverageMeter() # forward prop. + back prop. time data_time = AverageMeter() # data loading time losses = AverageMeter() # loss (per word decoded) top5accs = AverageMeter() # top5 accuracy start = time.time() # Batches for i, (imgs, caps, caplens) in enumerate(train_loader): data_time.update(time.time() - start) # Move to GPU, if available imgs = imgs.to(device) caps = caps.to(device) caplens = caplens.to(device) # Forward prop. attrs = miml(imgs) scores, caps_sorted, decode_lengths, sort_ind = decoder( attrs, caps, caplens) # Since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets = caps_sorted[:, 1:] # Remove timesteps that we didn't decode at, or are pads # pack_padded_sequence is an easy trick to do this # torch在计算时会自动除去pad,这样不带pad计算不影响精度 scores, _ = pack_padded_sequence(scores, decode_lengths, batch_first=True) targets, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True) # Calculate loss loss = criterion(scores, targets) # Back prop. decoder_optimizer.zero_grad() miml_optimizer.zero_grad() loss.backward() # Clip gradients if grad_clip is not None: clip_gradient(decoder_optimizer, grad_clip) if miml_optimizer is not None: clip_gradient(miml_optimizer, grad_clip) # Update weights decoder_optimizer.step() miml_optimizer.step() # Keep track of metrics top5 = accuracy(scores, targets, 5) losses.update(loss.item(), sum(decode_lengths)) top5accs.update(top5, sum(decode_lengths)) batch_time.update(time.time() - start) start = time.time() # Print status if i % print_freq == 0: writer.add_scalars('train', { 'loss': loss.item(), 'mAp': top5accs.val }, epoch * total_step + i) print('Epoch: [{0}][{1}/{2}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top5=top5accs))
def train(epoch): '''使用DataPrefetcher加速''' print("---Train---") # 根据训练的epoch次数来降低learning rate if epoch >= opt.lrd_se > 0: frac = ((epoch - opt.lrd_se) // opt.lrd_s) + 1 decay_factor = opt.lrd_r**frac current_lr = opt.lr * decay_factor # current_lr = opt.lr * 降低率 ^ ((epoch - 开始decay的epoch) // 每次decay的epoch num) utils.set_lr(optimizer, current_lr) # set the learning rate else: current_lr = opt.lr if epoch < opt.lre_je: current_lr *= 1.5 # 解决一开始收敛慢的问题 print('learning_rate: %s' % str(current_lr)) global Train_acc net.train() train_loss = 0 correct = 0 total = 0 pred_err_dataset = RawDataSet() pred_err_map = [0, 0, 0, 0, 0, 0, 0] cur_train_acc = 0. time_start = time.time() batch_idx = 0 inputs, targets = train_prefetcher.next() while inputs is not None: optimizer.zero_grad() outputs = put_through_net(inputs, targets) # print("outputs:", outputs) # print("targets:", targets) loss = criterion(outputs, targets) loss.backward() utils.clip_gradient( optimizer, 2 * current_lr ) # 解决梯度爆炸 https://blog.csdn.net/u010814042/article/details/76154391 optimizer.step() train_loss += float(loss.data) _, predicted = torch.max( outputs.data, 1) # torch.max() 加上dim参数后,返回值为 max_value, max_value_index if target_type == 'ls': ground_value = targets.data elif target_type == 'fa': _, ground_value = torch.max(targets.data, 1) # print("predicted:", predicted) # print("ground_value:", ground_value) for i in range(len(predicted)): if predicted[i] == ground_value[i]: train_acc_map[predicted[i].item()] += 1 else: pred_err_dataset.add(inputs[i], targets[i]) pred_err_map[ground_value[i].item()] += 1 total += targets.size(0) correct += predicted.eq(ground_value.data).cpu().sum() # print("equal: ", predicted.eq(ground_value.data).cpu()) cur_train_acc = float(correct) / float(total) * 100. time_end = time.time() duration = time_end - time_start utils.progress_bar( batch_idx, len(train_loader), 'Time: %.2fs | Loss: %.3f | Acc: %.3f%% (%d/%d)' % (duration, train_loss / (batch_idx + 1), cur_train_acc, correct, total)) # 删除无用的变量,释放显存 del loss del inputs del outputs del predicted inputs, targets = train_prefetcher.next() batch_idx += 1 Train_acc = cur_train_acc write_history('Train', epoch, cur_train_acc, train_loss / (batch_idx + 1), None) pred_err_loop(current_lr / opt.pred_err_lr_decay, pred_err_dataset, pred_err_map) del pred_err_dataset