def inference_classification(self): self.model.eval() self.model.module.mode = 0 val_accuracy = AverageMeter() with torch.no_grad(): final_itr = tqdm(self.test_loader, ncols=80, desc='Inference (instance) ...') for i, (input, labels) in enumerate(final_itr): input = input.to(self.device) labels = labels.to(self.device) logits = self.model(input)[0] preds = self.model.module.pooling.predictions(logits) accuracy = (preds == labels).sum().item() / labels.shape[0] val_accuracy.append(accuracy) final_itr.set_description('--- (test) | Accuracy: {:.3f} :'.format( val_accuracy.avg()) ) err = val_accuracy.avg() fp = open(os.path.join(self.logdir, 'meanscores.csv'), 'w') fp.write('Accuracy: {:.4f} \n'.format(err)) fp.close()
def train(train_loader, net, optim, curr_epoch, scaler): """ Runs the training loop per epoch train_loader: Data loader for train net: thet network optimizer: optimizer curr_epoch: current epoch return: """ full_bt = time.perf_counter() net.train() train_main_loss = AverageMeter() start_time = None warmup_iter = 10 optim.last_batch = len(train_loader) - 1 btimes = [] batch_time = time.perf_counter() for i, data in enumerate(train_loader): lr_warmup(optim, curr_epoch, i, len(train_loader), max_lr=0.4) if i <= warmup_iter: start_time = time.time() # inputs = (bs,3,713,713) # gts = (bs,713,713) images, gts, _img_name, scale_float = data batch_pixel_size = images.size(0) * images.size(2) * images.size(3) images, gts, scale_float = images.cuda(), gts.cuda(), scale_float.cuda( ) inputs = {'images': images, 'gts': gts} optim.zero_grad() if args.amp: with amp.autocast(): main_loss = net(inputs) log_main_loss = main_loss.clone().detach_() # torch.distributed.all_reduce(log_main_loss, # torch.distributed.ReduceOp.SUM) log_wait = optim.comm.Iallreduce(MPI.IN_PLACE, log_main_loss, MPI.SUM) # log_main_loss = log_main_loss / args.world_size # train_main_loss.update(log_main_loss.item(), batch_pixel_size) scaler.scale(main_loss).backward() else: main_loss = net(inputs) main_loss = main_loss.mean() log_main_loss = main_loss.clone().detach_() log_wait = None #train_main_loss.update(log_main_loss.item(), batch_pixel_size) main_loss.backward() # the scaler update is within the optim step optim.step() if i >= warmup_iter: curr_time = time.time() batches = i - warmup_iter + 1 batchtime = (curr_time - start_time) / batches else: batchtime = 0 if log_wait is not None: log_wait.Wait() log_main_loss = log_main_loss / args.world_size train_main_loss.update(log_main_loss.item(), batch_pixel_size) msg = ('[epoch {}], [iter {} / {}], [train main loss {:0.6f}],' ' [lr {:0.6f}] [batchtime {:0.3g}]') msg = msg.format(curr_epoch, i + 1, len(train_loader), train_main_loss.avg, optim.local_optimizer.param_groups[-1]['lr'], batchtime) logx.msg(msg) metrics = { 'loss': train_main_loss.avg, 'lr': optim.local_optimizer.param_groups[-1]['lr'] } curr_iter = curr_epoch * len(train_loader) + i logx.metric('train', metrics, curr_iter) if i >= 10 and args.test_mode: del data, inputs, gts return btimes.append(time.perf_counter() - batch_time) batch_time = time.perf_counter() if args.benchmarking: train_loss_tens = torch.tensor(train_main_loss.avg) optim.comm.Allreduce(MPI.IN_PLACE, train_loss_tens, MPI.SUM) train_loss_tens = train_loss_tens.to(torch.float) train_loss_tens /= float(optim.comm.size) train_main_loss.avg = train_loss_tens.item() return train_main_loss.avg, torch.mean( torch.tensor(btimes)), time.perf_counter() - full_bt
def _train_epoch(self, epoch): logits_losses = AverageMeter() bag_losses = AverageMeter() center_losses = AverageMeter() train_accuracy = AverageMeter() bag_accuracy = AverageMeter() self.center_loss.train() self.model.train() self.model.module.mode = 1 # combined mode (instance & bag prediction) self.adjust_lr_staircase( self.optimizer.param_groups, [0.001, 0.01], # initial values for features and pooling epoch + 1, [10, 15, 17], # set the steps to adjust accordingly 0.1 # reduce by this value ) pbar = tqdm(self.train_loader, ncols=160, desc=' ') for i, (inputs, labels, all_labels) in enumerate(pbar): inputs = inputs.to(self.device) labels = labels.to(self.device) all_labels = all_labels.view(-1).to(self.device).long() self.optimizer.zero_grad() self.optimizerpool.zero_grad() # get features and logits inst_logits, inst_feat, bag_embed, bag_logits = self.model(inputs) loss_soft = self.model.module.pooling.loss(inst_logits, all_labels) loss_bag = self.model.module.pooling.loss(bag_logits, labels) # default : clustering instances #loss_center = self.center_loss(inst_embed, all_labels) # other : clustering bags / instances loss_center = self.center_loss(bag_feat, labels) # alpha, lambda and bag weight loss = 1.0 * loss_soft + loss_center * 1.0 + loss_bag * 1.0 preds_bag = self.model.module.pooling.predictions(bag_logits) preds = self.model.module.pooling.predictions(inst_logits) accuracy = (preds == all_labels).sum().item() / all_labels.shape[0] accuracy_bag = (preds_bag == labels).sum().item() / labels.shape[0] loss_cen = loss_center.item() loss_val = loss_soft.item() loss_slide = loss_bag.item() logits_losses.append(loss_val) center_losses.append(loss_cen) bag_losses.append(loss_slide) train_accuracy.append(accuracy) bag_accuracy.append(accuracy_bag) loss.backward() self.optimizer.step() for param in self.center_loss.parameters(): # center loss weight should match as in the loss function param.grad.data *= (1. / 1.0) self.optimizerpool.step() pbar.set_description( '--- (train) | Loss(I): {:.4f} | Loss(C): {:.4f} | Loss(B): {:.4f} | ACC(I): {:.3f} | ACC(B): {:.3f} :' .format(logits_losses.avg(), center_losses.avg(), bag_losses.avg(), train_accuracy.avg(), bag_accuracy.avg())) step = epoch + 1 self.writer.add_scalar('training/loss_i', logits_losses.avg(), step) self.writer.add_scalar('training/loss_c', center_losses.avg(), step) self.writer.add_scalar('training/loss_b', bag_losses.avg(), step) self.writer.add_scalar('training/accuracy', train_accuracy.avg(), step) self.writer.add_scalar('training/accuracy_bag', bag_accuracy.avg(), step) print()
def loop_finetune(self, pretrained_state, max_lp, max_ep, rewarded_ep, op_cfg, sc_cfg, candidate_ratio, finetune_lsmooth): self.g_tb_lg.add_scalars('probs', self.agent.get_prob_dict(), -1) self.g_tb_lg.add_histogram('probs_dist', self.agent.get_prob_tensor(), -1) [ self.g_tb_lg.add_scalar('ppo_step', self.agent.max_training_times, t) for t in [-1, max_lp, -1] ] max_it = self.auged_sub_train_iters loader = self.auged_sub_train_ld # assert max_it == len(loader) agent_param_his = [] best_rewards_mean = 0 best_rewards_lp = 0 best_agent_state = {} candidate_ep = max(round(max_ep * candidate_ratio), 1) loop_speed = AverageMeter(4) crit = self.criterion if finetune_lsmooth else F.cross_entropy for lp in range(max_lp): lp_str = f'%{len(str(max_lp))}d' % (lp + 1) lp_start_t = time.time() self.model.load_state_dict(pretrained_state['model']) self.model.train() op, sc = self.create_op_sc(self.model, op_cfg, sc_cfg, max_it) op: torch.optim.optimizer.Optimizer op.load_state_dict(pretrained_state['op']) epoch_speed = AverageMeter(1) acc1s = [] for ep in range(max_ep): ep_str = f'%{len(str(max_ep))}d' % (ep + 1) ep_start_t = time.time() for it, (inp, tar, op_indices) in enumerate(loader): global_it = ep * max_it + it self.agent.record(op_indices) inp, tar = inp.cuda(), tar.cuda() loss = crit(self.model(inp), tar) op.zero_grad() loss.backward() if self.model_grad_clip is not None: total_norm = torch.nn.utils.clip_grad_norm_( self.model.parameters(), self.model_grad_clip) else: total_norm = -233 clipped_norm = torch.cat([ p.grad.data.view(-1) for p in self.model.parameters() ]).abs_().norm() sc.step() # sc.step() before op.step() lr = sc.get_lr()[0] clipped_lr = lr * (clipped_norm / total_norm) op.step() val_loss, val_acc1, val_acc5 = self.val() self.model.train() acc1s.append(val_acc1) if lp % 20 == 0: if ep == 0: self.lg.info(f'==> at {self.exp_root}') self.g_tb_lg.add_scalars('rk0_ft_v_loss', {f'loop_{lp}': val_loss}, ep) self.g_tb_lg.add_scalars('rk0_ft_v_acc1', {f'loop_{lp}': val_acc1}, ep) self.g_tb_lg.add_scalars('rk0_ft_v_acc5', {f'loop_{lp}': val_acc5}, ep) epoch_speed.avg = time.time() - ep_start_t remain_time, finish_time = epoch_speed.time_preds(max_ep - ep - 1) self.lg.info(f'lp[{lp_str}/{max_lp}], ep[{ep_str}/{max_ep}]' f' vacc1: {float(val_acc1):5.2f},' f' verr1: {float(100.-val_acc1):5.2f},' f' time cost: {time.time()-ep_start_t:.3f},' f' op_freq.s: {self.agent.op_freq.sum()},' f' rem-t: {remain_time} ({finish_time})') acc1s = acc1s[-candidate_ep:] rewarded_acc1s = sorted(acc1s)[-rewarded_ep:] reward = sum(rewarded_acc1s) / len(rewarded_acc1s) rewards = sync_vals(self.dist, reward, fmt=None) rewards_mean = rewards.mean().item() if self.agent.initial_baseline is not None: d = f'{rewards_mean-self.agent.initial_baseline:.3f}' else: d = None if rewards_mean > best_rewards_mean: best_rewards_mean = rewards_mean best_rewards_lp = lp - 1 best_agent_state = self.agent.state_dict() best_agent_state = { 'first_param': best_agent_state['first_param'].data.clone(), 'second_param': best_agent_state['second_param'].data.clone() } if self.dist.is_master() and d is not None: for root, dirs, files in os.walk( self.best_agent_ckpt_root): for f in files: os.remove(os.path.join(root, f)) torch.save({ 'lp': lp, 'agent': best_agent_state, }, os.path.join( self.best_agent_ckpt_root, f'after_lp{best_rewards_lp}_d{d}.pth.tar')) if lp == 0: self.agent.set_baselines(initial_baseline=rewards_mean, running_baseline=reward) [ self.g_tb_lg.add_scalars('reward', {f'g_ini_bsln': rewards_mean}, t) for t in [0, max_lp // 2, max_lp - 1] ] [ self.l_tb_lg.add_scalars( 'reward', {f'rk{self.dist.rank}_ini_run_bsln': reward}, t) for t in [0, max_lp // 2, max_lp - 1] ] ppo_step_times = self.agent.step(reward=reward) self.g_tb_lg.add_scalar('agent_lr', self.agent.scheduler.get_lr()[0], lp) loop_speed.update(time.time() - lp_start_t) remain_time, finish_time = loop_speed.time_preds(max_lp - lp - 1) self.lg.info( f'==> loop[{lp_str}/{max_lp}],' f' time cost: {(time.time()-lp_start_t) / 60:.2f} min,' f' rem-t[{remain_time}] ({finish_time}),' f' rew={rewards}') if self.dist.is_master(): agent_param_his.append(self.agent.get_params_as_list()) self.g_tb_lg.add_scalar('ppo_step', ppo_step_times, lp) self.l_tb_lg.add_scalars( 'reward', {f'rk{self.dist.rank}_run_bsln': self.agent.running_baseline}, lp) self.l_tb_lg.add_scalars( 'advance', {f'rk{self.dist.rank}_adv': self.agent.advance_val}, lp) self.g_tb_lg.add_scalars('probs', self.agent.get_prob_dict(), lp) self.g_tb_lg.add_histogram('probs_dist', self.agent.get_prob_tensor(), lp) if self.dist.is_master(): torch.save( { 'lp': lp, 'agent': self.agent.state_dict(), }, os.path.join( self.agents_ckpt_root, f'lp{lp}_d{d}_rew_mean{rewards_mean:.2f}.pth.tar')) torch.cuda.empty_cache() if self.dist.is_master(): f_name = os.path.join(self.ckpt_root, 'agent_param_his.json') self.lg.info(f'dump agent params into {f_name}') with open(f_name, 'w') as fp: json.dump(agent_param_his, fp) # if lp == 0: # self.lg.info(f'dumped list[0]: {agent_param_his[0]}') self.dist.barrier() if not os.path.exists(self.early_stop_root): break [ self.meta_tb_lg.add_scalar('best_rew_mean', best_rewards_mean, t) for t in [0, best_rewards_lp, max_lp] ] [ self.g_tb_lg.add_scalar('best_rew_mean', best_rewards_mean, t) for t in [0, best_rewards_lp, max_lp] ] return {'lp': best_rewards_lp, 'agent': best_agent_state}