Example #1
0
    def inference_classification(self):
        self.model.eval()
        self.model.module.mode = 0
        val_accuracy = AverageMeter()

        with torch.no_grad():
            final_itr = tqdm(self.test_loader, ncols=80, desc='Inference (instance) ...')

            for i, (input, labels) in enumerate(final_itr):
                input  = input.to(self.device)
                labels = labels.to(self.device)

                logits = self.model(input)[0]
                preds  = self.model.module.pooling.predictions(logits)

                accuracy = (preds == labels).sum().item() / labels.shape[0]
                val_accuracy.append(accuracy)

                final_itr.set_description('--- (test) | Accuracy: {:.3f}  :'.format(
                    val_accuracy.avg())
                )

        err = val_accuracy.avg()
        fp = open(os.path.join(self.logdir, 'meanscores.csv'), 'w')
        fp.write('Accuracy: {:.4f} \n'.format(err))
        fp.close()
def train(train_loader, net, optim, curr_epoch, scaler):
    """
    Runs the training loop per epoch
    train_loader: Data loader for train
    net: thet network
    optimizer: optimizer
    curr_epoch: current epoch
    return:
    """
    full_bt = time.perf_counter()
    net.train()

    train_main_loss = AverageMeter()
    start_time = None
    warmup_iter = 10
    optim.last_batch = len(train_loader) - 1
    btimes = []
    batch_time = time.perf_counter()
    for i, data in enumerate(train_loader):
        lr_warmup(optim, curr_epoch, i, len(train_loader), max_lr=0.4)

        if i <= warmup_iter:
            start_time = time.time()
        # inputs = (bs,3,713,713)
        # gts    = (bs,713,713)
        images, gts, _img_name, scale_float = data
        batch_pixel_size = images.size(0) * images.size(2) * images.size(3)
        images, gts, scale_float = images.cuda(), gts.cuda(), scale_float.cuda(
        )
        inputs = {'images': images, 'gts': gts}
        optim.zero_grad()
        if args.amp:
            with amp.autocast():
                main_loss = net(inputs)
                log_main_loss = main_loss.clone().detach_()
                # torch.distributed.all_reduce(log_main_loss,
                #                              torch.distributed.ReduceOp.SUM)
                log_wait = optim.comm.Iallreduce(MPI.IN_PLACE, log_main_loss,
                                                 MPI.SUM)
                # log_main_loss = log_main_loss / args.world_size
            # train_main_loss.update(log_main_loss.item(), batch_pixel_size)
            scaler.scale(main_loss).backward()
        else:
            main_loss = net(inputs)
            main_loss = main_loss.mean()
            log_main_loss = main_loss.clone().detach_()
            log_wait = None
            #train_main_loss.update(log_main_loss.item(), batch_pixel_size)
            main_loss.backward()

        # the scaler update is within the optim step
        optim.step()

        if i >= warmup_iter:
            curr_time = time.time()
            batches = i - warmup_iter + 1
            batchtime = (curr_time - start_time) / batches
        else:
            batchtime = 0

        if log_wait is not None:
            log_wait.Wait()
        log_main_loss = log_main_loss / args.world_size
        train_main_loss.update(log_main_loss.item(), batch_pixel_size)

        msg = ('[epoch {}], [iter {} / {}], [train main loss {:0.6f}],'
               ' [lr {:0.6f}] [batchtime {:0.3g}]')
        msg = msg.format(curr_epoch, i + 1, len(train_loader),
                         train_main_loss.avg,
                         optim.local_optimizer.param_groups[-1]['lr'],
                         batchtime)
        logx.msg(msg)

        metrics = {
            'loss': train_main_loss.avg,
            'lr': optim.local_optimizer.param_groups[-1]['lr']
        }
        curr_iter = curr_epoch * len(train_loader) + i
        logx.metric('train', metrics, curr_iter)

        if i >= 10 and args.test_mode:
            del data, inputs, gts
            return
        btimes.append(time.perf_counter() - batch_time)
        batch_time = time.perf_counter()

    if args.benchmarking:
        train_loss_tens = torch.tensor(train_main_loss.avg)
        optim.comm.Allreduce(MPI.IN_PLACE, train_loss_tens, MPI.SUM)
        train_loss_tens = train_loss_tens.to(torch.float)
        train_loss_tens /= float(optim.comm.size)
        train_main_loss.avg = train_loss_tens.item()

    return train_main_loss.avg, torch.mean(
        torch.tensor(btimes)), time.perf_counter() - full_bt
Example #3
0
    def _train_epoch(self, epoch):
        logits_losses = AverageMeter()
        bag_losses = AverageMeter()
        center_losses = AverageMeter()
        train_accuracy = AverageMeter()
        bag_accuracy = AverageMeter()

        self.center_loss.train()
        self.model.train()
        self.model.module.mode = 1  # combined mode (instance & bag prediction)

        self.adjust_lr_staircase(
            self.optimizer.param_groups,
            [0.001, 0.01],  # initial values for features and pooling 
            epoch + 1,
            [10, 15, 17],  # set the steps to adjust accordingly
            0.1  # reduce by this value
        )
        pbar = tqdm(self.train_loader, ncols=160, desc=' ')
        for i, (inputs, labels, all_labels) in enumerate(pbar):

            inputs = inputs.to(self.device)
            labels = labels.to(self.device)
            all_labels = all_labels.view(-1).to(self.device).long()

            self.optimizer.zero_grad()
            self.optimizerpool.zero_grad()

            # get features and logits
            inst_logits, inst_feat, bag_embed, bag_logits = self.model(inputs)

            loss_soft = self.model.module.pooling.loss(inst_logits, all_labels)
            loss_bag = self.model.module.pooling.loss(bag_logits, labels)

            # default : clustering instances
            #loss_center = self.center_loss(inst_embed, all_labels)
            # other : clustering bags / instances
            loss_center = self.center_loss(bag_feat, labels)
            # alpha, lambda and bag weight
            loss = 1.0 * loss_soft + loss_center * 1.0 + loss_bag * 1.0

            preds_bag = self.model.module.pooling.predictions(bag_logits)
            preds = self.model.module.pooling.predictions(inst_logits)
            accuracy = (preds == all_labels).sum().item() / all_labels.shape[0]
            accuracy_bag = (preds_bag == labels).sum().item() / labels.shape[0]

            loss_cen = loss_center.item()
            loss_val = loss_soft.item()
            loss_slide = loss_bag.item()
            logits_losses.append(loss_val)
            center_losses.append(loss_cen)
            bag_losses.append(loss_slide)
            train_accuracy.append(accuracy)
            bag_accuracy.append(accuracy_bag)

            loss.backward()
            self.optimizer.step()
            for param in self.center_loss.parameters():
                # center loss weight should match as in the loss function
                param.grad.data *= (1. / 1.0)
            self.optimizerpool.step()

            pbar.set_description(
                '--- (train) | Loss(I): {:.4f} | Loss(C): {:.4f} | Loss(B): {:.4f} | ACC(I): {:.3f} | ACC(B): {:.3f} :'
                .format(logits_losses.avg(), center_losses.avg(),
                        bag_losses.avg(), train_accuracy.avg(),
                        bag_accuracy.avg()))

        step = epoch + 1
        self.writer.add_scalar('training/loss_i', logits_losses.avg(), step)
        self.writer.add_scalar('training/loss_c', center_losses.avg(), step)
        self.writer.add_scalar('training/loss_b', bag_losses.avg(), step)
        self.writer.add_scalar('training/accuracy', train_accuracy.avg(), step)
        self.writer.add_scalar('training/accuracy_bag', bag_accuracy.avg(),
                               step)
        print()
Example #4
0
    def loop_finetune(self, pretrained_state, max_lp, max_ep, rewarded_ep,
                      op_cfg, sc_cfg, candidate_ratio, finetune_lsmooth):
        self.g_tb_lg.add_scalars('probs', self.agent.get_prob_dict(), -1)
        self.g_tb_lg.add_histogram('probs_dist', self.agent.get_prob_tensor(),
                                   -1)
        [
            self.g_tb_lg.add_scalar('ppo_step', self.agent.max_training_times,
                                    t) for t in [-1, max_lp, -1]
        ]

        max_it = self.auged_sub_train_iters
        loader = self.auged_sub_train_ld
        # assert max_it == len(loader)
        agent_param_his = []

        best_rewards_mean = 0
        best_rewards_lp = 0
        best_agent_state = {}
        candidate_ep = max(round(max_ep * candidate_ratio), 1)
        loop_speed = AverageMeter(4)
        crit = self.criterion if finetune_lsmooth else F.cross_entropy
        for lp in range(max_lp):
            lp_str = f'%{len(str(max_lp))}d' % (lp + 1)
            lp_start_t = time.time()
            self.model.load_state_dict(pretrained_state['model'])
            self.model.train()
            op, sc = self.create_op_sc(self.model, op_cfg, sc_cfg, max_it)
            op: torch.optim.optimizer.Optimizer
            op.load_state_dict(pretrained_state['op'])

            epoch_speed = AverageMeter(1)
            acc1s = []
            for ep in range(max_ep):
                ep_str = f'%{len(str(max_ep))}d' % (ep + 1)
                ep_start_t = time.time()
                for it, (inp, tar, op_indices) in enumerate(loader):
                    global_it = ep * max_it + it
                    self.agent.record(op_indices)
                    inp, tar = inp.cuda(), tar.cuda()
                    loss = crit(self.model(inp), tar)
                    op.zero_grad()
                    loss.backward()
                    if self.model_grad_clip is not None:
                        total_norm = torch.nn.utils.clip_grad_norm_(
                            self.model.parameters(), self.model_grad_clip)
                    else:
                        total_norm = -233
                    clipped_norm = torch.cat([
                        p.grad.data.view(-1) for p in self.model.parameters()
                    ]).abs_().norm()

                    sc.step()  # sc.step() before op.step()
                    lr = sc.get_lr()[0]
                    clipped_lr = lr * (clipped_norm / total_norm)
                    op.step()

                val_loss, val_acc1, val_acc5 = self.val()
                self.model.train()
                acc1s.append(val_acc1)
                if lp % 20 == 0:
                    if ep == 0:
                        self.lg.info(f'==> at {self.exp_root}')
                    self.g_tb_lg.add_scalars('rk0_ft_v_loss',
                                             {f'loop_{lp}': val_loss}, ep)
                    self.g_tb_lg.add_scalars('rk0_ft_v_acc1',
                                             {f'loop_{lp}': val_acc1}, ep)
                    self.g_tb_lg.add_scalars('rk0_ft_v_acc5',
                                             {f'loop_{lp}': val_acc5}, ep)

                epoch_speed.avg = time.time() - ep_start_t
                remain_time, finish_time = epoch_speed.time_preds(max_ep - ep -
                                                                  1)
                self.lg.info(f'lp[{lp_str}/{max_lp}], ep[{ep_str}/{max_ep}]'
                             f' vacc1: {float(val_acc1):5.2f},'
                             f' verr1: {float(100.-val_acc1):5.2f},'
                             f' time cost: {time.time()-ep_start_t:.3f},'
                             f' op_freq.s: {self.agent.op_freq.sum()},'
                             f' rem-t: {remain_time} ({finish_time})')

            acc1s = acc1s[-candidate_ep:]
            rewarded_acc1s = sorted(acc1s)[-rewarded_ep:]
            reward = sum(rewarded_acc1s) / len(rewarded_acc1s)
            rewards = sync_vals(self.dist, reward, fmt=None)
            rewards_mean = rewards.mean().item()
            if self.agent.initial_baseline is not None:
                d = f'{rewards_mean-self.agent.initial_baseline:.3f}'
            else:
                d = None

            if rewards_mean > best_rewards_mean:
                best_rewards_mean = rewards_mean
                best_rewards_lp = lp - 1
                best_agent_state = self.agent.state_dict()
                best_agent_state = {
                    'first_param':
                    best_agent_state['first_param'].data.clone(),
                    'second_param':
                    best_agent_state['second_param'].data.clone()
                }

                if self.dist.is_master() and d is not None:
                    for root, dirs, files in os.walk(
                            self.best_agent_ckpt_root):
                        for f in files:
                            os.remove(os.path.join(root, f))
                    torch.save({
                        'lp': lp,
                        'agent': best_agent_state,
                    },
                               os.path.join(
                                   self.best_agent_ckpt_root,
                                   f'after_lp{best_rewards_lp}_d{d}.pth.tar'))

            if lp == 0:
                self.agent.set_baselines(initial_baseline=rewards_mean,
                                         running_baseline=reward)
                [
                    self.g_tb_lg.add_scalars('reward',
                                             {f'g_ini_bsln': rewards_mean}, t)
                    for t in [0, max_lp // 2, max_lp - 1]
                ]
                [
                    self.l_tb_lg.add_scalars(
                        'reward', {f'rk{self.dist.rank}_ini_run_bsln': reward},
                        t) for t in [0, max_lp // 2, max_lp - 1]
                ]

            ppo_step_times = self.agent.step(reward=reward)
            self.g_tb_lg.add_scalar('agent_lr',
                                    self.agent.scheduler.get_lr()[0], lp)

            loop_speed.update(time.time() - lp_start_t)
            remain_time, finish_time = loop_speed.time_preds(max_lp - lp - 1)
            self.lg.info(
                f'==> loop[{lp_str}/{max_lp}],'
                f' time cost: {(time.time()-lp_start_t) / 60:.2f} min,'
                f' rem-t[{remain_time}] ({finish_time}),'
                f' rew={rewards}')
            if self.dist.is_master():
                agent_param_his.append(self.agent.get_params_as_list())

            self.g_tb_lg.add_scalar('ppo_step', ppo_step_times, lp)
            self.l_tb_lg.add_scalars(
                'reward',
                {f'rk{self.dist.rank}_run_bsln': self.agent.running_baseline},
                lp)
            self.l_tb_lg.add_scalars(
                'advance', {f'rk{self.dist.rank}_adv': self.agent.advance_val},
                lp)

            self.g_tb_lg.add_scalars('probs', self.agent.get_prob_dict(), lp)
            self.g_tb_lg.add_histogram('probs_dist',
                                       self.agent.get_prob_tensor(), lp)

            if self.dist.is_master():
                torch.save(
                    {
                        'lp': lp,
                        'agent': self.agent.state_dict(),
                    },
                    os.path.join(
                        self.agents_ckpt_root,
                        f'lp{lp}_d{d}_rew_mean{rewards_mean:.2f}.pth.tar'))

            torch.cuda.empty_cache()

            if self.dist.is_master():
                f_name = os.path.join(self.ckpt_root, 'agent_param_his.json')
                self.lg.info(f'dump agent params into {f_name}')
                with open(f_name, 'w') as fp:
                    json.dump(agent_param_his, fp)
                # if lp == 0:
                #     self.lg.info(f'dumped list[0]: {agent_param_his[0]}')

            self.dist.barrier()
            if not os.path.exists(self.early_stop_root):
                break

        [
            self.meta_tb_lg.add_scalar('best_rew_mean', best_rewards_mean, t)
            for t in [0, best_rewards_lp, max_lp]
        ]
        [
            self.g_tb_lg.add_scalar('best_rew_mean', best_rewards_mean, t)
            for t in [0, best_rewards_lp, max_lp]
        ]
        return {'lp': best_rewards_lp, 'agent': best_agent_state}