def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
        """Classification loss (NLL)
        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
        """
        assert 'pred_logits' in outputs
        src_logits = outputs['pred_logits']

        idx = self._get_src_permutation_idx(indices)
        target_classes_o = torch.cat(
            [t["labels"][J] for t, (_, J) in zip(targets, indices)])
        target_classes = torch.full(src_logits.shape[:2],
                                    self.num_classes,
                                    dtype=torch.int64,
                                    device=src_logits.device)
        target_classes[idx] = target_classes_o

        loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes,
                                  self.empty_weight)
        losses = {'loss_ce': loss_ce}

        if log:
            # TODO this should probably be a separate loss, not hacked in this one here
            losses['class_error'] = 100 - accuracy(src_logits[idx],
                                                   target_classes_o)[0]
        return losses
Example #2
0
def test(load=True, logger=None, epoch=None):
    if load:
        ckpt = torch.load(os.path.join(save_dir, 'model.tar'))
        net.load_state_dict(ckpt['state_dict'])

    if logger is None:
        logging.basicConfig(level=logging.INFO)
        logger = logging.getLogger(config)
        logger.addHandler(
            logging.FileHandler(os.path.join(save_dir, 'test.log'), mode='w'))

    net.eval()
    net.reset_dep()
    accm.reset()
    for it, (x, y) in enumerate(test_loader):
        x = x.cuda()
        y = y.cuda()
        outs = net(x)
        cent = cent_fn(outs, y)
        accm.update([cent.item(), accuracy(outs, y)])
    logger.info(accm.info(header='test', epoch=epoch))
    logger.info('reg {:.4f}'.format(net.get_reg_dep().item()))
    logger.info('pruned size {}'.format(str(net.get_pruned_size())))
    logger.info('pruned size (dep) {}'.format(str(net.get_pruned_size_dep())))
    logger.info('speedup in flops {:.4f}'.format(net.get_speedup_dep()))
    logger.info('memory saving {:.4f}\n'.format(net.get_memory_saving_dep()))
Example #3
0
def train():
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)

    with open(os.path.join(save_dir, 'args.txt'), 'w') as f:
        for v in vars(args):
            f.write('{}: {}\n'.format(v, getattr(args, v)))

    ckpt = torch.load(os.path.join(args.pretrain_dir, 'model.tar'))
    net.load_state_dict(ckpt['state_dict'], strict=False)

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(config)
    logger.addHandler(
        logging.FileHandler(os.path.join(save_dir, 'train.log'), mode='w'))
    logger.info(str(args) + '\n')

    for epoch in range(1, args.num_epochs + 1):
        accm.reset()
        scheduler.step()
        line = 'epoch {} starts with lr'.format(epoch)
        for pg in optimizer.param_groups:
            line += ' {:.3e}'.format(pg['lr'])
        logger.info(line)
        net.train()
        if args.freeze_bn:
            freeze_batch_norm(net)
        for x, y in train_loader:
            x = x.cuda()
            y = y.cuda()
            optimizer.zero_grad()
            outs = net(x)
            cent = cent_fn(outs, y)
            reg = net.get_reg().cuda()
            loss = cent + args.gamma * reg
            loss.backward()
            optimizer.step()
            accm.update([cent.item(), accuracy(outs, y)])
        line = accm.info(header='train', epoch=epoch)

        if epoch % args.eval_freq == 0:
            logger.info(line)
            test(load=False, logger=logger, epoch=epoch)
        else:
            logger.info(line + '\n')

        if epoch % args.save_freq == 0:
            torch.save({'state_dict': net.state_dict()},
                       os.path.join(save_dir, 'model.tar'))

    test(load=False)
    torch.save({'state_dict': net.state_dict()},
               os.path.join(save_dir, 'model.tar'))
Example #4
0
    def test(self, epoch):
        batch_time = AverageMeter('Time', ':6.3f')
        losses = AverageMeter('Loss', ':.4e')
        top1 = AverageMeter('Acc@1', ':6.2f')
        top5 = AverageMeter('Acc@5', ':6.2f')
        progress = ProgressMeter(len(self.test_loader),
                                 [batch_time, losses, top1, top5],
                                 prefix='Test: ')

        # switch to test mode
        self.model.eval()

        with torch.no_grad():
            end = time.time()
            for i, (images, target) in enumerate(self.test_loader):
                images = images.cuda()
                target = target.cuda()

                # compute output
                output, _ = self.model(images)
                loss = self.criterion(output, target)

                # measure accuracy and record loss
                acc1, acc5 = accuracy(output, target, topk=(1, 5))
                losses.update(loss.item(), images.size(0))
                top1.update(acc1[0], images.size(0))
                top5.update(acc5[0], images.size(0))

                # measure elapsed time
                batch_time.update(time.time() - end)
                end = time.time()

                if i % self.args.print_freq == 0 and self.args.local_rank == 0:
                    progress.display(i)

            if self.args.local_rank == 0:
                print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(
                    top1=top1, top5=top5))
                self.writer.add_scalar('Test/Avg_Loss', losses.avg, epoch + 1)
                self.writer.add_scalar('Test/Avg_Top1', top1.avg, epoch + 1)
                self.writer.add_scalar('Test/Avg_Top5', top5.avg, epoch + 1)
                self.summary_graph_adj(self.writer, epoch + 1)
                self.summary_graph_histogram(self.writer, epoch + 1)

        return top1.avg
Example #5
0
def test(load=True, logger=None, epoch=None):
    if load:
        ckpt = torch.load(os.path.join(save_dir, 'model.tar'))
        net.load_state_dict(ckpt['state_dict'])

    if logger is None:
        logging.basicConfig(level=logging.INFO)
        logger = logging.getLogger(config)
        logger.addHandler(
            logging.FileHandler(os.path.join(save_dir, 'test.log'), mode='w'))

    net.eval()
    accm.reset()
    for it, (x, y) in enumerate(test_loader):
        x = x.cuda()
        y = y.cuda()
        outs = net(x)
        cent = cent_fn(outs, y)
        accm.update([cent.item(), accuracy(outs, y)])
    logger.info(accm.info(header='test', epoch=epoch) + '\n')
Example #6
0
    def _infer(self, max_iters, loader):
        # assert max_iters == len(loader)
        self.model.eval()
        with torch.no_grad():
            tot = 0
            sum_loss, sum_acc1, sum_acc5 = 0., 0., 0.
            for inp, tar in loader:
                inp, tar = inp.cuda(), tar.cuda()
                bs = tar.shape[0]
                logits = self.model(inp)
                loss, (acc1, acc5) = F.cross_entropy(logits.data,
                                                     tar), accuracy(
                                                         logits.data, tar)
                sum_loss += loss.item() * bs
                sum_acc1 += acc1 * bs
                sum_acc5 += acc5 * bs
                tot += bs

        if self.dist_training:
            pass  # todo: dist

        return sum_loss / tot, sum_acc1 / tot, sum_acc5 / tot
Example #7
0
    def train_epoch(self, epoch):
        batch_time = AverageMeter('Time', ':6.3f')
        data_time = AverageMeter('Data', ':6.3f')
        losses = AverageMeter('Loss', ':.4e')
        top1 = AverageMeter('Acc@1', ':6.2f')
        top5 = AverageMeter('Acc@5', ':6.2f')
        progress = ProgressMeter(len(self.train_loader),
                                 [batch_time, data_time, losses, top1, top5],
                                 prefix="Epoch: [{}]".format(epoch))

        # switch to train mode
        self.model.train()
        end = time.time()

        for i, (images, target) in enumerate(self.train_loader):
            # measure data loading time
            data_time.update(time.time() - end)

            images = images.cuda()
            target = target.cuda()

            # compute output
            self.optimizer.zero_grad()
            logits, logits_aux = self.model(images)
            loss = self.criterion(logits, target)
            if self.args.graph_wd > 0:
                graph_params = [
                    v for k, v in self.model.named_parameters()
                    if 'graph_weights' in k and v.requires_grad
                ]
                graph_l2 = 0
                for v in graph_params:
                    graph_l2 += (self.model.edge_act(v)**2).sum()
                loss += 0.5 * graph_l2 * self.args.graph_wd
            if self.args.auxiliary:
                loss_aux = self.criterion(logits_aux, target)
                loss += self.args.auxiliary_weight * loss_aux
            loss.backward()
            if self.args.grad_clip > 0:
                nn.utils.clip_grad_norm_(self.model.parameters(),
                                         self.args.grad_clip)
            self.optimizer.step()

            # measure accuracy and record loss
            acc1, acc5 = accuracy(logits, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))
            self.moving_loss = loss.item() if epoch == self.args.start_epoch and i == 0 else \
                (1. - self.mu) * self.moving_loss + self.mu * loss.item()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % self.args.print_freq == 0 and self.args.local_rank == 0:
                progress.display(i)
                niter = epoch * len(self.train_loader) + i
                self.writer.add_scalar('Train/Sec_per_batch', batch_time.avg,
                                       niter)
                self.writer.add_scalar('Train/Avg_Loss', losses.avg, niter)
                self.writer.add_scalar('Train/Avg_Top1', top1.avg, niter)
                self.writer.add_scalar('Train/Avg_Top5', top5.avg, niter)
                self.writer.add_scalar('Train/Moving_Loss', self.moving_loss,
                                       niter)
Example #8
0
                            np.expand_dims(samples[i][0],
                                           0).transpose(0, 3, 1, 2)),
                        mx.nd.array(
                            np.expand_dims(samples[i][1],
                                           0).transpose(0, 3, 1, 2))
                    ],
                    label=[mx.nd.array([np.expand_dims(samples[i][2], 0)])])
                mod.forward_backward(dbatch)
                mod.update()

        # Save checkpoint and result
        mod.save_checkpoint(prefix=checkpoint_path,
                            epoch=k,
                            save_optimizer_states=True)
        np.save(checkpoint_path + '_predict_{}.npy'.format(k), label)

        # Evaluation
        score, density = misc.F1_score(label, gt)
        acc = misc.accuracy(label, gt)
        logging.info(
            "Epoch : %d, F1-score : %.4f, accuracy: %.4f, Density : %.4f" %
            (k, score, acc, density))

        if flag_n == False:
            args.t1 += 0.05
            logging.info("update t1:{}".format(args.t1))

        if flag_p == False:
            args.t2 -= 0.01
            logging.info("update t2:{}".format(args.t2))
Example #9
0
    def _train_with_aug(self,
                        max_iters,
                        loader,
                        max_ep,
                        op_cfg,
                        sc_cfg,
                        sync_mid,
                        lsmooth,
                        save_mode='best',
                        prefix='pre'):
        # assert max_iters == len(loader)
        self.model.train()

        max_it = max_iters
        max_global_it = max_ep * max_it
        train_log_freq = max_it // 10
        test_freqs = [self.test_freq * 32, self.test_freq]

        speed = AverageMeter(max_it)
        tr_loss, tr_acc1, tr_acc5 = AverageMeter(train_log_freq), AverageMeter(
            train_log_freq), AverageMeter(train_log_freq)

        op, sc = self.create_op_sc(self.model,
                                   op_cfg,
                                   sc_cfg,
                                   iters_per_epoch=max_it)
        op: Optimizer
        sc: LRScheduler
        best_acc1 = 0
        start_train_t = time.time()
        crit = self.criterion if lsmooth else F.cross_entropy
        for ep in range(max_ep):
            ep_str = f'%{len(str(max_ep))}d' % (ep + 1)
            is_late = int(ep >= 0.75 * max_ep)
            test_freq = test_freqs[is_late]
            if ep % 32 == 0:
                self.lg.info(f'==> at {self.exp_root}')

            last_t = time.time()
            for it, tup in enumerate(loader):
                if len(tup) == 3:
                    inp, tar, _ = tup
                else:
                    inp, tar = tup
                it_str = f'%{len(str(max_it))}d' % (it + 1)
                global_it = ep * max_it + it
                data_t = time.time()

                if global_it == 1:
                    for i in range(self.dist.world_size):
                        if self.dist.rank == i:
                            print(f'rk[{i:2d}] dist test')
                        self.dist.barrier()

                inp, tar = inp.cuda(), tar.cuda()
                cuda_t = time.time()

                logits = self.model(inp)
                loss = crit(logits, tar)
                tr_loss.update(loss.item())
                op.zero_grad()
                loss.backward()
                if self.dist_training:
                    pass
                if self.model_grad_clip is not None:
                    total_norm = torch.nn.utils.clip_grad_norm_(
                        self.model.parameters(), self.model_grad_clip)
                else:
                    total_norm = -233
                clipped_norm = torch.cat([
                    p.grad.data.view(-1) for p in self.model.parameters()
                ]).abs_().norm()

                sc.step()  # sc.step() before op.step()
                lr = sc.get_lr()[0]
                clipped_lr = lr * (clipped_norm / total_norm)

                op.step()
                acc1, acc5 = accuracy(logits, tar)
                op_t = time.time()

                total_r = tar.shape[0] / 128
                tr_acc1.update(val=acc1, num=total_r)
                tr_acc5.update(val=acc5, num=total_r)

                if global_it % test_freq == 0 or global_it == max_global_it - 1:
                    test_loss, test_acc1, test_acc5 = self.test()
                    test_t = time.time()
                    self.model.train()
                    is_best = test_acc1 >= best_acc1
                    best_acc1 = max(test_acc1, best_acc1)

                    if self.dist.is_master() and it + 1 == max_it:
                        remain_time, finish_time = speed.time_preds(
                            max_global_it - global_it - 1)
                        self.lg.info(
                            f'ep[{ep_str}/{max_ep}], it[{it_str}/{max_it}]:'
                            f' tr-err1[{100-tr_acc1.last:5.2f}] ({100-tr_acc1.avg:5.2f}),'
                            f' tr-loss[{tr_loss.last:.4f}] ({tr_loss.avg:.4f}),'
                            f' te-err1[{100-test_acc1:5.2f}],'
                            f' te-loss[{test_loss:.4f}],\n'
                            f' data[{data_t-last_t:.3f}],'
                            f' cuda[{cuda_t-data_t:.3f}],'
                            f' bp[{op_t-cuda_t:.3f}],'
                            f' te[{test_t-op_t:.3f}]'
                            f' rem-t[{remain_time}] ({finish_time})'
                            f' lr[{lr:.4g}] ({clipped_lr:.4g})')

                    state = {
                        'model': self.model.state_dict(),
                        'op': op.state_dict(),
                        'last_iter': global_it,
                    }

                    model_ckpt_path = os.path.join(
                        self.ckpt_root,
                        f'rk{self.dist.rank}_{prefix}_{save_mode}.pth.tar')
                    if save_mode == 'best' and is_best:
                        self.lg.info(
                            f'==> saving best model ckpt (err{100-test_acc1:.3f}) at {os.path.abspath(model_ckpt_path)}...'
                        )
                        torch.save(state, model_ckpt_path)
                    elif save_mode == 'last':
                        torch.save(state, model_ckpt_path)

                speed.update(time.time() - last_t)
                last_t = time.time()

        if self.dist.world_size > 1:
            test_loss, test_acc1, test_acc5 = self.test()
            acc1_ts: torch.Tensor = sync_vals(self.dist, test_acc1, None)
            mid_rank = acc1_ts.argsort()[self.dist.world_size // 2].item()
            mid_ckpt_path = os.path.join(
                self.ckpt_root,
                f'midrk{mid_rank}_{prefix}_enderr{100-acc1_ts[mid_rank].item():.2f}.pth.tar'
            )
            if self.dist.rank == mid_rank:
                torch.save(
                    {
                        'model': self.model.state_dict(),
                        'op': op.state_dict(),
                    }, mid_ckpt_path)
            self.dist.barrier()

            if sync_mid:
                mid_ckpt = torch.load(mid_ckpt_path, map_location='cpu')
                self.model.load_state_dict(mid_ckpt['model'])
                op.load_state_dict(mid_ckpt['op'])

            best_errs: torch.Tensor = sync_vals(self.dist, 100 - best_acc1,
                                                None)
            best_err: float = best_errs.mean().item()
            self.lg.info(
                f'==> {prefix}-training finished, mid rank={mid_rank},'
                f' total time cost: {(time.time()-start_train_t)/60:.2f} min,'
                f' test err @1: mean={best_err:.3f}')
        else:
            best_err = 100 - best_acc1
            self.lg.info(
                f'==> {prefix}-training finished,'
                f' total time cost: {(time.time()-start_train_t)/60:.2f} min,'
                f' test err @1: {100-best_acc1:.3f}')

        [
            self.meta_tb_lg.add_scalar(f'{prefix}_best_err', best_err, t)
            for t in [0, max_ep]
        ]
        [
            self.g_tb_lg.add_scalar(f'{prefix}_best_err', best_err, t)
            for t in [0, max_ep]
        ]
        return {
            'model': self.model.state_dict(),
            'op': op.state_dict(),
            'last_iter': max_global_it
        }