Example #1
0
def train(epoch, optim, net, criteria, lr_schdr):
    ## dataset
    dl = get_data_loader(cfg.im_root,
                         cfg.train_im_anns,
                         cfg.ims_per_gpu,
                         cfg.scales,
                         cfg.cropsize,
                         mode='train')

    ## meters
    time_meter, loss_meter = set_meters(epoch)
    ## train loop
    for it, (im, lb) in enumerate(tqdm(dl)):
        im = im.cuda()
        lb = lb.cuda()

        lb = torch.squeeze(lb, 1)

        optim.zero_grad()
        aux_loss, main_loss = net(im)
        aux_criteria = criteria(aux_loss, lb)
        main_criteria = criteria(main_loss, lb)
        loss = main_criteria + 0.1 * aux_criteria
        '''if has_apex:
            with amp.scale_loss(loss, optim) as scaled_loss:
                scaled_loss.backward()
        else:'''
        loss = loss.mean()
        loss.backward()
        optim.step()

        time_meter.update()
        loss_meter.update(loss.item())
        lr_schdr.step()
        return lr_schdr, time_meter, loss_meter
Example #2
0
def eval_model(net, ims_per_gpu, im_root, im_anns, n_classes, cropsize):

    dl = get_data_loader(im_root,
                         im_anns,
                         ims_per_gpu,
                         None,
                         None,
                         mode='test',
                         distributed=False)

    net.eval()

    heads, mious = [], []
    logger = logging.getLogger()
    '''
    single_scale = MscEvalV0((1., ), False)
    mIOU = single_scale(net, dl, n_classes)
    heads.append('single_scale')
    mious.append(mIOU)
    logger.info('single mIOU is: %s\n', mIOU)

    single_crop = MscEvalCrop(
        cropsize=1024,
        cropstride=2. / 3,
        flip=False,
        scales=(1., ),
        lb_ignore=255,
    )
    mIOU = single_crop(net, dl, n_classes)
    heads.append('single_scale_crop')
    mious.append(mIOU)
    logger.info('single scale crop mIOU is: %s\n', mIOU)

    ms_flip = MscEvalV0((0.5, 0.75, 1, 1.25, 1.5, 1.75), True)
    mIOU = ms_flip(net, dl, n_classes)
    heads.append('ms_flip')
    mious.append(mIOU)
    logger.info('ms flip mIOU is: %s\n', mIOU)

    ms_flip_crop = MscEvalCrop(
        cropsize=1024,
        cropstride=2. / 3,
        flip=True,
        scales=(0.5, 0.75, 1.0, 1.25, 1.5, 1.75),
        lb_ignore=255,
    )
    mIOU = ms_flip_crop(net, dl, n_classes)
    heads.append('ms_flip_crop')
    mious.append(mIOU)
    logger.info('ms crop mIOU is: %s\n', mIOU)
    '''

    windowEval = WindowEval(cropsize=cropsize,
                            cropstride=2. / 3,
                            lb_ignore=255)
    mIOU = windowEval(net, dl, n_classes)
    heads.append('window eval')
    mious.append(mIOU)
    logger.info('ms crop mIOU is: %s\n', mIOU)
    return heads, mious
Example #3
0
def eval_model(net, ims_per_gpu, im_root, im_anns, it=cfg.epoch):
    dl = get_data_loader(im_root, im_anns, ims_per_gpu, None, None, mode='val')
    net.eval()

    heads, mious = [], []
    logger = logging.getLogger()

    if (it + 1) != cfg.epoch:
        single_scale = MscEvalV0((1., ), False)
        mIOU = single_scale(net, dl, 19)
        heads.append('single_scale')
        mious.append(mIOU)
        logger.info('single mIOU is: %s\n', mIOU)
        return heads, mious, mIOU

    single_scale = MscEvalV0((1., ), False)
    mIOUss = single_scale(net, dl, 19)
    heads.append('single_scale')
    mious.append(mIOUss)
    logger.info('single mIOU is: %s\n', mIOUss)

    single_crop = MscEvalCrop(
        cropsize=1024,
        cropstride=2. / 3,
        flip=False,
        scales=(1., ),
        lb_ignore=255,
    )
    '''mIOU = single_crop(net, dl, 19)
    heads.append('single_scale_crop')
    mious.append(mIOU)
    logger.info('single scale crop mIOU is: %s\n', mIOU)

    ms_flip = MscEvalV0((0.5, 0.75, 1, 1.25, 1.5, 1.75), True)
    mIOU = ms_flip(net, dl, 19)
    heads.append('ms_flip')
    mious.append(mIOU)
    logger.info('ms flip mIOU is: %s\n', mIOU)

    ms_flip_crop = MscEvalCrop(
        cropsize=1024,
        cropstride=2. / 3,
        flip=True,
        scales=(0.5, 0.75, 1.0, 1.25, 1.5, 1.75),
        lb_ignore=255,
    )
    mIOU = ms_flip_crop(net, dl, 19)
    heads.append('ms_flip_crop')
    mious.append(mIOU)
    logger.info('ms crop mIOU is: %s\n', mIOU)'''
    return heads, mious, mIOUss
Example #4
0
def eval_model(net, ims_per_gpu, im_root, im_anns):
    is_dist = dist.is_initialized()
    dl = get_data_loader(im_root,
                         im_anns,
                         ims_per_gpu,
                         None,
                         None,
                         mode='val',
                         distributed=is_dist)
    net.eval()

    heads, mious = [], []
    logger = logging.getLogger()

    single_scale = MscEvalV0((1., ), False)
    mIOU = single_scale(net, dl, 19)
    heads.append('single_scale')
    mious.append(mIOU)
    logger.info('single mIOU is: %s\n', mIOU)

    single_crop = MscEvalCrop(
        cropsize=1024,
        cropstride=2. / 3,
        flip=False,
        scales=(1., ),
        lb_ignore=255,
    )
    mIOU = single_crop(net, dl, 19)
    heads.append('single_scale_crop')
    mious.append(mIOU)
    logger.info('single scale crop mIOU is: %s\n', mIOU)

    ms_flip = MscEvalV0((0.5, 0.75, 1, 1.25, 1.5, 1.75), True)
    mIOU = ms_flip(net, dl, 19)
    heads.append('ms_flip')
    mious.append(mIOU)
    logger.info('ms flip mIOU is: %s\n', mIOU)

    ms_flip_crop = MscEvalCrop(
        cropsize=1024,
        cropstride=2. / 3,
        flip=True,
        scales=(0.5, 0.75, 1.0, 1.25, 1.5, 1.75),
        lb_ignore=255,
    )
    mIOU = ms_flip_crop(net, dl, 19)
    heads.append('ms_flip_crop')
    mious.append(mIOU)
    logger.info('ms crop mIOU is: %s\n', mIOU)
    return heads, mious
Example #5
0
def eval_model(net, ims_per_gpu, im_root, im_anns,iteration):
    is_dist = dist.is_initialized()
    dl = get_data_loader(im_root, im_anns, ims_per_gpu, None,
            None, mode='val', distributed=is_dist)
    net.eval()

    heads, mious = [], []
    logger = logging.getLogger()

    single_scale = MscEvalV0((1., ), False)
    mIOU = single_scale(net, dl,19,iteration,dist.get_rank() == 0)
    heads.append('single_scale')
    mious.append(mIOU)
    logger.info('single mIOU is: %s\n', mIOU)
    return heads, mious
Example #6
0
def train():
    logger = logging.getLogger()
    is_dist = dist.is_initialized()

    ## dataset
    dl = get_data_loader(cfg.im_root,
                         cfg.train_im_anns,
                         cfg.ims_per_gpu,
                         cfg.scales,
                         cfg.cropsize,
                         cfg.max_iter,
                         mode='train',
                         distributed=is_dist)

    ## model
    net, criteria_pre, criteria_aux = set_model()

    ## optimizer
    optim = set_optimizer(net)

    ## fp16
    if has_apex:
        opt_level = 'O1' if cfg.use_fp16 else 'O0'
        net, optim = amp.initialize(net, optim, opt_level=opt_level)

    ## ddp training
    net = set_model_dist(net)

    ## meters
    time_meter, loss_meter, loss_pre_meter, loss_aux_meters = set_meters()

    ## lr scheduler
    lr_schdr = WarmupPolyLrScheduler(
        optim,
        power=0.9,
        max_iter=cfg.max_iter,
        warmup_iter=cfg.warmup_iters,
        warmup_ratio=0.1,
        warmup='exp',
        last_epoch=-1,
    )

    ## train loop
    for it, (im, lb) in enumerate(dl):
        im = im.cuda()
        lb = lb.cuda()

        lb = torch.squeeze(lb, 1)

        optim.zero_grad()
        logits, *logits_aux = net(im)
        loss_pre = criteria_pre(logits, lb)
        loss_aux = [
            crit(lgt, lb) for crit, lgt in zip(criteria_aux, logits_aux)
        ]
        loss = loss_pre + sum(loss_aux)
        if has_apex:
            with amp.scale_loss(loss, optim) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        optim.step()
        torch.cuda.synchronize()
        lr_schdr.step()

        time_meter.update()
        loss_meter.update(loss.item())
        loss_pre_meter.update(loss_pre.item())
        _ = [
            mter.update(lss.item())
            for mter, lss in zip(loss_aux_meters, loss_aux)
        ]

        ## print training log message
        if (it + 1) % 100 == 0:
            lr = lr_schdr.get_lr()
            lr = sum(lr) / len(lr)
            print_log_msg(it, cfg.max_iter, lr, time_meter, loss_meter,
                          loss_pre_meter, loss_aux_meters)
        if (it) % 1000 == 0:
            save_checkpoint('bisenet_citys_{}.pth'.format(it),
                            net.module.state_dict())

    ## dump the final model and evaluate the result
    save_pth = osp.join(cfg.respth, 'model_final.pth')
    logger.info('\nsave models to {}'.format(save_pth))
    state = net.module.state_dict()
    if dist.get_rank() == 0: torch.save(state, save_pth)

    logger.info('\nevaluating the final model')
    torch.cuda.empty_cache()
    heads, mious = eval_model(net, 2, cfg.im_root, cfg.val_im_anns)
    logger.info(tabulate([
        mious,
    ], headers=heads, tablefmt='orgtbl'))

    return
def train():
    logger = logging.getLogger()
    is_dist = dist.is_initialized()

    ## dataset
    dl = get_data_loader(cfg.im_root,
                         cfg.train_im_anns,
                         cfg.ims_per_gpu,
                         cfg.scales,
                         cfg.cropsize,
                         cfg.max_iter,
                         mode='train',
                         distributed=is_dist)

    ## model
    net, criteria_pre, criteria_aux = set_model()

    ## optimizer
    optim = set_optimizer(net)

    ## fp16
    if has_apex:
        opt_level = 'O1' if cfg.use_fp16 else 'O0'
        net, optim = amp.initialize(net, optim, opt_level=opt_level)

    ## ddp training
    net = set_model_dist(net)

    ## meters
    time_meter, loss_meter, loss_pre_meter, loss_aux_meters = set_meters()

    ## lr scheduler
    lr_schdr = WarmupPolyLrScheduler(
        optim,
        power=0.9,
        max_iter=cfg.max_iter,
        warmup_iter=cfg.warmup_iters,
        warmup_ratio=0.1,
        warmup='exp',
        last_epoch=-1,
    )

    ##load checkpoin if exits for resuming training
    if args.loadCheckpointLocation != None:
        net, optim, lr_schdr, start_iteration = load_ckp(
            args.loadCheckpointLocation, net, optim, lr_schdr)
    else:
        start_iteration = 0

    ## train loop
    for current_it, (im, lb) in enumerate(dl):
        #on resumed training 'it' will be incremented from what was left else the sum is 0 anyways
        it = current_it + start_iteration
        im = im.cuda()
        lb = lb.cuda()

        lb = torch.squeeze(lb, 1)

        optim.zero_grad()
        logits, *logits_aux = net(im)
        loss_pre = criteria_pre(logits, lb)
        loss_aux = [
            crit(lgt, lb) for crit, lgt in zip(criteria_aux, logits_aux)
        ]
        loss = loss_pre + sum(loss_aux)
        if has_apex:
            with amp.scale_loss(loss, optim) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        optim.step()
        torch.cuda.synchronize()
        lr_schdr.step()

        time_meter.update()
        loss_meter.update(loss.item())
        loss_pre_meter.update(loss_pre.item())
        _ = [
            mter.update(lss.item())
            for mter, lss in zip(loss_aux_meters, loss_aux)
        ]

        ## print training log message
        if (it + 1) % 100 == 0:
            lr = lr_schdr.get_lr()
            lr = sum(lr) / len(lr)
            print_log_msg(it, cfg.max_iter, lr, time_meter, loss_meter,
                          loss_pre_meter, loss_aux_meters)

        #save the checkpoint on every some iteration
        if (it + 1) % args.saveOnEveryIt == 0:
            if args.saveCheckpointDir != None:
                checkpoint = {
                    'iteration': it + 1,
                    'state_dict': net.state_dict(),
                    'optimizer': optim.state_dict(),
                    'lr_schdr': lr_schdr.state_dict(),
                }
                iteration_no_str = (str(it + 1)).zfill(len(str(cfg.max_iter)))
                ckt_name = 'checkpoint_it_' + iteration_no_str + '.pt'
                save_pth = osp.join(args.saveCheckpointDir, ckt_name)
                logger.info(
                    '\nsaving intermidiate checkpoint to {}'.format(save_pth))
                save_ckp(checkpoint, save_pth)

    ## dump the final model and evaluate the result
    checkpoint = {
        'iteration': cfg.max_iter,
        'state_dict': net.state_dict(),
        'optimizer': optim.state_dict(),
        'lr_schdr': lr_schdr.state_dict(),
    }
    save_pth = osp.join(args.saveCheckpointDir, 'model_final.pt')
    logger.info('\nsave Final models to {}'.format(save_pth))
    save_ckp(checkpoint, save_pth)

    logger.info('\nevaluating the final model')
    torch.cuda.empty_cache()
    heads, mious = eval_model(net, 2, cfg.im_root, cfg.val_im_anns)
    logger.info(tabulate([
        mious,
    ], headers=heads, tablefmt='orgtbl'))
    return
Example #8
0
def train():
    logger = logging.getLogger()

    is_dist = False

    ## dataset
    dl = get_data_loader(
            cfg.im_root, cfg.train_im_anns,
            cfg.ims_per_gpu, cfg.scales, cfg.cropsize,
            cfg.max_iter, mode='train', distributed=is_dist)

    valid = get_data_loader(
        cfg.im_root, cfg.val_im_anns,
            cfg.ims_per_gpu, cfg.scales, cfg.cropsize,
            cfg.max_iter, mode='val', distributed=is_dist
    )

    ## model
    net, criteria_pre, criteria_aux = set_model()
    print(net)
    print(f'n_parameters: {sum(p.numel() for p in net.parameters())}')
    ## optimizer
    optim = set_optimizer(net)

    ## fp16
    if has_apex:
        opt_level = 'O1' if cfg.use_fp16 else 'O0'
        net, optim = amp.initialize(net, optim, opt_level=opt_level)

    ## meters
    time_meter, loss_meter, loss_pre_meter, loss_aux_meters = set_meters()

    ## lr scheduler
    lr_schdr = WarmupPolyLrScheduler(optim, power=0.9,
        max_iter=cfg.max_iter, warmup_iter=cfg.warmup_iters,
        warmup_ratio=0.1, warmup='exp', last_epoch=-1,)

    best_validation = np.inf

    for i in range(cfg.n_epochs):
        ## train loop
        for it, (im, lb) in enumerate(Bar(dl)):

            net.train()

            im = im.cuda()
            lb = lb.cuda()

            lb = torch.squeeze(lb, 1)

            optim.zero_grad()
            logits, *logits_aux = net(im)
            loss_pre = criteria_pre(logits, lb)

            loss_aux = [crit(lgt, lb) for crit, lgt in zip(criteria_aux, logits_aux)]
            loss = loss_pre + sum(loss_aux)
            if has_apex:
                with amp.scale_loss(loss, optim) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()
            optim.step()
            torch.cuda.synchronize()
            lr_schdr.step()

            time_meter.update()
            loss_meter.update(loss.item())
            loss_pre_meter.update(loss_pre.item())
            _ = [mter.update(lss.item()) for mter, lss in zip(loss_aux_meters, loss_aux)]
            
            del im
            del lb
        ## print training log message
        lr = lr_schdr.get_lr()
        lr = sum(lr) / len(lr)
        print_log_msg(
            i, cfg.max_iter, lr, time_meter, loss_meter,
            loss_pre_meter, loss_aux_meters)

        ##validation loop
        validation_loss = []
        for it, (im, lb) in enumerate(Bar(valid)):

            net.eval()

            im = im.cuda()
            lb = lb.cuda()

            lb = torch.squeeze(lb, 1)

            with torch.no_grad():
                logits, *logits_aux = net(im)
                loss_pre = criteria_pre(logits, lb)
                loss_aux = [crit(lgt, lb) for crit, lgt in zip(criteria_aux, logits_aux)]
                loss = loss_pre + sum(loss_aux)
                validation_loss.append(loss.item())

            del im
            del lb

        ## print training log messag
        validation_loss = sum(validation_loss)/len(validation_loss)
        print(f'Validation loss: {validation_loss}')

        if best_validation > validation_loss:
            print('new best performance, storing model')
            best_validation = validation_loss
            state = net.state_dict()
            torch.save(state,  osp.join(cfg.respth, 'best_validation.pth'))

    ## dump the final model and evaluate the result
    save_pth = osp.join(cfg.respth, 'model_final.pth')
    logger.info('\nsave models to {}'.format(save_pth))
    state = net.state_dict()

    torch.save(state, save_pth)

    logger.info('\nevaluating the final model')
    torch.cuda.empty_cache()
    heads, mious = eval_model(net, 2, cfg.im_root, cfg.test_im_anns)
    logger.info(tabulate([mious, ], headers=heads, tablefmt='orgtbl'))

    return
Example #9
0
def train():
    logger = logging.getLogger()
    is_dist = dist.is_initialized()

    ## dataset
    dl = get_data_loader(cfg.im_root,
                         cfg.train_im_anns,
                         cfg.ims_per_gpu,
                         cfg.scales,
                         cfg.cropsize,
                         cfg.max_iter,
                         mode='train',
                         distributed=is_dist)

    ## model
    net, criteria_pre, criteria_aux = set_model()

    if dist.get_rank() == 0:
        exp_name = "cityscapes_repl"
        wandb.init(project="bisenet", name="cityscapes_repl")
        wandb.watch(net)

    ## optimizer
    optim = set_optimizer(net)

    ## fp16
    if has_apex:
        opt_level = 'O1' if cfg.use_fp16 else 'O0'
        net, optim = amp.initialize(net, optim, opt_level=opt_level)

    ## ddp training
    net = set_model_dist(net)

    ## meters
    time_meter, loss_meter, loss_pre_meter, loss_aux_meters = set_meters()

    ## lr scheduler
    lr_schdr = WarmupPolyLrScheduler(
        optim,
        power=0.9,
        max_iter=cfg.max_iter,
        warmup_iter=cfg.warmup_iters,
        warmup_ratio=0.1,
        warmup='exp',
        last_epoch=-1,
    )

    ## train loop
    for it, (im, lb) in enumerate(dl):
        net.train()
        im = im.cuda()
        lb = lb.cuda()

        lb = torch.squeeze(lb, 1)

        optim.zero_grad()
        logits, *logits_aux = net(im)
        loss_pre = criteria_pre(logits, lb)
        loss_aux = [
            crit(lgt, lb) for crit, lgt in zip(criteria_aux, logits_aux)
        ]
        loss = loss_pre + sum(loss_aux)
        if has_apex:
            with amp.scale_loss(loss, optim) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        optim.step()
        torch.cuda.synchronize()
        lr_schdr.step()

        time_meter.update()
        loss_meter.update(loss.item())
        loss_pre_meter.update(loss_pre.item())
        _ = [
            mter.update(lss.item())
            for mter, lss in zip(loss_aux_meters, loss_aux)
        ]

        lr = lr_schdr.get_lr()
        lr = sum(lr) / len(lr)
        ## print training log message
        if dist.get_rank() == 0:
            loss_avg = loss_meter.get()[0]
            wandb.log(
                {
                    "lr": lr,
                    "time": time_meter.get()[0],
                    "loss": loss_avg,
                    "loss_pre": loss_pre_meter.get()[0],
                    **{
                        f"loss_aux_{el.name}": el.get()[0]
                        for el in loss_aux_meters
                    }
                },
                commit=False)
            if (it + 1) % 100 == 0: print(it, ' - ', lr, ' - ', loss_avg)

            if (it + 1) % 2000 == 0:
                # dump the model and evaluate the result
                save_pth = osp.join(cfg.respth, f"{exp_name}_{it}.pth")
                state = net.module.state_dict()
                torch.save(state, save_pth)
                wandb.save(save_pth)
        if ((it + 1) % 2000 == 0):
            logger.info('\nevaluating the model')
            heads, mious = eval_model(net, 2, cfg.im_root, cfg.val_im_anns, it)
            logger.info(tabulate([
                mious,
            ], headers=heads, tablefmt='orgtbl'))
            if (dist.get_rank() == 0):
                wandb.log({k: v for k, v in zip(heads, mious)}, commit=False)
        if (dist.get_rank() == 0):
            wandb.log({"t": it}, step=it)
    return