def __init__(self, args):
        self.args = args

        # image transform
        input_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
        ])
        # dataset and dataloader
        data_kwargs = {'transform': input_transform, 'base_size': args.base_size, 'crop_size': args.crop_size}
        train_dataset = get_segmentation_dataset(args.dataset, split=args.train_split, mode='train', **data_kwargs)
        val_dataset = get_segmentation_dataset(args.dataset, split='val', mode='val', **data_kwargs)

        self.train_loader = data.DataLoader(dataset=train_dataset,
                                            batch_size=args.batch_size,
                                            drop_last=True,
                                            shuffle=True)

        self.val_loader = data.DataLoader(dataset=val_dataset,
                                          batch_size=1,
                                          drop_last=False,
                                          shuffle=False)

        # create network
        self.model = get_segmentation_model(model=args.model, dataset=args.dataset, backbone=args.backbone,
                                            aux=args.aux, norm_layer=nn.BatchNorm2d).to(args.device)

        # create criterion
        self.criterion = MixSoftmaxCrossEntropyLoss(args.aux, args.aux_weight, ignore_label=-1).to(args.device)

        # for multi-GPU
        # if torch.cuda.is_available():
        #     self.model = DataParallelModel(self.model).cuda()
        #     self.criterion = DataParallelCriterion(self.criterion).cuda()

        # resume checkpoint if needed
        if args.resume:
            if os.path.isfile(args.resume):
                name, ext = os.path.splitext(args.resume)
                assert ext == '.pkl' or '.pth', 'Sorry only .pth and .pkl files supported.'
                print('Resuming training, loading {}...'.format(args.resume))
                self.model.load_state_dict(torch.load(args.resume, map_location=lambda storage, loc: storage))

        # optimizer
        self.optimizer = torch.optim.SGD(self.model.parameters(),
                                         lr=args.lr,
                                         momentum=args.momentum,
                                         weight_decay=args.weight_decay)

        # lr scheduling
        self.lr_scheduler = LRScheduler(mode='poly', base_lr=args.lr, nepochs=args.epochs,
                                        iters_per_epoch=len(self.train_loader), power=0.9)

        # evaluation metrics
        self.metric = SegmentationMetric(train_dataset.num_class)

        self.best_pred = 0.0
Example #2
0
    def __init__(self, args):
        self.args = args
        self.device = torch.device(args.device)

        # image transform
        input_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
        ])

        # dataset and dataloader
        val_dataset = get_segmentation_dataset(args.dataset,
                                               split='val',
                                               mode='testval',
                                               transform=input_transform)
        val_sampler = make_data_sampler(val_dataset, False, args.distributed)
        val_batch_sampler = make_batch_data_sampler(val_sampler,
                                                    images_per_batch=1)
        self.val_loader = data.DataLoader(dataset=val_dataset,
                                          batch_sampler=val_batch_sampler,
                                          num_workers=args.workers,
                                          pin_memory=True)

        # create network
        self.model = get_segmentation_model(model=args.model,
                                            dataset=args.dataset,
                                            backbone=args.backbone,
                                            pretrained=True,
                                            pretrained_base=False)
        if args.distributed:
            self.model = self.model.module
        self.model.to(self.device)

        self.metric = SegmentationMetric(val_dataset.num_class)
    def __init__(self, args):
        self.args = args
        self.device = torch.device(args.device)

        # image transform
        input_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
        ])

        # dataset and dataloader
        val_dataset = get_segmentation_dataset(args.dataset, split='val', mode='testval', transform=input_transform)
        #val_dataset = get_segmentation_dataset(args.dataset, split='val_test', mode='testval', transform=input_transform)
        val_sampler = make_data_sampler(val_dataset, False, args.distributed)
        val_batch_sampler = make_batch_data_sampler(val_sampler, images_per_batch=1)
        self.val_loader = data.DataLoader(dataset=val_dataset,
                                          batch_sampler=val_batch_sampler,
                                          num_workers=args.workers,
                                          pin_memory=True)

        # create network
        BatchNorm2d = nn.SyncBatchNorm if args.distributed else nn.BatchNorm2d
        self.model = get_segmentation_model(model=args.model, dataset=args.dataset, backbone=args.backbone,
                                            aux=args.aux, pretrained=True, pretrained_base=False,
                                            local_rank=args.local_rank,
                                            norm_layer=BatchNorm2d).to(self.device)
        if args.distributed:
            self.model = nn.parallel.DistributedDataParallel(self.model,
                device_ids=[args.local_rank], output_device=args.local_rank)
        self.model.to(self.device)

        self.metric = SegmentationMetric(val_dataset.num_class)
Example #4
0
def demo(config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # output folder
    if not os.path.exists(config.outdir):
        os.makedirs(config.outdir)
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])
    model = get_model(args.model, pretrained=True,
                      root=args.save_folder).to(device)
    print('Finished loading model!')

    if args.input_pic != None:
        image = Image.open(config.input_pic).convert('RGB')
        images = transform(image).unsqueeze(0).to(device)
        test(model, images, args.input_pic)
    else:
        # image transform
        test_dataset = get_segmentation_dataset(args.dataset,
                                                split='test',
                                                mode='test',
                                                transform=transform)
        test_sampler = make_data_sampler(test_dataset, True, False)
        test_batch_sampler = make_batch_data_sampler(test_sampler,
                                                     images_per_batch=1)
        test_loader = data.DataLoader(dataset=test_dataset,
                                      batch_sampler=test_batch_sampler,
                                      num_workers=4,
                                      pin_memory=True)
        for i, (image, target) in enumerate(test_loader):
            image = image.to(torch.device(device))
            test(model, image, ''.join(target))
Example #5
0
    def __init__(self, config):
        self.config = config
        self.run_config = config['run_config']
        self.optim_config = config['optim_config']
        self.data_config = config['data_config']
        self.model_config = config['model_config']

        self.device = torch.device(self.run_config["device"])

        # image transform
        input_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
        ])

        # dataset and dataloader
        val_dataset = get_segmentation_dataset(
            self.data_config['dataset_name'],
            root=self.data_config['dataset_root'],
            split='test',
            mode='test',
            transform=input_transform)
        val_sampler = make_data_sampler(val_dataset, False,
                                        self.run_config['distributed'])
        val_batch_sampler = make_batch_data_sampler(val_sampler,
                                                    images_per_batch=10,
                                                    drop_last=False)
        self.val_loader = data.DataLoader(dataset=val_dataset,
                                          batch_sampler=val_batch_sampler,
                                          num_workers=4,
                                          pin_memory=True)

        # create network
        BatchNorm2d = nn.SyncBatchNorm if self.run_config[
            'distributed'] else nn.BatchNorm2d
        self.model = get_segmentation_model(
            model=self.model_config['model'],
            dataset=self.data_config['dataset_name'],
            backbone=self.model_config['backbone'],
            aux=self.optim_config['aux'],
            jpu=self.model_config['jpu'],
            norm_layer=BatchNorm2d,
            root=run_config['path']['eval_model_root'],
            pretrained=run_config['eval_model'],
            pretrained_base=False,
            local_rank=self.run_config['local_rank']).to(self.device)

        if self.run_config['distributed']:
            self.model = nn.parallel.DistributedDataParallel(
                self.model,
                device_ids=[self.run_config['local_rank']],
                output_device=self.run_config['local_rank'])
        elif len(run_config['gpu_ids']) > 1:
            assert torch.cuda.is_available()
            self.model = nn.DataParallel(self.model)

        self.model.to(self.device)

        self.metric = SegmentationMetric(val_dataset.num_class)
def eval(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # output folder
    outdir = 'test_result'
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    # image transform
    input_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
    ])

    # dataset and dataloader
    test_dataset = get_segmentation_dataset(args.dataset,
                                            split='val',
                                            mode='testval',
                                            transform=input_transform)

    test_loader = data.DataLoader(dataset=test_dataset,
                                  batch_size=1,
                                  shuffle=False)

    # create network
    model = get_segmentation_model(model=args.model,
                                   dataset=args.dataset,
                                   backbone=args.backbone,
                                   aux=args.aux,
                                   pretrained=True,
                                   pretrained_base=False).to(device)
    print('Finished loading model!')

    metric = SegmentationMetric(test_dataset.num_class)

    model.eval()
    for i, (image, label) in enumerate(test_loader):
        image = image.to(device)

        with torch.no_grad():
            outputs = model(image)

            pred = torch.argmax(outputs[0], 1)
            pred = pred.cpu().data.numpy()
            label = label.numpy()

            metric.update(pred, label)
            pixAcc, mIoU = metric.get()
            print('Sample %d, validation pixAcc: %.3f%%, mIoU: %.3f%%' %
                  (i + 1, pixAcc * 100, mIoU * 100))

            if args.save_result:
                predict = pred.squeeze(0)
                mask = get_color_pallete(predict, args.dataset)
                mask.save(os.path.join(outdir, 'seg_{}.png'.format(i)))
Example #7
0
    def __init__(self, args):
        self.args = args
        self.device = torch.device(args.device)

        # image transform
        input_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
        ])

        # dataset and dataloader
        data_kwargs = {
            'transform': input_transform,
            'base_size': args.base_size,
            'crop_size': args.crop_size
        }
        val_dataset = get_segmentation_dataset(args.dataset,
                                               split='val',
                                               mode='testval',
                                               **data_kwargs)
        val_sampler = make_data_sampler(val_dataset, False, args.distributed)
        val_batch_sampler = make_batch_data_sampler(val_sampler,
                                                    images_per_batch=1)
        self.val_loader = data.DataLoader(dataset=val_dataset,
                                          batch_sampler=val_batch_sampler,
                                          num_workers=args.workers,
                                          pin_memory=True)

        # create network
        BatchNorm2d = nn.SyncBatchNorm if args.distributed else nn.BatchNorm2d
        self.model = get_segmentation_model(model=args.model,
                                            dataset=args.dataset,
                                            backbone=args.backbone,
                                            aux=args.aux,
                                            norm_layer=BatchNorm2d).to(
                                                self.device)

        # resume checkpoint if needed
        if args.resume:
            if os.path.isfile(args.resume):
                name, ext = os.path.splitext(args.resume)
                assert ext == '.pkl' or '.pth', 'Sorry only .pth and .pkl files supported.'
                print('Resuming training, loading {}...'.format(args.resume))
                self.model.load_state_dict(
                    torch.load(args.resume,
                               map_location=lambda storage, loc: storage))
        ###...
        # self.model.to(self.device)
        if args.mutilgpu:
            self.model = nn.DataParallel(self.model, device_ids=args.gpu_ids)
        ##....
        self.metric = SegmentationMetric(val_dataset.num_class)
Example #8
0
    def __init__(self, args):
        self.args = args
        self.device = torch.device(args.device)

        # Visualizer
        self.visualizer = TensorboardVisualizer(args, sys.argv)

        # image transform
        input_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
        ])
        # dataset and dataloader
        data_kwargs = {
            'transform': input_transform,
            'base_size': args.base_size,
            'crop_size': args.crop_size
        }
        train_dataset = get_segmentation_dataset(args.dataset,
                                                 split='train',
                                                 mode='train',
                                                 **data_kwargs)
        val_dataset = get_segmentation_dataset(args.dataset,
                                               split='val',
                                               mode='val',
                                               **data_kwargs)
        args.iters_per_epoch = len(train_dataset) // (args.num_gpus *
                                                      args.batch_size)
        args.max_iters = args.epochs * args.iters_per_epoch

        train_sampler = make_data_sampler(train_dataset,
                                          shuffle=True,
                                          distributed=args.distributed)
        train_batch_sampler = make_batch_data_sampler(train_sampler,
                                                      args.batch_size,
                                                      args.max_iters)
        val_sampler = make_data_sampler(val_dataset, False, args.distributed)
        val_batch_sampler = make_batch_data_sampler(val_sampler,
                                                    args.batch_size)

        self.train_loader = data.DataLoader(dataset=train_dataset,
                                            batch_sampler=train_batch_sampler,
                                            num_workers=args.workers,
                                            pin_memory=True)
        self.val_loader = data.DataLoader(dataset=val_dataset,
                                          batch_sampler=val_batch_sampler,
                                          num_workers=args.workers,
                                          pin_memory=True)

        # create network
        BatchNorm2d = nn.SyncBatchNorm if args.distributed else nn.BatchNorm2d
        self.model = get_segmentation_model(model=args.model,
                                            dataset=args.dataset,
                                            backbone=args.backbone,
                                            aux=args.aux,
                                            norm_layer=BatchNorm2d).to(
                                                self.device)  # jpu=args.jpu

        # resume checkpoint if needed
        if args.resume:
            if os.path.isfile(args.resume):
                name, ext = os.path.splitext(args.resume)
                assert ext == '.pkl' or '.pth', 'Sorry only .pth and .pkl files supported.'
                print('Resuming training, loading {}...'.format(args.resume))
                self.model.load_state_dict(
                    torch.load(args.resume,
                               map_location=lambda storage, loc: storage))

        # create criterion
        self.criterion = get_segmentation_loss(args.model,
                                               use_ohem=args.use_ohem,
                                               aux=args.aux,
                                               aux_weight=args.aux_weight,
                                               ignore_index=-1).to(self.device)

        # optimizer, for model just includes pretrained, head and auxlayer
        params_list = list()
        if hasattr(self.model, 'pretrained'):
            params_list.append({
                'params': self.model.pretrained.parameters(),
                'lr': args.lr
            })
        if hasattr(self.model, 'exclusive'):
            for module in self.model.exclusive:
                params_list.append({
                    'params':
                    getattr(self.model, module).parameters(),
                    'lr':
                    args.lr * 10
                })
        self.optimizer = torch.optim.SGD(params_list,
                                         lr=args.lr,
                                         momentum=args.momentum,
                                         weight_decay=args.weight_decay)

        # lr scheduling
        self.lr_scheduler = WarmupPolyLR(self.optimizer,
                                         max_iters=args.max_iters,
                                         power=0.9,
                                         warmup_factor=args.warmup_factor,
                                         warmup_iters=args.warmup_iters,
                                         warmup_method=args.warmup_method)

        if args.distributed:
            self.model = nn.parallel.DistributedDataParallel(
                self.model,
                device_ids=[args.local_rank],
                output_device=args.local_rank)

        # evaluation metrics
        self.metric = SegmentationMetric(train_dataset.num_class)

        self.best_pred = 0.0
Example #9
0
def train(args, model, enc=False):
    # image transform
    input_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
    ])

    data_kwargs = {
        'dataset_root': args.datadir,
        'transform': input_transform,
        'base_size': args.base_size,
        'crop_size': args.crop_size,
        'encode': enc
    }
    train_dataset = get_segmentation_dataset('ade20k',
                                             split='train',
                                             mode='train',
                                             **data_kwargs)
    val_dataset = get_segmentation_dataset('ade20k',
                                           split='val',
                                           mode='val',
                                           **data_kwargs)

    train_sampler = make_data_sampler(train_dataset,
                                      shuffle=True,
                                      distributed=False)
    train_batch_sampler = make_batch_data_sampler(train_sampler,
                                                  args.batch_size)
    val_sampler = make_data_sampler(val_dataset,
                                    shuffle=False,
                                    distributed=False)
    val_batch_sampler = make_batch_data_sampler(val_sampler, args.batch_size)

    loader = data.DataLoader(dataset=train_dataset,
                             batch_sampler=train_batch_sampler,
                             num_workers=args.num_workers,
                             pin_memory=True)
    loader_val = data.DataLoader(dataset=val_dataset,
                                 batch_sampler=val_batch_sampler,
                                 num_workers=args.num_workers,
                                 pin_memory=True)

    criterion = CrossEntropyLoss2d()
    print(type(criterion))

    savedir = f'../save/{args.savedir}'

    if (enc):
        automated_log_path = savedir + "/automated_log_encoder.txt"
        modeltxtpath = savedir + "/model_encoder.txt"
    else:
        automated_log_path = savedir + "/automated_log.txt"
        modeltxtpath = savedir + "/model.txt"

    if (not os.path.exists(automated_log_path)
        ):  #dont add first line if it exists
        with open(automated_log_path, "a") as myfile:
            myfile.write(
                "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate"
            )

    with open(modeltxtpath, "w") as myfile:
        myfile.write(str(model))

    optimizer = Adam(model.parameters(),
                     args.lr, (0.9, 0.999),
                     eps=1e-08,
                     weight_decay=1e-4)

    start_epoch = 1
    best_acc = 0.0
    if args.resume:
        if enc:
            filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar'
        else:
            filenameCheckpoint = savedir + '/checkpoint.pth.tar'

        assert os.path.exists(filenameCheckpoint)
        checkpoint = torch.load(filenameCheckpoint)
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        best_acc = checkpoint['best_acc']
        print("=> Loaded checkpoint at epoch {})".format(checkpoint['epoch']))

    # scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler     ## scheduler 1
    lambda1 = lambda epoch: pow(
        (1 - ((epoch - 1) / args.num_epochs)), 0.7)  ## scheduler 2
    scheduler = lr_scheduler.LambdaLR(optimizer,
                                      lr_lambda=lambda1)  ## scheduler 2

    for epoch in range(start_epoch, args.num_epochs + 1):
        print("----- TRAINING - EPOCH", epoch, "-----", " LR",
              optimizer.param_groups[0]['lr'], "-----")

        epoch_loss = []
        time_train = []

        doIouTrain = args.iouTrain
        doIouVal = args.iouVal

        if (doIouTrain):
            iouEvalTrain = iouEval(args.NUM_CLASSES)

        usedLr = optimizer.param_groups[0]['lr']

        model.train()
        total_train_step = len(train_dataset) // args.batch_size
        total_val_step = len(val_dataset) // args.batch_size
        for step, (images, labels, _) in enumerate(loader):
            start_time = time.time()

            imgs_batch = images.shape[0]
            if imgs_batch != args.batch_size:
                break

            if args.cuda:
                inputs = images.cuda()
                targets = labels.cuda()

            outputs = model(inputs, only_encode=enc)

            optimizer.zero_grad()
            loss = criterion(outputs, targets)

            loss.backward()
            optimizer.step()
            scheduler.step(epoch)  ## scheduler 2

            epoch_loss.append(loss.item())
            time_train.append(time.time() - start_time)

            if (doIouTrain):
                targets = torch.unsqueeze(targets, 1)
                iouEvalTrain.addBatch(
                    outputs.max(1)[1].unsqueeze(1).data, targets.data)
            if args.steps_loss > 0 and step % args.steps_loss == 0:
                average = sum(epoch_loss) / len(epoch_loss)
                print(
                    f'loss: {average:0.4} (epoch: {epoch}, step: {step}/{total_train_step})',
                    "// Remaining time: %.1f s" %
                    ((total_train_step - step) * sum(time_train) /
                     len(time_train)))

        average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss)

        iouTrain = 0
        if (doIouTrain):
            iouTrain, iou_classes = iouEvalTrain.getIoU()
            print("EPOCH IoU on TRAIN set: ", iouTrain.item() * 100, "%")
        print("----- VALIDATING - EPOCH", epoch, "-----")
        model.eval()
        epoch_loss_val = []
        time_val = []

        if (doIouVal):
            iouEvalVal = iouEval(args.NUM_CLASSES)

        for step, (images, labels, _) in enumerate(loader_val):
            start_time = time.time()

            imgs_batch = images.shape[0]
            if imgs_batch != args.batch_size:
                break
            if args.cuda:
                images = images.cuda()
                labels = labels.cuda()
            with torch.no_grad():
                inputs = Variable(images)
                targets = Variable(labels)
                outputs = model(inputs, only_encode=enc)
                loss = criterion(outputs, targets)
            epoch_loss_val.append(loss.item())
            time_val.append(time.time() - start_time)

            #Add batch to calculate TP, FP and FN for iou estimation
            if (doIouVal):
                targets = torch.unsqueeze(targets, 1)
                iouEvalVal.addBatch(
                    outputs.max(1)[1].unsqueeze(1).data, targets.data)
            if args.steps_loss > 0 and step % args.steps_loss == 0:
                average = sum(epoch_loss_val) / len(epoch_loss_val)
                print(
                    f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step}/{total_val_step})',
                    "// Remaining time: %.1f s" %
                    ((total_val_step - step) * sum(time_val) / len(time_val)))

        average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val)
        # scheduler.step(average_epoch_loss_val, epoch)  ## scheduler 1   # update lr if needed

        iouVal = 0
        if (doIouVal):
            iouVal, iou_classes = iouEvalVal.getIoU()
            print("EPOCH IoU on VAL set: ", iouVal.item() * 100, "%")

        # remember best valIoU and save checkpoint
        if iouVal == 0:
            current_acc = -average_epoch_loss_val
        else:
            current_acc = iouVal
        print('best acc:', best_acc, ' current acc:', current_acc.item())
        is_best = current_acc > best_acc
        best_acc = max(current_acc, best_acc)
        if enc:
            filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar'
            filenameBest = savedir + '/model_best_enc.pth.tar'
        else:
            filenameCheckpoint = savedir + '/checkpoint.pth.tar'
            filenameBest = savedir + '/model_best.pth.tar'
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': str(model),
                'state_dict': model.state_dict(),
                'best_acc': best_acc,
                'optimizer': optimizer.state_dict(),
            }, is_best, filenameCheckpoint, filenameBest)

        #SAVE MODEL AFTER EPOCH
        if (enc):
            filename = f'{savedir}/model_encoder-{epoch:03}.pth'
            filenamebest = f'{savedir}/model_encoder_best.pth'
        else:
            filename = f'{savedir}/model-{epoch:03}.pth'
            filenamebest = f'{savedir}/model_best.pth'
        if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0:
            torch.save(model.state_dict(), filename)
            print(f'save: {filename} (epoch: {epoch})')
        if (is_best):
            torch.save(model.state_dict(), filenamebest)
            print(f'save: {filenamebest} (epoch: {epoch})')
            if (not enc):
                with open(savedir + "/best.txt", "w") as myfile:
                    myfile.write("Best epoch is %d, with Val-IoU= %.4f" %
                                 (epoch, iouVal))
            else:
                with open(savedir + "/best_encoder.txt", "w") as myfile:
                    myfile.write("Best epoch is %d, with Val-IoU= %.4f" %
                                 (epoch, iouVal))

        with open(automated_log_path, "a") as myfile:
            myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" %
                         (epoch, average_epoch_loss_train,
                          average_epoch_loss_val, iouTrain, iouVal, usedLr))
    return (model)
    def __init__(self, args, logger):
        self.args = args
        self.logger = logger
        if get_rank() == 0:
            TBWriter.init(
                os.path.join(args.project_dir, args.task_dir, "tbevents")
            )
        self.device = torch.device(args.device)

        self.meters = MetricLogger(delimiter="  ")
        # image transform
        input_transform = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Normalize(
                    [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
                ),
            ]
        )
        # dataset and dataloader
        data_kwargs = {
            "transform": input_transform,
            "base_size": args.base_size,
            "crop_size": args.crop_size,
            "root": args.dataroot,
        }
        train_dataset = get_segmentation_dataset(
            args.dataset, split="train", mode="train", **data_kwargs
        )
        val_dataset = get_segmentation_dataset(
            args.dataset, split="val", mode="val", **data_kwargs
        )
        args.iters_per_epoch = len(train_dataset) // (
            args.num_gpus * args.batch_size
        )
        args.max_iters = args.epochs * args.iters_per_epoch

        train_sampler = make_data_sampler(
            train_dataset, shuffle=True, distributed=args.distributed
        )
        train_batch_sampler = make_batch_data_sampler(
            train_sampler, args.batch_size, args.max_iters
        )
        val_sampler = make_data_sampler(val_dataset, False, args.distributed)
        val_batch_sampler = make_batch_data_sampler(
            val_sampler, args.batch_size
        )

        self.train_loader = data.DataLoader(
            dataset=train_dataset,
            batch_sampler=train_batch_sampler,
            num_workers=args.workers,
            pin_memory=True,
        )
        self.val_loader = data.DataLoader(
            dataset=val_dataset,
            batch_sampler=val_batch_sampler,
            num_workers=args.workers,
            pin_memory=True,
        )

        # create network
        BatchNorm2d = nn.SyncBatchNorm if args.distributed else nn.BatchNorm2d
        self.model = get_segmentation_model(
            model=args.model,
            dataset=args.dataset,
            backbone=args.backbone,
            aux=args.aux,
            jpu=args.jpu,
            norm_layer=BatchNorm2d,
        ).to(self.device)

        # resume checkpoint if needed
        if args.resume:
            if os.path.isfile(args.resume):
                name, ext = os.path.splitext(args.resume)
                assert (
                    ext == ".pkl" or ".pth"
                ), "Sorry only .pth and .pkl files supported."
                print("Resuming training, loading {}...".format(args.resume))
                self.model.load_state_dict(
                    torch.load(
                        args.resume, map_location=lambda storage, loc: storage
                    )
                )

        # create criterion
        self.criterion = get_segmentation_loss(
            args.model,
            use_ohem=args.use_ohem,
            aux=args.aux,
            aux_weight=args.aux_weight,
            ignore_index=-1,
        ).to(self.device)

        # optimizer, for model just includes pretrained, head and auxlayer
        params_list = list()
        if hasattr(self.model, "pretrained"):
            params_list.append(
                {"params": self.model.pretrained.parameters(), "lr": args.lr}
            )
        if hasattr(self.model, "exclusive"):
            for module in self.model.exclusive:
                params_list.append(
                    {
                        "params": getattr(self.model, module).parameters(),
                        "lr": args.lr * args.lr_scale,
                    }
                )
        self.optimizer = torch.optim.SGD(
            params_list,
            lr=args.lr,
            momentum=args.momentum,
            weight_decay=args.weight_decay,
        )

        # lr scheduling
        self.lr_scheduler = get_lr_scheduler(self.optimizer, args)
        if args.distributed:
            self.model = nn.parallel.DistributedDataParallel(
                self.model,
                device_ids=[args.local_rank],
                output_device=args.local_rank,
            )

        # evaluation metrics
        self.metric = SegmentationMetric(train_dataset.num_class)

        self.best_pred = 0.0