Example #1
0
    def __init__(self, args):
        self.args = args
        self.device = torch.device(args.device)

        # Visualizer
        self.visualizer = TensorboardVisualizer(args, sys.argv)

        # image transform
        input_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
        ])
        # dataset and dataloader
        data_kwargs = {
            'transform': input_transform,
            'base_size': args.base_size,
            'crop_size': args.crop_size
        }
        train_dataset = get_segmentation_dataset(args.dataset,
                                                 split='train',
                                                 mode='train',
                                                 **data_kwargs)
        val_dataset = get_segmentation_dataset(args.dataset,
                                               split='val',
                                               mode='val',
                                               **data_kwargs)
        args.iters_per_epoch = len(train_dataset) // (args.num_gpus *
                                                      args.batch_size)
        args.max_iters = args.epochs * args.iters_per_epoch

        train_sampler = make_data_sampler(train_dataset,
                                          shuffle=True,
                                          distributed=args.distributed)
        train_batch_sampler = make_batch_data_sampler(train_sampler,
                                                      args.batch_size,
                                                      args.max_iters)
        val_sampler = make_data_sampler(val_dataset, False, args.distributed)
        val_batch_sampler = make_batch_data_sampler(val_sampler,
                                                    args.batch_size)

        self.train_loader = data.DataLoader(dataset=train_dataset,
                                            batch_sampler=train_batch_sampler,
                                            num_workers=args.workers,
                                            pin_memory=True)
        self.val_loader = data.DataLoader(dataset=val_dataset,
                                          batch_sampler=val_batch_sampler,
                                          num_workers=args.workers,
                                          pin_memory=True)

        # create network
        BatchNorm2d = nn.SyncBatchNorm if args.distributed else nn.BatchNorm2d
        self.model = get_segmentation_model(model=args.model,
                                            dataset=args.dataset,
                                            backbone=args.backbone,
                                            aux=args.aux,
                                            norm_layer=BatchNorm2d).to(
                                                self.device)  # jpu=args.jpu

        # resume checkpoint if needed
        if args.resume:
            if os.path.isfile(args.resume):
                name, ext = os.path.splitext(args.resume)
                assert ext == '.pkl' or '.pth', 'Sorry only .pth and .pkl files supported.'
                print('Resuming training, loading {}...'.format(args.resume))
                self.model.load_state_dict(
                    torch.load(args.resume,
                               map_location=lambda storage, loc: storage))

        # create criterion
        self.criterion = get_segmentation_loss(args.model,
                                               use_ohem=args.use_ohem,
                                               aux=args.aux,
                                               aux_weight=args.aux_weight,
                                               ignore_index=-1).to(self.device)

        # optimizer, for model just includes pretrained, head and auxlayer
        params_list = list()
        if hasattr(self.model, 'pretrained'):
            params_list.append({
                'params': self.model.pretrained.parameters(),
                'lr': args.lr
            })
        if hasattr(self.model, 'exclusive'):
            for module in self.model.exclusive:
                params_list.append({
                    'params':
                    getattr(self.model, module).parameters(),
                    'lr':
                    args.lr * 10
                })
        self.optimizer = torch.optim.SGD(params_list,
                                         lr=args.lr,
                                         momentum=args.momentum,
                                         weight_decay=args.weight_decay)

        # lr scheduling
        self.lr_scheduler = WarmupPolyLR(self.optimizer,
                                         max_iters=args.max_iters,
                                         power=0.9,
                                         warmup_factor=args.warmup_factor,
                                         warmup_iters=args.warmup_iters,
                                         warmup_method=args.warmup_method)

        if args.distributed:
            self.model = nn.parallel.DistributedDataParallel(
                self.model,
                device_ids=[args.local_rank],
                output_device=args.local_rank)

        # evaluation metrics
        self.metric = SegmentationMetric(train_dataset.num_class)

        self.best_pred = 0.0
    def __init__(self, args, logger):
        self.args = args
        self.logger = logger
        if get_rank() == 0:
            TBWriter.init(
                os.path.join(args.project_dir, args.task_dir, "tbevents")
            )
        self.device = torch.device(args.device)

        self.meters = MetricLogger(delimiter="  ")
        # image transform
        input_transform = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Normalize(
                    [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
                ),
            ]
        )
        # dataset and dataloader
        data_kwargs = {
            "transform": input_transform,
            "base_size": args.base_size,
            "crop_size": args.crop_size,
            "root": args.dataroot,
        }
        train_dataset = get_segmentation_dataset(
            args.dataset, split="train", mode="train", **data_kwargs
        )
        val_dataset = get_segmentation_dataset(
            args.dataset, split="val", mode="val", **data_kwargs
        )
        args.iters_per_epoch = len(train_dataset) // (
            args.num_gpus * args.batch_size
        )
        args.max_iters = args.epochs * args.iters_per_epoch

        train_sampler = make_data_sampler(
            train_dataset, shuffle=True, distributed=args.distributed
        )
        train_batch_sampler = make_batch_data_sampler(
            train_sampler, args.batch_size, args.max_iters
        )
        val_sampler = make_data_sampler(val_dataset, False, args.distributed)
        val_batch_sampler = make_batch_data_sampler(
            val_sampler, args.batch_size
        )

        self.train_loader = data.DataLoader(
            dataset=train_dataset,
            batch_sampler=train_batch_sampler,
            num_workers=args.workers,
            pin_memory=True,
        )
        self.val_loader = data.DataLoader(
            dataset=val_dataset,
            batch_sampler=val_batch_sampler,
            num_workers=args.workers,
            pin_memory=True,
        )

        # create network
        BatchNorm2d = nn.SyncBatchNorm if args.distributed else nn.BatchNorm2d
        self.model = get_segmentation_model(
            model=args.model,
            dataset=args.dataset,
            backbone=args.backbone,
            aux=args.aux,
            jpu=args.jpu,
            norm_layer=BatchNorm2d,
        ).to(self.device)

        # resume checkpoint if needed
        if args.resume:
            if os.path.isfile(args.resume):
                name, ext = os.path.splitext(args.resume)
                assert (
                    ext == ".pkl" or ".pth"
                ), "Sorry only .pth and .pkl files supported."
                print("Resuming training, loading {}...".format(args.resume))
                self.model.load_state_dict(
                    torch.load(
                        args.resume, map_location=lambda storage, loc: storage
                    )
                )

        # create criterion
        self.criterion = get_segmentation_loss(
            args.model,
            use_ohem=args.use_ohem,
            aux=args.aux,
            aux_weight=args.aux_weight,
            ignore_index=-1,
        ).to(self.device)

        # optimizer, for model just includes pretrained, head and auxlayer
        params_list = list()
        if hasattr(self.model, "pretrained"):
            params_list.append(
                {"params": self.model.pretrained.parameters(), "lr": args.lr}
            )
        if hasattr(self.model, "exclusive"):
            for module in self.model.exclusive:
                params_list.append(
                    {
                        "params": getattr(self.model, module).parameters(),
                        "lr": args.lr * args.lr_scale,
                    }
                )
        self.optimizer = torch.optim.SGD(
            params_list,
            lr=args.lr,
            momentum=args.momentum,
            weight_decay=args.weight_decay,
        )

        # lr scheduling
        self.lr_scheduler = get_lr_scheduler(self.optimizer, args)
        if args.distributed:
            self.model = nn.parallel.DistributedDataParallel(
                self.model,
                device_ids=[args.local_rank],
                output_device=args.local_rank,
            )

        # evaluation metrics
        self.metric = SegmentationMetric(train_dataset.num_class)

        self.best_pred = 0.0