コード例 #1
0
ファイル: val_best.py プロジェクト: bmhopkinson/CSN
class Trainer(object):
    def __init__(self,args):
        warnings.filterwarnings('ignore')
        assert torch.cuda.is_available()
        torch.backends.cudnn.benchmark = True
        model_fname = 'data/deeplab_{0}_{1}_v3_{2}_epoch%d.pth'.format(args.backbone, args.dataset, args.exp)
        if args.dataset == 'pascal':
            raise NotImplementedError
        elif args.dataset == 'cityscapes':
            kwargs = {'num_workers': args.workers, 'pin_memory': True, 'drop_last': True}
            dataset_loader, num_classes = dataloaders.make_data_loader(args, **kwargs)
            args.num_classes = num_classes
        elif args.dataset == 'marsh' :
            kwargs = {'num_workers': args.workers, 'pin_memory': True, 'drop_last': True}
            dataset_loader,val_loader, test_loader, num_classes = dataloaders.make_data_loader(args, **kwargs)
            args.num_classes = num_classes
        else:
            raise ValueError('Unknown dataset: {}'.format(args.dataset))

        if args.backbone == 'autodeeplab':
            model = Retrain_Autodeeplab(args)
            model.load_state_dict(torch.load(r"./run/marsh/deeplab-autodeeplab/model_best.pth.tar")['state_dict'], strict=False)
        else:
            raise ValueError('Unknown backbone: {}'.format(args.backbone))

       optimizer = optim.SGD(model.module.parameters(), lr=args.base_lr, momentum=0.9, weight_decay=0.0001)


        if args.criterion == 'Ohem':
            args.thresh = 0.7
            args.crop_size = [args.crop_size, args.crop_size] if isinstance(args.crop_size, int) else args.crop_size
            args.n_min = int((args.batch_size / len(args.gpu) * args.crop_size[0] * args.crop_size[1]) // 16)
        criterion = build_criterion(args)
		
		
        model = nn.DataParallel(model).cuda()
        ##mergee 
        self.args = args
        # Define Saver
        self.saver = Saver(args)
        self.saver.save_experiment_config()
        # Define Tensorboard Summary
        self.summary = TensorboardSummary(self.saver.experiment_dir)
        self.writer = self.summary.create_summary()
        
        # Define Dataloader
        #kwargs = {'num_workers': args.workers, 'pin_memory': True}
        self.train_loader, self.val_loader, self.test_loader, self.nclass = dataset_loader,val_loader, test_loader, num_classes

        self.criterion = criterion
        self.model, self.optimizer = model, optimizer
        
        # Define Evaluator
        self.evaluator = Evaluator(self.nclass)
        # Define lr scheduler
        #self.scheduler = scheduler
        self.scheduler = LR_Scheduler("poly",args.lr, args.epochs, len(self.train_loader)) #removed None from second parameter. 
コード例 #2
0
def main():
    warnings.filterwarnings('ignore')
    assert torch.cuda.is_available()
    torch.backends.cudnn.benchmark = True
    args = obtain_retrain_autodeeplab_args()
    args.data_dict = {}
    model_fname = 'data/deeplab_{0}_{1}_v3_{2}_epoch%d.pth'.format(
        args.backbone, args.dataset, args.exp)
    if args.dataset == 'pascal':
        raise NotImplementedError
    elif args.dataset == 'cityscapes':
        kwargs = {
            'num_workers': args.workers,
            'pin_memory': True,
            'drop_last': True
        }
        dataset_loader, num_classes = make_data_loader(args, **kwargs)
        args.num_classes = num_classes
    elif args.dataset == '2d':
        args.data_dict, args.num_classes = make_data_loader(args)
    elif args.dataset == '3d':
        args.data_dict, args.num_classes = make_data_loader_3d_patch(args)

    else:
        raise ValueError('Unknown dataset: {}'.format(args.dataset))

    if args.backbone == 'autodeeplab':
        model = Retrain_Autodeeplab(args)
    else:
        raise ValueError('Unknown backbone: {}'.format(args.backbone))

    if args.criterion == 'Ohem':
        args.thresh = 0.7
        args.crop_size = [args.crop_size, args.crop_size] if isinstance(
            args.crop_size, int) else args.crop_size
        args.n_min = int((args.batch_size / len(args.gpu) * args.crop_size[0] *
                          args.crop_size[1]) // 16)
    criterion = build_criterion(args)

    model = nn.DataParallel(model).cuda()
    model.train()
    if args.freeze_bn:
        for m in model.modules():
            if isinstance(m, nn.BatchNorm2d):
                m.eval()
                m.weight.requires_grad = False
                m.bias.requires_grad = False
    optimizer = optim.SGD(model.module.parameters(),
                          lr=args.base_lr,
                          momentum=0.9,
                          weight_decay=0.0001)

    max_iteration = args.data_dict['num_train'] * args.epochs
    scheduler = Iter_LR_Scheduler(args, max_iteration,
                                  args.data_dict['num_train'])
    start_epoch = 0

    if args.resume:
        if os.path.isfile(args.resume):
            print('=> loading checkpoint {0}'.format(args.resume))
            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print('=> loaded checkpoint {0} (epoch {1})'.format(
                args.resume, checkpoint['epoch']))
        else:
            raise ValueError('=> no checkpoint found at {0}'.format(
                args.resume))

    for epoch in range(start_epoch, args.epochs):
        losses = AverageMeter()
        for i in range(args.data_dict['num_train']):
            cur_iter = epoch * args.data_dict['num_train'] + i
            scheduler(optimizer, cur_iter)
            inputs = torch.FloatTensor(args.data_dict['train_data'][i]).cuda()
            target = torch.FloatTensor(args.data_dict['train_mask'][i]).cuda()
            outputs = model(inputs)
            loss = criterion(outputs, target)
            if np.isnan(loss.item()) or np.isinf(loss.item()):
                pdb.set_trace()
            losses.update(loss.item(), args.batch_size)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            print('epoch: {0}\t'
                  'iter: {1}/{2}\t'
                  'lr: {3:.6f}\t'
                  'loss: {loss.val:.4f} ({loss.ema:.4f})'.format(
                      epoch + 1,
                      i + 1,
                      args.data_dict['num_train'],
                      scheduler.get_lr(optimizer),
                      loss=losses))

        if epoch < args.epochs - 50:
            if epoch % 50 == 0:
                torch.save(
                    {
                        'epoch': epoch + 1,
                        'state_dict': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                    }, model_fname % (epoch + 1))
        else:
            torch.save(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                }, model_fname % (epoch + 1))

        if epoch % 2 == 0:
            ########################valid and test
            validation(epoch, model, args, criterion, args.num_classes)
            validation(epoch,
                       model,
                       args,
                       criterion,
                       args.num_classes,
                       test_tag=True)
        print('reset local total loss!')
コード例 #3
0
def main():
    args = obtain_retrain_autodeeplab_args()
    torch.cuda.set_device(args.local_rank)
    cfg = config_factory['medical_img']
    if not os.path.exists(cfg.respth):
        os.makedirs(cfg.respth)
    dist.init_process_group(backend='nccl',
                            init_method='tcp://127.0.0.1:{}'.format(cfg.port),
                            world_size=torch.cuda.device_count(),
                            rank=args.local_rank)
    setup_logger(cfg.respth)
    logger = logging.getLogger()
    rand_seed = random.randint(0, args.manualSeed)
    prepare_seed(rand_seed)
    if args.local_rank == 0:
        log_string = 'seed-{}-time-{}'.format(rand_seed, time_for_file())
        train_logger = Logger(args, log_string)
        train_logger.log('Arguments : -------------------------------')
        for name, value in args._get_kwargs():
            train_logger.log('{:16} : {:}'.format(name, value))
        train_logger.log("Python  version : {}".format(
            sys.version.replace('\n', ' ')))
        train_logger.log("Pillow  version : {}".format(PIL.__version__))
        train_logger.log("PyTorch version : {}".format(torch.__version__))
        train_logger.log("cuDNN   version : {}".format(
            torch.backends.cudnn.version()))
        train_logger.log("random_seed : {}".format(rand_seed))
        if args.checkname is None:
            args.checkname = 'deeplab-' + str(args.backbone)
    # dataset
    kwargs = {
        'num_workers': args.workers,
        'pin_memory': True,
        'drop_last': True
    }
    if args.dataset == '2d':
        args.data_dict, args.num_classes = make_data_loader(args=args,
                                                            **kwargs)
    if args.dataset == '3d':
        args.data_dict, args.num_classes = make_data_loader_3d_patch(args=args,
                                                                     **kwargs)
    # model
    model = Retrain_Autodeeplab(args)
    model.train()
    model.cuda()
    model = nn.parallel.DistributedDataParallel(
        model,
        device_ids=[
            args.local_rank,
        ],
        output_device=args.local_rank,
        find_unused_parameters=True).cuda()
    # n_min = cfg.ims_per_gpu * cfg.crop_size[0] * cfg.crop_size[1] // 16
    # criterion = OhemCELoss(thresh=cfg.ohem_thresh, n_min=n_min).cuda()
    criterion = build_criterion(args)
    max_iteration = int(cfg.max_epoch * args.data_dict['num_train'])
    #     max_iteration = int(1500000 * 4 // cfg.gpus)
    it = 0
    # optimizer
    optimizer = Optimizer(model, cfg.lr_start, cfg.momentum, cfg.weight_decay,
                          cfg.warmup_steps, cfg.warmup_start_lr, max_iteration,
                          cfg.lr_power)
    if dist.get_rank() == 0:
        print(
            '======optimizer launch successfully , max_iteration {:}!======='.
            format(max_iteration))

    # train loop
    loss_avg = []
    start_time = glob_start_time = time.time()
    # for it in range(cfg.max_iter):
    if args.resume is not None:
        checkpoint = torch.load(args.resume, map_location='cpu')
        if checkpoint['iter'] is not None:
            args.train_mode = 'iter'
            start_iter = checkpoint['iter']
            n_epoch = checkpoint['epoch']
        elif checkpoint['epoch'] is not None:
            args.train_mode = 'epoch'
        model.module.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'], checkpoint['iter'])

    else:
        if args.train_mode == 'iter':
            start_iter = 0
            n_epoch = 0
        elif args.train_mode == 'epoch':
            start_epoch = 0

    if args.train_mode is 'iter':

        diter = iter(train_loader)
        for it in range(start_iter, cfg.max_iter):
            try:
                sample = next(diter)
            except StopIteration:
                n_epoch += 1
                sampler.set_epoch(n_epoch)
                diter = iter(train_loader)
                sample = next(diter)

            im, lb = sample['image'].cuda(), sample['label'].cuda()
            lb = torch.squeeze(lb, 1)

            optimizer.zero_grad()
            logits = model(im)
            loss = criterion(logits, lb)
            loss.backward()
            optimizer.step()

            loss_avg.append(loss.item())
            # print training log message

            if it % cfg.msg_iter == 0 and not it == 0 and dist.get_rank() == 0:
                loss_avg = sum(loss_avg) / len(loss_avg)
                lr = optimizer.lr
                ed = time.time()
                t_intv, glob_t_intv = ed - start_time, ed - glob_start_time
                eta = int((max_iteration - it) * (glob_t_intv / it))
                eta = str(datetime.timedelta(seconds=eta))
                msg = ', '.join([
                    'iter: {it}/{max_iteration}',
                    'lr: {lr:4f}',
                    'loss: {loss:.4f}',
                    'eta: {eta}',
                    'time: {time:.4f}',
                ]).format(it=it,
                          max_iteration=max_iteration,
                          lr=lr,
                          loss=loss_avg,
                          time=t_intv,
                          eta=eta)
                # TODO : now the logger.info will error if iter > 350000, so use print haha
                if max_iteration > 350000:
                    logger.info(msg)
                else:
                    print(msg)
                loss_avg = []
            it += 1

            if (cfg.msg_iter is not None) and (it % cfg.msg_iter
                                               == 0) and (it != 0):
                if args.verbose:
                    logger.info('evaluating the model of iter:{}'.format(it))
                    model.eval()
                    evaluator = MscEval(cfg, args)
                    mIOU, loss = evaluator(model,
                                           criteria=criterion,
                                           multi_scale=False)
                    logger.info('mIOU is: {}, loss_eval is {}'.format(
                        mIOU, loss))

                model.cpu()
                save_name = 'iter_{}_naive_model.pth'.format(it)
                save_pth = osp.join(cfg.respth, save_name)
                state = model.module.state_dict() if hasattr(
                    model, 'module') else model.state_dict()

                checkpoint = {
                    'state_dict': state,
                    'epoch': n_epoch,
                    'iter': it,
                    'optimizer': optimizer.optim.state_dict()
                }
                if dist.get_rank() == 0:
                    torch.save(state, save_pth)
                logger.info('model of iter {} saved to: {}'.format(
                    it, save_pth))
                model.cuda()
                model.train()

    elif args.train_mode is 'epoch':
        for epoch in range(start_epoch, cfg.max_epoch):
            for i in range(args.data_dict['num_train']):
                im = torch.FloatTensor(args.data_dict['train_data'][i]).cuda()
                lb = torch.FloatTensor(args.data_dict['train_mask'][i]).cuda()
                # lb = torch.squeeze(lb, 1)

                optimizer.zero_grad()
                logits = model(im)
                loss = criterion(logits, lb)
                loss.backward()
                optimizer.step()

                loss_avg.append(loss.item())
                # print training log message

            if i % cfg.msg_iter == 0 and not (i == 0 and epoch
                                              == 0) and dist.get_rank() == 0:
                loss_avg = sum(loss_avg) / len(loss_avg)
                lr = optimizer.lr
                ed = time.time()
                t_intv, glob_t_intv = ed - start_time, ed - glob_start_time
                eta = int((max_iteration - it) * (glob_t_intv / it))
                eta = str(datetime.timedelta(seconds=eta))
                msg = ', '.join([
                    'iter: {it}/{max_iteration}',
                    'lr: {lr:4f}',
                    'loss: {loss:.4f}',
                    'eta: {eta}',
                    'time: {time:.4f}',
                ]).format(it=it,
                          max_iteration=max_iteration,
                          lr=lr,
                          loss=loss_avg,
                          time=t_intv,
                          eta=eta)
                logger.info(msg)
                loss_avg = []

            # save model and optimizer each epoch
            if args.verbose:
                logger.info('evaluating the model of iter:{}'.format(it))
                model.eval()
                evaluator = MscEval(cfg, args)
                mIOU, loss = evaluator(model,
                                       criteria=criterion,
                                       multi_scale=False)
                logger.info('mIOU is: {}, loss_eval is {}'.format(mIOU, loss))

            model.cpu()
            save_name = 'iter_{}_naive_model.pth'.format(it)
            save_pth = osp.join(cfg.respth, save_name)
            state = model.module.state_dict() if hasattr(
                model, 'module') else model.state_dict()

            checkpoint = {
                'state_dict': state,
                'epoch': n_epoch,
                'iter': it,
                'optimizer': optimizer.state_dict()
            }
            if dist.get_rank() == 0:
                torch.save(state, save_pth)
            logger.info('model of iter {} saved to: {}'.format(it, save_pth))
            model.cuda()
            model.train()

    else:
        raise NotImplementedError
コード例 #4
0
ファイル: train_best.py プロジェクト: bmhopkinson/CSN
    def __init__(self, args):
        warnings.filterwarnings('ignore')
        assert torch.cuda.is_available()
        torch.backends.cudnn.benchmark = True
        model_fname = 'data/deeplab_{0}_{1}_v3_{2}_epoch%d.pth'.format(
            args.backbone, args.dataset, args.exp)
        if args.dataset == 'pascal':
            raise NotImplementedError
        elif args.dataset == 'cityscapes':
            kwargs = {
                'num_workers': args.workers,
                'pin_memory': True,
                'drop_last': True
            }
            dataset_loader, num_classes = dataloaders.make_data_loader(
                args, **kwargs)
            args.num_classes = num_classes
        elif args.dataset == 'marsh':
            kwargs = {
                'num_workers': args.workers,
                'pin_memory': True,
                'drop_last': True
            }
            dataset_loader, val_loader, test_loader, num_classes = dataloaders.make_data_loader(
                args, **kwargs)
            args.num_classes = num_classes
        else:
            raise ValueError('Unknown dataset: {}'.format(args.dataset))

        if args.backbone == 'autodeeplab':
            model = Retrain_Autodeeplab(args)
        else:
            raise ValueError('Unknown backbone: {}'.format(args.backbone))

        if args.criterion == 'Ohem':
            args.thresh = 0.7
            args.crop_size = [args.crop_size, args.crop_size] if isinstance(
                args.crop_size, int) else args.crop_size
            args.n_min = int((args.batch_size / len(args.gpu) *
                              args.crop_size[0] * args.crop_size[1]) // 16)
        criterion = build_criterion(args)

        model = nn.DataParallel(model).cuda()
        model.train()
        if args.freeze_bn:
            for m in model.modules():
                if isinstance(m, nn.BatchNorm2d):
                    m.eval()
                    m.weight.requires_grad = False
                    m.bias.requires_grad = False
        optimizer = optim.SGD(model.module.parameters(),
                              lr=args.base_lr,
                              momentum=0.9,
                              weight_decay=0.0001)

        max_iteration = len(dataset_loader) * args.epochs
        scheduler = Iter_LR_Scheduler(args, max_iteration, len(dataset_loader))

        start_epoch = 0

        # Resuming checkpoint
        self.best_pred = 0.0
        if args.resume:
            if os.path.isfile(args.resume):
                print('=> loading checkpoint {0}'.format(args.resume))
                checkpoint = torch.load(args.resume)
                start_epoch = checkpoint['epoch']
                model.load_state_dict(checkpoint['state_dict'])
                optimizer.load_state_dict(checkpoint['optimizer'])
                print('=> loaded checkpoint {0} (epoch {1})'.format(
                    args.resume, checkpoint['epoch']))
                self.best_pred = checkpoint['best_pred']
            else:
                raise ValueError('=> no checkpoint found at {0}'.format(
                    args.resume))
        ##mergee
        self.args = args
        # Define Saver
        self.saver = Saver(args)
        self.saver.save_experiment_config()
        # Define Tensorboard Summary
        self.summary = TensorboardSummary(self.saver.experiment_dir)
        self.writer = self.summary.create_summary()

        # Define Dataloader
        #kwargs = {'num_workers': args.workers, 'pin_memory': True}
        self.train_loader, self.val_loader, self.test_loader, self.nclass = dataset_loader, val_loader, test_loader, num_classes

        self.criterion = criterion
        self.model, self.optimizer = model, optimizer

        # Define Evaluator
        self.evaluator = Evaluator(self.nclass)
        # Define lr scheduler
        #self.scheduler = scheduler
        self.scheduler = LR_Scheduler(
            "poly", args.lr, args.epochs,
            len(self.train_loader))  #removed None from second parameter.