def main(): global args, logger, writer args = get_parser().parse_args() logger = get_logger() writer = SummaryWriter(args.save_path) # os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in args.gpu) if args.dist: dist_init(args.port, backend=args.backend) if len(args.gpu) == 1: args.syncbn = False logger.info(args) assert args.classes > 1 assert args.zoom_factor in [1, 2, 4, 8] assert (args.crop_h - 1) % 8 == 0 and (args.crop_w - 1) % 8 == 0 world_size = 1 rank = 0 if args.dist: rank = dist.get_rank() world_size = dist.get_world_size() if rank == 0: logger.info('dist:{}'.format(args.dist)) logger.info("=> creating model ...") logger.info("Classes: {}".format(args.classes)) # rank = dist.get_rank() if args.bn_group > 1: args.syncbn = True bn_sync_stats = True bn_group_comm = simple_group_split(world_size, rank, world_size // args.bn_group) else: args.syncbn = False bn_sync_stats = False bn_group_comm = None model = PSPNet(layers=args.layers, classes=args.classes, zoom_factor=args.zoom_factor, syncbn=args.syncbn, group_size=args.bn_group, group=bn_group_comm, sync_stats=bn_sync_stats) if rank == 0: logger.info(model) # optimizer = torch.optim.SGD(model.parameters(), args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) # newly introduced layer with lr x10 optimizer = torch.optim.SGD([{ 'params': model.layer0.parameters() }, { 'params': model.layer1.parameters() }, { 'params': model.layer2.parameters() }, { 'params': model.layer3.parameters() }, { 'params': model.layer4.parameters() }, { 'params': model.ppm.parameters(), 'lr': args.base_lr * 10 }, { 'params': model.cls.parameters(), 'lr': args.base_lr * 10 }, { 'params': model.aux.parameters(), 'lr': args.base_lr * 10 }], lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) # model = torch.nn.DataParallel(model).cuda() # if args.syncbn: # from lib.syncbn import patch_replication_callback # patch_replication_callback(model) model = model.cuda() cudnn.enabled = True cudnn.benchmark = True criterion = nn.NLLLoss(ignore_index=args.ignore_label) if args.weight: def map_func(storage, location): return storage.cuda() if os.path.isfile(args.weight): logger.info("=> loading weight '{}'".format(args.weight)) checkpoint = torch.load(args.weight, map_location=map_func) model.load_state_dict(checkpoint['state_dict']) logger.info("=> loaded weight '{}'".format(args.weight)) else: logger.info("=> no weight found at '{}'".format(args.weight)) if args.resume: if os.path.isfile(args.resume): logger.info("=> loading checkpoint '{}'".format(args.resume)) # checkpoint = torch.load(args.resume) # args.start_epoch = checkpoint['epoch'] # model.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) model, optimizer, args.start_epoch = restore_from( model, optimizer, args.resume) logger.info("=> loaded checkpoint '{}' (epoch {})".format( args.resume, args.start_epoch)) else: logger.info("=> no checkpoint found at '{}'".format(args.resume)) if args.dist: broadcast_params(model) value_scale = 255 mean = [0.485, 0.456, 0.406] mean = [item * value_scale for item in mean] std = [0.229, 0.224, 0.225] std = [item * value_scale for item in std] train_transform = transforms.Compose([ transforms.RandScale([args.scale_min, args.scale_max]), transforms.RandRotate([args.rotate_min, args.rotate_max], padding=mean, ignore_label=args.ignore_label), transforms.RandomGaussianBlur(), transforms.RandomHorizontalFlip(), transforms.Crop([args.crop_h, args.crop_w], crop_type='rand', padding=mean, ignore_label=args.ignore_label), transforms.ToTensor(), transforms.Normalize(mean=mean, std=std) ]) train_data = datasets.SegData(split='train', data_root=args.data_root, data_list=args.train_list, transform=train_transform) train_sampler = None if args.dist: train_sampler = DistributedSampler(train_data) train_loader = torch.utils.data.DataLoader( train_data, batch_size=args.batch_size, shuffle=False if train_sampler else True, num_workers=args.workers, pin_memory=False, sampler=train_sampler) if args.evaluate: val_transform = transforms.Compose([ transforms.Crop([args.crop_h, args.crop_w], crop_type='center', padding=mean, ignore_label=args.ignore_label), transforms.ToTensor(), transforms.Normalize(mean=mean, std=std) ]) val_data = datasets.SegData(split='val', data_root=args.data_root, data_list=args.val_list, transform=val_transform) val_sampler = None if args.dist: val_sampler = DistributedSampler(val_data) val_loader = torch.utils.data.DataLoader( val_data, batch_size=args.batch_size_val, shuffle=False, num_workers=args.workers, pin_memory=False, sampler=val_sampler) for epoch in range(args.start_epoch, args.epochs + 1): loss_train, mIoU_train, mAcc_train, allAcc_train = train( train_loader, model, criterion, optimizer, epoch, args.zoom_factor, args.batch_size, args.aux_weight) writer.add_scalar('loss_train', loss_train.cpu().numpy(), epoch) writer.add_scalar('mIoU_train', mIoU_train, epoch) writer.add_scalar('mAcc_train', mAcc_train, epoch) writer.add_scalar('allAcc_train', allAcc_train, epoch) # write parameters histogram costs lots of time # for name, param in model.named_parameters(): # writer.add_histogram(name, param, epoch) if args.evaluate and rank == 0: loss_val, mIoU_val, mAcc_val, allAcc_val = validate( val_loader, model, criterion, args.classes, args.zoom_factor) writer.add_scalar('loss_val', loss_val.cpu().numpy(), epoch) writer.add_scalar('mIoU_val', mIoU_val, epoch) writer.add_scalar('mAcc_val', mAcc_val, epoch) writer.add_scalar('allAcc_val', allAcc_val, epoch) if epoch % args.save_step == 0 and (rank == 0): filename = args.save_path + '/train_epoch_' + str(epoch) + '.pth' logger.info('Saving checkpoint to: ' + filename) torch.save( { 'epoch': epoch, 'state_dict': model.cpu().state_dict(), 'optimizer': optimizer.state_dict() }, filename)
def load_model(filepath): net = PSPNet(pretrained=False) net.cpu() net.load_state_dict(torch.load(filepath, map_location='cpu')) return net