def main(): global args, logger, writer args = get_parser().parse_args() import multiprocessing as mp if mp.get_start_method(allow_none=True) != 'spawn': mp.set_start_method('spawn', force=True) rank, world_size = dist_init(args.port) logger = get_logger() writer = SummaryWriter(args.save_path) #os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in args.gpu) #if len(args.gpu) == 1: # args.syncbn = False if rank == 0: logger.info(args) if args.bn_group == 1: args.bn_group_comm = None else: assert world_size % args.bn_group == 0 args.bn_group_comm = simple_group_split(world_size, rank, world_size // args.bn_group) if rank == 0: logger.info("=> creating model ...") logger.info("Classes: {}".format(args.classes)) from pspnet import PSPNet model = PSPNet(backbone=args.backbone, layers=args.layers, classes=args.classes, zoom_factor=args.zoom_factor, syncbn=args.syncbn, group_size=args.bn_group, group=args.bn_group_comm).cuda() logger.info(model) model_ppm = PPM().cuda() # optimizer = torch.optim.SGD(model.parameters(), args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) # newly introduced layer with lr x10 optimizer = torch.optim.SGD( [{'params': model.layer0.parameters()}, {'params': model.layer1.parameters()}, {'params': model.layer2.parameters()}, {'params': model.layer3.parameters()}, {'params': model.layer4_ICR.parameters()}, {'params': model.layer4_PFR.parameters()}, {'params': model.layer4_PRP.parameters()}, {'params': model_ppm.cls_trans.parameters(), 'lr': args.base_lr * 10}, {'params': model_ppm.cls_quat.parameters(), 'lr': args.base_lr * 10} ], lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) #model = torch.nn.DataParallel(model).cuda() model = DistModule(model) model_ppm = DistModule(model_ppm) cudnn.enabled = True cudnn.benchmark = True criterion = nn.L1Loss().cuda() if args.weight: def map_func(storage, location): return storage.cuda() if os.path.isfile(args.weight): logger.info("=> loading weight '{}'".format(args.weight)) checkpoint = torch.load(args.weight, map_location=map_func) model.load_state_dict(checkpoint['state_dict']) logger.info("=> loaded weight '{}'".format(args.weight)) else: logger.info("=> no weight found at '{}'".format(args.weight)) if args.resume: load_state(args.resume, model, model_ppm, optimizer) value_scale = 255 mean = [0.485, 0.456, 0.406] mean = [item * value_scale for item in mean] std = [0.229, 0.224, 0.225] std = [item * value_scale for item in std] train_transform = transforms.Compose([ transforms.Resize(size=(256,256)), #transforms.RandomGaussianBlur(), transforms.Crop([args.crop_h, args.crop_w], crop_type='rand', padding=mean, ignore_label=args.ignore_label), transforms.ColorJitter([0.4,0.4,0.4]), transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]) train_data = datasets.SegData(split='train', data_root=args.data_root, data_list=args.train_list, transform=train_transform) train_sampler = DistributedSampler(train_data) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=False, sampler=train_sampler) if args.evaluate: val_transform = transforms.Compose([ transforms.Resize(size=(256,256)), transforms.Crop([args.crop_h, args.crop_w], crop_type='center', padding=mean, ignore_label=args.ignore_label), transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]) val_data = datasets.SegData(split='val', data_root=args.data_root, data_list=args.val_list, transform=val_transform) val_sampler = DistributedSampler(val_data) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size_val, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler) for epoch in range(args.start_epoch, args.epochs + 1): t_loss_train, r_loss_train= train(train_loader, model, model_ppm, criterion, optimizer, epoch, args.zoom_factor, args.batch_size, args.aux_weight) if rank == 0: writer.add_scalar('t_loss_train', t_loss_train, epoch) writer.add_scalar('r_loss_train', r_loss_train, epoch) # write parameters histogram costs lots of time # for name, param in model.named_parameters(): # writer.add_histogram(name, param, epoch) if epoch % args.save_step == 0 and rank == 0: filename = args.save_path + '/train_epoch_' + str(epoch) + '.pth' logger.info('Saving checkpoint to: ' + filename) filename_ppm = args.save_path + '/train_epoch_' + str(epoch) + '_ppm.pth' torch.save({'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}, filename) torch.save({'epoch': epoch, 'state_dict': model_ppm.state_dict(), 'optimizer': optimizer.state_dict()}, filename_ppm) #if epoch / args.save_step > 2: # deletename = args.save_path + '/train_epoch_' + str(epoch - args.save_step*2) + '.pth' # os.remove(deletename) if args.evaluate: t_loss_val, r_loss_val= validate(val_loader, model, model_ppm, criterion) writer.add_scalar('t_loss_val', t_loss_val, epoch) writer.add_scalar('r_loss_val', r_loss_val, epoch) writer.close()
def main(): global args, logger, writer args = get_parser().parse_args() import multiprocessing as mp if mp.get_start_method(allow_none=True) != 'spawn': mp.set_start_method('spawn', force=True) rank, world_size = dist_init(args.port) logger = get_logger() writer = SummaryWriter(args.save_path) #os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in args.gpu) #if len(args.gpu) == 1: # args.syncbn = False if rank == 0: logger.info(args) assert args.classes > 1 assert args.zoom_factor in [1, 2, 4, 8] assert (args.crop_h - 1) % 8 == 0 and (args.crop_w - 1) % 8 == 0 assert args.net_type in [0, 1, 2, 3] if args.bn_group == 1: args.bn_group_comm = None else: assert world_size % args.bn_group == 0 args.bn_group_comm = simple_group_split(world_size, rank, world_size // args.bn_group) if rank == 0: logger.info("=> creating model ...") logger.info("Classes: {}".format(args.classes)) from pspnet import PSPNet model = PSPNet(backbone=args.backbone, layers=args.layers, classes=args.classes, zoom_factor=args.zoom_factor, syncbn=args.syncbn, group_size=args.bn_group, group=args.bn_group_comm, use_softmax=False, use_aux=False).cuda() logger.info(model) # optimizer = torch.optim.SGD(model.parameters(), args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) # newly introduced layer with lr x10 optimizer = torch.optim.SGD( [{ 'params': model.layer0.parameters() }, { 'params': model.layer1.parameters() }, { 'params': model.layer2.parameters() }, { 'params': model.layer3.parameters() }, { 'params': model.layer4.parameters() }, { 'params': model.ppm.parameters(), 'lr': args.base_lr * 10 }, { 'params': model.cls.parameters(), 'lr': args.base_lr * 10 }, { 'params': model.result.parameters(), 'lr': args.base_lr * 10 }], # {'params': model.aux.parameters(), 'lr': args.base_lr * 10}], lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) #model = torch.nn.DataParallel(model).cuda() model = DistModule(model) #if args.syncbn: # from lib.syncbn import patch_replication_callback # patch_replication_callback(model) cudnn.enabled = True cudnn.benchmark = True criterion = nn.NLLLoss(ignore_index=args.ignore_label).cuda() if args.weight: def map_func(storage, location): return storage.cuda() if os.path.isfile(args.weight): logger.info("=> loading weight '{}'".format(args.weight)) checkpoint = torch.load(args.weight, map_location=map_func) model.load_state_dict(checkpoint['state_dict']) logger.info("=> loaded weight '{}'".format(args.weight)) else: logger.info("=> no weight found at '{}'".format(args.weight)) if args.resume: load_state(args.resume, model, optimizer) value_scale = 255 mean = [0.485, 0.456, 0.406] mean = [item * value_scale for item in mean] std = [0.229, 0.224, 0.225] std = [item * value_scale for item in std] normalize = Normalize() train_data = voc12.data.VOC12ClsDataset( args.train_list, voc12_root=args.voc12_root, transform=transforms.Compose([ imutils.RandomResizeLong(400, 512), transforms.RandomHorizontalFlip(), transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.1), np.asarray, normalize, imutils.RandomCrop(args.crop_size), imutils.HWC_to_CHW, torch.from_numpy ])) train_sampler = DistributedSampler(train_data) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=False, sampler=train_sampler) for epoch in range(args.start_epoch, args.epochs + 1): loss_train = train(train_loader, model, criterion, optimizer, epoch, args.zoom_factor, args.batch_size, args.aux_weight) if rank == 0: writer.add_scalar('loss_train', loss_train, epoch) # write parameters histogram costs lots of time # for name, param in model.named_parameters(): # writer.add_histogram(name, param, epoch) if epoch % args.save_step == 0 and rank == 0: filename = args.save_path + '/train_epoch_' + str(epoch) + '.pth' logger.info('Saving checkpoint to: ' + filename) torch.save( { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() }, filename)
def main(): global args, logger, writer args = get_parser().parse_args() import multiprocessing as mp if mp.get_start_method(allow_none=True) != 'spawn': mp.set_start_method('spawn', force=True) rank, world_size = dist_init(args.port) logger = get_logger() writer = SummaryWriter(args.save_path) #os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in args.gpu) #if len(args.gpu) == 1: # args.syncbn = False if rank == 0: logger.info(args) assert args.classes > 1 assert args.zoom_factor in [1, 2, 4, 8] assert (args.crop_h-1) % 8 == 0 and (args.crop_w-1) % 8 == 0 assert args.net_type in [0, 1, 2, 3] if args.bn_group == 1: args.bn_group_comm = None else: assert world_size % args.bn_group == 0 args.bn_group_comm = simple_group_split(world_size, rank, world_size // args.bn_group) if rank == 0: logger.info("=> creating model ...") logger.info("Classes: {}".format(args.classes)) if args.net_type == 0: from pspnet import PSPNet model = PSPNet(backbone=args.backbone, layers=args.layers, classes=args.classes, zoom_factor=args.zoom_factor, syncbn=args.syncbn, group_size=args.bn_group, group=args.bn_group_comm).cuda() elif args.net_type in [1, 2, 3]: from pspnet_div4 import PSPNet model = PSPNet(backbone=args.backbone, layers=args.layers, classes=args.classes, zoom_factor=args.zoom_factor, syncbn=args.syncbn, group_size=args.bn_group, group=args.bn_group_comm, net_type=args.net_type).cuda() logger.info(model) # optimizer = torch.optim.SGD(model.parameters(), args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) # newly introduced layer with lr x10 if args.net_type == 0: optimizer = torch.optim.SGD( [{'params': model.layer0.parameters()}, {'params': model.layer1.parameters()}, {'params': model.layer2.parameters()}, {'params': model.layer3.parameters()}, {'params': model.layer4.parameters()}, {'params': model.ppm.parameters(), 'lr': args.base_lr * 10}, {'params': model.conv6.parameters(), 'lr': args.base_lr * 10}, {'params': model.conv1_1x1.parameters(), 'lr': args.base_lr * 10}, {'params': model.cls.parameters(), 'lr': args.base_lr * 10}, {'params': model.aux.parameters(), 'lr': args.base_lr * 10}], lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.net_type == 1: optimizer = torch.optim.SGD( [{'params': model.layer0.parameters()}, {'params': model.layer1.parameters()}, {'params': model.layer2.parameters()}, {'params': model.layer3.parameters()}, {'params': model.layer4.parameters()}, {'params': model.layer4_p.parameters()}, {'params': model.ppm.parameters(), 'lr': args.base_lr * 10}, {'params': model.ppm_p.parameters(), 'lr': args.base_lr * 10}, {'params': model.cls.parameters(), 'lr': args.base_lr * 10}, {'params': model.cls_p.parameters(), 'lr': args.base_lr * 10}, {'params': model.aux.parameters(), 'lr': args.base_lr * 10}], lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.net_type == 2: optimizer = torch.optim.SGD( [{'params': model.layer0.parameters()}, {'params': model.layer1.parameters()}, {'params': model.layer2.parameters()}, {'params': model.layer3.parameters()}, {'params': model.layer4.parameters()}, {'params': model.layer4_p.parameters()}, {'params': model.ppm.parameters(), 'lr': args.base_lr * 10}, {'params': model.ppm_p.parameters(), 'lr': args.base_lr * 10}, {'params': model.cls.parameters(), 'lr': args.base_lr * 10}, {'params': model.cls_p.parameters(), 'lr': args.base_lr * 10}, {'params': model.att.parameters(), 'lr': args.base_lr * 10}, {'params': model.att_p.parameters(), 'lr': args.base_lr * 10}, {'params': model.aux.parameters(), 'lr': args.base_lr * 10}], lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.net_type == 3: optimizer = torch.optim.SGD( [{'params': model.layer0.parameters()}, {'params': model.layer1.parameters()}, {'params': model.layer2.parameters()}, {'params': model.layer3.parameters()}, {'params': model.layer4.parameters()}, {'params': model.layer4_p.parameters()}, {'params': model.ppm.parameters(), 'lr': args.base_lr * 10}, {'params': model.ppm_p.parameters(), 'lr': args.base_lr * 10}, {'params': model.cls.parameters(), 'lr': args.base_lr * 10}, {'params': model.cls_p.parameters(), 'lr': args.base_lr * 10}, {'params': model.att.parameters(), 'lr': args.base_lr * 10}, {'params': model.aux.parameters(), 'lr': args.base_lr * 10}], lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) fcw = V11RFCN() fcw_model = torch.load('checkpoint_e8.pth')['state_dict'] fcw_dict = fcw.state_dict() pretrained_fcw = {k: v for k, v in fcw_model.items() if k in fcw_dict} fcw_dict.update(pretrained_fcw) fcw.load_state_dict(fcw_dict) #fcw = DistModule(fcw) #print(fcw) fcw = fcw.cuda() #model = torch.nn.DataParallel(model).cuda() model = DistModule(model) #if args.syncbn: # from lib.syncbn import patch_replication_callback # patch_replication_callback(model) cudnn.enabled = True cudnn.benchmark = True criterion = nn.NLLLoss(ignore_index=args.ignore_label).cuda() if args.weight: def map_func(storage, location): return storage.cuda() if os.path.isfile(args.weight): logger.info("=> loading weight '{}'".format(args.weight)) checkpoint = torch.load(args.weight, map_location=map_func)['state_dict'] checkpoint = {k: v for k, v in checkpoint.items() if 'ppm' not in k} model_dict = model.state_dict() model_dict.update(checkpoint) model.load_state_dict(model_dict) logger.info("=> loaded weight '{}'".format(args.weight)) else: logger.info("=> no weight found at '{}'".format(args.weight)) if args.resume: load_state(args.resume, model, optimizer) value_scale = 255 mean = [0.485, 0.456, 0.406] mean = [item * value_scale for item in mean] std = [0.229, 0.224, 0.225] std = [item * value_scale for item in std] train_transform = transforms.Compose([ transforms.RandScale([args.scale_min, args.scale_max]), #transforms.RandRotate([args.rotate_min, args.rotate_max], padding=mean, ignore_label=args.ignore_label), transforms.RandomGaussianBlur(), transforms.RandomHorizontalFlip(), transforms.Crop([args.crop_h, args.crop_w], crop_type='rand', padding=mean, ignore_label=args.ignore_label), transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]) train_data = datasets.SegData(split='train', data_root=args.data_root, data_list=args.train_list, transform=train_transform) train_sampler = DistributedSampler(train_data) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=False, sampler=train_sampler) if args.evaluate: val_transform = transforms.Compose([ transforms.Crop([args.crop_h, args.crop_w], crop_type='center', padding=mean, ignore_label=args.ignore_label), transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]) val_data = datasets.SegData(split='val', data_root=args.data_root, data_list=args.val_list, transform=val_transform) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size_val, shuffle=False, num_workers=args.workers, pin_memory=True) for epoch in range(args.start_epoch, args.epochs + 1): loss_train, mIoU_train, mAcc_train, allAcc_train = train(train_loader, model, criterion, optimizer, epoch, args.zoom_factor, args.batch_size, args.aux_weight, fcw) if rank == 0: writer.add_scalar('loss_train', loss_train, epoch) writer.add_scalar('mIoU_train', mIoU_train, epoch) writer.add_scalar('mAcc_train', mAcc_train, epoch) writer.add_scalar('allAcc_train', allAcc_train, epoch) # write parameters histogram costs lots of time # for name, param in model.named_parameters(): # writer.add_histogram(name, param, epoch) if epoch % args.save_step == 0 and rank == 0: filename = args.save_path + '/train_epoch_' + str(epoch) + '.pth' logger.info('Saving checkpoint to: ' + filename) torch.save({'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}, filename) #if epoch / args.save_step > 2: # deletename = args.save_path + '/train_epoch_' + str(epoch - args.save_step*2) + '.pth' # os.remove(deletename) if args.evaluate: loss_val, mIoU_val, mAcc_val, allAcc_val = validate(val_loader, model, criterion, args.classes, args.zoom_factor) writer.add_scalar('loss_val', loss_val, epoch) writer.add_scalar('mIoU_val', mIoU_val, epoch) writer.add_scalar('mAcc_val', mAcc_val, epoch) writer.add_scalar('allAcc_val', allAcc_val, epoch)
def main(): global args, logger, writer args = get_parser().parse_args() import multiprocessing as mp if mp.get_start_method(allow_none=True) != 'spawn': mp.set_start_method('spawn', force=True) rank, world_size = dist_init(args.port) logger = get_logger() writer = SummaryWriter(args.save_path) # os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in args.gpu) # if len(args.gpu) == 1: # args.syncbn = False if rank == 0: logger.info(args) assert args.classes > 1 assert args.zoom_factor in [1, 2, 4, 8] assert (args.crop_h - 1) % 8 == 0 and (args.crop_w - 1) % 8 == 0 assert args.net_type in [0, 1, 2, 3] if args.bn_group == 1: args.bn_group_comm = None else: assert world_size % args.bn_group == 0 args.bn_group_comm = simple_group_split(world_size, rank, world_size // args.bn_group) if rank == 0: logger.info("=> creating two branch model ...") logger.info("Classes: {}".format(args.classes)) if args.net_type == 0: # from pspnet import PSPNet from pspnet2S import PSP2S model = PSP2S(backbone=args.backbone, layers=args.layers, classes=args.classes, zoom_factor=args.zoom_factor, syncbn=args.syncbn, group_size=args.bn_group, group=args.bn_group_comm).cuda() logger.info(model) # optimizer = torch.optim.SGD(model.parameters(), args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) # newly introduced layer with lr x10 optimizer = torch.optim.SGD( [{'params': model.rgb_branch.layer0_img.parameters()}, {'params': model.rgb_branch.layer0_ins.parameters()}, {'params': model.rgb_branch.layer0_conv1.parameters()}, {'params': model.rgb_branch.layer1.parameters()}, {'params': model.rgb_branch.layer2.parameters()}, {'params': model.rgb_branch.layer3.parameters()}, {'params': model.rgb_branch.layer4.parameters()}, {'params': model.rgb_branch.conv6.parameters()}, {'params': model.rgb_branch.conv1_1x1.parameters()}, {'params': model.rgb_branch.ppm.parameters(), 'lr': args.base_lr * 10}, {'params': model.rgb_branch.cls_ins.parameters(), 'lr': args.base_lr * 10}, {'params': model.flow_branch.layer0_flo.parameters()}, {'params': model.flow_branch.layer0_ins.parameters()}, {'params': model.flow_branch.layer0_conv1.parameters()}, {'params': model.flow_branch.layer1.parameters()}, {'params': model.flow_branch.layer2.parameters()}, {'params': model.flow_branch.layer3.parameters()}, {'params': model.flow_branch.layer4.parameters()}, {'params': model.flow_branch.conv6.parameters()}, {'params': model.flow_branch.conv1_1x1.parameters()}, {'params': model.flow_branch.ppm.parameters(), 'lr': args.base_lr * 10}, {'params': model.flow_branch.cls_ins.parameters(), 'lr': args.base_lr * 10}], lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) cudnn.enabled = True cudnn.benchmark = True criterion = nn.NLLLoss(ignore_index=args.ignore_label).cuda() if args.weight: def map_func(storage, location): return storage.cuda() if os.path.isfile(args.weight): # load rgb branch params. logger.info("=> loading rgb weight '{}'".format(args.weight)) checkpoint = torch.load(args.weight, map_location=map_func) new_dict = remove_prefix(checkpoint['state_dict'], 'module.') model.rgb_branch.load_state_dict(new_dict, strict=True) logger.info("=> loaded rgb weight '{}'".format(args.weight)) # load flow branch params. logger.info("=> loading flow weight '{}'".format(args.weight_flow)) checkpoint_flow = torch.load(args.weight_flow, map_location=map_func) new_dict_flow = remove_prefix(checkpoint_flow['state_dict'], 'module.') model.flow_branch.load_state_dict(new_dict_flow, strict=True) logger.info("=> loaded flow weight '{}'".format(args.weight_flow)) else: logger.info("=> no weight found at '{}'".format(args.weight)) model = DistModule(model) if args.resume: load_state(args.resume, model, optimizer) value_scale = 255 mean = [0.485, 0.456, 0.406] mean = [item * value_scale for item in mean] std = [0.229, 0.224, 0.225] std = [item * value_scale for item in std] if args.dataset_name == 'coco': logger.info("=> load coco patch datasets....") train_data = datasets.PatchDataSet(dataset_path='/mnt/lustre/share/sunpeng/voc_coco_label/coco/', data_list='train2017', norm=[mean, std]) elif args.dataset_name == 'voc': logger.info("=> load voc patch datasets....") train_data = datasets.PatchDataSet(dataset_path='/mnt/lustre/share/sunpeng/voc_coco_label/voc/', data_list='train_val_ins', norm=[mean, std]) elif args.dataset_name == 'youtube': logger.info("=> load youtube shuffle three pairs patch datasets....") train_data = datasets.YoutubePatchDataSet(dataset_path='', gt_path='', data_list='/mnt/lustre/sunpeng/Research/video-seg-workshop/models/deeplabv3_models/final_2s_seg_youtube_online_two_finetune_first_frame/add_first_frame_final_0823.txt', norm=[mean, std]) elif args.dataset_name == 'youtube_and_davis': logger.info("=> load youtube and davis shuffle three pairs patch datasets....") train_data = datasets.YoutubePatchDataSet(dataset_path='', gt_path='', data_list='/mnt/lustre/share/sunpeng/video-seg-workshop/davis/DAVIS/train_shuffle_davis.txt', norm=[mean, std]) else: logger.info("=> no right datset name found, please input coco or voc.") assert 0 == 1 # train_data = datasets.SegData(split='train', data_root=args.data_root, data_list=args.train_list, transform=train_transform) train_sampler = DistributedSampler(train_data) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=False, sampler=train_sampler) if args.evaluate: val_transform = transforms.Compose([ transforms.Crop([args.crop_h, args.crop_w], crop_type='center', padding=mean, ignore_label=args.ignore_label), transforms.ToTensor(), transforms.Normalize(mean=mean, std=std)]) val_data = datasets.SegData(split='val', data_root=args.data_root, data_list=args.val_list, transform=val_transform) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size_val, shuffle=False, num_workers=args.workers, pin_memory=True) for epoch in range(args.start_epoch, args.epochs + 1): loss_train, mIoU_train, mAcc_train, allAcc_train = train(train_loader, model, criterion, optimizer, epoch, args.zoom_factor, args.batch_size, args.ins_weight) if rank == 0: writer.add_scalar('loss_train', loss_train, epoch) writer.add_scalar('mIoU_train', mIoU_train, epoch) writer.add_scalar('mAcc_train', mAcc_train, epoch) writer.add_scalar('allAcc_train', allAcc_train, epoch) # write parameters histogram costs lots of time # for name, param in model.named_parameters(): # writer.add_histogram(name, param, epoch) if epoch % args.save_step == 0 and rank == 0: filename = args.save_path + '/train_epoch_' + str(epoch) + '.pth' logger.info('Saving checkpoint to: ' + filename) torch.save({'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}, filename) # if epoch / args.save_step > 2: # deletename = args.save_path + '/train_epoch_' + str(epoch - args.save_step*2) + '.pth' # os.remove(deletename) if args.evaluate: loss_val, mIoU_val, mAcc_val, allAcc_val = validate(val_loader, model, criterion, args.classes, args.zoom_factor) writer.add_scalar('loss_val', loss_val, epoch) writer.add_scalar('mIoU_val', mIoU_val, epoch) writer.add_scalar('mAcc_val', mAcc_val, epoch) writer.add_scalar('allAcc_val', allAcc_val, epoch)