def dataloader(bs, gpus): roidb, ratio_list, ratio_index = \ combined_roidb_for_training_semseg('cityscapes_semseg_val') sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader(roidb, 19, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=bs, sampler=sampler, num_workers=gpus, collate_fn=collate_minibatch_semseg) return dataloader
def dataloader(bs, gpus): roidb, ratio_list, ratio_index = \ combined_roidb_for_training_semseg('cityscapes_semseg_train') sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader(roidb, 19, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=bs, sampler=sampler, num_workers=gpus, collate_fn=collate_minibatch_semseg) return dataloader return torch.randn(bs*gpus, 3, 720, 720), \ torch.LongTensor(np.random.randint(0, 19, (bs*gpus, 90, 90), dtype=np.long))
def main(): saveNetStructure=False """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: #set gpu device os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(ids) for ids in args.device_ids]) torch.backends.cudnn.benchmark=True cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train',) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train',) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cityscapes": cfg.TRAIN.DATASETS = ('cityscapes_semseg_train', ) cfg.MODEL.NUM_CLASSES = 19 elif args.dataset == "cityscape_train_on_val": cfg.TRAIN.DATASETS = ('cityscape_train_on_val', ) cfg.MODEL.NUM_CLASSES = 19 elif args.dataset == "cityscapes_coarse": cfg.TRAIN.DATASETS = ('cityscapes_coarse', ) cfg.MODEL.NUM_CLASSES = 19 elif args.dataset == "cityscapes_all": cfg.TRAIN.DATASETS = ('cityscapes_all', ) cfg.MODEL.NUM_CLASSES = 19 elif args.dataset == "cityscapes_trainval": cfg.TRAIN.DATASETS = ('cityscapes_trainval', ) cfg.MODEL.NUM_CLASSES = 19 elif args.dataset == "cityscapes_fineturn": cfg.TRAIN.DATASETS = ('cityscapes_fineturn', ) cfg.MODEL.NUM_CLASSES = 19 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS print('Batch size change from {} (in config file) to {}'.format( original_batch_size, args.batch_size)) print('NUM_GPUs: %d, TRAIN.IMS_PER_BATCH: %d' % (cfg.NUM_GPUS, cfg.TRAIN.IMS_PER_BATCH)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Adjust learning based on batch size change linearly old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch size change: {} --> {}'.format( old_base_lr, cfg.SOLVER.BASE_LR)) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() if cfg.SEM.SEM_ON or cfg.DISP.DISP_ON: roidb, ratio_list, ratio_index = combined_roidb_for_training_semseg( cfg.TRAIN.DATASETS) else: roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() train_size = len(roidb) logger.info('{:d} roidb entries'.format(train_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) #sampler = MinibatchSampler(ratio_list, ratio_index) sampler = None dataset = RoiDataLoader( roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch_semseg_all if cfg.SEM.SEM_ON or cfg.DISP.DISP_ON else collate_minibatch, drop_last=False, shuffle=True, pin_memory=True) assert_and_infer_cfg() #for data in dataloader: # image = data['data'][0][0].numpy() # print (image.shape) # image=image.transpose(1,2,0)+cfg.PIXEL_MEANS # cv2.imwrite('image.png', image[:,:,::-1]) # cv2.imwrite('label.png',10*data['semseg_label_0'][0][0].numpy()) # return maskRCNN = eval(cfg.MODEL.TYPE)() if len(cfg.SEM.PSPNET_PRETRAINED_WEIGHTS)>1: print("loading pspnet weights") state_dict={} pretrained=torch.load(cfg.SEM.PSPNET_PRETRAINED_WEIGHTS, map_location=lambda storage, loc: storage) pretrained = pretrained['model'] if cfg.SEM.SPN_ON: maskRCNN.pspnet.load_state_dict(pretrained,strict=True) elif 'deeplab' in cfg.SEM.DECODER_TYPE: encoder = dict() for k, v in pretrained.items(): if 'decoder' in k: continue encoder[k.replace('encoder.','')] = v maskRCNN.encoder.load_state_dict(encoder,strict=True) del encoder else: maskRCNN.load_state_dict(pretrained,strict=True) del pretrained print("weights load success") if cfg.SEM.SPN_ON: maskRCNN.pspnet.eval() for p in maskRCNN.pspnet.parameters(): p.requires_grad = False # load nets into gpu maskRCNN = UserScatteredDataParallel(maskRCNN) # For sync bn patch_replication_callback(maskRCNN) if cfg.CUDA: maskRCNN.to('cuda') ### Optimizer ### bias_params = [] nonbias_params = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) else: nonbias_params.append(value) params = [ {'params': nonbias_params, 'lr': cfg.SOLVER.BASE_LR, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY}, {'params': bias_params, 'lr': cfg.SOLVER.BASE_LR * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0} ] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) print("Using STEP as Lr reduce policy!") if cfg.SOLVER.TYPE == 'SGD' and cfg.SOLVER.LR_POLICY == 'ReduceLROnPlateau': optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,'min',patience=10) print("Using ReduceLROnPlateau as Lr reduce policy!") elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) elif "poly" in cfg.SOLVER.TYPE: optimizer = create_optimizers(maskRCNN,args) print("Using Poly as Lr reduce policy!") args.max_iters = (int(train_size / args.batch_size)) * args.num_epochs ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: assert checkpoint['iters_per_epoch'] == train_size // args.batch_size, \ "iters_per_epoch should match for resume" # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) if checkpoint['step'] == (checkpoint['iters_per_epoch'] - 1): # Resume from end of an epoch args.start_epoch = checkpoint['epoch'] + 1 args.start_iter = 0 else: # Resume from the middle of an epoch. # NOTE: dataloader is not synced with previous state args.start_epoch = checkpoint['epoch'] args.start_iter = checkpoint['step'] + 1 del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) if cfg.SOLVER.TYPE=='step_poly': lr = cfg.SOLVER.BASE_LR / (cfg.SOLVER.GAMMA**len(args.lr_decay_epochs)) else: lr = optimizer.param_groups[0]['lr'] # lr of non-bias parameters, for commmand line outputs. ### Training Setups ### args.run_name = misc_utils.get_run_name() output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: #from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) iters_per_epoch = int(train_size / args.batch_size) # drop last args.iters_per_epoch = iters_per_epoch ckpt_interval_per_epoch = iters_per_epoch // args.ckpt_num_per_epoch try: logger.info('Training starts !') args.step = args.start_iter global_step = iters_per_epoch * args.start_epoch + args.step for args.epoch in range(args.start_epoch, args.start_epoch + args.num_epochs): # ---- Start of epoch ---- # adjust learning rate if args.lr_decay_epochs and args.epoch == args.lr_decay_epochs[0] and args.start_iter == 0 and cfg.SOLVER.LR_POLICY=='steps_with_decay' : args.lr_decay_epochs.pop(0) net_utils.decay_learning_rate(optimizer, lr, cfg.SOLVER.GAMMA) lr *= cfg.SOLVER.GAMMA for args.step, input_data in zip(range(args.start_iter, iters_per_epoch), dataloader): #if cfg.DISP.DISP_ON: # input_data['data'] = list(map(lambda x,y: torch.cat((x,y), dim=0), # input_data['data'], input_data['data_R'])) # if cfg.SEM.DECODER_TYPE.endswith('3D'): # input_data['disp_scans'] = torch.arange(1, # cfg.DISP.MAX_DISPLACEMENT+1).float().view(1,cfg.DISP.MAX_DISPLACEMENT).repeat(args.batch_size,1) # del input_data['data_R'] #for key in input_data: # if key != 'roidb': # roidb is a list of ndarrays with inconsistent length # input_data[key] = list(map(lambda x: Variable(x, requires_grad=False).to('cuda'), input_data[key])) training_stats.IterTic() net_outputs = maskRCNN(input_data) training_stats.UpdateIterStats(net_outputs) #loss = net_outputs['losses']['loss_semseg'] #acc = net_outputs['metrics']['accuracy_pixel'] #print (loss.item(), acc) #for key in net_outputs.keys(): # print(key) loss = net_outputs['total_loss'] #print("loss.shape:",loss) optimizer.zero_grad() loss.backward() optimizer.step() if cfg.SOLVER.TYPE=='poly': lr = adjust_learning_rate(optimizer, global_step, args) if cfg.SOLVER.TYPE=='step_poly': lr = step_adjust_learning_rate(optimizer, lr, global_step, args) training_stats.IterToc() if args.step % args.disp_interval == 0: disp_image='' semseg_image='' #tblogger.add_image('disp_image',disp_image,global_step) #tblogger.add_image('semseg_image',semseg_image,global_step) log_training_stats(training_stats, global_step, lr) global_step += 1 # ---- End of epoch ---- # save checkpoint if cfg.SOLVER.TYPE == 'SGD' and cfg.SOLVER.LR_POLICY == 'ReduceLROnPlateau': lr_scheduler.step(loss) lr = optimizer.param_groups[0]['lr'] if (args.epoch+1) % args.ckpt_num_per_epoch ==0: net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) # reset starting iter number after first epoch args.start_iter = 0 # ---- Training ends ---- #if iters_per_epoch % args.disp_interval != 0: # log last stats at the end # log_training_stats(training_stats, global_step, lr) # save final model if (args.epoch+1) % args.ckpt_num_per_epoch: net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) except (RuntimeError, KeyboardInterrupt): logger.info('Save ckpt on exception ...') net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): saveNetStructure = False """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: #set gpu device os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( [str(ids) for ids in args.device_ids]) torch.backends.cudnn.benchmark = True cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cityscapes": cfg.TRAIN.DATASETS = ('cityscapes_semseg_train', ) cfg.MODEL.NUM_CLASSES = 19 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS print('Batch size change from {} (in config file) to {}'.format( original_batch_size, args.batch_size)) print('NUM_GPUs: %d, TRAIN.IMS_PER_BATCH: %d' % (cfg.NUM_GPUS, cfg.TRAIN.IMS_PER_BATCH)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Adjust learning based on batch size change linearly old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch size change: {} --> {}'. format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() if cfg.SEM.SEM_ON or cfg.DISP.DISP_ON: roidb, ratio_list, ratio_index = combined_roidb_for_training_semseg( cfg.TRAIN.DATASETS) else: roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() train_size = len(roidb) logger.info('{:d} roidb entries'.format(train_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch_semseg if cfg.SEM.SEM_ON or cfg.DISP.DISP_ON else collate_minibatch) assert_and_infer_cfg() #for args.step, input_data in zip(range(100), dataloader): # data_L = input_data['data'] # data_R = input_data['data_R'] # label = input_data['disp_label_0'] # cv2.imwrite('ims_L.png', data_L[0].numpy()[0].transpose(1,2,0)[:,:,::-1]+cfg.PIXEL_MEANS) # cv2.imwrite('ims_R.png', data_R[0].numpy()[0].transpose(1,2,0)[:,:,::-1]+cfg.PIXEL_MEANS) # cv2.imwrite('label.png', label[0].numpy()[0]) # return ### Model ### dispSeg = DispSeg() if cfg.CUDA: dispSeg.to('cuda') pspnet_bias_params = [] pspnet_nonbias_params = [] for key, value in dict(dispSeg.pspnet.named_parameters()).items(): if value.requires_grad: if 'bias' in key: pspnet_bias_params.append(value) else: pspnet_nonbias_params.append(value) pspnet_params = [{ 'params': pspnet_nonbias_params, 'lr': cfg.SOLVER.BASE_LR, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': pspnet_bias_params, 'lr': cfg.SOLVER.BASE_LR * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }] glassGCN_bias_params = [] glassGCN_nonbias_params = [] for key, value in dict(dispSeg.glassGCN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: glassGCN_bias_params.append(value) else: glassGCN_nonbias_params.append(value) segdisp3d_params = [{ 'params': glassGCN_nonbias_params, 'lr': cfg.SOLVER.BASE_LR, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': glassGCN_bias_params, 'lr': cfg.SOLVER.BASE_LR * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }] if cfg.SOLVER.TYPE == "SGD": optimizerP = torch.optim.SGD(pspnet_params, momentum=cfg.SOLVER.MOMENTUM) optimizerS = torch.optim.SGD(segdisp3d_params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizerP = torch.optim.Adam(pspnet_params) optimizerS = torch.optim.Adam(segdisp3d_params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(pspnet, checkpoint['model']) net_utils.load_ckpt(segdisp3d, checkpoint['model']) if args.resume: assert checkpoint['iters_per_epoch'] == train_size // args.batch_size, \ "iters_per_epoch should match for resume" # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) if checkpoint['step'] == (checkpoint['iters_per_epoch'] - 1): # Resume from end of an epoch args.start_epoch = checkpoint['epoch'] + 1 args.start_iter = 0 else: # Resume from the middle of an epoch. # NOTE: dataloader is not synced with previous state args.start_epoch = checkpoint['epoch'] args.start_iter = checkpoint['step'] + 1 del checkpoint torch.cuda.empty_cache() lr = optimizerP.param_groups[0][ 'lr'] # lr of non-bias parameters, for commmand line outputs. dispSeg = mynn.DataParallel(dispSeg, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: #from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### dispSeg.train() training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) iters_per_epoch = int(train_size / args.batch_size) # drop last args.iters_per_epoch = iters_per_epoch ckpt_interval_per_epoch = iters_per_epoch // args.ckpt_num_per_epoch try: logger.info('Training starts !') args.step = args.start_iter global_step = iters_per_epoch * args.start_epoch + args.step for args.epoch in range(args.start_epoch, args.start_epoch + args.num_epochs): # ---- Start of epoch ---- # adjust learning rate if args.lr_decay_epochs and args.epoch == args.lr_decay_epochs[ 0] and args.start_iter == 0: args.lr_decay_epochs.pop(0) net_utils.decay_learning_rate(optimizerP, lr, cfg.SOLVER.GAMMA) net_utils.decay_learning_rate(optimizerS, lr, cfg.SOLVER.GAMMA) lr *= cfg.SOLVER.GAMMA for args.step, input_data in zip( range(args.start_iter, iters_per_epoch), dataloader): if cfg.DISP.DISP_ON: input_data['data'] = list( map(lambda x, y: torch.cat((x, y), dim=0), input_data['data'], input_data['data_R'])) if cfg.SEM.DECODER_TYPE.endswith('3Ddeepsup'): input_data['disp_scans'] = torch.arange( 0, cfg.DISP.MAX_DISPLACEMENT).float().view( 1, cfg.DISP.MAX_DISPLACEMENT, 1, 1).repeat(args.batch_size, 1, 1, 1) input_data['semseg_scans'] = torch.arange( 0, cfg.MODEL.NUM_CLASSES).long().view( 1, cfg.MODEL.NUM_CLASSES, 1, 1).repeat(args.batch_size, 1, 1, 1) del input_data['data_R'] for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list( map( lambda x: Variable(x, requires_grad=False).to( 'cuda'), input_data[key])) training_stats.IterTic() net_outputs = dispSeg(**input_data) training_stats.UpdateIterStats(net_outputs) #loss = net_outputs['losses']['loss_semseg'] #acc = net_outputs['metrics']['accuracy_pixel'] #print (loss.item(), acc) #for key in net_outputs.keys(): # print(key) loss = net_outputs['total_loss'] #print("loss.shape:",loss) optimizerP.zero_grad() optimizerS.zero_grad() loss.backward() optimizerP.step() optimizerS.step() training_stats.IterToc() if args.step % args.disp_interval == 0: #disp_image=net_outputs['disp_image'] #semseg_image=net_outputs['semseg_image'] #tblogger.add_image('disp_image',disp_image,global_step) #tblogger.add_image('semseg_image',semseg_image,global_step) log_training_stats(training_stats, global_step, lr) global_step += 1 # ---- End of epoch ---- # save checkpoint net_utils.save_ckpt(output_dir, args, dispSeg, optimizerS) # reset starting iter number after first epoch args.start_iter = 0 # ---- Training ends ---- #if iters_per_epoch % args.disp_interval != 0: # log last stats at the end # log_training_stats(training_stats, global_step, lr) except (RuntimeError, KeyboardInterrupt): logger.info('Save ckpt on exception ...') net_utils.save_ckpt(output_dir, args, dispSeg, optimizerS) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
if not os.path.exists(args.save_file): os.makedirs(args.save_file) os.makedirs(args.save_file + '_labelId') name_list = load_file_list( 'lib/datasets/data/cityscapes/annotations/val.txt') model = create_model(args) model.net.cuda() if args.dataset == 'cityscapes_train': cfg.TRAIN.DATASETS = ('cityscapes_semseg_train') elif args.dataset == 'cityscapes_val': cfg.TRAIN.DATASETS = ('cityscapes_semseg_val') cfg.TRAIN.IMS_PER_BATCH = 1 if cfg.SEM.SEM_ON or cfg.DISP.DISP_ON: roidb, ratio_list, ratio_index = combined_roidb_for_training_semseg( cfg.TRAIN.DATASETS) val_roidb, val_ratio_list, val_ratio_index = combined_roidb_for_training_semseg( cfg.VALIDATION.VAL_LIST) else: roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) train_size = len(roidb) print("total samples to be evaluated:", train_size) val_size = len(val_roidb) sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=False) dataloader = torch.utils.data.DataLoader( dataset, batch_size=1,