def _init_modules(self): if cfg.RESNETS.IMAGENET_PRETRAINED_WEIGHTS != '': # or cfg.MODEL.USE_SE_LOSS: logger.info("Loading pretrained weights from %s", cfg.RESNETS.IMAGENET_PRETRAINED_WEIGHTS) resnet_utils.load_pretrained_imagenet_weights(self) # Check if shared weights are equaled if cfg.MODEL.MASK_ON and getattr(self.Mask_Head, 'SHARE_RES5', False): assert self.Mask_Head.res5.state_dict() == self.Box_Head.res5.state_dict() if cfg.MODEL.KEYPOINTS_ON and getattr(self.Keypoint_Head, 'SHARE_RES5', False): assert self.Keypoint_Head.res5.state_dict() == self.Box_Head.res5.state_dict() # load detectron pretrained weights for resnet if cfg.RESNETS.COCO_PRETRAINED_WEIGHTS != '': logger.info("loading detectron pretrained weights from %s", cfg.RESNETS.COCO_PRETRAINED_WEIGHTS) load_detectron_weight(self, cfg.RESNETS.COCO_PRETRAINED_WEIGHTS, ('cls_score', 'bbox_pred')) if cfg.VGG16.COCO_PRETRAINED_WEIGHTS != '': logger.info("loading pretrained weights from %s", cfg.VGG16.COCO_PRETRAINED_WEIGHTS) checkpoint = torch.load(cfg.VGG16.COCO_PRETRAINED_WEIGHTS, map_location=lambda storage, loc: storage) # not using the last softmax layers del checkpoint['model']['Box_Outs.cls_score.weight'] del checkpoint['model']['Box_Outs.cls_score.bias'] del checkpoint['model']['Box_Outs.bbox_pred.weight'] del checkpoint['model']['Box_Outs.bbox_pred.bias'] net_utils.load_ckpt(self, checkpoint['model']) if cfg.RESNETS.TO_BE_FINETUNED_WEIGHTS != '': logger.info("loading trained and to be finetuned weights from %s", cfg.RESNETS.TO_BE_FINETUNED_WEIGHTS) checkpoint = torch.load(cfg.RESNETS.TO_BE_FINETUNED_WEIGHTS, map_location=lambda storage, loc: storage) net_utils.load_ckpt(self, checkpoint['model']) if cfg.TRAIN.FREEZE_CONV_BODY: for p in self.Conv_Body.parameters(): p.requires_grad = False
def initialize_model_from_cfg(args, gpu_id=0): """Initialize a model from the global cfg. Loads test-time weights and set to evaluation mode. """ model = model_builder.Generalized_RCNN() model.eval() if args.cuda: model.cuda() if args.load_ckpt: load_name = args.load_ckpt logger.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(model, checkpoint['model']) if args.load_detectron: logger.info("loading detectron weights %s", args.load_detectron) load_detectron_weight(model, args.load_detectron) model = mynn.DataParallel(model, cpu_keywords=['im_info', 'roidb'], minibatch=True) return model
def __init__(self): args = parse_args() # Historical code, because we have trained on the Baidu Apoloscape dataset. dataset = WAD_CVPR2018(args.dataset_dir) cfg.MODEL.NUM_CLASSES = len( dataset.eval_class) + 1 # with a background class print('load cfg from file: {}'.format(args.cfg_file)) cfg_from_file(osp.join(this_dir, args.cfg_file)) if args.nms_soft: cfg.TEST.SOFT_NMS.ENABLED = True else: cfg.TEST.NMS = args.nms cfg.RESNETS.IMAGENET_PRETRAINED = False # Don't need to load imagenet pretrained weights assert_and_infer_cfg() maskRCNN = Generalized_RCNN() maskRCNN.cuda() if args.load_ckpt: load_name = osp.join(this_dir, args.load_ckpt) print("loading checkpoint %s" % (load_name)) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True, device_ids=[0]) maskRCNN.eval() self.model = maskRCNN self.dataset = dataset
def initialize_model_from_cfg(args, gpu_id=0): ''' Initialize a model from the global cfg. Loads test-time weights and set to evaluation mode. ''' model = model_builder.Generalized_RCNN() model.eval() # pdb.set_trace() if args.cuda: model.cuda() if args.load_ckpt: load_name = args.load_ckpt logger.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) #pdb.set_trace() # checkpoint['model'].keys() net_utils.load_ckpt(model, checkpoint['model']) if args.load_detectron: logger.info("loading detectron weights %s", args.load_detectron) load_detectron_weight(model, args.load_detectron) # for key, vaule in model.named_parameters(): # LJ output required_grad parameter # pdb.set_trace() # print(key) model = mynn.DataParallel(model, cpu_keywords=['im_info', 'roidb'], minibatch=True) return model
def main(): """main function""" if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") args = parse_args() print('Called with args:') print(args) dataset = WAD_CVPR2018(args.dataset_dir) cfg.MODEL.NUM_CLASSES = len( dataset.eval_class) + 1 # with a background class print('load cfg from file: {}'.format(args.cfg_file)) cfg_from_file(args.cfg_file) cfg.RESNETS.IMAGENET_PRETRAINED = False # Don't need to load imagenet pretrained weights assert_and_infer_cfg() maskRCNN = Generalized_RCNN() maskRCNN.cuda() if args.load_ckpt: load_name = args.load_ckpt print("loading checkpoint %s" % (load_name)) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True, device_ids=[0]) # only support single GPU maskRCNN.eval() imglist = misc_utils.get_imagelist_from_dir(dataset.test_image_dir) num_images = len(imglist) output_dir = os.path.join(('/').join(args.load_ckpt.split('/')[:-2]), 'Images') if not os.path.exists(output_dir): os.makedirs(output_dir) for i in tqdm(xrange(num_images)): im = cv2.imread(imglist[i]) assert im is not None timers = defaultdict(Timer) im_name, _ = os.path.splitext(os.path.basename(imglist[i])) args.current_im_name = im_name cls_boxes, cls_segms, prediction_row = im_detect_all(args, maskRCNN, im, dataset, timers=timers) thefile = open(os.path.join(output_dir, im_name + '.txt'), 'w') for item in prediction_row: thefile.write("%s\n" % item)
def _init_modules(self): # VGG16 imagenet pretrained model is initialized in VGG16.py if cfg.RESNETS.IMAGENET_PRETRAINED_WEIGHTS != '': logger.info("Loading pretrained weights from %s", cfg.RESNETS.IMAGENET_PRETRAINED_WEIGHTS) resnet_utils.load_pretrained_imagenet_weights(self) if cfg.RESNETS.VRD_PRETRAINED_WEIGHTS != '': self.load_detector_weights(cfg.RESNETS.VRD_PRETRAINED_WEIGHTS) if cfg.VGG16.VRD_PRETRAINED_WEIGHTS != '': self.load_detector_weights(cfg.VGG16.VRD_PRETRAINED_WEIGHTS) if cfg.RESNETS.VG_PRETRAINED_WEIGHTS != '': self.load_detector_weights(cfg.RESNETS.VG_PRETRAINED_WEIGHTS) if cfg.VGG16.VG_PRETRAINED_WEIGHTS != '': self.load_detector_weights(cfg.VGG16.VG_PRETRAINED_WEIGHTS) if cfg.TRAIN.FREEZE_CONV_BODY: for p in self.Conv_Body.parameters(): p.requires_grad = False if cfg.RESNETS.VRD_PRD_PRETRAINED_WEIGHTS != '' or cfg.VGG16.VRD_PRD_PRETRAINED_WEIGHTS != '' or \ cfg.RESNETS.VG_PRD_PRETRAINED_WEIGHTS != '' or cfg.VGG16.VG_PRD_PRETRAINED_WEIGHTS != '': if cfg.RESNETS.VRD_PRD_PRETRAINED_WEIGHTS != '': logger.info("loading prd pretrained weights from %s", cfg.RESNETS.VRD_PRD_PRETRAINED_WEIGHTS) checkpoint = torch.load( cfg.RESNETS.VRD_PRD_PRETRAINED_WEIGHTS, map_location=lambda storage, loc: storage) if cfg.VGG16.VRD_PRD_PRETRAINED_WEIGHTS != '': logger.info("loading prd pretrained weights from %s", cfg.VGG16.VRD_PRD_PRETRAINED_WEIGHTS) checkpoint = torch.load( cfg.VGG16.VRD_PRD_PRETRAINED_WEIGHTS, map_location=lambda storage, loc: storage) if cfg.RESNETS.VG_PRD_PRETRAINED_WEIGHTS != '': logger.info("loading prd pretrained weights from %s", cfg.RESNETS.VG_PRD_PRETRAINED_WEIGHTS) checkpoint = torch.load( cfg.RESNETS.VG_PRD_PRETRAINED_WEIGHTS, map_location=lambda storage, loc: storage) if cfg.VGG16.VG_PRD_PRETRAINED_WEIGHTS != '': logger.info("loading prd pretrained weights from %s", cfg.VGG16.VG_PRD_PRETRAINED_WEIGHTS) checkpoint = torch.load( cfg.VGG16.VG_PRD_PRETRAINED_WEIGHTS, map_location=lambda storage, loc: storage) # not using the last softmax layers del checkpoint['model']['Box_Outs.cls_score.weight'] del checkpoint['model']['Box_Outs.cls_score.bias'] del checkpoint['model']['Box_Outs.bbox_pred.weight'] del checkpoint['model']['Box_Outs.bbox_pred.bias'] net_utils.load_ckpt(self.Prd_RCNN, checkpoint['model']) if cfg.TRAIN.FREEZE_PRD_CONV_BODY: for p in self.Prd_RCNN.Conv_Body.parameters(): p.requires_grad = False if cfg.TRAIN.FREEZE_PRD_BOX_HEAD: for p in self.Prd_RCNN.Box_Head.parameters(): p.requires_grad = False
def load_model(self, weight_file, use_half, device_id): # import pytorch # self._log.info("load_model does not use device, or use_half in MRCNN") # print("Device in load model: ", device) try: import torch except: raise RuntimeError("could not load pytorch!") # import model try: from modeling.model_builder import Generalized_RCNN as ubMRCNN # from ubmrcnn import ubMRCNN except: raise RuntimeError( "could not load ubMRCNN model. did you remember" + " to setup everything?") # if "cuda" not in device and "cpu" not in device: # raise ValueError("invalid device name [{}]. Must str with name \ # \"cpu\" or \"cuda:X\" where X=device number") self._log = logging.getLogger(self.idname()) # self.device = torch.device(device) self.model = Generalized_RCNN() #checkpoint = torch.load(weight_file, map_location=lambda storage, loc: storage) locations = {} for x in range(6): locations["cuda:%d" % (x)] = "cpu" checkpoint = torch.load(weight_file, map_location=locations) # self.device = device_id # if device_id==None: # self.device = torch.device("cpu") # else: # self.device = torch.device("cuda:%d"%(device_id)) net_utils.load_ckpt(self.model, checkpoint['model']) # self.model = mynn.DataParallel(self.model, cpu_keywords=['im_info', 'roidb'], # minibatch=True, device_ids=[0], # output_device=0) # only support single GPU self.model = mynn.DataSingular(self.model, cpu_keywords=['im_info', 'roidb'], minibatch=True, device_id=[cfg.MODEL.DEVICE]) self.model.eval()
def initialize_model_from_cfg(args, gpu_id=0): model = model_builder_rel.Generalized_RCNN() model.eval() model.cuda() load_name = args.load_ckpt logger.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(model, checkpoint['model']) model = mynn.DataParallel(model, cpu_keywords=['im_info', 'roidb'], minibatch=True) return model
def load_detector_weights(self, weight_name): logger.info("loading pretrained weights from %s", weight_name) checkpoint = torch.load(weight_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(self, checkpoint['model']) # freeze everything above the rel module for p in self.Conv_Body.parameters(): p.requires_grad = False for p in self.RPN.parameters(): p.requires_grad = False if not cfg.MODEL.UNFREEZE_DET: for p in self.Box_Head.parameters(): p.requires_grad = False for p in self.Box_Outs.parameters(): p.requires_grad = False
def load_detector_weights(self, weight_name): logger.info("loading pretrained weights from %s", weight_name) checkpoint = torch.load(weight_name, map_location=lambda storage, loc: storage) if not cfg.VGG16.INCLUDE_CLASSIFIER or cfg.TRAIN.USE_GT_BOXES: del checkpoint['model']['Box_Outs.cls_score.weight'] del checkpoint['model']['Box_Outs.cls_score.bias'] del checkpoint['model']['Box_Outs.bbox_pred.weight'] del checkpoint['model']['Box_Outs.bbox_pred.bias'] net_utils.load_ckpt(self, checkpoint['model']) # freeze everything above the rel module if in stage 2 if cfg.MODEL.MEMORY_MODULE_STAGE == 2: for p in self.Conv_Body.parameters(): p.requires_grad = False for p in self.Box_Head.parameters(): p.requires_grad = False
def initialize_model_from_cfg(args, roidb=None, gpu_id=0): """Initialize a model from the global cfg. Loads test-time weights and set to evaluation mode. """ model = model_builder.Generalized_RCNN() model.eval() cfg.immutable(False) cfg.TEST.CLASS_SPLIT = {'source': roidb[0]['source'], 'target': roidb[0]['target']} cfg.immutable(True) if 'word_embeddings' in roidb[0]: model.Box_Outs.set_word_embedding(torch.tensor(roidb[0]['word_embeddings'])) if cfg.MODEL.IGNORE_CLASSES: if cfg.MODEL.IGNORE_CLASSES == 'all': roidb[0]['all'] = roidb[0]['source'] + roidb[0]['target'] model._ignore_classes = roidb[0][cfg.MODEL.IGNORE_CLASSES] model.Box_Outs._ignore_classes = roidb[0][cfg.MODEL.IGNORE_CLASSES] if True: tmp = {} for rel in roidb[0]['relationships']: tmp[(rel['subject_id'], rel['object_id'])] = \ tmp.get((rel['subject_id'], rel['object_id']), []) + [rel['rel_id']] if cfg.MODEL.RELATION_COOCCUR: for k in tmp: tmp[k] = [1] if cfg.MODEL.NUM_RELATIONS > 0: model.Rel_Outs.relationship_dict = tmp if args.cuda: model.cuda() if args.load_ckpt: load_name = args.load_ckpt logger.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(model, checkpoint['model']) if args.load_detectron: logger.info("loading detectron weights %s", args.load_detectron) load_detectron_weight(model, args.load_detectron) model = mynn.DataParallel(model, cpu_keywords=['im_info', 'roidb'], minibatch=True) return model
def reload_ckpt(load_ckpt): #checkpoint = torch.load(load_ckpt, map_location=lambda storage, loc: storage) #net_utils.load_ckpt(model, checkpoint['model']) model = model_builder.Generalized_RCNN() model.eval() model.cuda() load_name = load_ckpt checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(model, checkpoint['model']) model = mynn.DataParallel(model, cpu_keywords=['im_info', 'roidb'], minibatch=True) print("load checkpoint: {}".format(load_ckpt)) return model
def initialize_model_from_cfg(args, gpu_id=0): """Initialize a model from the global cfg. Loads test-time weights and set to evaluation mode. """ model = GAN() model.eval() model.cuda() if args.load_ckpt: load_name = args.load_ckpt checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(model, checkpoint['model']) model = mynn.DataParallel(model, cpu_keywords=['im_info', 'roidb'], minibatch=True) return model
def initialize_model_from_cfg(args, gpu_id=0): """Initialize a model from the global cfg. Loads test-time weights and set to evaluation mode. """ model = eval(args.model).loot_model(args) model.eval() if args.cuda: model.cuda() if args.load_ckpt: load_name = args.load_ckpt logger.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(model, checkpoint['model']) if args.load_detectron: logger.info("loading detectron weights %s", args.load_detectron) load_detectron_weight(model, args.load_detectron) return model
def main(): """main function""" if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") args = parse_args() print('Called with args:') print(args) assert args.image_dir or args.images assert bool(args.image_dir) ^ bool(args.images) if args.dataset.startswith("coco"): dataset = datasets.get_coco_dataset() cfg.MODEL.NUM_CLASSES = len(dataset.classes) elif args.dataset.startswith("keypoints_coco"): dataset = datasets.get_coco_dataset() cfg.MODEL.NUM_CLASSES = 2 else: raise ValueError('Unexpected dataset name: {}'.format(args.dataset)) print('load cfg from file: {}'.format(args.cfg_file)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) assert bool(args.load_ckpt) ^ bool(args.load_detectron), \ 'Exactly one of --load_ckpt and --load_detectron should be specified.' cfg.MODEL.LOAD_IMAGENET_PRETRAINED_WEIGHTS = False # Don't need to load imagenet pretrained weights assert_and_infer_cfg() maskRCNN = Generalized_RCNN() if args.cuda: maskRCNN.cuda() if args.load_ckpt: load_name = args.load_ckpt print("loading checkpoint %s" % (load_name)) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.load_detectron: print("loading detectron weights %s" % args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True, device_ids=[0]) # only support single GPU maskRCNN.eval() if args.image_dir: imglist = misc_utils.get_imagelist_from_dir(args.image_dir) else: imglist = args.images num_images = len(imglist) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) for i in xrange(num_images): print('img', i) im = cv2.imread(imglist[i]) assert im is not None timers = defaultdict(Timer) cls_boxes, cls_segms, cls_keyps = im_detect_all(maskRCNN, im, timers=timers) im_name, _ = os.path.splitext(os.path.basename(imglist[i])) vis_utils.vis_one_image( im[:, :, ::-1], # BGR -> RGB for visualization im_name, args.output_dir, cls_boxes, cls_segms, cls_keyps, dataset=dataset, box_alpha=0.3, show_class=True, thresh=0.7, kp_thresh=2) if args.merge_pdfs and num_images > 1: merge_out_path = '{}/results.pdf'.format(args.output_dir) if os.path.exists(merge_out_path): os.remove(merge_out_path) command = "pdfunite {}/*.pdf {}".format(args.output_dir, merge_out_path) subprocess.call(command, shell=True)
def main(): """main function""" if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") args = parse_args() print('Called with args:') print(args) assert args.image_dir or args.images assert bool(args.image_dir) ^ bool(args.images) if args.dataset.startswith("coco"): dataset = datasets.get_coco_dataset() cfg.MODEL.NUM_CLASSES = len(dataset.classes) elif args.dataset.startswith("keypoints_coco"): dataset = datasets.get_coco_dataset() cfg.MODEL.NUM_CLASSES = 2 else: raise ValueError('Unexpected dataset name: {}'.format(args.dataset)) print('load cfg from file: {}'.format(args.cfg_file)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) assert bool(args.load_ckpt) ^ bool(args.load_detectron), \ 'Exactly one of --load_ckpt and --load_detectron should be specified.' cfg.MODEL.LOAD_IMAGENET_PRETRAINED_WEIGHTS = False # Don't need to load imagenet pretrained weights assert_and_infer_cfg() maskRCNN = Generalized_RCNN() if args.cuda: maskRCNN.cuda() if args.load_ckpt: load_name = args.load_ckpt print("loading checkpoint %s" % (load_name)) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.load_detectron: print("loading detectron weights %s" % args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True, device_ids=[0]) # only support single GPU maskRCNN.eval() if args.image_dir: imglist = misc_utils.get_imagelist_from_dir(args.image_dir) else: imglist = args.images num_images = len(imglist) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) for i in xrange(num_images): print('img', i) im = cv2.imread(imglist[i]) assert im is not None timers = defaultdict(Timer) cls_boxes, cls_segms, cls_keyps = im_detect_all(maskRCNN, im, timers=timers) im_name, _ = os.path.splitext(os.path.basename(imglist[i])) vis_utils.vis_one_image( im[:, :, ::-1], # BGR -> RGB for visualization im_name, args.output_dir, cls_boxes, cls_segms, cls_keyps, dataset=dataset, box_alpha=0.3, show_class=True, thresh=0.7, kp_thresh=2 ) if args.merge_pdfs and num_images > 1: merge_out_path = '{}/results.pdf'.format(args.output_dir) if os.path.exists(merge_out_path): os.remove(merge_out_path) command = "pdfunite {}/*.pdf {}".format(args.output_dir, merge_out_path) subprocess.call(command, shell=True)
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 2 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS print('Batch size change from {} (in config file) to {}'.format( original_batch_size, args.batch_size)) print('NUM_GPUs: %d, TRAIN.IMS_PER_BATCH: %d' % (cfg.NUM_GPUS, cfg.TRAIN.IMS_PER_BATCH)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Adjust learning based on batch size change linearly old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch size change: {} --> {}'. format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma assert_and_infer_cfg() timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, drop_last=True, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) dataiterator = iter(dataloader) ### Model ### maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### bias_params = [] nonbias_params = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) else: nonbias_params.append(value) # Learning rate of 0 is a dummy value to be set properly at the start of training params = [{ 'params': nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: args.start_step = checkpoint['step'] + 1 assert checkpoint['train_size'] == train_size # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0][ 'lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### run_name = misc_utils.get_run_name() output_dir = misc_utils.get_output_dir(args, run_name) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) # Set index for decay steps decay_steps_ind = None for i in range(1, len(cfg.SOLVER.STEPS)): if cfg.SOLVER.STEPS[i] > args.start_step: decay_steps_ind = i if decay_steps_ind is None: decay_steps_ind = len(cfg.SOLVER.STEPS) logger.info('Training starts !') loss_avg = 0 try: timers['train_loop'].tic() for step in range(args.start_step, cfg.SOLVER.MAX_ITER): # Warm up if step < cfg.SOLVER.WARM_UP_ITERS: method = cfg.SOLVER.WARM_UP_METHOD if method == 'constant': warmup_factor = cfg.SOLVER.WARM_UP_FACTOR elif method == 'linear': alpha = step / cfg.SOLVER.WARM_UP_ITERS warmup_factor = cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha else: raise KeyError( 'Unknown SOLVER.WARM_UP_METHOD: {}'.format(method)) lr_new = cfg.SOLVER.BASE_LR * warmup_factor net_utils.update_learning_rate(optimizer, lr, lr_new) lr = lr_new elif step == cfg.SOLVER.WARM_UP_ITERS: lr = cfg.SOLVER.BASE_LR # Learning rate decay if decay_steps_ind < len(cfg.SOLVER.STEPS) and \ step == cfg.SOLVER.STEPS[decay_steps_ind]: logger.info('Decay the learning on step %d', step) lr_new = lr * cfg.SOLVER.GAMMA net_utils.update_learning_rate(optimizer, lr, lr_new) lr = lr_new decay_steps_ind += 1 try: input_data = next(dataiterator) except StopIteration: dataiterator = iter(dataloader) input_data = next(dataiterator) for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) outputs = maskRCNN(**input_data) rois_label = outputs['rois_label'] cls_score = outputs['cls_score'] bbox_pred = outputs['bbox_pred'] loss_rpn_cls = outputs['loss_rpn_cls'].mean() loss_rpn_bbox = outputs['loss_rpn_bbox'].mean() loss_rcnn_cls = outputs['loss_rcnn_cls'].mean() loss_rcnn_bbox = outputs['loss_rcnn_bbox'].mean() loss = loss_rpn_cls + loss_rpn_bbox + loss_rcnn_cls + loss_rcnn_bbox if cfg.MODEL.MASK_ON: loss_rcnn_mask = outputs['loss_rcnn_mask'].mean() loss += loss_rcnn_mask if cfg.MODEL.KEYPOINTS_ON: loss_rcnn_keypoints = outputs['loss_rcnn_keypoints'].mean() loss += loss_rcnn_keypoints loss_avg += loss.data.cpu().numpy()[0] optimizer.zero_grad() loss.backward() optimizer.step() if (step + 1) % CHECKPOINT_PERIOD == 0: save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) if ((step % args.disp_interval == 0 and (step - args.start_step >= args.disp_interval)) or step == cfg.SOLVER.MAX_ITER - 1): diff = timers['train_loop'].toc(average=False) loss_avg /= args.disp_interval loss_rpn_cls = loss_rpn_cls.data[0] loss_rpn_bbox = loss_rpn_bbox.data[0] loss_rcnn_cls = loss_rcnn_cls.data[0] loss_rcnn_bbox = loss_rcnn_bbox.data[0] fg_cnt = torch.sum(rois_label.data.ne(0)) bg_cnt = rois_label.data.numel() - fg_cnt print("[ %s ][ step %d ]" % (run_name, step)) print("\t\tloss: %.4f, lr: %.2e" % (loss_avg, lr)) print("\t\tfg/bg=(%d/%d), time cost: %f" % (fg_cnt, bg_cnt, diff)) print( "\t\trpn_cls: %.4f, rpn_bbox: %.4f, rcnn_cls: %.4f, rcnn_bbox %.4f" % (loss_rpn_cls, loss_rpn_bbox, loss_rcnn_cls, loss_rcnn_bbox)) print_prefix = "\t\t" if cfg.MODEL.MASK_ON: loss_rcnn_mask = loss_rcnn_mask.data[0] print("%srcnn_mask %.4f" % (print_prefix, loss_rcnn_mask)) print_prefix = ", " if cfg.MODEL.KEYPOINTS_ON: loss_rcnn_keypoints = loss_rcnn_keypoints.data[0] print("%srcnn_keypoints %.4f" % (print_prefix, loss_rcnn_keypoints)) if args.use_tfboard and not args.no_save: info = { 'lr': lr, 'loss': loss_avg, 'loss_rpn_cls': loss_rpn_cls, 'loss_rpn_box': loss_rpn_bbox, 'loss_rcnn_cls': loss_rcnn_cls, 'loss_rcnn_box': loss_rcnn_bbox, } if cfg.MODEL.MASK_ON: info['loss_rcnn_mask'] = loss_rcnn_mask if cfg.MODEL.KEYPOINTS_ON: info['loss_rcnn_keypoints'] = loss_rcnn_keypoints for tag, value in info.items(): tblogger.add_scalar(tag, value, step) loss_avg = 0 timers['train_loop'].tic() # Save last checkpoint save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) except (RuntimeError, KeyboardInterrupt) as e: print('Save on exception:', e) save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) stack_trace = traceback.format_exc() print(stack_trace) finally: # ---- Training ends ---- if args.use_tfboard and not args.no_save: tblogger.close()
def main(): """Main function""" args = parse_cfg() timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size batchSampler = BatchSampler(sampler=MinibatchSampler( ratio_list, ratio_index), batch_size=args.batch_size, drop_last=True) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_sampler=batchSampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) dataiterator = iter(dataloader) ### Model ### maskRCNN = GetRCNNModel() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### gn_param_nameset = set() for name, module in maskRCNN.named_modules(): if isinstance(module, nn.GroupNorm): gn_param_nameset.add(name + '.weight') gn_param_nameset.add(name + '.bias') gn_params = [] gn_param_names = [] bias_params = [] bias_param_names = [] nonbias_params = [] nonbias_param_names = [] nograd_param_names = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) bias_param_names.append(key) elif key in gn_param_nameset: gn_params.append(value) gn_param_names.append(key) else: nonbias_params.append(value) nonbias_param_names.append(key) else: nograd_param_names.append(key) assert (gn_param_nameset - set(nograd_param_names) - set(bias_param_names)) == set(gn_param_names) # Learning rate of 0 is a dummy value to be set properly at the start of training params = [{ 'params': nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }, { 'params': gn_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY_GN }] # names of paramerters for each paramter param_names = [nonbias_param_names, bias_param_names, gn_param_names] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: args.start_step = checkpoint['step'] + 1 if 'train_size' in checkpoint: # For backward compatibility if checkpoint['train_size'] != train_size: print( 'train_size value: %d different from the one in checkpoint: %d' % (train_size, checkpoint['train_size'])) # reorder the params in optimizer checkpoint's params_groups if needed # misc_utils.ensure_optimizer_ckpt_params_order(param_names, checkpoint) # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0][ 'lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) if cfg.TRAIN_SYNC_BN: # Shu:For synchorinized BN patch_replication_callback(maskRCNN) ### Training Setups ### args.run_name = misc_utils.get_run_name() + '_step' output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) # Set index for decay steps decay_steps_ind = None for i in range(1, len(cfg.SOLVER.STEPS)): if cfg.SOLVER.STEPS[i] >= args.start_step: decay_steps_ind = i break if decay_steps_ind is None: decay_steps_ind = len(cfg.SOLVER.STEPS) training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) try: logger.info('Training starts !') step = args.start_step for step in range(args.start_step, cfg.SOLVER.MAX_ITER): # Warm up if step < cfg.SOLVER.WARM_UP_ITERS: method = cfg.SOLVER.WARM_UP_METHOD if method == 'constant': warmup_factor = cfg.SOLVER.WARM_UP_FACTOR elif method == 'linear': alpha = step / cfg.SOLVER.WARM_UP_ITERS warmup_factor = cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha else: raise KeyError( 'Unknown SOLVER.WARM_UP_METHOD: {}'.format(method)) lr_new = cfg.SOLVER.BASE_LR * warmup_factor net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new elif step == cfg.SOLVER.WARM_UP_ITERS: net_utils.update_learning_rate(optimizer, lr, cfg.SOLVER.BASE_LR) lr = optimizer.param_groups[0]['lr'] assert lr == cfg.SOLVER.BASE_LR # Learning rate decay if decay_steps_ind < len(cfg.SOLVER.STEPS) and \ step == cfg.SOLVER.STEPS[decay_steps_ind]: logger.info('Decay the learning on step %d', step) lr_new = lr * cfg.SOLVER.GAMMA net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new decay_steps_ind += 1 training_stats.IterTic() optimizer.zero_grad() for inner_iter in range(args.iter_size): try: input_data = next(dataiterator) except StopIteration: dataiterator = iter(dataloader) input_data = next(dataiterator) for key in input_data: if cfg.MODEL.LR_VIEW_ON or cfg.MODEL.GIF_ON or cfg.MODEL.LRASY_MAHA_ON: if key != 'roidb' and key != 'data': # roidb is a list of ndarrays with inconsistent length input_data[key] = list( map(Variable, input_data[key])) if key == 'data': input_data[key] = [ torch.squeeze(item) for item in input_data[key] ] input_data[key] = list( map(Variable, input_data[key])) else: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list( map(Variable, input_data[key])) net_outputs = maskRCNN(**input_data) training_stats.UpdateIterStats(net_outputs, inner_iter) loss = net_outputs['total_loss'] loss.backward() optimizer.step() training_stats.IterToc() training_stats.LogIterStats(step, lr) if (step + 1) % CHECKPOINT_PERIOD == 0: net_utils.train_save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) # ---- Training ends ---- # Save last checkpoint net_utils.train_save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) except (RuntimeError, KeyboardInterrupt): del dataiterator logger.info('Save ckpt on exception ...') net_utils.train_save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): saveNetStructure=False """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: #set gpu device os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(ids) for ids in args.device_ids]) torch.backends.cudnn.benchmark=True cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train',) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train',) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cityscapes": cfg.TRAIN.DATASETS = ('cityscapes_semseg_train', ) cfg.MODEL.NUM_CLASSES = 19 elif args.dataset == "cityscape_train_on_val": cfg.TRAIN.DATASETS = ('cityscape_train_on_val', ) cfg.MODEL.NUM_CLASSES = 19 elif args.dataset == "cityscapes_coarse": cfg.TRAIN.DATASETS = ('cityscapes_coarse', ) cfg.MODEL.NUM_CLASSES = 19 elif args.dataset == "cityscapes_all": cfg.TRAIN.DATASETS = ('cityscapes_all', ) cfg.MODEL.NUM_CLASSES = 19 elif args.dataset == "cityscapes_trainval": cfg.TRAIN.DATASETS = ('cityscapes_trainval', ) cfg.MODEL.NUM_CLASSES = 19 elif args.dataset == "cityscapes_fineturn": cfg.TRAIN.DATASETS = ('cityscapes_fineturn', ) cfg.MODEL.NUM_CLASSES = 19 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS print('Batch size change from {} (in config file) to {}'.format( original_batch_size, args.batch_size)) print('NUM_GPUs: %d, TRAIN.IMS_PER_BATCH: %d' % (cfg.NUM_GPUS, cfg.TRAIN.IMS_PER_BATCH)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Adjust learning based on batch size change linearly old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch size change: {} --> {}'.format( old_base_lr, cfg.SOLVER.BASE_LR)) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() if cfg.SEM.SEM_ON or cfg.DISP.DISP_ON: roidb, ratio_list, ratio_index = combined_roidb_for_training_semseg( cfg.TRAIN.DATASETS) else: roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() train_size = len(roidb) logger.info('{:d} roidb entries'.format(train_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) #sampler = MinibatchSampler(ratio_list, ratio_index) sampler = None dataset = RoiDataLoader( roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch_semseg_all if cfg.SEM.SEM_ON or cfg.DISP.DISP_ON else collate_minibatch, drop_last=False, shuffle=True, pin_memory=True) assert_and_infer_cfg() #for data in dataloader: # image = data['data'][0][0].numpy() # print (image.shape) # image=image.transpose(1,2,0)+cfg.PIXEL_MEANS # cv2.imwrite('image.png', image[:,:,::-1]) # cv2.imwrite('label.png',10*data['semseg_label_0'][0][0].numpy()) # return maskRCNN = eval(cfg.MODEL.TYPE)() if len(cfg.SEM.PSPNET_PRETRAINED_WEIGHTS)>1: print("loading pspnet weights") state_dict={} pretrained=torch.load(cfg.SEM.PSPNET_PRETRAINED_WEIGHTS, map_location=lambda storage, loc: storage) pretrained = pretrained['model'] if cfg.SEM.SPN_ON: maskRCNN.pspnet.load_state_dict(pretrained,strict=True) elif 'deeplab' in cfg.SEM.DECODER_TYPE: encoder = dict() for k, v in pretrained.items(): if 'decoder' in k: continue encoder[k.replace('encoder.','')] = v maskRCNN.encoder.load_state_dict(encoder,strict=True) del encoder else: maskRCNN.load_state_dict(pretrained,strict=True) del pretrained print("weights load success") if cfg.SEM.SPN_ON: maskRCNN.pspnet.eval() for p in maskRCNN.pspnet.parameters(): p.requires_grad = False # load nets into gpu maskRCNN = UserScatteredDataParallel(maskRCNN) # For sync bn patch_replication_callback(maskRCNN) if cfg.CUDA: maskRCNN.to('cuda') ### Optimizer ### bias_params = [] nonbias_params = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) else: nonbias_params.append(value) params = [ {'params': nonbias_params, 'lr': cfg.SOLVER.BASE_LR, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY}, {'params': bias_params, 'lr': cfg.SOLVER.BASE_LR * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0} ] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) print("Using STEP as Lr reduce policy!") if cfg.SOLVER.TYPE == 'SGD' and cfg.SOLVER.LR_POLICY == 'ReduceLROnPlateau': optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,'min',patience=10) print("Using ReduceLROnPlateau as Lr reduce policy!") elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) elif "poly" in cfg.SOLVER.TYPE: optimizer = create_optimizers(maskRCNN,args) print("Using Poly as Lr reduce policy!") args.max_iters = (int(train_size / args.batch_size)) * args.num_epochs ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: assert checkpoint['iters_per_epoch'] == train_size // args.batch_size, \ "iters_per_epoch should match for resume" # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) if checkpoint['step'] == (checkpoint['iters_per_epoch'] - 1): # Resume from end of an epoch args.start_epoch = checkpoint['epoch'] + 1 args.start_iter = 0 else: # Resume from the middle of an epoch. # NOTE: dataloader is not synced with previous state args.start_epoch = checkpoint['epoch'] args.start_iter = checkpoint['step'] + 1 del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) if cfg.SOLVER.TYPE=='step_poly': lr = cfg.SOLVER.BASE_LR / (cfg.SOLVER.GAMMA**len(args.lr_decay_epochs)) else: lr = optimizer.param_groups[0]['lr'] # lr of non-bias parameters, for commmand line outputs. ### Training Setups ### args.run_name = misc_utils.get_run_name() output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: #from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) iters_per_epoch = int(train_size / args.batch_size) # drop last args.iters_per_epoch = iters_per_epoch ckpt_interval_per_epoch = iters_per_epoch // args.ckpt_num_per_epoch try: logger.info('Training starts !') args.step = args.start_iter global_step = iters_per_epoch * args.start_epoch + args.step for args.epoch in range(args.start_epoch, args.start_epoch + args.num_epochs): # ---- Start of epoch ---- # adjust learning rate if args.lr_decay_epochs and args.epoch == args.lr_decay_epochs[0] and args.start_iter == 0 and cfg.SOLVER.LR_POLICY=='steps_with_decay' : args.lr_decay_epochs.pop(0) net_utils.decay_learning_rate(optimizer, lr, cfg.SOLVER.GAMMA) lr *= cfg.SOLVER.GAMMA for args.step, input_data in zip(range(args.start_iter, iters_per_epoch), dataloader): #if cfg.DISP.DISP_ON: # input_data['data'] = list(map(lambda x,y: torch.cat((x,y), dim=0), # input_data['data'], input_data['data_R'])) # if cfg.SEM.DECODER_TYPE.endswith('3D'): # input_data['disp_scans'] = torch.arange(1, # cfg.DISP.MAX_DISPLACEMENT+1).float().view(1,cfg.DISP.MAX_DISPLACEMENT).repeat(args.batch_size,1) # del input_data['data_R'] #for key in input_data: # if key != 'roidb': # roidb is a list of ndarrays with inconsistent length # input_data[key] = list(map(lambda x: Variable(x, requires_grad=False).to('cuda'), input_data[key])) training_stats.IterTic() net_outputs = maskRCNN(input_data) training_stats.UpdateIterStats(net_outputs) #loss = net_outputs['losses']['loss_semseg'] #acc = net_outputs['metrics']['accuracy_pixel'] #print (loss.item(), acc) #for key in net_outputs.keys(): # print(key) loss = net_outputs['total_loss'] #print("loss.shape:",loss) optimizer.zero_grad() loss.backward() optimizer.step() if cfg.SOLVER.TYPE=='poly': lr = adjust_learning_rate(optimizer, global_step, args) if cfg.SOLVER.TYPE=='step_poly': lr = step_adjust_learning_rate(optimizer, lr, global_step, args) training_stats.IterToc() if args.step % args.disp_interval == 0: disp_image='' semseg_image='' #tblogger.add_image('disp_image',disp_image,global_step) #tblogger.add_image('semseg_image',semseg_image,global_step) log_training_stats(training_stats, global_step, lr) global_step += 1 # ---- End of epoch ---- # save checkpoint if cfg.SOLVER.TYPE == 'SGD' and cfg.SOLVER.LR_POLICY == 'ReduceLROnPlateau': lr_scheduler.step(loss) lr = optimizer.param_groups[0]['lr'] if (args.epoch+1) % args.ckpt_num_per_epoch ==0: net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) # reset starting iter number after first epoch args.start_iter = 0 # ---- Training ends ---- #if iters_per_epoch % args.disp_interval != 0: # log last stats at the end # log_training_stats(training_stats, global_step, lr) # save final model if (args.epoch+1) % args.ckpt_num_per_epoch: net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) except (RuntimeError, KeyboardInterrupt): logger.info('Save ckpt on exception ...') net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train',) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train',) cfg.MODEL.NUM_CLASSES = 2 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH original_ims_per_batch = cfg.TRAIN.IMS_PER_BATCH original_num_gpus = cfg.NUM_GPUS if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS effective_batch_size = args.iter_size * args.batch_size print('effective_batch_size = batch_size * iter_size = %d * %d' % (args.batch_size, args.iter_size)) print('Adaptive config changes:') print(' effective_batch_size: %d --> %d' % (original_batch_size, effective_batch_size)) print(' NUM_GPUS: %d --> %d' % (original_num_gpus, cfg.NUM_GPUS)) print(' IMS_PER_BATCH: %d --> %d' % (original_ims_per_batch, cfg.TRAIN.IMS_PER_BATCH)) ### Adjust learning based on batch size change linearly # For iter_size > 1, gradients are `accumulated`, so lr is scaled based # on batch_size instead of effective_batch_size old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch_size change:\n' ' BASE_LR: {} --> {}'.format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Adjust solver steps step_scale = original_batch_size / effective_batch_size old_solver_steps = cfg.SOLVER.STEPS old_max_iter = cfg.SOLVER.MAX_ITER cfg.SOLVER.STEPS = list(map(lambda x: int(x * step_scale + 0.5), cfg.SOLVER.STEPS)) cfg.SOLVER.MAX_ITER = int(cfg.SOLVER.MAX_ITER * step_scale + 0.5) print('Adjust SOLVER.STEPS and SOLVER.MAX_ITER linearly based on effective_batch_size change:\n' ' SOLVER.STEPS: {} --> {}\n' ' SOLVER.MAX_ITER: {} --> {}'.format(old_solver_steps, cfg.SOLVER.STEPS, old_max_iter, cfg.SOLVER.MAX_ITER)) # Scale FPN rpn_proposals collect size (post_nms_topN) in `collect` function # of `collect_and_distribute_fpn_rpn_proposals.py` # # post_nms_topN = int(cfg[cfg_key].RPN_POST_NMS_TOP_N * cfg.FPN.RPN_COLLECT_SCALE + 0.5) if cfg.FPN.FPN_ON and cfg.MODEL.FASTER_RCNN: cfg.FPN.RPN_COLLECT_SCALE = cfg.TRAIN.IMS_PER_BATCH / original_ims_per_batch print('Scale FPN rpn_proposals collect size directly propotional to the change of IMS_PER_BATCH:\n' ' cfg.FPN.RPN_COLLECT_SCALE: {}'.format(cfg.FPN.RPN_COLLECT_SCALE)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma assert_and_infer_cfg() timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size batchSampler = BatchSampler( sampler=MinibatchSampler(ratio_list, ratio_index), batch_size=args.batch_size, drop_last=True ) dataset = RoiDataLoader( roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_sampler=batchSampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) dataiterator = iter(dataloader) ### Model ### maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### gn_param_nameset = set() for name, module in maskRCNN.named_modules(): if isinstance(module, nn.GroupNorm): gn_param_nameset.add(name+'.weight') gn_param_nameset.add(name+'.bias') gn_params = [] gn_param_names = [] bias_params = [] bias_param_names = [] nonbias_params = [] nonbias_param_names = [] nograd_param_names = [] for key, value in maskRCNN.named_parameters(): if value.requires_grad: if 'bias' in key: bias_params.append(value) bias_param_names.append(key) elif key in gn_param_nameset: gn_params.append(value) gn_param_names.append(key) else: nonbias_params.append(value) nonbias_param_names.append(key) else: nograd_param_names.append(key) assert (gn_param_nameset - set(nograd_param_names) - set(bias_param_names)) == set(gn_param_names) # Learning rate of 0 is a dummy value to be set properly at the start of training params = [ {'params': nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY}, {'params': bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0}, {'params': gn_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY_GN} ] # names of paramerters for each paramter param_names = [nonbias_param_names, bias_param_names, gn_param_names] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: args.start_step = checkpoint['step'] + 1 if 'train_size' in checkpoint: # For backward compatibility if checkpoint['train_size'] != train_size: print('train_size value: %d different from the one in checkpoint: %d' % (train_size, checkpoint['train_size'])) # reorder the params in optimizer checkpoint's params_groups if needed # misc_utils.ensure_optimizer_ckpt_params_order(param_names, checkpoint) # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. optimizer.load_state_dict(checkpoint['optimizer']) # misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0]['lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() + '_step' output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) # Set index for decay steps decay_steps_ind = None for i in range(1, len(cfg.SOLVER.STEPS)): if cfg.SOLVER.STEPS[i] >= args.start_step: decay_steps_ind = i break if decay_steps_ind is None: decay_steps_ind = len(cfg.SOLVER.STEPS) training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) try: logger.info('Training starts !') step = args.start_step for step in range(args.start_step, cfg.SOLVER.MAX_ITER): # Warm up if step < cfg.SOLVER.WARM_UP_ITERS: method = cfg.SOLVER.WARM_UP_METHOD if method == 'constant': warmup_factor = cfg.SOLVER.WARM_UP_FACTOR elif method == 'linear': alpha = step / cfg.SOLVER.WARM_UP_ITERS warmup_factor = cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha else: raise KeyError('Unknown SOLVER.WARM_UP_METHOD: {}'.format(method)) lr_new = cfg.SOLVER.BASE_LR * warmup_factor net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new elif step == cfg.SOLVER.WARM_UP_ITERS: net_utils.update_learning_rate(optimizer, lr, cfg.SOLVER.BASE_LR) lr = optimizer.param_groups[0]['lr'] assert lr == cfg.SOLVER.BASE_LR # Learning rate decay if decay_steps_ind < len(cfg.SOLVER.STEPS) and \ step == cfg.SOLVER.STEPS[decay_steps_ind]: logger.info('Decay the learning on step %d', step) lr_new = lr * cfg.SOLVER.GAMMA net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new decay_steps_ind += 1 training_stats.IterTic() optimizer.zero_grad() for inner_iter in range(args.iter_size): try: input_data = next(dataiterator) except StopIteration: dataiterator = iter(dataloader) input_data = next(dataiterator) for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) net_outputs = maskRCNN(**input_data) training_stats.UpdateIterStats(net_outputs, inner_iter) loss = net_outputs['total_loss'] loss.backward() optimizer.step() training_stats.IterToc() training_stats.LogIterStats(step, lr) if (step+1) % CHECKPOINT_PERIOD == 0: save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) # ---- Training ends ---- # Save last checkpoint save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) except (RuntimeError, KeyboardInterrupt): del dataiterator logger.info('Save ckpt on exception ...') save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train',) elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train',) else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS print('Batch size change from {} (in config file) to {}'.format( original_batch_size, args.batch_size)) print('NUM_GPUs: %d, TRAIN.IMS_PER_BATCH: %d' % (cfg.NUM_GPUS, cfg.TRAIN.IMS_PER_BATCH)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Adjust learning based on batch size change linearly old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch size change: {} --> {}'.format( old_base_lr, cfg.SOLVER.BASE_LR)) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma args.mGPUs = (cfg.NUM_GPUS > 1) timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() train_size = len(roidb) logger.info('{:d} roidb entries'.format(train_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader( roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) assert_and_infer_cfg() ### Model ### maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### bias_params = [] nonbias_params = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) else: nonbias_params.append(value) params = [ {'params': nonbias_params, 'lr': cfg.SOLVER.BASE_LR, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY}, {'params': bias_params, 'lr': cfg.SOLVER.BASE_LR * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0} ] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: assert checkpoint['iters_per_epoch'] == train_size // args.batch_size, \ "iters_per_epoch should match for resume" # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) if checkpoint['step'] == (checkpoint['iters_per_epoch'] - 1): # Resume from end of an epoch args.start_epoch = checkpoint['epoch'] + 1 args.start_iter = 0 else: # Resume from the middle of an epoch. # NOTE: dataloader is not synced with previous state args.start_epoch = checkpoint['epoch'] args.start_iter = checkpoint['step'] + 1 del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0]['lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### run_name = misc_utils.get_run_name() output_dir = misc_utils.get_output_dir(args, run_name) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() iters_per_epoch = int(train_size / args.batch_size) # drop last ckpt_interval_per_epoch = iters_per_epoch // args.ckpt_num_per_epoch step = 0 try: logger.info('Training starts !') for epoch in range(args.start_epoch, args.start_epoch + args.num_epochs): # ---- Start of epoch ---- loss_avg = 0 timers['train_loop'].tic() # adjust learning rate if args.lr_decay_epochs and epoch == args.lr_decay_epochs[0] and args.start_iter == 0: args.lr_decay_epochs.pop(0) net_utils.decay_learning_rate(optimizer, lr, cfg.SOLVER.GAMMA) lr *= cfg.SOLVER.GAMMA for step, input_data in zip(range(args.start_iter, iters_per_epoch), dataloader): for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) outputs = maskRCNN(**input_data) rois_label = outputs['rois_label'] cls_score = outputs['cls_score'] bbox_pred = outputs['bbox_pred'] loss_rpn_cls = outputs['loss_rpn_cls'].mean() loss_rpn_bbox = outputs['loss_rpn_bbox'].mean() loss_rcnn_cls = outputs['loss_rcnn_cls'].mean() loss_rcnn_bbox = outputs['loss_rcnn_bbox'].mean() loss = loss_rpn_cls + loss_rpn_bbox + loss_rcnn_cls + loss_rcnn_bbox if cfg.MODEL.MASK_ON: loss_rcnn_mask = outputs['loss_rcnn_mask'].mean() loss += loss_rcnn_mask if cfg.MODEL.KEYPOINTS_ON: loss_rcnn_keypoints = outputs['loss_rcnn_keypoints'].mean() loss += loss_rcnn_keypoints loss_avg += loss.data.cpu().numpy()[0] optimizer.zero_grad() loss.backward() optimizer.step() if (step+1) % ckpt_interval_per_epoch == 0: net_utils.save_ckpt(output_dir, args, epoch, step, maskRCNN, optimizer, iters_per_epoch) if (step+1) % args.disp_interval == 0: if (step + 1 - args.start_iter) >= args.disp_interval: # for the case of resume diff = timers['train_loop'].toc(average=False) loss_avg /= args.disp_interval loss_rpn_cls = loss_rpn_cls.data[0] loss_rpn_bbox = loss_rpn_bbox.data[0] loss_rcnn_cls = loss_rcnn_cls.data[0] loss_rcnn_bbox = loss_rcnn_bbox.data[0] fg_cnt = torch.sum(rois_label.data.ne(0)) bg_cnt = rois_label.data.numel() - fg_cnt print("[%s][epoch %2d][iter %4d / %4d]" % (run_name, epoch, step, iters_per_epoch)) print("\t\tloss: %.4f, lr: %.2e" % (loss_avg, lr)) print("\t\tfg/bg=(%d/%d), time cost: %f" % (fg_cnt, bg_cnt, diff)) print("\t\trpn_cls: %.4f, rpn_bbox: %.4f, rcnn_cls: %.4f, rcnn_bbox %.4f" % (loss_rpn_cls, loss_rpn_bbox, loss_rcnn_cls, loss_rcnn_bbox)) print_prefix = "\t\t" if cfg.MODEL.MASK_ON: loss_rcnn_mask = loss_rcnn_mask.data[0] print("%srcnn_mask %.4f" % (print_prefix, loss_rcnn_mask)) print_prefix = ", " if cfg.MODEL.KEYPOINTS_ON: loss_rcnn_keypoints = loss_rcnn_keypoints.data[0] print("%srcnn_keypoints %.4f" % (print_prefix, loss_rcnn_keypoints)) if args.use_tfboard: info = { 'loss': loss_avg, 'loss_rpn_cls': loss_rpn_cls, 'loss_rpn_box': loss_rpn_bbox, 'loss_rcnn_cls': loss_rcnn_cls, 'loss_rcnn_box': loss_rcnn_bbox, } if cfg.MODEL.MASK_ON: info['loss_rcnn_mask'] = loss_rcnn_mask if cfg.MODEL.KEYPOINTS_ON: info['loss_rcnn_keypoints'] = loss_rcnn_keypoints for tag, value in info.items(): tblogger.add_scalar(tag, value, iters_per_epoch * epoch + step) loss_avg = 0 timers['train_loop'].tic() # ---- End of epoch ---- # save checkpoint net_utils.save_ckpt(output_dir, args, epoch, step, maskRCNN, optimizer, iters_per_epoch) # reset timer timers['train_loop'].reset() # reset starting iter number after first epoch args.start_iter = 0 except (RuntimeError, KeyboardInterrupt) as e: print('Save on exception:', e) net_utils.save_ckpt(output_dir, args, epoch, step, maskRCNN, optimizer, iters_per_epoch) stack_trace = traceback.format_exc() print(stack_trace) finally: # ---- Training ends ---- if args.use_tfboard: tblogger.close()
def main(): """main function""" if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") args = parse_args() print('Called with args:') print(args) assert args.image_dir or args.images assert bool(args.image_dir) ^ bool(args.images) if args.dataset == "pascal_parts_heads": dataset = datasets.get_head_dataset() cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "scuthead_a": dataset = datasets.get_head_dataset() cfg.MODEL.NUM_CLASSES = 2 else: raise ValueError('Unexpected dataset name: {}'.format(args.dataset)) print('load cfg from file: {}'.format(args.cfg_file)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) assert bool(args.load_ckpt) ^ bool(args.load_detectron), \ 'Exactly one of --load_ckpt and --load_detectron should be specified.' cfg.MODEL.LOAD_IMAGENET_PRETRAINED_WEIGHTS = False # Don't need to load imagenet pretrained weights assert_and_infer_cfg() maskRCNN = Generalized_RCNN() if args.cuda: maskRCNN.cuda() if args.load_ckpt: load_name = args.load_ckpt print("loading checkpoint %s" % (load_name)) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.load_detectron: print("loading detectron weights %s" % args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True, device_ids=[0]) # only support single GPU maskRCNN.eval() if args.image_dir: imglist = misc_utils.get_imagelist_from_dir(args.image_dir) else: imglist = args.images num_images = len(imglist) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) for i in xrange(num_images): im = cv2.imread(imglist[i]) assert im is not None timers = defaultdict(Timer) cls_boxes, cls_segms, cls_keyps = im_detect_all(maskRCNN, im, timers=timers) im_name, _ = os.path.splitext(os.path.basename(imglist[i])) outputfile = os.path.join(args.output_dir, im_name) head_boxes = cls_boxes[1] print('img :', i, ' num_heads :', len(head_boxes), ' img_path :', imglist[i]) np.save(outputfile, head_boxes)
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") cfg.DATASET = args.dataset if args.dataset == "vrd": cfg.TRAIN.DATASETS = ('vrd_train',) cfg.MODEL.NUM_CLASSES = 101 cfg.MODEL.NUM_PRD_CLASSES = 70 # exclude background elif args.dataset == "vg": cfg.TRAIN.DATASETS = ('vg_train',) cfg.MODEL.NUM_CLASSES = 151 cfg.MODEL.NUM_PRD_CLASSES = 50 # exclude background elif args.dataset == "vg80k": cfg.TRAIN.DATASETS = ('vg80k_train',) cfg.MODEL.NUM_CLASSES = 53305 # includes background cfg.MODEL.NUM_PRD_CLASSES = 29086 # excludes background elif args.dataset == "gvqa20k": cfg.TRAIN.DATASETS = ('gvqa20k_train',) cfg.MODEL.NUM_CLASSES = 1704 # includes background cfg.MODEL.NUM_PRD_CLASSES = 310 # exclude background elif args.dataset == "gvqa10k": cfg.TRAIN.DATASETS = ('gvqa10k_train',) cfg.MODEL.NUM_CLASSES = 1704 # includes background cfg.MODEL.NUM_PRD_CLASSES = 310 # exclude background elif args.dataset == "gvqa": cfg.TRAIN.DATASETS = ('gvqa_train',) cfg.MODEL.NUM_CLASSES = 1704 # includes background cfg.MODEL.NUM_PRD_CLASSES = 310 # exclude background else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH original_ims_per_batch = cfg.TRAIN.IMS_PER_BATCH original_num_gpus = cfg.NUM_GPUS if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS effective_batch_size = args.iter_size * args.batch_size print('effective_batch_size = batch_size * iter_size = %d * %d' % (args.batch_size, args.iter_size)) print('Adaptive config changes:') print(' effective_batch_size: %d --> %d' % (original_batch_size, effective_batch_size)) print(' NUM_GPUS: %d --> %d' % (original_num_gpus, cfg.NUM_GPUS)) print(' IMS_PER_BATCH: %d --> %d' % (original_ims_per_batch, cfg.TRAIN.IMS_PER_BATCH)) ### Adjust learning based on batch size change linearly # For iter_size > 1, gradients are `accumulated`, so lr is scaled based # on batch_size instead of effective_batch_size old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch_size change:\n' ' BASE_LR: {} --> {}'.format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Adjust solver steps step_scale = original_batch_size / effective_batch_size old_solver_steps = cfg.SOLVER.STEPS old_max_iter = cfg.SOLVER.MAX_ITER cfg.SOLVER.STEPS = list(map(lambda x: int(x * step_scale + 0.5), cfg.SOLVER.STEPS)) cfg.SOLVER.MAX_ITER = int(cfg.SOLVER.MAX_ITER * step_scale + 0.5) print('Adjust SOLVER.STEPS and SOLVER.MAX_ITER linearly based on effective_batch_size change:\n' ' SOLVER.STEPS: {} --> {}\n' ' SOLVER.MAX_ITER: {} --> {}'.format(old_solver_steps, cfg.SOLVER.STEPS, old_max_iter, cfg.SOLVER.MAX_ITER)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma assert_and_infer_cfg() timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size batchSampler = BatchSampler( sampler=MinibatchSampler(ratio_list, ratio_index), batch_size=args.batch_size, drop_last=True ) dataset = RoiDataLoader( roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_sampler=batchSampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) dataiterator = iter(dataloader) ### Model ### maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### # record backbone params, i.e., conv_body and box_head params gn_params = [] backbone_bias_params = [] backbone_bias_param_names = [] prd_branch_bias_params = [] prd_branch_bias_param_names = [] backbone_nonbias_params = [] backbone_nonbias_param_names = [] prd_branch_nonbias_params = [] prd_branch_nonbias_param_names = [] classifier_params = [] classifier_param_names = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'gn' in key: gn_params.append(value) elif 'Conv_Body' in key or 'Box_Head' in key or 'Box_Outs' in key or 'RPN' in key: if 'bias' in key: backbone_bias_params.append(value) backbone_bias_param_names.append(key) else: backbone_nonbias_params.append(value) backbone_nonbias_param_names.append(key) #elif 'classifier' in key: # classifier_params.append(value) # classifier_param_names.append(value) else: if 'bias' in key: prd_branch_bias_params.append(value) prd_branch_bias_param_names.append(key) else: prd_branch_nonbias_params.append(value) prd_branch_nonbias_param_names.append(key) # Learning rate of 0 is a dummy value to be set properly at the start of training params = [ {'params': backbone_nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY}, {'params': backbone_bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0}, {'params': prd_branch_nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY}, {'params': prd_branch_bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0}, {'params': gn_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY_GN}#, #{'params': classifier_params, # 'lr': 0.1, 'momentum': 0.9, 'weight_decay': 0.0005} ] # print('Initializing model optimizer.') # if cfg.SOLVER.TYPE == "SGD": # optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) # elif cfg.SOLVER.TYPE == "Adam": # optimizer = torch.optim.Adam(params) print('Initializing model and classifier optimizers.') #classifier_optim_param = {'lr': 0.1, 'momentum': 0.9, 'weight_decay': 0.0005} #params.append({'params': maskRCNN.classifier.parameters(), # 'lr': classifier_optim_param['lr']), # 'momentum': classifier_optim_param['momentum'], # 'weight_decay': classifier_optim_param['weight_decay']}) #params.append({'params': maskRCNN.prd_classifier.parameters(), # 'lr': classifier_optim_param['lr'], # 'momentum': classifier_optim_param['momentum'], # 'weight_decay': classifier_optim_param['weight_decay']}) if cfg.MODEL.MEMORY_MODULE_STAGE == 1: step_size = 10 elif cfg.MODEL.MEMORY_MODULE_STAGE == 2: step_size = 20 else: raise NotImplementedError scheduler_params = {'step_size': step_size, 'gamma': 0.1} optimizer, optimizer_scheduler = init_optimizers(params, scheduler_params) criterion_optimizer, criterion_optimizer_scheduler = None, None if cfg.MODEL.MEMORY_MODULE_STAGE == 2: print('Initializing criterion optimizer.') feat_loss_optim_param = {'lr': 0.01, 'momentum': 0.9, 'weight_decay': 0.0005} optim_params = feat_loss_optim_param optim_params = [{'params': maskRCNN.feature_loss_sbj_obj.parameters(), 'lr': optim_params['lr'], 'momentum': optim_params['momentum'], 'weight_decay': optim_params['weight_decay']}, {'params': maskRCNN.feature_loss_prd.parameters(), 'lr': optim_params['lr'], 'momentum': optim_params['momentum'], 'weight_decay': optim_params['weight_decay']} ] # Initialize criterion optimizer and scheduler criterion_optimizer, criterion_optimizer_scheduler = init_optimizers(optim_params, scheduler_params) if cfg.MODEL.MEMORY_MODULE_STAGE == 2: weights_path = 'Outputs/e2e_relcnn_VGG16_8_epochs_gvqa_y_loss_only_1_gpu/gvqa/Feb07-10-55-03_login104-09_step_with_prd_cls_v3/ckpt/model_step1439.pth' weights = torch.load(weights_path) #print('weights', weights['model'].keys()) #print(maskRCNN.state_dict().keys()) maskRCNN.load_state_dict(weights['model'], strict=False) #print(list(maskRCNN.parameters())) #print(maskRCNN.state_dict().keys()) #print(maskRCNN.state_dict()['prd_classifier.fc_hallucinator.weight'] == weights['model']['prd_classifier.fc.weight']) #print(torch.all(torch.eq(maskRCNN.state_dict()['prd_classifier.fc_hallucinator.weight'], weights['model']['prd_classifier.fc.weight']))) #print(torch.all(torch.eq(maskRCNN.state_dict()['Box_Head.heads.0.weight'], weights['model']['Box_Head.heads.0.weight']))) #exit() ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: args.start_step = checkpoint['step'] + 1 if 'train_size' in checkpoint: # For backward compatibility if checkpoint['train_size'] != train_size: print('train_size value: %d different from the one in checkpoint: %d' % (train_size, checkpoint['train_size'])) # reorder the params in optimizer checkpoint's params_groups if needed # misc_utils.ensure_optimizer_ckpt_params_order(param_names, checkpoint) # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[2]['lr'] # lr of non-backbone parameters, for commmand line outputs. backbone_lr = optimizer.param_groups[0]['lr'] # lr of backbone parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() + '_step_with_prd_cls_v' + str(cfg.MODEL.SUBTYPE) output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() # CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) # CHECKPOINT_PERIOD = cfg.SOLVER.MAX_ITER / cfg.TRAIN.SNAPSHOT_FREQ CHECKPOINT_PERIOD = 200000 # Set index for decay steps decay_steps_ind = None for i in range(1, len(cfg.SOLVER.STEPS)): if cfg.SOLVER.STEPS[i] >= args.start_step: decay_steps_ind = i break if decay_steps_ind is None: decay_steps_ind = len(cfg.SOLVER.STEPS) training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) try: logger.info('Training starts !') step = args.start_step for step in range(args.start_step, cfg.SOLVER.MAX_ITER): # optimizer_scheduler.step() # if criterion_optimizer: # criterion_optimizer_scheduler.step() # Warm up if step < cfg.SOLVER.WARM_UP_ITERS: method = cfg.SOLVER.WARM_UP_METHOD if method == 'constant': warmup_factor = cfg.SOLVER.WARM_UP_FACTOR elif method == 'linear': alpha = step / cfg.SOLVER.WARM_UP_ITERS warmup_factor = cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha else: raise KeyError('Unknown SOLVER.WARM_UP_METHOD: {}'.format(method)) lr_new = cfg.SOLVER.BASE_LR * warmup_factor net_utils.update_learning_rate_rel(optimizer, lr, lr_new) lr = optimizer.param_groups[2]['lr'] backbone_lr = optimizer.param_groups[0]['lr'] assert lr == lr_new elif step == cfg.SOLVER.WARM_UP_ITERS: net_utils.update_learning_rate_rel(optimizer, lr, cfg.SOLVER.BASE_LR) lr = optimizer.param_groups[2]['lr'] backbone_lr = optimizer.param_groups[0]['lr'] assert lr == cfg.SOLVER.BASE_LR # Learning rate decay if decay_steps_ind < len(cfg.SOLVER.STEPS) and \ step == cfg.SOLVER.STEPS[decay_steps_ind]: logger.info('Decay the learning on step %d', step) lr_new = lr * cfg.SOLVER.GAMMA net_utils.update_learning_rate_rel(optimizer, lr, lr_new) if criterion_optimizer: net_utils.update_learning_rate_rel(criterion_optimizer, lr, lr_new) lr = optimizer.param_groups[2]['lr'] backbone_lr = optimizer.param_groups[0]['lr'] assert lr == lr_new decay_steps_ind += 1 training_stats.IterTic() optimizer.zero_grad() if criterion_optimizer: criterion_optimizer.zero_grad() for inner_iter in range(args.iter_size): try: input_data = next(dataiterator) except StopIteration: dataiterator = iter(dataloader) input_data = next(dataiterator) #print('input_data', [torch.isnan(x) for x in input_data.values()]) for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) #with autograd.detect_anomaly(): net_outputs = maskRCNN(**input_data) training_stats.UpdateIterStats(net_outputs, inner_iter) loss = net_outputs['total_loss'] loss.backward() optimizer.step() if criterion_optimizer: criterion_optimizer.step() training_stats.IterToc() training_stats.LogIterStats(step, lr, backbone_lr) if (step+1) % CHECKPOINT_PERIOD == 0: print('Saving Checkpoint..') save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) # ---- Training ends ---- # Save last checkpoint save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) except Exception as e: del dataiterator logger.info('Save ckpt on exception ...') save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): """main function""" if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") args = parse_args() print('Called with args:') print(args) assert args.image_dir or args.images assert bool(args.image_dir) ^ bool(args.images) dataset = datasets.get_hospital_dataset() cfg.MODEL.NUM_CLASSES = 20 # with bg num_class = cfg.MODEL.NUM_CLASSES sents = dataset.sents th_cls = dataset.th_cls cls2eng = dataset.cls2eng eng2type = dataset.eng2type print('load cfg from file: {}'.format(args.cfg_file)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) assert bool(args.load_ckpt) ^ bool(args.load_detectron), \ 'Exactly one of --load_ckpt and --load_detectron should be specified.' cfg.MODEL.LOAD_IMAGENET_PRETRAINED_WEIGHTS = False # Don't need to load imagenet pretrained weights assert_and_infer_cfg() maskRCNN = Generalized_RCNN() if args.cuda: maskRCNN.cuda() if args.load_ckpt: load_name = args.load_ckpt print("loading checkpoint %s" % (load_name)) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.load_detectron: print("loading detectron weights %s" % args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True, device_ids=[0]) # only support single GPU maskRCNN.eval() if args.image_dir: imglist = misc_utils.get_imagelist_from_dir(args.image_dir) else: imglist = args.images num_images = len(imglist) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) for i in xrange(num_images): # for each image print('img', i) im = cv2.imread(imglist[i]) assert im is not None # segmentation # d = segment(im) # pdb.set_trace() timers = defaultdict(Timer) # detection cls_boxes, cls_segms, cls_keyps = im_detect_all(maskRCNN, im, timers=timers) # first we collect boxes from all classes dets_total = np.empty([0, 6], dtype=np.float32) for cls in range(1, num_class): # for each cls dets = cls_boxes[cls] if dets.shape[0] == 0: continue dets_extend = np.pad( dets, ((0, 0), (0, 1)), # add 0 rows above, below and left, but 1 row right mode='constant', constant_values=cls) # append cls to dets dets_total = np.vstack((dets_total, dets_extend)) # then use a loose NMS to make each region has only one symptom keep = box_utils.nms(dets_total, 0.7) nms_dets = dets_total[keep, :] # iterate through remained boxes report, healthy = '', True have_sym_of_cls = [False for _ in range(num_class)] n = nms_dets.shape[0] final_results = [] # return to the web for idx in range(n): # for each region th, cls = nms_dets[idx, -2], int(nms_dets[idx, -1]) if th > th_cls[cls]: # diagnosed to have the sym report += sents[cls][1] have_sym_of_cls[cls] = True healthy = False ename = cls2eng[int(cls)] _type = eng2type[ename] final_results.append({ 'name': ename, 'type': _type, 'box': list(nms_dets[idx, 0:4]) }) for cls in range(1, num_class): # for each cls if not have_sym_of_cls[cls]: # if have no sym of this cls report += sents[cls][0] if healthy: report = sents[0][0] print(report) pdb.set_trace() # healthy = True # flag indicating healthy or not # for cls in range(1, num_class): # for each cls # dets = cls_boxes[cls] # if dets.shape[0] == 0: # report += sents[cls][0] # continue # n = dets.shape[0] # flag = False # indicates if the sym exists # for k in range(n): # for each region # if dets[k, -1] > th_cls[cls]: # above threshold for this cls, means have this cls of symptom # report += sents[cls][1] # flag = True # healthy = False # if not flag: # don't have this symptom # report += sents[cls][0] # # if healthy: # use the report for healthy people # report = sents[0][0] im_name, _ = os.path.splitext(os.path.basename(imglist[i])) # vis_utils.vis_one_image( # im[:, :, ::-1], # BGR -> RGB for visualization # im_name, # args.output_dir, # cls_boxes, # cls_segms, # cls_keyps, # dataset=dataset, # box_alpha=0.3, # show_class=True, # thresh=0.05, # kp_thresh=2 # ) if args.merge_pdfs and num_images > 1: merge_out_path = '{}/results.pdf'.format(args.output_dir) if os.path.exists(merge_out_path): os.remove(merge_out_path) command = "pdfunite {}/*.pdf {}".format(args.output_dir, merge_out_path) subprocess.call(command, shell=True)
def main(): args = parse_args() print('Called with args:') print(args) cfg = set_configs(args) timers = defaultdict(Timer) ### -------------------------------------------------------------------------------- ### Dataset Training ### ### -------------------------------------------------------------------------------- timers['roidb_training'].tic() roidb_training, ratio_list_training, ratio_index_training, category_to_id_map, prd_category_to_id_map = combined_roidb_for_training( cfg.TRAIN.DATASETS) timers['roidb_training'].toc() roidb_size_training = len(roidb_training) logger.info('{:d} training roidb entries'.format(roidb_size_training)) logger.info('Takes %.2f sec(s) to construct training roidb', timers['roidb_training'].average_time) batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH dataset_training = RoiDataLoader(roidb_training, cfg.MODEL.NUM_CLASSES, training=True, dataset=cfg.TRAIN.DATASETS) dataloader_training = torch.utils.data.DataLoader( dataset_training, batch_size=batch_size, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch, shuffle=True, drop_last=True) dataiterator_training = iter(dataloader_training) ### -------------------------------------------------------------------------------- ### Dataset Validation ### ### -------------------------------------------------------------------------------- timers['roidb_val'].tic() roidb_val, ratio_list_val, ratio_index_val, _, _ = combined_roidb_for_training( cfg.VAL.DATASETS) timers['roidb_val'].toc() roidb_size_val = len(roidb_val) logger.info('{:d} val roidb entries'.format(roidb_size_val)) logger.info('Takes %.2f sec(s) to construct val roidb', timers['roidb_val'].average_time) dataset_val = RoiDataLoader(roidb_val, cfg.MODEL.NUM_CLASSES, training=False, dataset=cfg.VAL.DATASETS) dataloader_val = torch.utils.data.DataLoader( dataset_val, batch_size=batch_size, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch, drop_last=True) ### -------------------------------------------------------------------------------- ### Dataset Test ### ### -------------------------------------------------------------------------------- timers['roidb_test'].tic() roidb_test, ratio_list_test, ratio_index_test, _, _ = combined_roidb_for_training( cfg.TEST.DATASETS) timers['roidb_test'].toc() roidb_size_test = len(roidb_test) logger.info('{:d} test roidb entries'.format(roidb_size_test)) logger.info('Takes %.2f sec(s) to construct test roidb', timers['roidb_test'].average_time) dataset_test = RoiDataLoader(roidb_test, cfg.MODEL.NUM_CLASSES, training=False, dataset=cfg.TEST.DATASETS) dataloader_test = torch.utils.data.DataLoader( dataset_test, batch_size=batch_size, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch, drop_last=True) ### -------------------------------------------------------------------------------- ### Dataset Unseen ### ### -------------------------------------------------------------------------------- if args.dataset == 'vhico': timers['roidb_unseen'].tic() roidb_unseen, ratio_list_unseen, ratio_index_unseen, _, _ = combined_roidb_for_training( cfg.UNSEEN.DATASETS) timers['roidb_unseen'].toc() roidb_size_unseen = len(roidb_unseen) logger.info('{:d} test unseen roidb entries'.format(roidb_size_unseen)) logger.info('Takes %.2f sec(s) to construct test roidb', timers['roidb_unseen'].average_time) dataset_unseen = RoiDataLoader(roidb_unseen, cfg.MODEL.NUM_CLASSES, training=False, dataset=cfg.UNSEEN.DATASETS) dataloader_unseen = torch.utils.data.DataLoader( dataset_unseen, batch_size=batch_size, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch, drop_last=True) ### -------------------------------------------------------------------------------- ### Model ### ### -------------------------------------------------------------------------------- maskRCNN = Generalized_RCNN(category_to_id_map=category_to_id_map, prd_category_to_id_map=prd_category_to_id_map, args=args) if cfg.CUDA: maskRCNN.cuda() ### -------------------------------------------------------------------------------- ### Optimizer ### # record backbone params, i.e., conv_body and box_head params ### -------------------------------------------------------------------------------- gn_params = [] backbone_bias_params = [] backbone_bias_param_names = [] prd_branch_bias_params = [] prd_branch_bias_param_names = [] backbone_nonbias_params = [] backbone_nonbias_param_names = [] prd_branch_nonbias_params = [] prd_branch_nonbias_param_names = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'gn' in key: gn_params.append(value) elif 'Conv_Body' in key or 'Box_Head' in key or 'Box_Outs' in key or 'RPN' in key: if 'bias' in key: backbone_bias_params.append(value) backbone_bias_param_names.append(key) else: backbone_nonbias_params.append(value) backbone_nonbias_param_names.append(key) else: if 'bias' in key: prd_branch_bias_params.append(value) prd_branch_bias_param_names.append(key) else: prd_branch_nonbias_params.append(value) prd_branch_nonbias_param_names.append(key) # Learning rate of 0 is a dummy value to be set properly at the start of training params = [{ 'params': backbone_nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': backbone_bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }, { 'params': prd_branch_nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': prd_branch_bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }, { 'params': gn_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY_GN }] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### -------------------------------------------------------------------------------- ### Load checkpoint ### -------------------------------------------------------------------------------- if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) print( '--------------------------------------------------------------------------------' ) print('loading checkpoint %s' % load_name) print( '--------------------------------------------------------------------------------' ) if args.resume: print('resume') args.start_step = checkpoint['step'] + 1 misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() else: print('args.load_ckpt', args.load_ckpt) lr = optimizer.param_groups[2][ 'lr'] # lr of non-backbone parameters, for commmand line outputs. backbone_lr = optimizer.param_groups[0][ 'lr'] # lr of backbone parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### -------------------------------------------------------------------------------- ### Training Setups ### ### -------------------------------------------------------------------------------- args.run_name = args.out_dir output_dir = misc_utils.get_output_dir(args, args.out_dir) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter tblogger = SummaryWriter(output_dir) ### -------------------------------------------------------------------------------- ### Training Loop ### ### -------------------------------------------------------------------------------- maskRCNN.train() # Set index for decay steps decay_steps_ind = None for i in range(1, len(cfg.SOLVER.STEPS)): if cfg.SOLVER.STEPS[i] >= args.start_step: decay_steps_ind = i break if decay_steps_ind is None: decay_steps_ind = len(cfg.SOLVER.STEPS) training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None, True) val_stats = ValStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None, False) test_stats = TestStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None, False) best_total_loss = np.inf best_eval_result = 0 ### -------------------------------------------------------------------------------- ### EVAL ### ### -------------------------------------------------------------------------------- if cfg.EVAL_SUBSET == 'unseen': print('testing unseen ...') is_best, best_eval_result = run_eval(args, cfg, maskRCNN, dataloader_unseen, step=0, output_dir=output_dir, test_stats=test_stats, best_eval_result=best_eval_result, eval_subset=cfg.EVAL_SUBSET) return elif cfg.EVAL_SUBSET == 'test': print('testing ...') is_best, best_eval_result = run_eval(args, cfg, maskRCNN, dataloader_test, step=0, output_dir=output_dir, test_stats=test_stats, best_eval_result=best_eval_result, eval_subset=cfg.EVAL_SUBSET) return ### -------------------------------------------------------------------------------- ### TRAIN ### ### -------------------------------------------------------------------------------- try: logger.info('Training starts !') step = args.start_step for step in range(args.start_step, cfg.SOLVER.MAX_ITER): # Warm up if step < cfg.SOLVER.WARM_UP_ITERS: method = cfg.SOLVER.WARM_UP_METHOD if method == 'constant': warmup_factor = cfg.SOLVER.WARM_UP_FACTOR elif method == 'linear': alpha = step / cfg.SOLVER.WARM_UP_ITERS warmup_factor = cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha else: raise KeyError( 'Unknown SOLVER.WARM_UP_METHOD: {}'.format(method)) lr_new = cfg.SOLVER.BASE_LR * warmup_factor net_utils.update_learning_rate_rel(optimizer, lr, lr_new) lr = optimizer.param_groups[2]['lr'] backbone_lr = optimizer.param_groups[0]['lr'] assert lr == lr_new elif step == cfg.SOLVER.WARM_UP_ITERS: net_utils.update_learning_rate_rel(optimizer, lr, cfg.SOLVER.BASE_LR) lr = optimizer.param_groups[2]['lr'] backbone_lr = optimizer.param_groups[0]['lr'] assert lr == cfg.SOLVER.BASE_LR # Learning rate decay if decay_steps_ind < len(cfg.SOLVER.STEPS) and \ step == cfg.SOLVER.STEPS[decay_steps_ind]: logger.info('Decay the learning on step %d', step) lr_new = lr * cfg.SOLVER.GAMMA net_utils.update_learning_rate_rel(optimizer, lr, lr_new) lr = optimizer.param_groups[2]['lr'] backbone_lr = optimizer.param_groups[0]['lr'] assert lr == lr_new decay_steps_ind += 1 ######################################################################################################################### ## train ######################################################################################################################### training_stats.IterTic() optimizer.zero_grad() for inner_iter in range(args.iter_size): try: input_data = next(dataiterator_training) except StopIteration: print('recurrence data loader') dataiterator_training = iter(dataloader_training) input_data = next(dataiterator_training) for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) net_outputs = maskRCNN(**input_data) training_stats.UpdateIterStats(net_outputs['gt_label'], inner_iter) loss = net_outputs['gt_label']['total_loss'] loss.backward() optimizer.step() training_stats.IterToc() training_stats.LogIterStats(step, lr, backbone_lr) if (step + 1) % cfg.SAVE_MODEL_ITER == 0: save_ckpt(output_dir, args, step, batch_size, maskRCNN, optimizer, False, best_total_loss) # ---- Training ends ---- save_ckpt(output_dir, args, step, batch_size, maskRCNN, optimizer, False, best_total_loss) except (RuntimeError, KeyboardInterrupt): del dataiterator_training logger.info('Save ckpt on exception ...') save_ckpt(output_dir, args, step, batch_size, maskRCNN, optimizer, False, best_total_loss) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def test_net_on_dataset_multigpu(args): dataset = WAD_CVPR2018(args.dataset_dir) cfg.MODEL.NUM_CLASSES = len( dataset.eval_class) + 1 # with a background class print('load cfg from file: {}'.format(args.cfg_file)) cfg_from_file(args.cfg_file) cfg.RESNETS.IMAGENET_PRETRAINED = False # Don't need to load imagenet pretrained weights if args.nms_soft: cfg.TEST.SOFT_NMS.ENABLED = True else: cfg.TEST.NMS = args.nms assert_and_infer_cfg() maskRCNN = Generalized_RCNN() maskRCNN.cuda() if args.load_ckpt: load_name = args.load_ckpt print("loading checkpoint %s" % (load_name)) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True, device_ids=[0]) # only support single GPU maskRCNN.eval() #imglist = misc_utils.get_imagelist_from_dir(dataset.test_image_dir) img_produced = os.listdir(args.output_list_dir) imglist_all = misc_utils.get_imagelist_from_dir(dataset.test_image_dir) imglist = [ x for x in imglist_all if x.split('/')[-1][:-4] + '.txt' not in img_produced ] #for i in tqdm(xrange(args.range[0], args.range[1])): start_idx = len(imglist_all) - args.range[0] end_idx = len(imglist_all) - args.range[0] + args.range[1] for i in tqdm(xrange(start_idx, end_idx)): im = cv2.imread(imglist_all[i]) assert im is not None timers = defaultdict(Timer) im_name, _ = os.path.splitext(os.path.basename(imglist[i])) args.current_im_name = im_name cls_boxes, cls_segms, prediction_row = im_detect_all(args, maskRCNN, im, dataset, timers=timers) im_name, _ = os.path.splitext(os.path.basename(imglist[i])) print(im_name) if args.vis: vis_utils.vis_one_image_cvpr2018_wad( im[:, :, ::-1], # BGR -> RGB for visualization im_name, args.output_vis_dir, cls_boxes, cls_segms, None, dataset=dataset, box_alpha=0.3, show_class=True, thresh=0.5, kp_thresh=2) thefile = open(os.path.join(args.output_list_dir, im_name + '.txt'), 'w') for item in prediction_row: thefile.write("%s\n" % item)
def test_net_on_dataset(args): dataset = WAD_CVPR2018(args.dataset_dir) cfg.MODEL.NUM_CLASSES = len(dataset.eval_class) + 1 # with a background class print('load cfg from file: {}'.format(args.cfg_file)) cfg_from_file(osp.join(this_dir,args.cfg_file)) if args.nms_soft: cfg.TEST.SOFT_NMS.ENABLED = True else: cfg.TEST.NMS = args.nms cfg.RESNETS.IMAGENET_PRETRAINED = False # Don't need to load imagenet pretrained weights assert_and_infer_cfg() maskRCNN = Generalized_RCNN() maskRCNN.cuda() if args.load_ckpt: load_name = osp.join(this_dir, args.load_ckpt) print("loading checkpoint %s" % (load_name)) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True, device_ids=[0]) # only support single GPU maskRCNN.eval() if args.nms_soft: output_dir = os.path.join(('/').join(args.load_ckpt.split('/')[:-2]), 'Images_' + str(cfg.TEST.SCALE)+'_SOFT_NMS') elif args.nms: output_dir = os.path.join(('/').join(args.load_ckpt.split('/')[:-2]), 'Images_' + str(cfg.TEST.SCALE)+'_NMS_%.2f'%args.nms) else: output_dir = os.path.join(('/').join(args.load_ckpt.split('/')[:-2]), 'Images_' + str(cfg.TEST.SCALE)) if cfg.TEST.BBOX_AUG.ENABLED: output_dir += '_TEST_AUG' #if args.cls_boxes_confident_threshold < 0.5: output_dir += '_cls_boxes_confident_threshold_%.1f' % args.cls_boxes_confident_threshold if not os.path.exists(output_dir): os.makedirs(output_dir) args.output_img_dir = os.path.join(output_dir, 'Image_Masks') if not os.path.exists(args.output_img_dir): os.makedirs(args.output_img_dir) output_list_dir = '/home/stevenwudi/PycharmProjects/Udacity/CarND-Vehicle-Detection/output_images/car_detection_maskrcnn' if not os.path.exists(output_list_dir): os.makedirs(output_list_dir) output_vis_dir = output_list_dir imglist = ['/home/stevenwudi/PycharmProjects/Udacity/CarND-Vehicle-Detection/test_images/test3.jpg', '/home/stevenwudi/PycharmProjects/Udacity/CarND-Vehicle-Detection/test_images/test4.jpg', '/home/stevenwudi/PycharmProjects/Udacity/CarND-Vehicle-Detection/test_images/test5.jpg'] if not args.range: start = 0 end = len(imglist) else: start = args.range[0] end = args.range[1] for i in tqdm(xrange(start, end)): im = cv2.imread(imglist[i]) assert im is not None timers = defaultdict(Timer) im_name, _ = os.path.splitext(os.path.basename(imglist[i])) args.current_im_name = im_name cls_boxes, cls_segms, prediction_row = im_detect_all(args, maskRCNN, im, dataset, timers=timers) im_name, _ = os.path.splitext(os.path.basename(imglist[i])) print(im_name) if args.vis: vis_utils.vis_one_image_cvpr2018_wad( im[:, :, ::-1], # BGR -> RGB for visualization im_name, output_vis_dir, cls_boxes, cls_segms, None, dataset=dataset, box_alpha=0.3, show_class=True, thresh=0.99, kp_thresh=2 ) thefile = open(os.path.join(output_list_dir, im_name + '.txt'), 'w') for item in prediction_row: thefile.write("%s\n" % item)
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train',) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train',) cfg.MODEL.NUM_CLASSES = 2 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH original_ims_per_batch = cfg.TRAIN.IMS_PER_BATCH original_num_gpus = cfg.NUM_GPUS if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS effective_batch_size = args.iter_size * args.batch_size print('effective_batch_size = batch_size * iter_size = %d * %d' % (args.batch_size, args.iter_size)) print('Adaptive config changes:') print(' effective_batch_size: %d --> %d' % (original_batch_size, effective_batch_size)) print(' NUM_GPUS: %d --> %d' % (original_num_gpus, cfg.NUM_GPUS)) print(' IMS_PER_BATCH: %d --> %d' % (original_ims_per_batch, cfg.TRAIN.IMS_PER_BATCH)) ### Adjust learning based on batch size change linearly # For iter_size > 1, gradients are `accumulated`, so lr is scaled based # on batch_size instead of effective_batch_size old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch_size change:\n' ' BASE_LR: {} --> {}'.format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Adjust solver steps step_scale = original_batch_size / effective_batch_size old_solver_steps = cfg.SOLVER.STEPS old_max_iter = cfg.SOLVER.MAX_ITER cfg.SOLVER.STEPS = list(map(lambda x: int(x * step_scale + 0.5), cfg.SOLVER.STEPS)) cfg.SOLVER.MAX_ITER = int(cfg.SOLVER.MAX_ITER * step_scale + 0.5) print('Adjust SOLVER.STEPS and SOLVER.MAX_ITER linearly based on effective_batch_size change:\n' ' SOLVER.STEPS: {} --> {}\n' ' SOLVER.MAX_ITER: {} --> {}'.format(old_solver_steps, cfg.SOLVER.STEPS, old_max_iter, cfg.SOLVER.MAX_ITER)) # Scale FPN rpn_proposals collect size (post_nms_topN) in `collect` function # of `collect_and_distribute_fpn_rpn_proposals.py` # # post_nms_topN = int(cfg[cfg_key].RPN_POST_NMS_TOP_N * cfg.FPN.RPN_COLLECT_SCALE + 0.5) if cfg.FPN.FPN_ON and cfg.MODEL.FASTER_RCNN: cfg.FPN.RPN_COLLECT_SCALE = cfg.TRAIN.IMS_PER_BATCH / original_ims_per_batch print('Scale FPN rpn_proposals collect size directly propotional to the change of IMS_PER_BATCH:\n' ' cfg.FPN.RPN_COLLECT_SCALE: {}'.format(cfg.FPN.RPN_COLLECT_SCALE)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma assert_and_infer_cfg() timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size batchSampler = BatchSampler( sampler=MinibatchSampler(ratio_list, ratio_index), batch_size=args.batch_size, drop_last=True ) dataset = RoiDataLoader( roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_sampler=batchSampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) dataiterator = iter(dataloader) ### Model ### maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### gn_param_nameset = set() for name, module in maskRCNN.named_modules(): if isinstance(module, nn.GroupNorm): gn_param_nameset.add(name+'.weight') gn_param_nameset.add(name+'.bias') gn_params = [] gn_param_names = [] bias_params = [] bias_param_names = [] nonbias_params = [] nonbias_param_names = [] nograd_param_names = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) bias_param_names.append(key) elif key in gn_param_nameset: gn_params.append(value) gn_param_names.append(key) else: nonbias_params.append(value) nonbias_param_names.append(key) else: nograd_param_names.append(key) assert (gn_param_nameset - set(nograd_param_names) - set(bias_param_names)) == set(gn_param_names) # Learning rate of 0 is a dummy value to be set properly at the start of training params = [ {'params': nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY}, {'params': bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0}, {'params': gn_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY_GN} ] # names of paramerters for each paramter param_names = [nonbias_param_names, bias_param_names, gn_param_names] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: args.start_step = checkpoint['step'] + 1 if 'train_size' in checkpoint: # For backward compatibility if checkpoint['train_size'] != train_size: print('train_size value: %d different from the one in checkpoint: %d' % (train_size, checkpoint['train_size'])) # reorder the params in optimizer checkpoint's params_groups if needed # misc_utils.ensure_optimizer_ckpt_params_order(param_names, checkpoint) # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0]['lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() + '_step' output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) # Set index for decay steps decay_steps_ind = None for i in range(1, len(cfg.SOLVER.STEPS)): if cfg.SOLVER.STEPS[i] >= args.start_step: decay_steps_ind = i break if decay_steps_ind is None: decay_steps_ind = len(cfg.SOLVER.STEPS) training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) try: logger.info('Training starts !') step = args.start_step for step in range(args.start_step, cfg.SOLVER.MAX_ITER): # Warm up if step < cfg.SOLVER.WARM_UP_ITERS: method = cfg.SOLVER.WARM_UP_METHOD if method == 'constant': warmup_factor = cfg.SOLVER.WARM_UP_FACTOR elif method == 'linear': alpha = step / cfg.SOLVER.WARM_UP_ITERS warmup_factor = cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha else: raise KeyError('Unknown SOLVER.WARM_UP_METHOD: {}'.format(method)) lr_new = cfg.SOLVER.BASE_LR * warmup_factor net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new elif step == cfg.SOLVER.WARM_UP_ITERS: net_utils.update_learning_rate(optimizer, lr, cfg.SOLVER.BASE_LR) lr = optimizer.param_groups[0]['lr'] assert lr == cfg.SOLVER.BASE_LR # Learning rate decay if decay_steps_ind < len(cfg.SOLVER.STEPS) and \ step == cfg.SOLVER.STEPS[decay_steps_ind]: logger.info('Decay the learning on step %d', step) lr_new = lr * cfg.SOLVER.GAMMA net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new decay_steps_ind += 1 training_stats.IterTic() optimizer.zero_grad() for inner_iter in range(args.iter_size): try: input_data = next(dataiterator) except StopIteration: dataiterator = iter(dataloader) input_data = next(dataiterator) for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) net_outputs = maskRCNN(**input_data) training_stats.UpdateIterStats(net_outputs, inner_iter) loss = net_outputs['total_loss'] loss.backward() optimizer.step() training_stats.IterToc() training_stats.LogIterStats(step, lr) if (step+1) % CHECKPOINT_PERIOD == 0: save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) # ---- Training ends ---- # Save last checkpoint save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) except (RuntimeError, KeyboardInterrupt): del dataiterator logger.info('Save ckpt on exception ...') save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "common": cfg.TRAIN.DATASETS = ('common_train', ) cfg.MODEL.NUM_CLASSES = 81 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS print('Batch size change from {} (in config file) to {}'.format( original_batch_size, args.batch_size)) print('NUM_GPUs: %d, TRAIN.IMS_PER_BATCH: %d' % (cfg.NUM_GPUS, cfg.TRAIN.IMS_PER_BATCH)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Adjust learning based on batch size change linearly old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch size change: {} --> {}'. format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() train_size = len(roidb) logger.info('{:d} roidb entries'.format(train_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) assert_and_infer_cfg() ### Model ### maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### bias_params = [] nonbias_params = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) else: nonbias_params.append(value) params = [{ 'params': nonbias_params, 'lr': cfg.SOLVER.BASE_LR, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': bias_params, 'lr': cfg.SOLVER.BASE_LR * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: assert checkpoint['iters_per_epoch'] == train_size // args.batch_size, \ "iters_per_epoch should match for resume" # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) if checkpoint['step'] == (checkpoint['iters_per_epoch'] - 1): # Resume from end of an epoch args.start_epoch = checkpoint['epoch'] + 1 args.start_iter = 0 else: # Resume from the middle of an epoch. # NOTE: dataloader is not synced with previous state args.start_epoch = checkpoint['epoch'] args.start_iter = checkpoint['step'] + 1 del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0][ 'lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) iters_per_epoch = int(train_size / args.batch_size) # drop last args.iters_per_epoch = iters_per_epoch ckpt_interval_per_epoch = iters_per_epoch // args.ckpt_num_per_epoch try: logger.info('Training starts !') args.step = args.start_iter global_step = iters_per_epoch * args.start_epoch + args.step for args.epoch in range(args.start_epoch, args.start_epoch + args.num_epochs): # ---- Start of epoch ---- # adjust learning rate if args.lr_decay_epochs and args.epoch == args.lr_decay_epochs[ 0] and args.start_iter == 0: args.lr_decay_epochs.pop(0) net_utils.decay_learning_rate(optimizer, lr, cfg.SOLVER.GAMMA) lr *= cfg.SOLVER.GAMMA for args.step, input_data in zip( range(args.start_iter, iters_per_epoch), dataloader): for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) training_stats.IterTic() net_outputs = maskRCNN(**input_data) training_stats.UpdateIterStats(net_outputs) loss = net_outputs['total_loss'] optimizer.zero_grad() loss.backward() optimizer.step() training_stats.IterToc() if (args.step + 1) % ckpt_interval_per_epoch == 0: net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) if args.step % args.disp_interval == 0: log_training_stats(training_stats, global_step, lr) global_step += 1 # ---- End of epoch ---- # save checkpoint net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) # reset starting iter number after first epoch args.start_iter = 0 # ---- Training ends ---- if iters_per_epoch % args.disp_interval != 0: # log last stats at the end log_training_stats(training_stats, global_step, lr) except (RuntimeError, KeyboardInterrupt): logger.info('Save ckpt on exception ...') net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
from torchvision import transforms import torch.nn.functional as F # load config dataset = datasets.get_coco_dataset() cfg.MODEL.NUM_CLASSES = len(dataset.classes) cfg_from_file('configs/baselines/e2e_mask_rcnn_R-50-C4_1x.yaml') cfg.MODEL.LOAD_IMAGENET_PRETRAINED_WEIGHTS = False # Don't need to load imagenet pretrained weights assert_and_infer_cfg() # load model maskRCNN = Generalized_RCNN() checkpoint = torch.load( '/home/work/liupeng11/code/Detectron.pytorch/models/e2e_mask_rcnn_R-50-C4_1x.pth', map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) maskRCNN.eval() maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True, device_ids=[0]) # load image img_path = "/home/work/liupeng11/code/Detectron.pytorch/demo/sample_images/img1.jpg" im = cv2.imread(img_path) # detect bouding boxes and segments from core.test import im_detect_bbox, im_detect_mask, box_results_with_nms_and_limit, segm_results scores, boxes, im_scale, blob_conv = im_detect_bbox(maskRCNN, im, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE, None)
def main(): saveNetStructure = False """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: #set gpu device os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( [str(ids) for ids in args.device_ids]) torch.backends.cudnn.benchmark = True cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cityscapes": cfg.TRAIN.DATASETS = ('cityscapes_semseg_train', ) cfg.MODEL.NUM_CLASSES = 19 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS print('Batch size change from {} (in config file) to {}'.format( original_batch_size, args.batch_size)) print('NUM_GPUs: %d, TRAIN.IMS_PER_BATCH: %d' % (cfg.NUM_GPUS, cfg.TRAIN.IMS_PER_BATCH)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Adjust learning based on batch size change linearly old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch size change: {} --> {}'. format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() if cfg.SEM.SEM_ON or cfg.DISP.DISP_ON: roidb, ratio_list, ratio_index = combined_roidb_for_training_semseg( cfg.TRAIN.DATASETS) else: roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() train_size = len(roidb) logger.info('{:d} roidb entries'.format(train_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch_semseg if cfg.SEM.SEM_ON or cfg.DISP.DISP_ON else collate_minibatch) assert_and_infer_cfg() #for args.step, input_data in zip(range(100), dataloader): # data_L = input_data['data'] # data_R = input_data['data_R'] # label = input_data['disp_label_0'] # cv2.imwrite('ims_L.png', data_L[0].numpy()[0].transpose(1,2,0)[:,:,::-1]+cfg.PIXEL_MEANS) # cv2.imwrite('ims_R.png', data_R[0].numpy()[0].transpose(1,2,0)[:,:,::-1]+cfg.PIXEL_MEANS) # cv2.imwrite('label.png', label[0].numpy()[0]) # return ### Model ### dispSeg = DispSeg() if cfg.CUDA: dispSeg.to('cuda') pspnet_bias_params = [] pspnet_nonbias_params = [] for key, value in dict(dispSeg.pspnet.named_parameters()).items(): if value.requires_grad: if 'bias' in key: pspnet_bias_params.append(value) else: pspnet_nonbias_params.append(value) pspnet_params = [{ 'params': pspnet_nonbias_params, 'lr': cfg.SOLVER.BASE_LR, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': pspnet_bias_params, 'lr': cfg.SOLVER.BASE_LR * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }] glassGCN_bias_params = [] glassGCN_nonbias_params = [] for key, value in dict(dispSeg.glassGCN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: glassGCN_bias_params.append(value) else: glassGCN_nonbias_params.append(value) segdisp3d_params = [{ 'params': glassGCN_nonbias_params, 'lr': cfg.SOLVER.BASE_LR, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': glassGCN_bias_params, 'lr': cfg.SOLVER.BASE_LR * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }] if cfg.SOLVER.TYPE == "SGD": optimizerP = torch.optim.SGD(pspnet_params, momentum=cfg.SOLVER.MOMENTUM) optimizerS = torch.optim.SGD(segdisp3d_params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizerP = torch.optim.Adam(pspnet_params) optimizerS = torch.optim.Adam(segdisp3d_params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(pspnet, checkpoint['model']) net_utils.load_ckpt(segdisp3d, checkpoint['model']) if args.resume: assert checkpoint['iters_per_epoch'] == train_size // args.batch_size, \ "iters_per_epoch should match for resume" # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) if checkpoint['step'] == (checkpoint['iters_per_epoch'] - 1): # Resume from end of an epoch args.start_epoch = checkpoint['epoch'] + 1 args.start_iter = 0 else: # Resume from the middle of an epoch. # NOTE: dataloader is not synced with previous state args.start_epoch = checkpoint['epoch'] args.start_iter = checkpoint['step'] + 1 del checkpoint torch.cuda.empty_cache() lr = optimizerP.param_groups[0][ 'lr'] # lr of non-bias parameters, for commmand line outputs. dispSeg = mynn.DataParallel(dispSeg, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: #from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### dispSeg.train() training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) iters_per_epoch = int(train_size / args.batch_size) # drop last args.iters_per_epoch = iters_per_epoch ckpt_interval_per_epoch = iters_per_epoch // args.ckpt_num_per_epoch try: logger.info('Training starts !') args.step = args.start_iter global_step = iters_per_epoch * args.start_epoch + args.step for args.epoch in range(args.start_epoch, args.start_epoch + args.num_epochs): # ---- Start of epoch ---- # adjust learning rate if args.lr_decay_epochs and args.epoch == args.lr_decay_epochs[ 0] and args.start_iter == 0: args.lr_decay_epochs.pop(0) net_utils.decay_learning_rate(optimizerP, lr, cfg.SOLVER.GAMMA) net_utils.decay_learning_rate(optimizerS, lr, cfg.SOLVER.GAMMA) lr *= cfg.SOLVER.GAMMA for args.step, input_data in zip( range(args.start_iter, iters_per_epoch), dataloader): if cfg.DISP.DISP_ON: input_data['data'] = list( map(lambda x, y: torch.cat((x, y), dim=0), input_data['data'], input_data['data_R'])) if cfg.SEM.DECODER_TYPE.endswith('3Ddeepsup'): input_data['disp_scans'] = torch.arange( 0, cfg.DISP.MAX_DISPLACEMENT).float().view( 1, cfg.DISP.MAX_DISPLACEMENT, 1, 1).repeat(args.batch_size, 1, 1, 1) input_data['semseg_scans'] = torch.arange( 0, cfg.MODEL.NUM_CLASSES).long().view( 1, cfg.MODEL.NUM_CLASSES, 1, 1).repeat(args.batch_size, 1, 1, 1) del input_data['data_R'] for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list( map( lambda x: Variable(x, requires_grad=False).to( 'cuda'), input_data[key])) training_stats.IterTic() net_outputs = dispSeg(**input_data) training_stats.UpdateIterStats(net_outputs) #loss = net_outputs['losses']['loss_semseg'] #acc = net_outputs['metrics']['accuracy_pixel'] #print (loss.item(), acc) #for key in net_outputs.keys(): # print(key) loss = net_outputs['total_loss'] #print("loss.shape:",loss) optimizerP.zero_grad() optimizerS.zero_grad() loss.backward() optimizerP.step() optimizerS.step() training_stats.IterToc() if args.step % args.disp_interval == 0: #disp_image=net_outputs['disp_image'] #semseg_image=net_outputs['semseg_image'] #tblogger.add_image('disp_image',disp_image,global_step) #tblogger.add_image('semseg_image',semseg_image,global_step) log_training_stats(training_stats, global_step, lr) global_step += 1 # ---- End of epoch ---- # save checkpoint net_utils.save_ckpt(output_dir, args, dispSeg, optimizerS) # reset starting iter number after first epoch args.start_iter = 0 # ---- Training ends ---- #if iters_per_epoch % args.disp_interval != 0: # log last stats at the end # log_training_stats(training_stats, global_step, lr) except (RuntimeError, KeyboardInterrupt): logger.info('Save ckpt on exception ...') net_utils.save_ckpt(output_dir, args, dispSeg, optimizerS) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): """main function""" os.environ['CUDA_VISIBLE_DEVICES'] = '1' if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") args = parse_args() print('Called with args:') print(args) assert args.image_dir or args.images assert bool(args.image_dir) ^ bool(args.images) if args.dataset.startswith("coco"): dataset = datasets.get_coco_dataset() cfg.MODEL.NUM_CLASSES = len(dataset.classes) elif args.dataset.startswith("keypoints_coco"): dataset = datasets.get_coco_dataset() cfg.MODEL.NUM_CLASSES = 2 else: raise ValueError('Unexpected dataset name: {}'.format(args.dataset)) print('load cfg from file: {}'.format(args.cfg_file)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) assert bool(args.load_ckpt) ^ bool(args.load_detectron), \ 'Exactly one of --load_ckpt and --load_detectron should be specified.' cfg.MODEL.LOAD_IMAGENET_PRETRAINED_WEIGHTS = False # Don't need to load imagenet pretrained weights assert_and_infer_cfg() maskRCNN = Generalized_RCNN() if args.cuda: maskRCNN.cuda() if args.load_ckpt: load_name = args.load_ckpt print("loading checkpoint %s" % (load_name)) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.load_detectron: print("loading detectron weights %s" % args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True, device_ids=[0]) # only support single GPU maskRCNN.eval() params = list(maskRCNN.parameters()) k = 0 for i in params: l = 1 for j in i.size(): l *= j k = k + l print('zonghe:' + str(k)) if args.image_dir: imglist = misc_utils.get_imagelist_from_dir(args.image_dir) else: imglist = args.images num_images = len(imglist) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) for i in xrange(num_images): print('img', i) im = cv2.imread(imglist[i]) assert im is not None timers = defaultdict(Timer) start = time.time() cls_boxes, cls_segms, cls_keyps = im_detect_all(maskRCNN, im, timers=timers) class_result_boxes = [] for index, class_boxes in enumerate(cls_boxes): if len(class_boxes) != 0: class_boxes = class_boxes.tolist() results_oneclass = threeD_detect(imglist[i], class_boxes, index) class_result_boxes.append(results_oneclass) save_image = im color_class = { 'Car': [0, 255, 255], 'Cyclist': [255, 0, 0], 'Pedestrian': [0, 0, 255] } for result_boxes in class_result_boxes: for box in result_boxes: cv2.rectangle(save_image, (box[0], box[1]), (box[2], box[3]), color_class[box[-1]], 2) height = round(box[-2][0], 2) width = round(box[-2][1], 2) length = round(box[-2][2], 2) threeD_info = str(height) + ' ' + str(width) + ' ' + str( length) cv2.putText(save_image, threeD_info, (box[0], box[1] - 20), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 2) _, imagename = os.path.split(imglist[i]) imagename2 = imagename.split('.')[0] cv2.imwrite('../output1/%s.png' % imagename2, save_image) end = time.time() print(end - start) im_name, _ = os.path.splitext(os.path.basename(imglist[i])) vis_utils.vis_one_image( im[:, :, ::-1], # BGR -> RGB for visualization im_name, args.output_dir, cls_boxes, cls_segms, cls_keyps, dataset=dataset, box_alpha=0.3, show_class=True, thresh=0.7, kp_thresh=2) if args.merge_pdfs and num_images > 1: merge_out_path = '{}/results.pdf'.format(args.output_dir) if os.path.exists(merge_out_path): os.remove(merge_out_path) command = "pdfunite {}/*.pdf {}".format(args.output_dir, merge_out_path) subprocess.call(command, shell=True)