def add_model_training_inputs(model): """Load the training dataset and attach the training inputs to the model.""" logger = logging.getLogger(__name__) logger.info('Loading dataset: {}'.format(cfg.TRAIN.DATASETS)) roidb = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES ) logger.info('{:d} roidb entries'.format(len(roidb))) model_builder.add_training_inputs(model, roidb=roidb)
def main(opts): logger = logging.getLogger(__name__) roidb = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) logger.info('{:d} roidb entries'.format(len(roidb))) roi_data_loader = RoIDataLoader( roidb, num_loaders=opts.num_loaders, minibatch_queue_size=opts.minibatch_queue_size, blobs_queue_capacity=opts.blobs_queue_capacity) blob_names = roi_data_loader.get_output_names() net = core.Net('dequeue_net') net.type = 'dag' all_blobs = [] for gpu_id in range(cfg.NUM_GPUS): with core.NameScope('gpu_{}'.format(gpu_id)): with core.DeviceScope(muji.OnGPU(gpu_id)): for blob_name in blob_names: blob = core.ScopedName(blob_name) all_blobs.append(blob) workspace.CreateBlob(blob) logger.info('Creating blob: {}'.format(blob)) net.DequeueBlobs( roi_data_loader._blobs_queue_name, blob_names) logger.info("Protobuf:\n" + str(net.Proto())) if opts.profiler: import cProfile cProfile.runctx( 'loader_loop(roi_data_loader)', globals(), locals(), sort='cumulative') else: loader_loop(roi_data_loader) roi_data_loader.register_sigint_handler() roi_data_loader.start(prefill=True) total_time = 0 for i in range(opts.num_batches): start_t = time.time() for _ in range(opts.x_factor): workspace.RunNetOnce(net) total_time += (time.time() - start_t) / opts.x_factor logger.info('{:d}/{:d}: Averge dequeue time: {:.3f}s [{:d}/{:d}]'. format(i + 1, opts.num_batches, total_time / (i + 1), roi_data_loader._minibatch_queue.qsize(), opts.minibatch_queue_size)) # Sleep to simulate the time taken by running a little network time.sleep(opts.sleep_time) # To inspect: # blobs = workspace.FetchBlobs(all_blobs) # from IPython import embed; embed() logger.info('Shutting down data loader (EnqueueBlob errors are ok)...') roi_data_loader.shutdown()
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train',) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train',) cfg.MODEL.NUM_CLASSES = 2 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS print('Batch size change from {} (in config file) to {}'.format( original_batch_size, args.batch_size)) print('NUM_GPUs: %d, TRAIN.IMS_PER_BATCH: %d' % (cfg.NUM_GPUS, cfg.TRAIN.IMS_PER_BATCH)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Adjust learning based on batch size change linearly old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch size change: {} --> {}'.format( old_base_lr, cfg.SOLVER.BASE_LR)) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma assert_and_infer_cfg() timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader( roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, drop_last=True, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) dataiterator = iter(dataloader) ### Model ### maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### bias_params = [] nonbias_params = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) else: nonbias_params.append(value) # Learning rate of 0 is a dummy value to be set properly at the start of training params = [ {'params': nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY}, {'params': bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0} ] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: args.start_step = checkpoint['step'] + 1 if 'train_size' in checkpoint: # For backward compatibility assert checkpoint['train_size'] == train_size # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0]['lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() + '_step' output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) # Set index for decay steps decay_steps_ind = None for i in range(1, len(cfg.SOLVER.STEPS)): if cfg.SOLVER.STEPS[i] >= args.start_step: decay_steps_ind = i break if decay_steps_ind is None: decay_steps_ind = len(cfg.SOLVER.STEPS) training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) try: logger.info('Training starts !') step = args.start_step for step in range(args.start_step, cfg.SOLVER.MAX_ITER): # Warm up if step < cfg.SOLVER.WARM_UP_ITERS: method = cfg.SOLVER.WARM_UP_METHOD if method == 'constant': warmup_factor = cfg.SOLVER.WARM_UP_FACTOR elif method == 'linear': alpha = step / cfg.SOLVER.WARM_UP_ITERS warmup_factor = cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha else: raise KeyError('Unknown SOLVER.WARM_UP_METHOD: {}'.format(method)) lr_new = cfg.SOLVER.BASE_LR * warmup_factor net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new elif step == cfg.SOLVER.WARM_UP_ITERS: net_utils.update_learning_rate(optimizer, lr, cfg.SOLVER.BASE_LR) lr = optimizer.param_groups[0]['lr'] assert lr == cfg.SOLVER.BASE_LR # Learning rate decay if decay_steps_ind < len(cfg.SOLVER.STEPS) and \ step == cfg.SOLVER.STEPS[decay_steps_ind]: logger.info('Decay the learning on step %d', step) lr_new = lr * cfg.SOLVER.GAMMA net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new decay_steps_ind += 1 try: input_data = next(dataiterator) except StopIteration: dataiterator = iter(dataloader) input_data = next(dataiterator) for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) training_stats.IterTic() net_outputs = maskRCNN(**input_data) training_stats.IterToc() training_stats.UpdateIterStats(net_outputs) training_stats.LogIterStats(step, lr) loss = net_outputs['total_loss'] optimizer.zero_grad() loss.backward() optimizer.step() if (step+1) % CHECKPOINT_PERIOD == 0: save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) # ---- Training ends ---- # Save last checkpoint save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) except (RuntimeError, KeyboardInterrupt) as e: has_exception = True stack_trace = traceback.format_exc() print(stack_trace) finally: if locals().get('has_exception'): print('Save on exception') save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) if args.use_tfboard and not args.no_save: tblogger.close()
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train',) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train',) cfg.MODEL.NUM_CLASSES = 2 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH original_ims_per_batch = cfg.TRAIN.IMS_PER_BATCH original_num_gpus = cfg.NUM_GPUS if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS effective_batch_size = args.iter_size * args.batch_size print('effective_batch_size = batch_size * iter_size = %d * %d' % (args.batch_size, args.iter_size)) print('Adaptive config changes:') print(' effective_batch_size: %d --> %d' % (original_batch_size, effective_batch_size)) print(' NUM_GPUS: %d --> %d' % (original_num_gpus, cfg.NUM_GPUS)) print(' IMS_PER_BATCH: %d --> %d' % (original_ims_per_batch, cfg.TRAIN.IMS_PER_BATCH)) ### Adjust learning based on batch size change linearly # For iter_size > 1, gradients are `accumulated`, so lr is scaled based # on batch_size instead of effective_batch_size old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch_size change:\n' ' BASE_LR: {} --> {}'.format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Adjust solver steps step_scale = original_batch_size / effective_batch_size old_solver_steps = cfg.SOLVER.STEPS old_max_iter = cfg.SOLVER.MAX_ITER cfg.SOLVER.STEPS = list(map(lambda x: int(x * step_scale + 0.5), cfg.SOLVER.STEPS)) cfg.SOLVER.MAX_ITER = int(cfg.SOLVER.MAX_ITER * step_scale + 0.5) print('Adjust SOLVER.STEPS and SOLVER.MAX_ITER linearly based on effective_batch_size change:\n' ' SOLVER.STEPS: {} --> {}\n' ' SOLVER.MAX_ITER: {} --> {}'.format(old_solver_steps, cfg.SOLVER.STEPS, old_max_iter, cfg.SOLVER.MAX_ITER)) # Scale FPN rpn_proposals collect size (post_nms_topN) in `collect` function # of `collect_and_distribute_fpn_rpn_proposals.py` # # post_nms_topN = int(cfg[cfg_key].RPN_POST_NMS_TOP_N * cfg.FPN.RPN_COLLECT_SCALE + 0.5) if cfg.FPN.FPN_ON and cfg.MODEL.FASTER_RCNN: cfg.FPN.RPN_COLLECT_SCALE = cfg.TRAIN.IMS_PER_BATCH / original_ims_per_batch print('Scale FPN rpn_proposals collect size directly propotional to the change of IMS_PER_BATCH:\n' ' cfg.FPN.RPN_COLLECT_SCALE: {}'.format(cfg.FPN.RPN_COLLECT_SCALE)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma assert_and_infer_cfg() timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size batchSampler = BatchSampler( sampler=MinibatchSampler(ratio_list, ratio_index), batch_size=args.batch_size, drop_last=True ) dataset = RoiDataLoader( roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_sampler=batchSampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) dataiterator = iter(dataloader) ### Model ### maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### gn_param_nameset = set() for name, module in maskRCNN.named_modules(): if isinstance(module, nn.GroupNorm): gn_param_nameset.add(name+'.weight') gn_param_nameset.add(name+'.bias') gn_params = [] gn_param_names = [] bias_params = [] bias_param_names = [] nonbias_params = [] nonbias_param_names = [] nograd_param_names = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) bias_param_names.append(key) elif key in gn_param_nameset: gn_params.append(value) gn_param_names.append(key) else: nonbias_params.append(value) nonbias_param_names.append(key) else: nograd_param_names.append(key) assert (gn_param_nameset - set(nograd_param_names) - set(bias_param_names)) == set(gn_param_names) # Learning rate of 0 is a dummy value to be set properly at the start of training params = [ {'params': nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY}, {'params': bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0}, {'params': gn_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY_GN} ] # names of paramerters for each paramter param_names = [nonbias_param_names, bias_param_names, gn_param_names] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: args.start_step = checkpoint['step'] + 1 if 'train_size' in checkpoint: # For backward compatibility if checkpoint['train_size'] != train_size: print('train_size value: %d different from the one in checkpoint: %d' % (train_size, checkpoint['train_size'])) # reorder the params in optimizer checkpoint's params_groups if needed # misc_utils.ensure_optimizer_ckpt_params_order(param_names, checkpoint) # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0]['lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() + '_step' output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) # Set index for decay steps decay_steps_ind = None for i in range(1, len(cfg.SOLVER.STEPS)): if cfg.SOLVER.STEPS[i] >= args.start_step: decay_steps_ind = i break if decay_steps_ind is None: decay_steps_ind = len(cfg.SOLVER.STEPS) training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) try: logger.info('Training starts !') step = args.start_step for step in range(args.start_step, cfg.SOLVER.MAX_ITER): # Warm up if step < cfg.SOLVER.WARM_UP_ITERS: method = cfg.SOLVER.WARM_UP_METHOD if method == 'constant': warmup_factor = cfg.SOLVER.WARM_UP_FACTOR elif method == 'linear': alpha = step / cfg.SOLVER.WARM_UP_ITERS warmup_factor = cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha else: raise KeyError('Unknown SOLVER.WARM_UP_METHOD: {}'.format(method)) lr_new = cfg.SOLVER.BASE_LR * warmup_factor net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new elif step == cfg.SOLVER.WARM_UP_ITERS: net_utils.update_learning_rate(optimizer, lr, cfg.SOLVER.BASE_LR) lr = optimizer.param_groups[0]['lr'] assert lr == cfg.SOLVER.BASE_LR # Learning rate decay if decay_steps_ind < len(cfg.SOLVER.STEPS) and \ step == cfg.SOLVER.STEPS[decay_steps_ind]: logger.info('Decay the learning on step %d', step) lr_new = lr * cfg.SOLVER.GAMMA net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new decay_steps_ind += 1 training_stats.IterTic() optimizer.zero_grad() for inner_iter in range(args.iter_size): try: input_data = next(dataiterator) except StopIteration: dataiterator = iter(dataloader) input_data = next(dataiterator) for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) net_outputs = maskRCNN(**input_data) training_stats.UpdateIterStats(net_outputs, inner_iter) loss = net_outputs['total_loss'] loss.backward() optimizer.step() training_stats.IterToc() training_stats.LogIterStats(step, lr) if (step+1) % CHECKPOINT_PERIOD == 0: save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) # ---- Training ends ---- # Save last checkpoint save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) except (RuntimeError, KeyboardInterrupt): del dataiterator logger.info('Save ckpt on exception ...') save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "custom_dataset" and args.num_classes is None: raise ValueError( "Need number of classes in your custom dataset to run!") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "voc2007": cfg.TRAIN.DATASETS = ('voc_2007_train', ) cfg.MODEL.NUM_CLASSES = 21 elif args.dataset == "voc2012": cfg.TRAIN.DATASETS = ('voc_2012_train', ) cfg.MODEL.NUM_CLASSES = 21 elif args.dataset == "custom_dataset": cfg.TRAIN.DATASETS = ('custom_data_train', ) cfg.MODEL.NUM_CLASSES = args.num_classes else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS print('Batch size change from {} (in config file) to {}'.format( original_batch_size, args.batch_size)) print('NUM_GPUs: %d, TRAIN.IMS_PER_BATCH: %d' % (cfg.NUM_GPUS, cfg.TRAIN.IMS_PER_BATCH)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Adjust learning based on batch size change linearly old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch size change: {} --> {}'. format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() train_size = len(roidb) logger.info('{:d} roidb entries'.format(train_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) assert_and_infer_cfg() ### Model ### maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### bias_params = [] nonbias_params = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) else: nonbias_params.append(value) params = [{ 'params': nonbias_params, 'lr': cfg.SOLVER.BASE_LR, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': bias_params, 'lr': cfg.SOLVER.BASE_LR * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: assert checkpoint['iters_per_epoch'] == train_size // args.batch_size, \ "iters_per_epoch should match for resume" # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) if checkpoint['step'] == (checkpoint['iters_per_epoch'] - 1): # Resume from end of an epoch args.start_epoch = checkpoint['epoch'] + 1 args.start_iter = 0 else: # Resume from the middle of an epoch. # NOTE: dataloader is not synced with previous state args.start_epoch = checkpoint['epoch'] args.start_iter = checkpoint['step'] + 1 del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0][ 'lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) iters_per_epoch = int(train_size / args.batch_size) # drop last args.iters_per_epoch = iters_per_epoch ckpt_interval_per_epoch = iters_per_epoch // args.ckpt_num_per_epoch try: logger.info('Training starts !') args.step = args.start_iter global_step = iters_per_epoch * args.start_epoch + args.step for args.epoch in range(args.start_epoch, args.start_epoch + args.num_epochs): # ---- Start of epoch ---- # adjust learning rate if args.lr_decay_epochs and args.epoch == args.lr_decay_epochs[ 0] and args.start_iter == 0: args.lr_decay_epochs.pop(0) net_utils.decay_learning_rate(optimizer, lr, cfg.SOLVER.GAMMA) lr *= cfg.SOLVER.GAMMA for args.step, input_data in zip( range(args.start_iter, iters_per_epoch), dataloader): for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) training_stats.IterTic() net_outputs = maskRCNN(**input_data) training_stats.UpdateIterStats(net_outputs) loss = net_outputs['total_loss'] optimizer.zero_grad() loss.backward() optimizer.step() training_stats.IterToc() if (args.step + 1) % ckpt_interval_per_epoch == 0: net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) if args.step % args.disp_interval == 0: log_training_stats(training_stats, global_step, lr) global_step += 1 # ---- End of epoch ---- # save checkpoint net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) # reset starting iter number after first epoch args.start_iter = 0 # ---- Training ends ---- if iters_per_epoch % args.disp_interval != 0: # log last stats at the end log_training_stats(training_stats, global_step, lr) except (RuntimeError, KeyboardInterrupt): logger.info('Save ckpt on exception ...') net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "rsna2018_pos_neg_k1": cfg.TRAIN.DATASETS = ('RSNA_2018_pos_train_k1','RSNA_2018_neg_train_k1', 'RSNA_2018_pos_train_k1', 'RSNA_2018_pos_train_k1') cfg.TEST.DATASETS = ('RSNA_2018_pos_val_k1','RSNA_2018_neg_val_k1') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "rsna2018_pos_k1": cfg.TRAIN.DATASETS = ('RSNA_2018_pos_train_k1', ) cfg.TEST.DATASETS = ('RSNA_2018_pos_val_k1',) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "rsna2018_pos_neg_k2": cfg.TRAIN.DATASETS = ('RSNA_2018_pos_train_k2','RSNA_2018_neg_train_k2', 'RSNA_2018_pos_train_k2', 'RSNA_2018_pos_train_k2') cfg.TEST.DATASETS = ('RSNA_2018_pos_val_k2','RSNA_2018_neg_val_k2') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "rsna2018_pos_k2": cfg.TRAIN.DATASETS = ('RSNA_2018_pos_train_k2', ) cfg.TEST.DATASETS = ('RSNA_2018_pos_val_k2',) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "rsna2018_pos_neg_k3": cfg.TRAIN.DATASETS = ('RSNA_2018_pos_train_k3','RSNA_2018_neg_train_k3', 'RSNA_2018_pos_train_k3', 'RSNA_2018_pos_train_k3') cfg.TEST.DATASETS = ('RSNA_2018_pos_val_k3','RSNA_2018_neg_val_k3') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "rsna2018_pos_k3": cfg.TRAIN.DATASETS = ('RSNA_2018_pos_train_k3', ) cfg.TEST.DATASETS = ('RSNA_2018_pos_val_k3',) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "rsna2018_pos_neg": cfg.TRAIN.DATASETS = ('RSNA_2018_pos_train','RSNA_2018_neg_train', 'RSNA_2018_pos_train', 'RSNA_2018_pos_train') cfg.TEST.DATASETS = ('RSNA_2018_pos_val','RSNA_2018_neg_val') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "rsna2018_pos": cfg.TRAIN.DATASETS = ('RSNA_2018_pos_train', ) cfg.TEST.DATASETS = ('RSNA_2018_pos_val',) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "rsna2018_pos_neg_clahe": cfg.TRAIN.DATASETS =('RSNA_2018_pos_train_clahe','RSNA_2018_neg_train_clahe', 'RSNA_2018_pos_train_clahe', 'RSNA_2018_pos_train_clahe') cfg.TEST.DATASETS =('RSNA_2018_pos_val_clahe_clahe','RSNA_2018_neg_val_clahe') cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "rsna2018_pos_clahe": cfg.TRAIN.DATASETS = ('RSNA_2018_pos_train_clahe', ) cfg.TEST.DATASETS = ('RSNA_2018_pos_val_clahe',) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train',) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_val',) cfg.TEST.DATASETS = ('coco_2017_val') cfg.MODEL.NUM_CLASSES = 81 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH original_ims_per_batch = cfg.TRAIN.IMS_PER_BATCH original_num_gpus = cfg.NUM_GPUS if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS effective_batch_size = args.iter_size * args.batch_size print('effective_batch_size = batch_size * iter_size = %d * %d' % (args.batch_size, args.iter_size)) print('Adaptive config changes:') print(' effective_batch_size: %d --> %d' % (original_batch_size, effective_batch_size)) print(' NUM_GPUS: %d --> %d' % (original_num_gpus, cfg.NUM_GPUS)) print(' IMS_PER_BATCH: %d --> %d' % (original_ims_per_batch, cfg.TRAIN.IMS_PER_BATCH)) ### Adjust learning based on batch size change linearly # For iter_size > 1, gradients are `accumulated`, so lr is scaled based # on batch_size instead of effective_batch_size old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch_size change:\n' ' BASE_LR: {} --> {}'.format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Adjust solver steps step_scale = original_batch_size / effective_batch_size old_solver_steps = cfg.SOLVER.STEPS old_max_iter = cfg.SOLVER.MAX_ITER cfg.SOLVER.STEPS = list(map(lambda x: int(x * step_scale + 0.5), cfg.SOLVER.STEPS)) cfg.SOLVER.MAX_ITER = int(cfg.SOLVER.MAX_ITER * step_scale + 0.5) print('Adjust SOLVER.STEPS and SOLVER.MAX_ITER linearly based on effective_batch_size change:\n' ' SOLVER.STEPS: {} --> {}\n' ' SOLVER.MAX_ITER: {} --> {}'.format(old_solver_steps, cfg.SOLVER.STEPS, old_max_iter, cfg.SOLVER.MAX_ITER)) # Scale FPN rpn_proposals collect size (post_nms_topN) in `collect` function # of `collect_and_distribute_fpn_rpn_proposals.py` # # post_nms_topN = int(cfg[cfg_key].RPN_POST_NMS_TOP_N * cfg.FPN.RPN_COLLECT_SCALE + 0.5) if cfg.FPN.FPN_ON and cfg.MODEL.FASTER_RCNN: cfg.FPN.RPN_COLLECT_SCALE = cfg.TRAIN.IMS_PER_BATCH / original_ims_per_batch print('Scale FPN rpn_proposals collect size directly propotional to the change of IMS_PER_BATCH:\n' ' cfg.FPN.RPN_COLLECT_SCALE: {}'.format(cfg.FPN.RPN_COLLECT_SCALE)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma assert_and_infer_cfg() timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() #print() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size batchSampler = BatchSampler( sampler=MinibatchSampler(ratio_list, ratio_index), batch_size=args.batch_size, drop_last=True ) #batchSampler = BatchSampler(torch_sampler.Sampler, batch_size=args.batch_size, drop_last=True) dataset = RoiDataLoader( roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_sampler=batchSampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) dataiterator = iter(dataloader) ### Model ### maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### gn_param_nameset = set() for name, module in maskRCNN.named_modules(): if isinstance(module, nn.GroupNorm): gn_param_nameset.add(name+'.weight') gn_param_nameset.add(name+'.bias') gn_params = [] gn_param_names = [] bias_params = [] bias_param_names = [] nonbias_params = [] nonbias_param_names = [] nograd_param_names = [] for key, value in maskRCNN.named_parameters(): if value.requires_grad: if 'bias' in key: bias_params.append(value) bias_param_names.append(key) elif key in gn_param_nameset: gn_params.append(value) gn_param_names.append(key) else: nonbias_params.append(value) nonbias_param_names.append(key) else: nograd_param_names.append(key) assert (gn_param_nameset - set(nograd_param_names) - set(bias_param_names)) == set(gn_param_names) # Learning rate of 0 is a dummy value to be set properly at the start of training params = [ {'params': nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY}, {'params': bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0}, {'params': gn_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY_GN} ] # names of paramerters for each paramter param_names = [nonbias_param_names, bias_param_names, gn_param_names] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: args.start_step = checkpoint['step'] + 1 if 'train_size' in checkpoint: # For backward compatibility if checkpoint['train_size'] != train_size: print('train_size value: %d different from the one in checkpoint: %d' % (train_size, checkpoint['train_size'])) # reorder the params in optimizer checkpoint's params_groups if needed # misc_utils.ensure_optimizer_ckpt_params_order(param_names, checkpoint) # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. optimizer.load_state_dict(checkpoint['optimizer']) # misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0]['lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() + '_step' output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) # Set index for decay steps decay_steps_ind = None for i in range(1, len(cfg.SOLVER.STEPS)): if cfg.SOLVER.STEPS[i] >= args.start_step: decay_steps_ind = i break if decay_steps_ind is None: decay_steps_ind = len(cfg.SOLVER.STEPS) training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) best_score = 0 try: logger.info('Training starts !') step = args.start_step for step in range(args.start_step, cfg.SOLVER.MAX_ITER): # Warm up if step < cfg.SOLVER.WARM_UP_ITERS: method = cfg.SOLVER.WARM_UP_METHOD if method == 'constant': warmup_factor = cfg.SOLVER.WARM_UP_FACTOR elif method == 'linear': alpha = step / cfg.SOLVER.WARM_UP_ITERS warmup_factor = cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha else: raise KeyError('Unknown SOLVER.WARM_UP_METHOD: {}'.format(method)) lr_new = cfg.SOLVER.BASE_LR * warmup_factor net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new elif step == cfg.SOLVER.WARM_UP_ITERS: net_utils.update_learning_rate(optimizer, lr, cfg.SOLVER.BASE_LR) lr = optimizer.param_groups[0]['lr'] assert lr == cfg.SOLVER.BASE_LR # Learning rate decay if decay_steps_ind < len(cfg.SOLVER.STEPS) and \ step == cfg.SOLVER.STEPS[decay_steps_ind]: logger.info('Decay the learning on step %d', step) lr_new = lr * cfg.SOLVER.GAMMA net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new decay_steps_ind += 1 training_stats.IterTic() optimizer.zero_grad() for inner_iter in range(args.iter_size): try: input_data = next(dataiterator) except StopIteration: dataiterator = iter(dataloader) input_data = next(dataiterator) for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) net_outputs = maskRCNN(**input_data) training_stats.UpdateIterStats(net_outputs, inner_iter) loss = net_outputs['total_loss'] loss.backward() optimizer.step() training_stats.IterToc() training_stats.LogIterStats(step, lr) if (step+1) % CHECKPOINT_PERIOD == 0: perforamce = eval_net(model=maskRCNN, submit_threshold=0.5, dataset_names=cfg.TEST.DATASETS, output_dir=output_dir, gt_file='/home1/data/Kaggle_RSNA_Pneumonia/annotations/k_folder_1/k_folder1_val.csv') #gt_file='/home/wccui/RSNA/data/csv/val_pos_459_gt.csv') #gt_file='/home/wccui/detectron_pytorch_RSNA_v10_9/data/csv/val_pos_459_gt.csv') maskRCNN.train() print("perforamce ",perforamce) MAP, precision, recall, f1 = summary_per(perforamce) is_best = MAP > best_score prev_best_score = best_score best_score = max(MAP, best_score) if is_best: print ('Best snapshot! {} -> {}'.format(prev_best_score, best_score)) save_best_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer,best_score) tblogger.add_scalar('val_MAP', MAP, step+1) tblogger.add_scalar('val_precision', precision, step + 1) tblogger.add_scalar('val_recall', recall, step + 1) tblogger.add_scalar('val_f1', f1, step + 1) save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) # ---- Training ends ---- # Save last checkpoint save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) except (RuntimeError, KeyboardInterrupt): del dataiterator logger.info('Save ckpt on exception ...') save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): netStructure = SummaryWriter( log_dir= '/home/zhanwj/Desktop/pyTorch/Detectron.pytorch/network_structure/dispSeg/' ) saveNetStructure = False """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: #set gpu device os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( [str(ids) for ids in args.device_ids]) torch.backends.cudnn.benchmark = True cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "cityscapes": cfg.TRAIN.DATASETS = ('cityscapes_semseg_train', ) cfg.MODEL.NUM_CLASSES = 19 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS print('Batch size change from {} (in config file) to {}'.format( original_batch_size, args.batch_size)) print('NUM_GPUs: %d, TRAIN.IMS_PER_BATCH: %d' % (cfg.NUM_GPUS, cfg.TRAIN.IMS_PER_BATCH)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Adjust learning based on batch size change linearly old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch size change: {} --> {}'. format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() if cfg.SEM.SEM_ON or cfg.DISP.DISP_ON: roidb, ratio_list, ratio_index = combined_roidb_for_training_semseg( cfg.TRAIN.DATASETS) else: roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() train_size = len(roidb) logger.info('{:d} roidb entries'.format(train_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch_semseg if cfg.SEM.SEM_ON or cfg.DISP.DISP_ON else collate_minibatch) assert_and_infer_cfg() #for args.step, input_data in zip(range(100), dataloader): # data_L = input_data['data'] # data_R = input_data['data_R'] # label = input_data['disp_label_0'] # cv2.imwrite('ims_L.png', data_L[0].numpy()[0].transpose(1,2,0)[:,:,::-1]+cfg.PIXEL_MEANS) # cv2.imwrite('ims_R.png', data_R[0].numpy()[0].transpose(1,2,0)[:,:,::-1]+cfg.PIXEL_MEANS) # cv2.imwrite('label.png', label[0].numpy()[0]) # return ### Model ### maskRCNN = eval(cfg.MODEL.TYPE)() """ if cfg.SEM.SEM_ON: if not cfg.DISP.DISP_ON: maskRCNN = Generalized_SEMSEG() else: maskRCNN = Generalized_3DSD() else: maskRCNN = Generalized_RCNN() """ if cfg.CUDA: maskRCNN.to('cuda') #print(maskRCNN) ### Optimizer ### bias_params = [] nonbias_params = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) else: nonbias_params.append(value) params = [{ 'params': nonbias_params, 'lr': cfg.SOLVER.BASE_LR, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': bias_params, 'lr': cfg.SOLVER.BASE_LR * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: assert checkpoint['iters_per_epoch'] == train_size // args.batch_size, \ "iters_per_epoch should match for resume" # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) if checkpoint['step'] == (checkpoint['iters_per_epoch'] - 1): # Resume from end of an epoch args.start_epoch = checkpoint['epoch'] + 1 args.start_iter = 0 else: # Resume from the middle of an epoch. # NOTE: dataloader is not synced with previous state args.start_epoch = checkpoint['epoch'] args.start_iter = checkpoint['step'] + 1 del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0][ 'lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: #from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) iters_per_epoch = int(train_size / args.batch_size) # drop last args.iters_per_epoch = iters_per_epoch ckpt_interval_per_epoch = iters_per_epoch // args.ckpt_num_per_epoch try: logger.info('Training starts !') args.step = args.start_iter global_step = iters_per_epoch * args.start_epoch + args.step for args.epoch in range(args.start_epoch, args.start_epoch + args.num_epochs): # ---- Start of epoch ---- # adjust learning rate if args.lr_decay_epochs and args.epoch == args.lr_decay_epochs[ 0] and args.start_iter == 0: args.lr_decay_epochs.pop(0) net_utils.decay_learning_rate(optimizer, lr, cfg.SOLVER.GAMMA) lr *= cfg.SOLVER.GAMMA for args.step, input_data in zip( range(args.start_iter, iters_per_epoch), dataloader): if cfg.DISP.DISP_ON: input_data['data'] = list( map(lambda x, y: torch.cat((x, y), dim=0), input_data['data'], input_data['data_R'])) if cfg.SEM.DECODER_TYPE.endswith('3D'): input_data['disp_scans'] = torch.arange( 1, cfg.DISP.MAX_DISPLACEMENT + 1).float().view( 1, cfg.DISP.MAX_DISPLACEMENT).repeat( args.batch_size, 1) del input_data['data_R'] for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list( map( lambda x: Variable(x, requires_grad=False).to( 'cuda'), input_data[key])) training_stats.IterTic() net_outputs = maskRCNN(**input_data) training_stats.UpdateIterStats(net_outputs) #loss = net_outputs['losses']['loss_semseg'] #acc = net_outputs['metrics']['accuracy_pixel'] #print (loss.item(), acc) #for key in net_outputs.keys(): # print(key) loss = net_outputs['total_loss'] #print("loss.shape:",loss) optimizer.zero_grad() loss.backward() optimizer.step() training_stats.IterToc() if args.step % args.disp_interval == 0: disp_image = net_outputs['disp_image'] #tblogger.add_image('disp_image',disp_image,global_step) #tblogger.add_image('semseg_image',semseg_image,global_step) log_training_stats(training_stats, global_step, lr) global_step += 1 # ---- End of epoch ---- # save checkpoint net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) # reset starting iter number after first epoch args.start_iter = 0 # ---- Training ends ---- #if iters_per_epoch % args.disp_interval != 0: # log last stats at the end # log_training_stats(training_stats, global_step, lr) except (RuntimeError, KeyboardInterrupt): logger.info('Save ckpt on exception ...') net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") merge_cfg_from_file(args.cfg_file) # Some manual adjustment for the ApolloScape dataset parameters here cfg.OUTPUT_DIR = args.output_dir cfg.TRAIN.DATASETS = 'TLESS' cfg.MODEL.NUM_CLASSES = 30 # We have only one class for all CAD models cfg.MODEL.NUMBER_CARS = 1 cfg.TRAIN.MIN_AREA = 196 # 14*14 cfg.TRAIN.USE_FLIPPED = False # Currently I don't know how to handle the flipped case cfg.TRAIN.IMS_PER_BATCH = 1 cfg.NUM_GPUS = torch.cuda.device_count() effective_batch_size = cfg.TRAIN.IMS_PER_BATCH * cfg.NUM_GPUS * args.iter_size ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH original_ims_per_batch = cfg.TRAIN.IMS_PER_BATCH original_num_gpus = cfg.NUM_GPUS if args.batch_size is None: args.batch_size = original_batch_size assert (args.batch_size % cfg.NUM_GPUS) == 0, 'batch_size: %d, NUM_GPUS: %d' % ( args.batch_size, cfg.NUM_GPUS) print('effective_batch_size = batch_size * iter_size = %d * %d' % (args.batch_size, args.iter_size)) print('Adaptive config changes:') print(' effective_batch_size: %d --> %d' % (original_batch_size, effective_batch_size)) print(' NUM_GPUS: %d --> %d' % (original_num_gpus, cfg.NUM_GPUS)) print(' IMS_PER_BATCH: %d --> %d' % (original_ims_per_batch, cfg.TRAIN.IMS_PER_BATCH)) ### Adjust learning based on batch size change linearly # For iter_size > 1, gradients are `accumulated`, so lr is scaled based # on batch_size instead of effective_batch_size old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print( 'Adjust BASE_LR linearly according to batch_size change:\n BASE_LR: {} --> {}' .format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Adjust solver steps step_scale = original_batch_size / effective_batch_size old_solver_steps = cfg.SOLVER.STEPS old_max_iter = cfg.SOLVER.MAX_ITER cfg.SOLVER.STEPS = list( map(lambda x: int(x * step_scale + 0.5), cfg.SOLVER.STEPS)) cfg.SOLVER.MAX_ITER = int(cfg.SOLVER.MAX_ITER * step_scale + 0.5) print( 'Adjust SOLVER.STEPS and SOLVER.MAX_ITER linearly based on effective_batch_size change:\n' ' SOLVER.STEPS: {} --> {}\n' ' SOLVER.MAX_ITER: {} --> {}'.format(old_solver_steps, cfg.SOLVER.STEPS, old_max_iter, cfg.SOLVER.MAX_ITER)) # Scale FPN rpn_proposals collect size (post_nms_topN) in `collect` function # of `collect_and_distribute_fpn_rpn_proposals.py` # # post_nms_topN = int(cfg[cfg_key].RPN_POST_NMS_TOP_N * cfg.FPN.RPN_COLLECT_SCALE + 0.5) if cfg.FPN.FPN_ON and cfg.MODEL.FASTER_RCNN: cfg.FPN.RPN_COLLECT_SCALE = cfg.TRAIN.IMS_PER_BATCH / original_ims_per_batch print( 'Scale FPN rpn_proposals collect size directly propotional to the change of IMS_PER_BATCH:\n' ' cfg.FPN.RPN_COLLECT_SCALE: {}'.format( cfg.FPN.RPN_COLLECT_SCALE)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma assert_and_infer_cfg() timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() if cfg.MODEL.LOSS_3D_2D_ON: roidb, ratio_list, ratio_index, ds = combined_roidb_for_training( cfg.TRAIN.DATASETS, args.dataset_dir) else: roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, args.dataset_dir) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True, valid_keys=[ 'has_visible_keypoints', 'boxes', 'seg_areas', 'gt_classes', 'gt_overlaps', 'box_to_gt_ind_map', 'segms', 'is_crowd', 'car_cat_classes', 'poses', 'quaternions', 'im_info' ]) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, drop_last=True, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) dataiterator = iter(dataloader) ### Model ### if cfg.MODEL.LOSS_3D_2D_ON: maskRCNN = Generalized_RCNN(ds.Car3D) else: maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### bias_params = [] bias_param_names = [] nonbias_params = [] nonbias_param_names = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) bias_param_names.append(key) else: nonbias_params.append(value) nonbias_param_names.append(key) # Learning rate of 0 is a dummy value to be set properly at the start of training params = [{ 'params': nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model'], ignore_list=args.ckpt_ignore_head) if args.resume: args.start_step = checkpoint['step'] + 1 if 'train_size' in checkpoint: # For backward compatibility if checkpoint['train_size'] != train_size: print( 'train_size value: %d different from the one in checkpoint: %d' % (train_size, checkpoint['train_size'])) # reorder the params in optimizer checkpoint's params_groups if needed # misc_utils.ensure_optimizer_ckpt_params_order(param_names, checkpoint) # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() if args.load_detectron: # TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0][ 'lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() + '_step' output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### maskRCNN.train() CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) # Set index for decay steps decay_steps_ind = None for i in range(1, len(cfg.SOLVER.STEPS)): if cfg.SOLVER.STEPS[i] >= args.start_step: decay_steps_ind = i break if decay_steps_ind is None: decay_steps_ind = len(cfg.SOLVER.STEPS) training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) try: logger.info('Training starts !') step = args.start_step for step in range(args.start_step, cfg.SOLVER.MAX_ITER): # Warm up if step < cfg.SOLVER.WARM_UP_ITERS: method = cfg.SOLVER.WARM_UP_METHOD if method == 'constant': warmup_factor = cfg.SOLVER.WARM_UP_FACTOR elif method == 'linear': alpha = step / cfg.SOLVER.WARM_UP_ITERS warmup_factor = cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha # warmup_factor_trans = cfg.SOLVER.WARM_UP_FACTOR_TRANS * (1 - alpha) + alpha # warmup_factor_trans *= cfg.TRANS_HEAD.LOSS_BETA warmup_factor_trans = 1.0 else: raise KeyError( 'Unknown SOLVER.WARM_UP_METHOD: {}'.format(method)) lr_new = cfg.SOLVER.BASE_LR * warmup_factor net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new elif step == cfg.SOLVER.WARM_UP_ITERS: net_utils.update_learning_rate(optimizer, lr, cfg.SOLVER.BASE_LR) lr = optimizer.param_groups[0]['lr'] assert lr == cfg.SOLVER.BASE_LR # Learning rate decay if decay_steps_ind < len(cfg.SOLVER.STEPS) and \ step == cfg.SOLVER.STEPS[decay_steps_ind]: logger.info('Decay the learning on step %d', step) lr_new = lr * cfg.SOLVER.GAMMA net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new decay_steps_ind += 1 training_stats.IterTic() optimizer.zero_grad() for inner_iter in range(args.iter_size): try: input_data = next(dataiterator) except StopIteration: dataiterator = iter(dataloader) input_data = next(dataiterator) for key in input_data: if key != 'roidb': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) net_outputs = maskRCNN(**input_data) net_outputs['losses'][ 'loss_car_cls'] *= cfg.CAR_CLS.CAR_CLS_LOSS_BETA net_outputs['losses']['loss_rot'] *= cfg.CAR_CLS.ROT_LOSS_BETA if cfg.MODEL.TRANS_HEAD_ON: net_outputs['losses'][ 'loss_trans'] *= cfg.TRANS_HEAD.TRANS_LOSS_BETA training_stats.UpdateIterStats_car_3d(net_outputs) # start training # loss_car_cls: 2.233790, loss_rot: 0.296853, loss_trans: ~100 loss = net_outputs['losses']['loss_car_cls'] + net_outputs[ 'losses']['loss_rot'] if cfg.MODEL.TRANS_HEAD_ON: loss += net_outputs['losses']['loss_trans'] if cfg.MODEL.LOSS_3D_2D_ON: loss += net_outputs['losses']['UV_projection_loss'] if not cfg.TRAIN.FREEZE_CONV_BODY and not cfg.TRAIN.FREEZE_RPN and not cfg.TRAIN.FREEZE_FPN: loss += net_outputs['total_loss_conv'] loss.backward() optimizer.step() training_stats.IterToc() training_stats.LogIterStats(step, lr, warmup_factor_trans) if (step + 1) % CHECKPOINT_PERIOD == 0: save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) # ---- Training ends ---- # Save last checkpoint save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) except (RuntimeError, KeyboardInterrupt): del dataiterator logger.info('Save ckpt on exception ...') save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
import _init_paths # pylint: disable=unused-import import torch from datasets.roidb import combined_roidb_for_training from roi_data.loader import RoiDataLoader, MinibatchSampler, BatchSampler, collate_minibatch from core.config import cfg, cfg_from_file, cfg_from_list, assert_and_infer_cfg if __name__ == '__main__': datasets = ('vcoco_trainval', ) batch_size = 2 num_classes = 81 cfg.RPN.RPN_ON = True # assert_and_infer_cfg() roidb, ratio_list, ratio_index = combined_roidb_for_training(datasets, ()) # print(roidb[0]) print(roidb[0].keys()) ''' batchSampler = BatchSampler( sampler=MinibatchSampler(ratio_list, ratio_index), batch_size=batch_size, drop_last=True ) dataset = RoiDataLoader( roidb, num_classes, training=True ) dataloader = torch.utils.data.DataLoader( dataset, batch_sampler=batchSampler, num_workers=1,
def add_model_inputs(model): logger.info('Loading dataset: {}'.format(cfg.TRAIN.DATASET)) roidb = combined_roidb_for_training( cfg.TRAIN.DATASET, cfg.TRAIN.PROPOSAL_FILE) logger.info('{:d} roidb entries'.format(len(roidb))) model_builder.add_inputs(model, roidb=roidb)
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "coco2014": cfg.TRAIN.DATASETS = ('coco_2014_train', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == 'vcoco_trainval': cfg.TRAIN.DATASETS = ('vcoco_trainval', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == 'vcoco_train': cfg.TRAIN.DATASETS = ('vcoco_train', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == 'vcoco_val': cfg.TRAIN.DATASETS = ('vcoco_val', ) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == 'keypoints_coco2014': cfg.TRAIN.DATASETS = ('keypoints_coco_2014_train', ) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train', ) cfg.MODEL.NUM_CLASSES = 2 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) if args.vcoco_kp_on: cfg.VCOCO.KEYPOINTS_ON = True cfg.NETWORK_NAME = args.net_name # network name print('Network name:', args.net_name) cfg.MODEL.CONV_BODY = args.conv_body # backbone network name print('Conv_body name:', args.conv_body) cfg.TRAIN.FG_THRESH = args.fg_thresh print('Train fg thresh:', args.fg_thresh) cfg.RESNETS.FREEZE_AT = args.freeze_at print('Freeze at: ', args.freeze_at) cfg.VCOCO.MLP_HEAD_DIM = args.mlp_head_dim print('MLP head dim: ', args.mlp_head_dim) cfg.SOLVER.MAX_ITER = args.max_iter print('MAX iter: ', args.max_iter) cfg.TRAIN.SNAPSHOT_ITERS = args.snapshot print('Snapshot Iters: ', args.snapshot) if args.solver_steps is not None: cfg.SOLVER.STEPS = args.solver_steps print('Solver_steps: ', cfg.SOLVER.STEPS) cfg.VCOCO.TRIPLETS_NUM_PER_IM = args.triplets_num_per_im print('triplets_num_per_im: ', cfg.VCOCO.TRIPLETS_NUM_PER_IM) cfg.VCOCO.HEATMAP_KERNEL_SIZE = args.heatmap_kernel_size print('heatmap_kernel_size: ', cfg.VCOCO.HEATMAP_KERNEL_SIZE) cfg.VCOCO.PART_CROP_SIZE = args.part_crop_size print('part_crop_size: ', cfg.VCOCO.PART_CROP_SIZE) print('use use_kps17 for part Align: ', args.use_kps17) if args.use_kps17: cfg.VCOCO.USE_KPS17 = True else: cfg.VCOCO.USE_KPS17 = False print('MULTILEVEL_ROIS: ', cfg.FPN.MULTILEVEL_ROIS) if args.vcoco_use_spatial: cfg.VCOCO.USE_SPATIAL = True if args.vcoco_use_union_feat: cfg.VCOCO.USE_UNION_FEAT = True if args.use_precomp_box: cfg.VCOCO.USE_PRECOMP_BOX = True cfg.DEBUG_TEST_WITH_GT = True if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH # 16 original_ims_per_batch = cfg.TRAIN.IMS_PER_BATCH original_num_gpus = cfg.NUM_GPUS if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS effective_batch_size = args.iter_size * args.batch_size print('effective_batch_size = batch_size * iter_size = %d * %d' % (args.batch_size, args.iter_size)) print('Adaptive config changes:') print(' effective_batch_size: %d --> %d' % (original_batch_size, effective_batch_size)) print(' NUM_GPUS: %d --> %d' % (original_num_gpus, cfg.NUM_GPUS)) print(' IMS_PER_BATCH: %d --> %d' % (original_ims_per_batch, cfg.TRAIN.IMS_PER_BATCH)) print(' FG_THRESH: ', cfg.TRAIN.FG_THRESH) ### Adjust learning based on batch size change linearly # For iter_size > 1, gradients are `accumulated`, so lr is scaled based # on batch_size instead of effective_batch_size old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch_size change:\n' ' BASE_LR: {} --> {}'.format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Adjust solver steps step_scale = original_batch_size / effective_batch_size old_solver_steps = cfg.SOLVER.STEPS old_max_iter = cfg.SOLVER.MAX_ITER cfg.SOLVER.STEPS = list( map(lambda x: int(x * step_scale + 0.5), cfg.SOLVER.STEPS)) cfg.SOLVER.MAX_ITER = int(cfg.SOLVER.MAX_ITER * step_scale + 0.5) cfg.SOLVER.VAL_ITER = int(cfg.SOLVER.VAL_ITER * step_scale + 0.5) cfg.TRAIN.SNAPSHOT_ITERS = int(cfg.TRAIN.SNAPSHOT_ITERS * step_scale + 0.5) print( 'Adjust SOLVER.STEPS and SOLVER.MAX_ITER linearly based on effective_batch_size change:\n' ' SOLVER.STEPS: {} --> {}\n' ' SOLVER.MAX_ITER: {} --> {}'.format(old_solver_steps, cfg.SOLVER.STEPS, old_max_iter, cfg.SOLVER.MAX_ITER)) # Scale FPN rpn_proposals collect size (post_nms_topN) in `collect` function # of `collect_and_distribute_fpn_rpn_proposals.py` # # post_nms_topN = int(cfg[cfg_key].RPN_POST_NMS_TOP_N * cfg.FPN.RPN_COLLECT_SCALE + 0.5) if cfg.FPN.FPN_ON and cfg.MODEL.FASTER_RCNN: cfg.FPN.RPN_COLLECT_SCALE = cfg.TRAIN.IMS_PER_BATCH / original_ims_per_batch print( 'Scale FPN rpn_proposals collect size directly propotional to the change of IMS_PER_BATCH:\n' ' cfg.FPN.RPN_COLLECT_SCALE: {}'.format( cfg.FPN.RPN_COLLECT_SCALE)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) # ipdb.set_trace() ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma assert_and_infer_cfg() timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() roidb, ratio_list, ratio_index = combined_roidb_for_training( cfg.TRAIN.DATASETS, cfg.TRAIN.PROPOSAL_FILES) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size # ToDo: shuffle? batchSampler = BatchSampler(sampler=MinibatchSampler( ratio_list, ratio_index), batch_size=args.batch_size, drop_last=True) dataset = RoiDataLoader(roidb, cfg.MODEL.NUM_CLASSES, training=True) dataloader = torch.utils.data.DataLoader( dataset, batch_sampler=batchSampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) # dataiterator = iter(dataloader) ### Model ### from modeling.model_builder import Generalized_RCNN maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### bias_hoi_params = [] bias_hoi_param_names = [] bias_faster_params = [] bias_faster_param_names = [] nobias_hoi_params = [] nobias_hoi_param_names = [] nobias_faster_params = [] nobias_faster_param_names = [] # bias_params = [] # bias_param_names = [] # nonbias_params = [] # nonbias_param_names = [] #base_model = torch.load('Outputs/baseline/baseline_512_32_nogt_1o3/ckpt/model_step47999.pth') nograd_param_names = [] for key, value in maskRCNN.named_parameters(): #if key in base_model['model'].keys(): # value.requires_grad = False #print('the key xxx:', key) # Fix RPN module same as the paper # ToDo: or key.startswith('Box') # if 'affinity' not in key: # value.requires_grad = False print(key, value.size(), value.requires_grad) if value.requires_grad: if 'bias' in key: if 'HOI_Head' in key: bias_hoi_params.append(value) bias_hoi_param_names.append(key) else: bias_faster_params.append(value) bias_faster_param_names.append(key) else: if 'HOI_Head' in key: nobias_hoi_params.append(value) nobias_hoi_param_names.append(key) else: nobias_faster_params.append(value) nobias_faster_param_names.append(key) else: nograd_param_names.append(key) #del base_model #ipdb.set_trace() # Learning rate of 0 is a dummy value to be set properly at the start of training params = [ { 'params': nobias_hoi_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': nobias_faster_params, 'lr': 0 * cfg.SOLVER.FASTER_RCNN_WEIGHT, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY }, { 'params': bias_hoi_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }, { 'params': bias_faster_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1) * cfg.SOLVER.FASTER_RCNN_WEIGHT, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0 }, ] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) if args.krcnn_from_faster: net_utils.load_krcnn_from_faster(maskRCNN, checkpoint['model']) else: net_utils.load_ckpt(maskRCNN, checkpoint['model']) print('Original model loaded....') if args.resume: print('Resume, loaded step\n\n\n: ', checkpoint['step']) args.start_step = checkpoint['step'] + 1 if 'train_size' in checkpoint: # For backward compatibility if checkpoint['train_size'] != train_size: print( 'train_size value: %d different from the one in checkpoint: %d' % (train_size, checkpoint['train_size'])) # reorder the params in optimizer checkpoint's params_groups if needed # misc_utils.ensure_optimizer_ckpt_params_order(param_names, checkpoint) # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. optimizer.load_state_dict(checkpoint['optimizer']) # misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0][ 'lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() + '_step' #output_dir = misc_utils.get_output_dir(args, args.run_name) output_dir = os.path.join('Outputs', args.expDir, args.expID) os.makedirs(output_dir, exist_ok=True) args.cfg_filename = os.path.basename(args.cfg_file) tblogger = None if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: from tensorboardX import SummaryWriter # Set the Tensorboard logger tblogger = SummaryWriter(output_dir) ### Training Loop ### train_val(maskRCNN, args, optimizer, lr, dataloader, train_size, output_dir, tblogger)