def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "fis_cell": cfg.TRAIN.DATASETS = ('fis_cell_train_val',) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train',) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train',) cfg.MODEL.NUM_CLASSES = 2 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) if args.close_fpn: args.cfg_file = "configs/few_shot/e2e_mask_rcnn_R-50-C4_1x_{}.yaml".format(args.group) cfg.OUTPUT_DIR = 'Outputs_wo_fpn' else: args.cfg_file = "configs/few_shot/e2e_mask_rcnn_R-50-FPN_1x_{}.yaml".format(args.group) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) cfg.RNG_SEED = args.random_seed if cfg.RNG_SEED is None: torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = True else: print('Make the experiment results deterministic.') random.seed(cfg.RNG_SEED) np.random.seed(cfg.RNG_SEED) torch.manual_seed(cfg.RNG_SEED) torch.cuda.manual_seed_all(cfg.RNG_SEED) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False cfg.SEEN = args.seen if args.close_co_atten: cfg.CO_ATTEN = False if not args.close_fpn: cfg.OUTPUT_DIR = 'Outputs_wo_co_atten' if args.close_relation_rcnn: cfg.RELATION_RCNN = False if not args.close_fpn: cfg.FAST_RCNN.ROI_BOX_HEAD = 'fast_rcnn_heads.roi_2mlp_head' cfg.MRCNN.ROI_MASK_HEAD = 'mask_rcnn_heads.mask_rcnn_fcn_head_v1up4convs' else: cfg.FAST_RCNN.ROI_BOX_HEAD = 'torchResNet.ResNet_roi_conv5_head' cfg.MRCNN.ROI_MASK_HEAD = 'mask_rcnn_heads.mask_rcnn_fcn_head_v0upshare' if not args.close_co_atten and not args.close_fpn: cfg.OUTPUT_DIR = 'Outputs_wo_relation_rcnn' if args.output_dir is not None: cfg.OUTPUT_DIR = args.output_dir if args.deform_conv: cfg.MODEL.USE_DEFORM = True ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH original_ims_per_batch = cfg.TRAIN.IMS_PER_BATCH original_num_gpus = cfg.NUM_GPUS if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS effective_batch_size = args.iter_size * args.batch_size print('effective_batch_size = batch_size * iter_size = %d * %d' % (args.batch_size, args.iter_size)) print('Adaptive config changes:') print(' effective_batch_size: %d --> %d' % (original_batch_size, effective_batch_size)) print(' NUM_GPUS: %d --> %d' % (original_num_gpus, cfg.NUM_GPUS)) print(' IMS_PER_BATCH: %d --> %d' % (original_ims_per_batch, cfg.TRAIN.IMS_PER_BATCH)) ### Adjust learning based on batch size change linearly # For iter_size > 1, gradients are `accumulated`, so lr is scaled based # on batch_size instead of effective_batch_size old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch_size change:\n' ' BASE_LR: {} --> {}'.format(old_base_lr, cfg.SOLVER.BASE_LR)) ### Adjust solver steps step_scale = original_batch_size / effective_batch_size old_solver_steps = cfg.SOLVER.STEPS old_max_iter = cfg.SOLVER.MAX_ITER cfg.SOLVER.STEPS = list(map(lambda x: int(x * step_scale + 0.5), cfg.SOLVER.STEPS)) cfg.SOLVER.MAX_ITER = int(cfg.SOLVER.MAX_ITER * step_scale + 0.5) print('Adjust SOLVER.STEPS and SOLVER.MAX_ITER linearly based on effective_batch_size change:\n' ' SOLVER.STEPS: {} --> {}\n' ' SOLVER.MAX_ITER: {} --> {}'.format(old_solver_steps, cfg.SOLVER.STEPS, old_max_iter, cfg.SOLVER.MAX_ITER)) # Scale FPN rpn_proposals collect size (post_nms_topN) in `collect` function # of `collect_and_distribute_fpn_rpn_proposals.py` # # post_nms_topN = int(cfg[cfg_key].RPN_POST_NMS_TOP_N * cfg.FPN.RPN_COLLECT_SCALE + 0.5) if cfg.FPN.FPN_ON and cfg.MODEL.FASTER_RCNN: cfg.FPN.RPN_COLLECT_SCALE = cfg.TRAIN.IMS_PER_BATCH / original_ims_per_batch print('Scale FPN rpn_proposals collect size directly propotional to the change of IMS_PER_BATCH:\n' ' cfg.FPN.RPN_COLLECT_SCALE: {}'.format(cfg.FPN.RPN_COLLECT_SCALE)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma assert_and_infer_cfg() timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() imdb, roidb, ratio_list, ratio_index, query, cat_list = combined_roidb( cfg.TRAIN.DATASETS, True) timers['roidb'].toc() roidb_size = len(roidb) logger.info('{:d} roidb entries'.format(roidb_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) # Effective training sample size for one epoch train_size = roidb_size // args.batch_size * args.batch_size batchSampler = BatchSampler( sampler=MinibatchSampler(ratio_list, ratio_index), batch_size=args.batch_size, drop_last=True ) dataset = RoiDataLoader( roidb, ratio_list, ratio_index, query, cfg.MODEL.NUM_CLASSES, training=True, cat_list=cat_list, shot=args.shot) dataloader = torch.utils.data.DataLoader( dataset, batch_sampler=batchSampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) dataiterator = iter(dataloader) ### Model ### maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### gn_param_nameset = set() for name, module in maskRCNN.named_modules(): if isinstance(module, nn.GroupNorm): gn_param_nameset.add(name+'.weight') gn_param_nameset.add(name+'.bias') gn_params = [] gn_param_names = [] bias_params = [] bias_param_names = [] nonbias_params = [] nonbias_param_names = [] nograd_param_names = [] for key, value in maskRCNN.named_parameters(): if value.requires_grad: if 'bias' in key: bias_params.append(value) bias_param_names.append(key) elif key in gn_param_nameset: gn_params.append(value) gn_param_names.append(key) else: nonbias_params.append(value) nonbias_param_names.append(key) else: nograd_param_names.append(key) assert (gn_param_nameset - set(nograd_param_names) - set(bias_param_names)) == set(gn_param_names) # Learning rate of 0 is a dummy value to be set properly at the start of training params = [ {'params': nonbias_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY}, {'params': bias_params, 'lr': 0 * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0}, {'params': gn_params, 'lr': 0, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY_GN} ] # names of paramerters for each paramter param_names = [nonbias_param_names, bias_param_names, gn_param_names] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: args.shot = checkpoint['shot'] args.start_step = checkpoint['step'] + 1 if 'train_size' in checkpoint: # For backward compatibility if checkpoint['train_size'] != train_size: print('train_size value: %d different from the one in checkpoint: %d' % (train_size, checkpoint['train_size'])) # reorder the params in optimizer checkpoint's params_groups if needed # misc_utils.ensure_optimizer_ckpt_params_order(param_names, checkpoint) # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. optimizer.load_state_dict(checkpoint['optimizer']) # misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0]['lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() + '_step' output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: #from tensorboardX import SummaryWriter # Set the Tensorboard logger #tblogger = SummaryWriter(output_dir) from loggers.logger import Logger tblogger = Logger(output_dir) ### Training Loop ### maskRCNN.train() CHECKPOINT_PERIOD = int(cfg.TRAIN.SNAPSHOT_ITERS / cfg.NUM_GPUS) # Set index for decay steps decay_steps_ind = None for i in range(1, len(cfg.SOLVER.STEPS)): if cfg.SOLVER.STEPS[i] >= args.start_step: decay_steps_ind = i break if decay_steps_ind is None: decay_steps_ind = len(cfg.SOLVER.STEPS) training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) try: logger.info('Training starts !') step = args.start_step for step in range(args.start_step, cfg.SOLVER.MAX_ITER): # Warm up if step < cfg.SOLVER.WARM_UP_ITERS: #for p in maskRCNN.module.Conv_Body.parameters(): # p.requires_grad = False method = cfg.SOLVER.WARM_UP_METHOD if method == 'constant': warmup_factor = cfg.SOLVER.WARM_UP_FACTOR elif method == 'linear': alpha = step / cfg.SOLVER.WARM_UP_ITERS warmup_factor = cfg.SOLVER.WARM_UP_FACTOR * (1 - alpha) + alpha else: raise KeyError('Unknown SOLVER.WARM_UP_METHOD: {}'.format(method)) lr_new = cfg.SOLVER.BASE_LR * warmup_factor net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new elif step == cfg.SOLVER.WARM_UP_ITERS: #for p in maskRCNN.module.Conv_Body.parameters(): # p.requires_grad = True net_utils.update_learning_rate(optimizer, lr, cfg.SOLVER.BASE_LR) lr = optimizer.param_groups[0]['lr'] assert lr == cfg.SOLVER.BASE_LR # Learning rate decay if decay_steps_ind < len(cfg.SOLVER.STEPS) and \ step == cfg.SOLVER.STEPS[decay_steps_ind]: logger.info('Decay the learning on step %d', step) lr_new = lr * cfg.SOLVER.GAMMA net_utils.update_learning_rate(optimizer, lr, lr_new) lr = optimizer.param_groups[0]['lr'] assert lr == lr_new decay_steps_ind += 1 training_stats.IterTic() optimizer.zero_grad() for inner_iter in range(args.iter_size): try: input_data = next(dataiterator) except StopIteration: dataiterator = iter(dataloader) input_data = next(dataiterator) for key in input_data: if key != 'roidb' and key != 'query': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) if key == 'query': input_data[key] = [list(map(Variable, q)) for q in input_data[key]] with torch.autograd.detect_anomaly(): net_outputs = maskRCNN(**input_data) training_stats.UpdateIterStats(net_outputs, inner_iter) loss = net_outputs['total_loss'] loss.backward() torch.nn.utils.clip_grad_value_(maskRCNN.module.parameters(), clip_value=0.4) optimizer.step() training_stats.IterToc() training_stats.LogIterStats(step, lr, input_data, args.shot) if (step+1) % CHECKPOINT_PERIOD == 0: save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) if (step+1) % args.disp_interval == 0: log_training_stats(training_stats, step, lr) # ---- Training ends ---- # Save last checkpoint save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) except (RuntimeError, KeyboardInterrupt): del dataiterator logger.info('Save ckpt on exception ...') save_ckpt(output_dir, args, step, train_size, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()
def main(): """Main function""" args = parse_args() print('Called with args:') print(args) if not torch.cuda.is_available(): sys.exit("Need a CUDA device to run the code.") if args.cuda or cfg.NUM_GPUS > 0: cfg.CUDA = True else: raise ValueError("Need Cuda device to run !") if args.dataset == "fis_cell": cfg.TRAIN.DATASETS = ('fis_cell_train_val',) cfg.MODEL.NUM_CLASSES = 2 elif args.dataset == "coco2017": cfg.TRAIN.DATASETS = ('coco_2017_train',) cfg.MODEL.NUM_CLASSES = 81 elif args.dataset == "keypoints_coco2017": cfg.TRAIN.DATASETS = ('keypoints_coco_2017_train',) cfg.MODEL.NUM_CLASSES = 2 else: raise ValueError("Unexpected args.dataset: {}".format(args.dataset)) args.cfg_file = "configs/few_shot/e2e_mask_rcnn_R-50-FPN_1x_{}.yaml".format(args.group) cfg_from_file(args.cfg_file) if args.set_cfgs is not None: cfg_from_list(args.set_cfgs) cfg.SEEN = args.seen ### Adaptively adjust some configs ### original_batch_size = cfg.NUM_GPUS * cfg.TRAIN.IMS_PER_BATCH original_ims_per_batch = cfg.TRAIN.IMS_PER_BATCH original_num_gpus = cfg.NUM_GPUS if args.batch_size is None: args.batch_size = original_batch_size cfg.NUM_GPUS = torch.cuda.device_count() assert (args.batch_size % cfg.NUM_GPUS) == 0, \ 'batch_size: %d, NUM_GPUS: %d' % (args.batch_size, cfg.NUM_GPUS) cfg.TRAIN.IMS_PER_BATCH = args.batch_size // cfg.NUM_GPUS print('Adaptive config changes:') print(' batch_size: %d --> %d' % (original_batch_size, args.batch_size)) print(' NUM_GPUS: %d --> %d' % (original_num_gpus, cfg.NUM_GPUS)) print(' IMS_PER_BATCH: %d --> %d' % (original_ims_per_batch, cfg.TRAIN.IMS_PER_BATCH)) if args.num_workers is not None: cfg.DATA_LOADER.NUM_THREADS = args.num_workers print('Number of data loading threads: %d' % cfg.DATA_LOADER.NUM_THREADS) ### Adjust learning based on batch size change linearly old_base_lr = cfg.SOLVER.BASE_LR cfg.SOLVER.BASE_LR *= args.batch_size / original_batch_size print('Adjust BASE_LR linearly according to batch size change: {} --> {}'.format( old_base_lr, cfg.SOLVER.BASE_LR)) # Scale FPN rpn_proposals collect size (post_nms_topN) in `collect` function # of `collect_and_distribute_fpn_rpn_proposals.py` # # post_nms_topN = int(cfg[cfg_key].RPN_POST_NMS_TOP_N * cfg.FPN.RPN_COLLECT_SCALE + 0.5) if cfg.FPN.FPN_ON and cfg.MODEL.FASTER_RCNN: cfg.FPN.RPN_COLLECT_SCALE = cfg.TRAIN.IMS_PER_BATCH / original_ims_per_batch print('Scale FPN rpn_proposals collect size directly propotional to the change of IMS_PER_BATCH:\n' ' cfg.FPN.RPN_COLLECT_SCALE: {}'.format(cfg.FPN.RPN_COLLECT_SCALE)) ### Overwrite some solver settings from command line arguments if args.optimizer is not None: cfg.SOLVER.TYPE = args.optimizer if args.lr is not None: cfg.SOLVER.BASE_LR = args.lr if args.lr_decay_gamma is not None: cfg.SOLVER.GAMMA = args.lr_decay_gamma timers = defaultdict(Timer) ### Dataset ### timers['roidb'].tic() imdb, roidb, ratio_list, ratio_index, query, cat_list = combined_roidb( cfg.TRAIN.DATASETS, True) timers['roidb'].toc() train_size = len(roidb) logger.info('{:d} roidb entries'.format(train_size)) logger.info('Takes %.2f sec(s) to construct roidb', timers['roidb'].average_time) sampler = MinibatchSampler(ratio_list, ratio_index) dataset = RoiDataLoader( roidb, ratio_list, ratio_index, query, cfg.MODEL.NUM_CLASSES, training=True, cat_list=cat_list, shot=args.shot) dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.batch_size, sampler=sampler, num_workers=cfg.DATA_LOADER.NUM_THREADS, collate_fn=collate_minibatch) assert_and_infer_cfg() ### Model ### maskRCNN = Generalized_RCNN() if cfg.CUDA: maskRCNN.cuda() ### Optimizer ### bias_params = [] nonbias_params = [] for key, value in dict(maskRCNN.named_parameters()).items(): if value.requires_grad: if 'bias' in key: bias_params.append(value) else: nonbias_params.append(value) params = [ {'params': nonbias_params, 'lr': cfg.SOLVER.BASE_LR, 'weight_decay': cfg.SOLVER.WEIGHT_DECAY}, {'params': bias_params, 'lr': cfg.SOLVER.BASE_LR * (cfg.SOLVER.BIAS_DOUBLE_LR + 1), 'weight_decay': cfg.SOLVER.WEIGHT_DECAY if cfg.SOLVER.BIAS_WEIGHT_DECAY else 0} ] if cfg.SOLVER.TYPE == "SGD": optimizer = torch.optim.SGD(params, momentum=cfg.SOLVER.MOMENTUM) elif cfg.SOLVER.TYPE == "Adam": optimizer = torch.optim.Adam(params) ### Load checkpoint if args.load_ckpt: load_name = args.load_ckpt logging.info("loading checkpoint %s", load_name) checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) net_utils.load_ckpt(maskRCNN, checkpoint['model']) if args.resume: assert checkpoint['iters_per_epoch'] == train_size // args.batch_size, \ "iters_per_epoch should match for resume" # There is a bug in optimizer.load_state_dict on Pytorch 0.3.1. # However it's fixed on master. # optimizer.load_state_dict(checkpoint['optimizer']) misc_utils.load_optimizer_state_dict(optimizer, checkpoint['optimizer']) args.shot = checkpoint['shot'] if checkpoint['step'] == (checkpoint['iters_per_epoch'] - 1): # Resume from end of an epoch args.start_epoch = checkpoint['epoch'] + 1 args.start_iter = 0 else: # Resume from the middle of an epoch. # NOTE: dataloader is not synced with previous state args.start_epoch = checkpoint['epoch'] args.start_iter = checkpoint['step'] + 1 del checkpoint torch.cuda.empty_cache() if args.load_detectron: #TODO resume for detectron weights (load sgd momentum values) logging.info("loading Detectron weights %s", args.load_detectron) load_detectron_weight(maskRCNN, args.load_detectron) lr = optimizer.param_groups[0]['lr'] # lr of non-bias parameters, for commmand line outputs. maskRCNN = mynn.DataParallel(maskRCNN, cpu_keywords=['im_info', 'roidb'], minibatch=True) ### Training Setups ### args.run_name = misc_utils.get_run_name() output_dir = misc_utils.get_output_dir(args, args.run_name) args.cfg_filename = os.path.basename(args.cfg_file) if not args.no_save: if not os.path.exists(output_dir): os.makedirs(output_dir) blob = {'cfg': yaml.dump(cfg), 'args': args} with open(os.path.join(output_dir, 'config_and_args.pkl'), 'wb') as f: pickle.dump(blob, f, pickle.HIGHEST_PROTOCOL) if args.use_tfboard: #from tensorboardX import SummaryWriter # Set the Tensorboard logger #tblogger = SummaryWriter(output_dir) from loggers.logger import Logger tblogger = Logger(output_dir) ### Training Loop ### maskRCNN.train() training_stats = TrainingStats( args, args.disp_interval, tblogger if args.use_tfboard and not args.no_save else None) iters_per_epoch = int(train_size / args.batch_size) # drop last args.iters_per_epoch = iters_per_epoch ckpt_interval_per_epoch = iters_per_epoch // args.ckpt_num_per_epoch try: logger.info('Training starts !') args.step = args.start_iter global_step = iters_per_epoch * args.start_epoch + args.step for args.epoch in range(args.start_epoch, args.start_epoch + args.num_epochs): # ---- Start of epoch ---- # adjust learning rate if args.lr_decay_epochs and args.epoch == args.lr_decay_epochs[0] and args.start_iter == 0: args.lr_decay_epochs.pop(0) net_utils.decay_learning_rate(optimizer, lr, cfg.SOLVER.GAMMA) lr *= cfg.SOLVER.GAMMA for args.step, input_data in zip(range(args.start_iter, iters_per_epoch), dataloader): for key in input_data: if key != 'roidb' and key != 'query': # roidb is a list of ndarrays with inconsistent length input_data[key] = list(map(Variable, input_data[key])) if key == 'query': input_data[key] = [list(map(Variable, q)) for q in input_data[key]] training_stats.IterTic() net_outputs = maskRCNN(**input_data) training_stats.UpdateIterStats(net_outputs) loss = net_outputs['total_loss'] optimizer.zero_grad() loss.backward() optimizer.step() training_stats.IterToc() #if (args.step+1) % ckpt_interval_per_epoch == 0: if global_step % 500 == 0: net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) if args.step % args.disp_interval == 0: log_training_stats(training_stats, global_step, lr, input_data, args.shot) global_step += 1 # ---- End of epoch ---- # save checkpoint #net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) # reset starting iter number after first epoch args.start_iter = 0 # ---- Training ends ---- net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) if iters_per_epoch % args.disp_interval != 0: # log last stats at the end log_training_stats(training_stats, global_step, lr, input_data, args.shot) except (RuntimeError, KeyboardInterrupt): logger.info('Save ckpt on exception ...') net_utils.save_ckpt(output_dir, args, maskRCNN, optimizer) logger.info('Save ckpt done.') stack_trace = traceback.format_exc() print(stack_trace) finally: if args.use_tfboard and not args.no_save: tblogger.close()