def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.image_set) prefix = os.path.join(final_output_path, prefix) # load symbol shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'), final_output_path) sym_instance = eval(config.symbol + '.' + config.symbol)() sym = sym_instance.get_symbol(config, is_train=True) feat_sym = sym.get_internals()['rpn_cls_score_output'] # setup multi-gpu batch_size = len(ctx) input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size # print config pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) # load dataset and prepare imdb for training image_sets = [iset for iset in config.dataset.image_set.split('+')] roidbs = [load_gt_roidb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, flip=config.TRAIN.FLIP) for image_set in image_sets] roidb = merge_roidb(roidbs) roidb = filter_roidb(roidb, config) # load training data train_data = AnchorLoader(feat_sym, roidb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, feat_stride=config.network.RPN_FEAT_STRIDE, anchor_scales=config.network.ANCHOR_SCALES, anchor_ratios=config.network.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING) # infer max shape max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) max_data_shape.append(('gt_boxes', (config.TRAIN.BATCH_IMAGES, 100, 5))) print 'providing maximum shape', max_data_shape, max_label_shape data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) pprint.pprint(data_shape_dict) sym_instance.infer_shape(data_shape_dict) # load and initialize params if config.TRAIN.RESUME: print('continue training from ', begin_epoch) arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) else: arg_params, aux_params = load_param(pretrained, epoch, convert=True) sym_instance.init_weight(config, arg_params, aux_params) # check parameter shapes sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # create solver fixed_param_prefix = config.network.FIXED_PARAMS data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] mod = MutableModule(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in range(batch_size)], max_label_shapes=[max_label_shape for _ in range(batch_size)], fixed_param_prefix=fixed_param_prefix) if config.TRAIN.RESUME: mod._preload_opt_states = '%s-%04d.states'%(prefix, begin_epoch) # decide training params # metric rpn_eval_metric = metric.RPNAccMetric() rpn_cls_metric = metric.RPNLogLossMetric() rpn_bbox_metric = metric.RPNL1LossMetric() eval_metric = metric.RCNNAccMetric(config) cls_metric = metric.RCNNLogLossMetric(config) bbox_metric = metric.RCNNL1LossMetric(config) eval_metrics = mx.metric.CompositeEvalMetric() # rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric]: eval_metrics.add(child_metric) # callback batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=args.frequent) means = np.tile(np.array(config.TRAIN.BBOX_MEANS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) stds = np.tile(np.array(config.TRAIN.BBOX_STDS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) epoch_end_callback = [mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True), callback.do_checkpoint(prefix, means, stds)] # decide learning rate base_lr = lr lr_factor = config.TRAIN.lr_factor lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) # optimizer optimizer_params = {'momentum': config.TRAIN.momentum, 'wd': config.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None} if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) # train mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch)
def train_rpn(cfg, dataset, image_set, root_path, dataset_path, frequent, kvstore, flip, shuffle, resume, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, train_shared, lr, lr_step, logger=None, output_path=None): # set up logger if not logger: logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) # set up config cfg.TRAIN.BATCH_IMAGES = cfg.TRAIN.ALTERNATE.RPN_BATCH_IMAGES # load symbol sym_instance = eval(cfg.symbol + '.' + cfg.symbol)() sym = sym_instance.get_symbol_rpn(cfg, is_train=True) feat_sym = sym.get_internals()['rpn_cls_score_output'] # setup multi-gpu batch_size = len(ctx) input_batch_size = cfg.TRAIN.BATCH_IMAGES * batch_size # print cfg pprint.pprint(cfg) logger.info('training rpn cfg:{}\n'.format(pprint.pformat(cfg))) # load dataset and prepare imdb for training image_sets = [iset for iset in image_set.split('+')] roidbs = [load_gt_roidb(dataset, image_set, root_path, dataset_path, result_path=output_path, flip=flip) for image_set in image_sets] roidb = merge_roidb(roidbs) roidb = filter_roidb(roidb, cfg) # load training data train_data = AnchorLoader(feat_sym, roidb, cfg, batch_size=input_batch_size, shuffle=shuffle, ctx=ctx, feat_stride=cfg.network.RPN_FEAT_STRIDE, anchor_scales=cfg.network.ANCHOR_SCALES, anchor_ratios=cfg.network.ANCHOR_RATIOS, aspect_grouping=cfg.TRAIN.ASPECT_GROUPING) # infer max shape max_data_shape = [('data', (cfg.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in cfg.SCALES]), max([v[1] for v in cfg.SCALES])))] max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) print('providing maximum shape', max_data_shape, max_label_shape) # infer shape data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) sym_instance.infer_shape(data_shape_dict) # load and initialize params if resume: print('continue training from ', begin_epoch) arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) else: arg_params, aux_params = load_param(pretrained, epoch, convert=True) sym_instance.init_weight_rpn(cfg, arg_params, aux_params) # check parameter shapes sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # create solver data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] if train_shared: fixed_param_prefix = cfg.network.FIXED_PARAMS_SHARED else: fixed_param_prefix = cfg.network.FIXED_PARAMS mod = MutableModule(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in xrange(batch_size)], max_label_shapes=[max_label_shape for _ in xrange(batch_size)], fixed_param_prefix=fixed_param_prefix) # decide training params # metric eval_metric = metric.RPNAccMetric() cls_metric = metric.RPNLogLossMetric() bbox_metric = metric.RPNL1LossMetric() eval_metrics = mx.metric.CompositeEvalMetric() for child_metric in [eval_metric, cls_metric, bbox_metric]: eval_metrics.add(child_metric) # callback batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=frequent) # epoch_end_callback = mx.callback.do_checkpoint(prefix) epoch_end_callback = mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True) # decide learning rate base_lr = lr lr_factor = cfg.TRAIN.lr_factor lr_epoch = [int(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, cfg.TRAIN.warmup, cfg.TRAIN.warmup_lr, cfg.TRAIN.warmup_step) # optimizer optimizer_params = {'momentum': cfg.TRAIN.momentum, 'wd': cfg.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None} if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) # train mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=kvstore, optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch)
def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.image_set) prefix = os.path.join(final_output_path, prefix) # load symbol shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'), final_output_path) sym_instance = eval(config.symbol + '.' + config.symbol)() sym = sym_instance.get_retina_symbol(config, is_train=True) feat_sym = [] feat_sym_p3 = sym.get_internals()['cls_score/p3_output'] feat_sym_p4 = sym.get_internals()['cls_score/p4_output'] feat_sym_p5 = sym.get_internals()['cls_score/p5_output'] feat_sym_p6 = sym.get_internals()['cls_score/p6_output'] feat_sym.append(feat_sym_p3) feat_sym.append(feat_sym_p4) feat_sym.append(feat_sym_p5) feat_sym.append(feat_sym_p6) ####### feat_stride = [] feat_stride.append(config.network.p3_RPN_FEAT_STRIDE) feat_stride.append(config.network.p4_RPN_FEAT_STRIDE) feat_stride.append(config.network.p5_RPN_FEAT_STRIDE) feat_stride.append(config.network.p6_RPN_FEAT_STRIDE) anchor_scales = [] anchor_scales.append(config.network.p3_ANCHOR_SCALES) anchor_scales.append(config.network.p4_ANCHOR_SCALES) anchor_scales.append(config.network.p5_ANCHOR_SCALES) anchor_scales.append(config.network.p6_ANCHOR_SCALES) anchor_ratios = [] anchor_ratios.append(config.network.p3_ANCHOR_RATIOS) anchor_ratios.append(config.network.p4_ANCHOR_RATIOS) anchor_ratios.append(config.network.p5_ANCHOR_RATIOS) anchor_ratios.append(config.network.p6_ANCHOR_RATIOS) ############# # setup multi-gpu batch_size = len(ctx) input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size # print config pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) # load dataset and prepare imdb for training image_sets = [iset for iset in config.dataset.image_set.split('+')] roidbs = [ load_gt_roidb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, flip=config.TRAIN.FLIP) for image_set in image_sets ] roidb = merge_roidb(roidbs) roidb = filter_roidb(roidb, config) # load training data train_data = AnchorLoader(feat_sym, feat_stride, anchor_scales, anchor_ratios, roidb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, aspect_grouping=config.TRAIN.ASPECT_GROUPING) # infer max shape max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) max_data_shape.append(('gt_boxes', (config.TRAIN.BATCH_IMAGES, 100, 5))) print 'providing maximum shape', max_data_shape, max_label_shape # infer max shape data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) pprint.pprint(data_shape_dict) sym_instance.infer_shape(data_shape_dict) # load and initialize params if config.TRAIN.RESUME: print('continue training from ', begin_epoch) arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) else: arg_params, aux_params = load_param(pretrained, epoch, convert=True) sym_instance.init_weight(config, arg_params, aux_params) # check parameter shapes sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # create solver fixed_param_prefix = config.network.FIXED_PARAMS data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] mod = MutableModule( sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in range(batch_size)], max_label_shapes=[max_label_shape for _ in range(batch_size)], fixed_param_prefix=fixed_param_prefix) if config.TRAIN.RESUME: mod._preload_opt_states = '%s-%04d.states' % (prefix, begin_epoch) # decide training params # metric Retina_eval_metric = metric.RetinaAccMetric() Retina_toal_eval_metric = metric.RetinaToalAccMetric() Retina_cls_metric = metric.RetinaFocalLossMetric() Retina_bbox_metric = metric.RetinaL1LossMetric() eval_metrics = mx.metric.CompositeEvalMetric() # rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric for child_metric in [ Retina_eval_metric, Retina_toal_eval_metric, Retina_cls_metric, Retina_bbox_metric ]: eval_metrics.add(child_metric) # callback batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=args.frequent) means = np.tile(np.array(config.TRAIN.BBOX_MEANS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) stds = np.tile(np.array(config.TRAIN.BBOX_STDS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) epoch_end_callback = [ mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True), callback.do_checkpoint(prefix, means, stds) ] # decide learning rate base_lr = lr lr_factor = config.TRAIN.lr_factor lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [ epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch ] lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [ int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff ] print lr_step.split(',') print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) # optimizer optimizer_params = { 'momentum': config.TRAIN.momentum, 'wd': config.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None } if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) # train print "#########################################train#######################################" mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch)
def train_net(args, ctx, pretrained_dir, pretrained_resnet, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.image_set) prefix = os.path.join(final_output_path, prefix) # load symbol shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'), final_output_path) sym_instance = eval(config.symbol + '.' + config.symbol)() sym = sym_instance.get_symbol(config, is_train=True) feat_sym = sym.get_internals()['rpn_cls_score_output'] # setup multi-gpu batch_size = len(ctx) input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size # print config pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) git_commit_id = commands.getoutput('git rev-parse HEAD') print("Git commit id:", git_commit_id) logger.info('Git commit id: {}'.format(git_commit_id)) # load dataset and prepare imdb for training image_sets = [iset for iset in config.dataset.image_set.split('+')] roidbs = [ load_gt_roidb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, flip=config.TRAIN.FLIP) for image_set in image_sets ] roidb = merge_roidb(roidbs) roidb = filter_roidb(roidb, config) # load training data train_data = AnchorLoader(feat_sym, roidb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, feat_stride=config.network.RPN_FEAT_STRIDE, anchor_scales=config.network.ANCHOR_SCALES, anchor_ratios=config.network.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING, normalize_target=config.network.NORMALIZE_RPN, bbox_mean=config.network.ANCHOR_MEANS, bbox_std=config.network.ANCHOR_STDS) # infer max shape max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) max_data_shape.append(('gt_boxes', (config.TRAIN.BATCH_IMAGES, 100, 5))) print('providing maximum shape', max_data_shape, max_label_shape) data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) pprint.pprint(data_shape_dict) sym_instance.infer_shape(data_shape_dict) # create solver fixed_param_prefix = config.network.FIXED_PARAMS data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] mod = MutableModule( sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in range(batch_size)], max_label_shapes=[max_label_shape for _ in range(batch_size)], fixed_param_prefix=fixed_param_prefix) # load and initialize params params_loaded = False if config.TRAIN.RESUME: arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) mod._preload_opt_states = '%s-%04d.states' % (prefix, begin_epoch) print('continue training from ', begin_epoch) logger.info('continue training from ', begin_epoch) params_loaded = True elif config.TRAIN.AUTO_RESUME: for cur_epoch in range(end_epoch - 1, begin_epoch, -1): params_filename = '{}-{:04d}.params'.format(prefix, cur_epoch) states_filename = '{}-{:04d}.states'.format(prefix, cur_epoch) if os.path.exists(params_filename) and os.path.exists( states_filename): begin_epoch = cur_epoch arg_params, aux_params = load_param(prefix, cur_epoch, convert=True) mod._preload_opt_states = states_filename print('auto continue training from {}, {}'.format( params_filename, states_filename)) logger.info('auto continue training from {}, {}'.format( params_filename, states_filename)) params_loaded = True break if not params_loaded: arg_params, aux_params = load_param(os.path.join( pretrained_dir, pretrained_resnet), epoch, convert=True) sym_instance.init_weight(config, arg_params, aux_params) # check parameter shapes sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # decide training params # metric eval_metric = metric.RCNNAccMetric(config) cls_metric = metric.RCNNLogLossMetric(config) bbox_metric = metric.RCNNL1LossMetric(config) eval_metrics = mx.metric.CompositeEvalMetric() for child_metric in [eval_metric, cls_metric, bbox_metric]: eval_metrics.add(child_metric) if config.TRAIN.JOINT_TRAINING or (not config.TRAIN.LEARN_NMS): rpn_eval_metric = metric.RPNAccMetric() rpn_cls_metric = metric.RPNLogLossMetric() rpn_bbox_metric = metric.RPNL1LossMetric() for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric]: eval_metrics.add(child_metric) if config.TRAIN.LEARN_NMS: eval_metrics.add(metric.NMSLossMetric(config, 'pos')) eval_metrics.add(metric.NMSLossMetric(config, 'neg')) eval_metrics.add(metric.NMSAccMetric(config)) # callback batch_end_callback = [ callback.Speedometer(train_data.batch_size, frequent=args.frequent) ] if config.USE_PHILLY: total_iter = (end_epoch - begin_epoch) * len(roidb) / input_batch_size progress_frequent = min(args.frequent * 10, 100) batch_end_callback.append( callback.PhillyProgressCallback(total_iter, progress_frequent)) means = np.tile(np.array(config.TRAIN.BBOX_MEANS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) stds = np.tile(np.array(config.TRAIN.BBOX_STDS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) epoch_end_callback = [ mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True), callback.do_checkpoint(prefix, means, stds) ] # decide learning rate base_lr = lr lr_factor = config.TRAIN.lr_factor lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [ epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch ] lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [ int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff ] print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) # optimizer optimizer_params = { 'momentum': config.TRAIN.momentum, 'wd': config.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None } if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) # train mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch)
def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.image_set) prefix = os.path.join(final_output_path, prefix) # load symbol shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'), final_output_path) sym_instance = eval(config.symbol + '.' + config.symbol)() sym = sym_instance.get_symbol(config, is_train=True) feat_sym = sym.get_internals()['rpn_cls_score_output'] # setup multi-gpu batch_size = len(ctx) input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size # print config pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) # load dataset and prepare imdb for training image_sets = [iset for iset in config.dataset.image_set.split('+')] roidbs = [ load_gt_roidb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, flip=config.TRAIN.FLIP) for image_set in image_sets ] roidb = merge_roidb(roidbs) roidb = filter_roidb(roidb, config) # load training data train_data = AnchorLoader(feat_sym, roidb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, feat_stride=config.network.RPN_FEAT_STRIDE, anchor_scales=config.network.ANCHOR_SCALES, anchor_ratios=config.network.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING) # infer max shape max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) max_data_shape.append(('gt_boxes', (config.TRAIN.BATCH_IMAGES, 100, 5))) print('providing maximum shape', max_data_shape, max_label_shape) data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) pprint.pprint(data_shape_dict) sym_instance.infer_shape(data_shape_dict) # load and initialize params #if config.TRAIN.RESUME: # print('continue training from ', begin_epoch) # arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) #else: # arg_params, aux_params = load_param(pretrained, epoch, convert=True) # sym_instance.init_weight(config, arg_params, aux_params) print('transfer learning...') # Choose the initialization weights (COCO or UADETRAC or pretrained) #arg_params, aux_params = load_param('/raid10/home_ext/Deformable-ConvNets/output/rfcn_dcn_Shuo_UADTRAC/resnet_v1_101_voc0712_rfcn_dcn_Shuo_UADETRAC/trainlist_full/rfcn_UADTRAC', 5, convert=True) #arg_params, aux_params = load_param('/raid10/home_ext/Deformable-ConvNets/model/rfcn_dcn_coco', 0, convert=True) arg_params, aux_params = load_param( '/raid10/home_ext/Deformable-ConvNets/output/rfcn_dcn_Shuo_AICity/resnet_v1_101_voc0712_rfcn_dcn_Shuo_AICityVOC1080_FreezeCOCO_rpnOnly_all/1080_all/rfcn_AICityVOC1080_FreezeCOCO_rpnOnly_all', 4, convert=True) sym_instance.init_weight_Shuo(config, arg_params, aux_params) # check parameter shapes sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # create solver fixed_param_prefix = config.network.FIXED_PARAMS data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] # mod = MutableModule(sym, data_names=data_names, label_names=label_names, # logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in range(batch_size)], # max_label_shapes=[max_label_shape for _ in range(batch_size)], fixed_param_prefix=fixed_param_prefix) #freeze parameters using fixed_param_names:list of str para_file = open( '/raid10/home_ext/Deformable-ConvNets/rfcn/symbols/arg_params.txt') para_list = [line.split('<')[0] for line in para_file.readlines()] para_list.remove('rfcn_cls_weight') para_list.remove('rfcn_cls_bias') para_list.remove('rfcn_cls_offset_t_weight') para_list.remove('rfcn_cls_offset_t_bias') para_list.remove('res5a_branch2b_offset_weight') para_list.remove('res5a_branch2b_offset_bias') para_list.remove('res5b_branch2b_offset_weight') para_list.remove('res5b_branch2b_offset_bias') para_list.remove('res5c_branch2b_offset_weight') para_list.remove('res5c_branch2b_offset_bias') para_list.remove('conv_new_1_weight') para_list.remove('conv_new_1_bias') para_list.remove('rfcn_bbox_weight') para_list.remove('rfcn_bbox_bias') para_list.remove('rfcn_bbox_offset_t_weight') para_list.remove('rfcn_bbox_offset_t_bias') mod = MutableModule_Shuo( sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in range(batch_size)], max_label_shapes=[max_label_shape for _ in range(batch_size)], fixed_param_prefix=fixed_param_prefix, fixed_param_names=para_list) if config.TRAIN.RESUME: mod._preload_opt_states = '%s-%04d.states' % (prefix, begin_epoch) # decide training params # metric rpn_eval_metric = metric.RPNAccMetric() rpn_cls_metric = metric.RPNLogLossMetric() rpn_bbox_metric = metric.RPNL1LossMetric() eval_metric = metric.RCNNAccMetric(config) cls_metric = metric.RCNNLogLossMetric(config) bbox_metric = metric.RCNNL1LossMetric(config) eval_metrics = mx.metric.CompositeEvalMetric() # rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric for child_metric in [ rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric ]: eval_metrics.add(child_metric) # callback batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=args.frequent) means = np.tile(np.array(config.TRAIN.BBOX_MEANS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) stds = np.tile(np.array(config.TRAIN.BBOX_STDS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) epoch_end_callback = [ mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True), callback.do_checkpoint(prefix, means, stds) ] # decide learning rate base_lr = lr lr_factor = config.TRAIN.lr_factor lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [ epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch ] lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [ int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff ] print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) # optimizer optimizer_params = { 'momentum': config.TRAIN.momentum, 'wd': config.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None } if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) # train mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch)
def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): mx.random.seed(3) np.random.seed(3) logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.image_set) prefix = os.path.join(final_output_path, prefix) # load symbol shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'), final_output_path) sym_instance = eval(config.symbol)() sym = sym_instance.get_symbol(config, is_train=True) feat_sym = sym.get_internals()['rpn_cls_score_output'] # setup multi-gpu batch_size = len(ctx) input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size # print config pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) # load dataset and prepare imdb for training image_sets = [iset for iset in config.dataset.image_set.split('+')] sdsdbs = [load_gt_sdsdb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, mask_size=config.MASK_SIZE, binary_thresh=config.BINARY_THRESH, result_path=final_output_path, flip=config.TRAIN.FLIP) for image_set in image_sets] sdsdb = merge_roidb(sdsdbs) sdsdb = filter_roidb(sdsdb, config) # load training data train_data = AnchorLoader(feat_sym, sdsdb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, feat_stride=config.network.RPN_FEAT_STRIDE, anchor_scales=config.network.ANCHOR_SCALES, anchor_ratios=config.network.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING, allowed_border=config.TRAIN.RPN_ALLOWED_BORDER) # infer max shape max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in config.SCALES]), max(v[1] for v in config.SCALES)))] max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) max_data_shape.append(('gt_boxes', (config.TRAIN.BATCH_IMAGES, 100, 5))) max_data_shape.append(('gt_masks', (config.TRAIN.BATCH_IMAGES, 100, max([v[0] for v in config.SCALES]), max(v[1] for v in config.SCALES)))) print 'providing maximum shape', max_data_shape, max_label_shape # infer shape print train_data.provide_data_single print train_data.provide_label_single data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) print 'data shape:' pprint.pprint(data_shape_dict) sym_instance.infer_shape(data_shape_dict) # load and initialize params if config.TRAIN.RESUME: print 'continue training from ', begin_epoch arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) else: arg_params, aux_params = load_param(pretrained, epoch, convert=True) sym_instance.init_weight(config, arg_params, aux_params) # check parameter shapes sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # create solver fixed_param_prefix = config.network.FIXED_PARAMS data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] mod = MutableModule(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in xrange(batch_size)], max_label_shapes=[max_label_shape for _ in xrange(batch_size)], fixed_param_prefix=fixed_param_prefix) # decide training metric # RPN, classification accuracy/loss, regression loss rpn_acc = metric.RPNAccMetric() rpn_cls_loss = metric.RPNLogLossMetric() rpn_bbox_loss = metric.RPNL1LossMetric() fcis_acc = metric.FCISAccMetric(config) fcis_acc_fg = metric.FCISAccFGMetric(config) fcis_cls_loss = metric.FCISLogLossMetric(config) fcis_bbox_loss = metric.FCISL1LossMetric(config) fcis_mask_loss = metric.FCISMaskLossMetric(config) eval_metrics = mx.metric.CompositeEvalMetric() for child_metric in [rpn_acc, rpn_cls_loss, rpn_bbox_loss, fcis_acc, fcis_acc_fg, fcis_cls_loss, fcis_bbox_loss, fcis_mask_loss]: eval_metrics.add(child_metric) # print eval_metrics batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=args.frequent) means = np.tile(np.array(config.TRAIN.BBOX_MEANS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) stds = np.tile(np.array(config.TRAIN.BBOX_STDS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) epoch_end_callback = callback.do_checkpoint(prefix, means, stds) #print epoch, begin_epoch, end_epoch, lr_step base_lr = lr lr_factor = 0.1 lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [int(epoch * len(sdsdb) / batch_size) for epoch in lr_epoch_diff] print 'lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) # optimizer optimizer_params = {'momentum': config.TRAIN.momentum, 'wd': config.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None} if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) # del sdsdb # a = mx.viz.plot_network(sym) # a.render('../example', view=True) # print 'prepare sds finished' mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch)