def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): logger, final_output_path, _, tensorboard_path = create_env( config.output_path, args.cfg, config.dataset.image_set) prefix = os.path.join(final_output_path, prefix) # print config pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) print "config.symbol", config.symbol sym_instance = eval(config.symbol + '.' + config.symbol)() if config.TRAIN.use_dynamic: sym_gen = sym_instance.sym_gen(config, is_train=True) else: sym = sym_instance.get_symbol(config, is_train=True) # infer max shape scales = [(config.TRAIN.crop_size[0], config.TRAIN.crop_size[1]) ] if config.TRAIN.enable_crop else config.SCALES label_stride = config.network.LABEL_STRIDE network_ratio = config.network.ratio if config.network.use_context: if config.network.use_crop_context: max_data_shape = [ ('data', (config.TRAIN.BATCH_IMAGES, 3, config.TRAIN.crop_size[0], config.TRAIN.crop_size[1])), ('origin_data', (config.TRAIN.BATCH_IMAGES, 3, 736, 736)), ('rois', (config.TRAIN.BATCH_IMAGES, 5)) ] else: max_data_shape = [ ('data', (config.TRAIN.BATCH_IMAGES, 3, config.TRAIN.crop_size[0], config.TRAIN.crop_size[1])), ('origin_data', (config.TRAIN.BATCH_IMAGES, 3, int(config.SCALES[0][0] * network_ratio), int(config.SCALES[0][1] * network_ratio))), ('rois', (config.TRAIN.BATCH_IMAGES, 5)) ] else: if config.TRAIN.enable_crop: max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3, config.TRAIN.crop_size[0], config.TRAIN.crop_size[1]))] else: max_data_shape = [ ('data', (config.TRAIN.BATCH_IMAGES, 3, max([make_divisible(v[0], label_stride) for v in scales]), max([make_divisible(v[1], label_stride) for v in scales]))) ] if config.network.use_mult_label: if config.network.use_crop_context: max_label_shape = [ ('label', (config.TRAIN.BATCH_IMAGES, 1, make_divisible(config.TRAIN.crop_size[0], label_stride) // config.network.LABEL_STRIDE, make_divisible(config.TRAIN.crop_size[1], label_stride) // config.network.LABEL_STRIDE)), ('origin_label', (config.TRAIN.BATCH_IMAGES, 1, 736, 736)) ] else: max_label_shape = [ ('label', (config.TRAIN.BATCH_IMAGES, 1, make_divisible(config.TRAIN.crop_size[0], label_stride) // config.network.LABEL_STRIDE, make_divisible(config.TRAIN.crop_size[1], label_stride) // config.network.LABEL_STRIDE)), ('origin_label', (config.TRAIN.BATCH_IMAGES, 1, int(config.SCALES[0][0] * network_ratio), int(config.SCALES[0][1] * network_ratio))) ] elif config.network.use_metric: scale_list = config.network.scale_list scale_name = ['a', 'b', 'c'] if config.network.scale_list == [1, 2, 4]: scale_name = ['', '', ''] if config.TRAIN.enable_crop: if config.TRAIN.use_mult_metric: max_label_shape = [ ('label', (config.TRAIN.BATCH_IMAGES, 1, config.TRAIN.crop_size[0] // label_stride, config.TRAIN.crop_size[1] // label_stride)), ('metric_label_' + str(scale_list[0]) + scale_name[0], (config.TRAIN.BATCH_IMAGES, 9, 1, config.TRAIN.crop_size[0] // label_stride, config.TRAIN.crop_size[1] // label_stride)), ('metric_label_' + str(scale_list[1]) + scale_name[1], (config.TRAIN.BATCH_IMAGES, 9, 1, config.TRAIN.crop_size[0] // label_stride, config.TRAIN.crop_size[1] // label_stride)), ('metric_label_' + str(scale_list[2]) + scale_name[2], (config.TRAIN.BATCH_IMAGES, 9, 1, config.TRAIN.crop_size[0] // label_stride, config.TRAIN.crop_size[1] // label_stride)) ] else: max_label_shape = [ ('label', (config.TRAIN.BATCH_IMAGES, 1, config.TRAIN.crop_size[0] // label_stride, config.TRAIN.crop_size[1] // label_stride)), ('metric_label', (config.TRAIN.BATCH_IMAGES, 9, 1, config.TRAIN.crop_size[0] // label_stride, config.TRAIN.crop_size[1] // label_stride)) ] else: if config.TRAIN.use_mult_metric: max_label_shape = [ ('label', (config.TRAIN.BATCH_IMAGES, 1, max([make_divisible(v[0], label_stride) for v in scales]) // config.network.LABEL_STRIDE, max([make_divisible(v[1], label_stride) for v in scales]) // config.network.LABEL_STRIDE)), ('metric_label_' + str(scale_list[0]) + scale_name[0], (config.TRAIN.BATCH_IMAGES, 9, 1, max([make_divisible(v[0], label_stride) for v in scales]) // config.network.LABEL_STRIDE, max([make_divisible(v[1], label_stride) for v in scales]) // config.network.LABEL_STRIDE)), ('metric_label_' + str(scale_list[1]) + scale_name[1], (config.TRAIN.BATCH_IMAGES, 9, 1, max([make_divisible(v[0], label_stride) for v in scales]) // config.network.LABEL_STRIDE, max([make_divisible(v[1], label_stride) for v in scales]) // config.network.LABEL_STRIDE)), ('metric_label_' + str(scale_list[2]) + scale_name[2], (config.TRAIN.BATCH_IMAGES, 9, 1, max([make_divisible(v[0], label_stride) for v in scales]) // config.network.LABEL_STRIDE, max([make_divisible(v[1], label_stride) for v in scales]) // config.network.LABEL_STRIDE)) ] else: max_label_shape = [ ('label', (config.TRAIN.BATCH_IMAGES, 1, max([make_divisible(v[0], label_stride) for v in scales]) // config.network.LABEL_STRIDE, max([make_divisible(v[1], label_stride) for v in scales]) // config.network.LABEL_STRIDE)), ('metric_label', (config.TRAIN.BATCH_IMAGES, 9, 1, max([make_divisible(v[0], label_stride) for v in scales]) // config.network.LABEL_STRIDE, max([make_divisible(v[1], label_stride) for v in scales]) // config.network.LABEL_STRIDE)) ] else: if config.TRAIN.enable_crop: max_label_shape = [('label', (config.TRAIN.BATCH_IMAGES, 1, config.TRAIN.crop_size[0] // label_stride, config.TRAIN.crop_size[1] // label_stride))] else: max_label_shape = [ ('label', (config.TRAIN.BATCH_IMAGES, 1, max([make_divisible(v[0], label_stride) for v in scales]) // config.network.LABEL_STRIDE, max([make_divisible(v[1], label_stride) for v in scales]) // config.network.LABEL_STRIDE)) ] print "max_label_shapes", max_label_shape if config.TRAIN.use_dynamic: sym = sym_gen([max_data_shape]) # setup multi-gpu input_batch_size = config.TRAIN.BATCH_IMAGES * len(ctx) NUM_GPUS = len(ctx) # load dataset and prepare imdb for training image_sets = [iset for iset in config.dataset.image_set.split('+')] segdbs = [ load_gt_segdb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, result_path=final_output_path, flip=True) for image_set in image_sets ] segdb = merge_segdb(segdbs) # load training data train_data = TrainDataLoader(sym, segdb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, use_context=config.network.use_context, use_mult_label=config.network.use_mult_label, use_metric=config.network.use_metric) # loading val data if config.TRAIN.eval_data_frequency > 0: val_image_set = config.dataset.test_image_set val_root_path = config.dataset.root_path val_dataset = config.dataset.dataset val_dataset_path = config.dataset.dataset_path val_imdb = eval(val_dataset)(val_image_set, val_root_path, val_dataset_path, result_path=final_output_path) val_segdb = val_imdb.gt_segdb() val_data = TrainDataLoader( sym, val_segdb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, use_context=config.network.use_context, use_mult_label=config.network.use_mult_label, use_metric=config.network.use_metric) else: val_data = None # print sym.list_arguments() print 'providing maximum shape', max_data_shape, max_label_shape max_data_shape, max_label_shape = train_data.infer_shape( max_data_shape, max_label_shape) print 'providing maximum shape', max_data_shape, max_label_shape # infer shape data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) if config.TRAIN.use_dynamic: sym = sym_gen([train_data.provide_data_single]) pprint.pprint(data_shape_dict) sym_instance.infer_shape(data_shape_dict) nset = set() for nm in sym.list_arguments(): if nm in nset: raise ValueError('Duplicate names detected, %s' % str(nm)) nset.add(nm) # load and initialize params if config.TRAIN.RESUME: print 'continue training from ', begin_epoch arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict, is_train=True) preload_opt_states = load_preload_opt_states(prefix, begin_epoch) # preload_opt_states = None else: print pretrained arg_params, aux_params = load_param(pretrained, epoch, convert=True) preload_opt_states = None if not config.TRAIN.FINTUNE: fixed_param_names = sym_instance.init_weights( config, arg_params, aux_params) sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict, is_train=True) # check parameter shapes # sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # create solver fixed_param_prefix = config.network.FIXED_PARAMS data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] mod = MutableModule( sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in xrange(NUM_GPUS)], max_label_shapes=[max_label_shape for _ in xrange(NUM_GPUS)], fixed_param_prefix=fixed_param_prefix) # metric imagecrossentropylossmetric = metric.ImageCrossEntropyLossMetric() localmetric = metric.LocalImageCrossEntropyLossMetric() globalmetric = metric.GlobalImageCrossEntropyLossMetric() pixcelAccMetric = metric.PixcelAccMetric() eval_metrics = mx.metric.CompositeEvalMetric() if config.network.use_mult_label: metric_list = [ imagecrossentropylossmetric, localmetric, globalmetric, pixcelAccMetric ] elif config.network.use_metric: if config.TRAIN.use_crl_ses: metric_list = [ imagecrossentropylossmetric, metric.SigmoidPixcelAccMetric(1), metric.SigmoidPixcelAccMetric(2), metric.SigmoidPixcelAccMetric(3), metric.CenterLossMetric(4), metric.CenterLossMetric(5), metric.CenterLossMetric(6), pixcelAccMetric ] elif config.network.use_sigmoid_metric: if config.TRAIN.use_mult_metric: metric_list = [ imagecrossentropylossmetric, metric.SigmoidPixcelAccMetric(1), metric.SigmoidPixcelAccMetric(2), metric.SigmoidPixcelAccMetric(3), pixcelAccMetric ] else: metric_list = [ imagecrossentropylossmetric, metric.SigmoidPixcelAccMetric(), pixcelAccMetric ] else: if config.TRAIN.use_mult_metric: metric_list = [ imagecrossentropylossmetric, metric.MetricLossMetric(1), metric.MetricLossMetric(2), metric.MetricLossMetric(3), pixcelAccMetric ] else: metric_list = [ imagecrossentropylossmetric, metric.MetricLossMetric(1), pixcelAccMetric ] elif config.network.mult_loss: metric_list = [ imagecrossentropylossmetric, metric.MImageCrossEntropyLossMetric(1), metric.MImageCrossEntropyLossMetric(2), pixcelAccMetric ] elif config.TRAIN.use_center: if config.TRAIN.use_one_center: metric_list = [ imagecrossentropylossmetric, pixcelAccMetric, metric.CenterLossMetric(1) ] else: metric_list = [ imagecrossentropylossmetric, pixcelAccMetric, metric.CenterLossMetric(1), metric.CenterLossMetric(2), metric.CenterLossMetric(3) ] else: metric_list = [imagecrossentropylossmetric, pixcelAccMetric] # rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric for child_metric in metric_list: eval_metrics.add(child_metric) # callback if False: batch_end_callback = [ callback.Speedometer(train_data.batch_size, frequent=args.frequent), callback.TensorboardCallback(tensorboard_path, prefix="train/batch") ] epoch_end_callback = mx.callback.module_checkpoint( mod, prefix, period=1, save_optimizer_states=True) shared_tensorboard = batch_end_callback[1] epoch_end_metric_callback = callback.TensorboardCallback( tensorboard_path, shared_tensorboard=shared_tensorboard, prefix="train/epoch") eval_end_callback = callback.TensorboardCallback( tensorboard_path, shared_tensorboard=shared_tensorboard, prefix="val/epoch") lr_callback = callback.LrCallback( tensorboard_path, shared_tensorboard=shared_tensorboard, prefix='train/batch') else: batch_end_callback = [ callback.Speedometer(train_data.batch_size, frequent=args.frequent) ] epoch_end_callback = mx.callback.module_checkpoint( mod, prefix, period=1, save_optimizer_states=True) epoch_end_metric_callback = None eval_end_callback = None #decide learning rate base_lr = lr lr_factor = 0.1 lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [ epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch ] lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [ int(epoch * len(segdb) / input_batch_size) for epoch in lr_epoch_diff ] print 'lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters if config.TRAIN.lr_type == "MultiStage": lr_scheduler = LinearWarmupMultiStageScheduler( lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step, args.frequent, stop_lr=lr * 0.01) elif config.TRAIN.lr_type == "MultiFactor": lr_scheduler = LinearWarmupMultiFactorScheduler( lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step, args.frequent) if config.TRAIN.optimizer == "sgd": optimizer_params = { 'momentum': config.TRAIN.momentum, 'wd': config.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None } optimizer = SGD(**optimizer_params) elif config.TRAIN.optimizer == "adam": optimizer_params = { 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None } optimizer = Adam(**optimizer_params) print "optimizer adam" freeze_layer_pattern = config.TRAIN.FIXED_PARAMS_PATTERN if freeze_layer_pattern.strip(): args_lr_mult = {} re_prog = re.compile(freeze_layer_pattern) if freeze_layer_pattern: fixed_param_names = [ name for name in sym.list_arguments() if re_prog.match(name) ] print "============================" print "fixed_params_names:" print(fixed_param_names) for name in fixed_param_names: args_lr_mult[name] = config.TRAIN.FIXED_PARAMS_PATTERN_LR_MULT print "============================" else: args_lr_mult = {} optimizer.set_lr_mult(args_lr_mult) # data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) # if config.TRAIN.use_dynamic: # sym = sym_gen([train_data.provide_data_single]) # pprint.pprint(data_shape_dict) # sym_instance.infer_shape(data_shape_dict) if not isinstance(train_data, PrefetchingIter) and config.TRAIN.use_thread: train_data = PrefetchingIter(train_data) if val_data: if not isinstance(val_data, PrefetchingIter): val_data = PrefetchingIter(val_data) if Debug: monitor = mx.monitor.Monitor(1) else: monitor = None # train mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, eval_end_callback=eval_end_callback, epoch_end_metric_callback=epoch_end_metric_callback, optimizer=optimizer, eval_data=val_data, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch, allow_missing=begin_epoch == 0, allow_extra=True, monitor=monitor, preload_opt_states=preload_opt_states, eval_data_frequency=config.TRAIN.eval_data_frequency)
def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): # 创建logger和对应的输出路径 logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.image_set) prefix = os.path.join(final_output_path, prefix) # load symbol shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'), final_output_path) sym_instance = eval(config.symbol + '.' + config.symbol)() sym = sym_instance.get_symbol(config, is_train=True) # 特征symbol,从网络sym中获取rpn_cls_score_output feat_sym = sym.get_internals()['rpn_cls_score_output'] # setup multi-gpu # 使能多GPU训练,每一张卡训练一个batch batch_size = len(ctx) input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size # print config pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) # load dataset and prepare imdb for training # 加载数据集同时准备训练的imdb,使用+分割不同的图像数据集,比如2007_trainval+2012_trainval image_sets = [iset for iset in config.dataset.image_set.split('+')] # load gt roidb加载gt roidb,根据数据集类型,图像集具体子类,数据集根目录和数据集路径,同时配置相关TRAIN为FLIP来增广数据 roidbs = [load_gt_roidb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, flip=config.TRAIN.FLIP) for image_set in image_sets] # 合并不同的roidb roidb = merge_roidb(roidbs) # 根据配置文件中对应的过滤规则来滤出roi roidb = filter_roidb(roidb, config) # load training data # 加载训练数据,anchor Loader为对应分类和回归的锚点加载,通过对应的roidb,查找对应的正负样本的锚点,该生成器需要参数锚点尺度,ratios和对应的feature的stride train_data = AnchorLoader(feat_sym, roidb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, feat_stride=config.network.RPN_FEAT_STRIDE, anchor_scales=config.network.ANCHOR_SCALES, anchor_ratios=config.network.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING) # infer max shape max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) max_data_shape.append(('gt_boxes', (config.TRAIN.BATCH_IMAGES, 100, 5))) print('providing maximum shape', max_data_shape, max_label_shape) data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) pprint.pprint(data_shape_dict) sym_instance.infer_shape(data_shape_dict) # load and initialize params if config.TRAIN.RESUME: print('continue training from ', begin_epoch) arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) else: arg_params, aux_params = load_param(pretrained, epoch, convert=True) sym_instance.init_weight(config, arg_params, aux_params) # check parameter shapes sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # create solver fixed_param_prefix = config.network.FIXED_PARAMS data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] mod = MutableModule(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in range(batch_size)], max_label_shapes=[max_label_shape for _ in range(batch_size)], fixed_param_prefix=fixed_param_prefix) if config.TRAIN.RESUME: mod._preload_opt_states = '%s-%04d.states'%(prefix, begin_epoch) # decide training params # metric rpn_eval_metric = metric.RPNAccMetric() rpn_cls_metric = metric.RPNLogLossMetric() rpn_bbox_metric = metric.RPNL1LossMetric() eval_metric = metric.RCNNAccMetric(config) cls_metric = metric.RCNNLogLossMetric(config) bbox_metric = metric.RCNNL1LossMetric(config) eval_metrics = mx.metric.CompositeEvalMetric() # rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric]: eval_metrics.add(child_metric) # callback batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=args.frequent) means = np.tile(np.array(config.TRAIN.BBOX_MEANS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) stds = np.tile(np.array(config.TRAIN.BBOX_STDS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) epoch_end_callback = [mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True), callback.do_checkpoint(prefix, means, stds)] # decide learning rate base_lr = lr lr_factor = config.TRAIN.lr_factor lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) # optimizer optimizer_params = {'momentum': config.TRAIN.momentum, 'wd': config.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None} if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) # train mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch)
def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): if config.dataset.dataset != 'JSONList': logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.image_set) prefix = os.path.join(final_output_path, prefix) shutil.copy2(args.cfg, prefix+'.yaml') else: import datetime import logging final_output_path = config.output_path prefix = prefix + '_' + datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S") prefix = os.path.join(final_output_path, prefix) shutil.copy2(args.cfg, prefix+'.yaml') log_file = prefix + '.log' head = '%(asctime)-15s %(message)s' logging.basicConfig(filename=log_file, format=head) logger = logging.getLogger() logger.setLevel(logging.INFO) logger.info('prefix: %s' % prefix) print('prefix: %s' % prefix) # load symbol #shutil.copy2(os.path.join(curr_path, '..', 'symbols', config.symbol + '.py'), final_output_path) sym_instance = eval(config.symbol + '.' + config.symbol)() sym = sym_instance.get_symbol(config, is_train=True) # setup multi-gpu batch_size = config.TRAIN.IMAGES_PER_GPU * len(ctx) # print config pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) # load dataset and prepare imdb for training image_sets = [iset for iset in config.dataset.image_set.split('+')] roidbs = [load_gt_roidb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, flip=config.TRAIN.FLIP) for image_set in image_sets] roidb = merge_roidb(roidbs) roidb = filter_roidb(roidb, config) # load training data if config.network.MULTI_RPN: assert Fasle, 'still developing' ### num_layers = len(config.network.MULTI_RPN_STRIDES) rpn_syms = [sym.get_internals()['rpn%d_cls_score_output'% l] for l in range(num_layers)] train_data = PyramidAnchorLoader(rpn_syms, roidb, config, batch_size=batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, feat_strides=config.network.MULTI_RPN_STRIDES, anchor_scales=config.network.ANCHOR_SCALES, anchor_ratios=config.network.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING, allowed_border=np.inf) else: feat_sym = sym.get_internals()['rpn_cls_score_output'] train_data = AnchorLoader(feat_sym, roidb, config, batch_size=batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, feat_stride=config.network.RPN_FEAT_STRIDE, anchor_scales=config.network.ANCHOR_SCALES, anchor_ratios=config.network.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING) # infer max shape data_shape_dict = dict(train_data.provide_data + train_data.provide_label) pprint.pprint(data_shape_dict) sym_instance.infer_shape(data_shape_dict) # load and initialize params if config.TRAIN.RESUME: print('continue training from ', begin_epoch) arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) else: arg_params, aux_params = load_param(pretrained, epoch, convert=True) sym_instance.init_weight(config, arg_params, aux_params) # check parameter shapes sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # create solver fixed_param_prefix = config.network.FIXED_PARAMS mod = MutableModule(sym, train_data.data_names, train_data.label_names, context=ctx, logger=logger, fixed_param_prefix=fixed_param_prefix) if config.TRAIN.RESUME: mod._preload_opt_states = '%s-%04d.states'%(prefix, begin_epoch) # decide training params # metric rpn_eval_metric = metric.RPNAccMetric() rpn_cls_metric = metric.RPNLogLossMetric() rpn_bbox_metric = metric.RPNL1LossMetric() eval_metric = metric.RCNNAccMetric(config) cls_metric = metric.RCNNLogLossMetric(config) bbox_metric = metric.RCNNL1LossMetric(config) eval_metrics = mx.metric.CompositeEvalMetric() # rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric]: eval_metrics.add(child_metric) if config.network.PREDICT_KEYPOINTS: kps_cls_acc = metric.KeypointAccMetric(config) kps_cls_loss = metric.KeypointLogLossMetric(config) kps_pos_loss = metric.KeypointL1LossMetric(config) eval_metrics.add(kps_cls_acc) eval_metrics.add(kps_cls_loss) eval_metrics.add(kps_pos_loss) # callback batch_end_callback = callback.Speedometer(batch_size, frequent=args.frequent) means = np.tile(np.array(config.TRAIN.BBOX_MEANS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) stds = np.tile(np.array(config.TRAIN.BBOX_STDS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) epoch_end_callback = [mx.callback.do_checkpoint(prefix)] # decide learning rate base_lr = lr lr_factor = config.TRAIN.lr_factor lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) # optimizer optimizer_params = {'momentum': config.TRAIN.momentum, 'wd': config.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None} if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) # train mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.TRAIN.kvstore, optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch) time.sleep(10) train_data.iters[0].terminate()
def train_rcnn(cfg, dataset, image_set, root_path, dataset_path, frequent, kvstore, flip, shuffle, resume, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, train_shared, lr, lr_step, proposal, logger=None, output_path=None): mx.random.seed(0) np.random.seed(0) # set up logger if not logger: logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) # load symbol sym_instance = eval(cfg.symbol + '.' + cfg.symbol)() sym = sym_instance.get_symbol_rcnn(cfg, is_train=True) # setup multi-gpu batch_size = len(ctx) input_batch_size = cfg.TRAIN.BATCH_IMAGES * batch_size # print cfg pprint.pprint(cfg) logger.info('training rcnn cfg:{}\n'.format(pprint.pformat(cfg))) rpn_path = cfg.dataset.proposal_cache # load dataset and prepare imdb for training image_sets = [iset for iset in image_set.split('+')] roidbs = [load_proposal_roidb(dataset, image_set, root_path, dataset_path, proposal=proposal, append_gt=True, flip=flip, result_path=output_path, rpn_path=rpn_path, top_roi=cfg.TRAIN.TOP_ROIS) for image_set in image_sets] roidb = merge_roidb(roidbs) roidb = filter_roidb(roidb, cfg) means, stds = add_bbox_regression_targets(roidb, cfg) # load training data train_data = ROIIter(roidb, cfg, batch_size=input_batch_size, shuffle=shuffle, ctx=ctx, aspect_grouping=cfg.TRAIN.ASPECT_GROUPING) # infer max shape max_height = max([v[0] for v in cfg.SCALES]) max_width = max([v[1] for v in cfg.SCALES]) paded_max_height = max_height + cfg.network.IMAGE_STRIDE - max_height % cfg.network.IMAGE_STRIDE paded_max_width = max_width + cfg.network.IMAGE_STRIDE - max_width % (cfg.network.IMAGE_STRIDE) max_data_shape = [('data', (cfg.TRAIN.BATCH_IMAGES, 3, paded_max_height, paded_max_width))] # infer shape data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) sym_instance.infer_shape(data_shape_dict) # print shape pprint.pprint(sym_instance.arg_shape_dict) logging.info(pprint.pformat(sym_instance.arg_shape_dict)) max_batch_roi = cfg.TRAIN.TOP_ROIS if cfg.TRAIN.BATCH_ROIS == -1 else cfg.TRAIN.BATCH_ROIS num_class = 2 if cfg.CLASS_AGNOSTIC else cfg.dataset.NUM_CLASSES max_label_shape = [('label', (cfg.TRAIN.BATCH_IMAGES, max_batch_roi)), ('bbox_target', (cfg.TRAIN.BATCH_IMAGES, max_batch_roi, num_class * 4)), ('bbox_weight', (cfg.TRAIN.BATCH_IMAGES, max_batch_roi, num_class * 4))] if cfg.network.USE_NONGT_INDEX: max_label_shape.append(('nongt_index', (2000,))) if cfg.network.ROIDispatch: max_data_shape.append(('rois_0', (cfg.TRAIN.BATCH_IMAGES, max_batch_roi / 4, 5))) max_data_shape.append(('rois_1', (cfg.TRAIN.BATCH_IMAGES, max_batch_roi / 4, 5))) max_data_shape.append(('rois_2', (cfg.TRAIN.BATCH_IMAGES, max_batch_roi / 4, 5))) max_data_shape.append(('rois_3', (cfg.TRAIN.BATCH_IMAGES, max_batch_roi / 4, 5))) else: max_data_shape.append(('rois', (cfg.TEST.PROPOSAL_POST_NMS_TOP_N + 30, 5))) #dot = mx.viz.plot_network(sym, node_attrs={'shape': 'rect', 'fixedsize': 'false'}) #dot.render(os.path.join('./output/rcnn/network_vis', cfg.symbol + cfg.TRAIN.model_prefix)) # load and initialize params if resume: print('continue training from ', begin_epoch) arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) else: arg_params, aux_params = load_param(pretrained, epoch, convert=True) sym_instance.init_weight_rcnn(cfg, arg_params, aux_params) # check parameter shapes sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # prepare training # create solver data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] if train_shared: fixed_param_prefix = cfg.network.FIXED_PARAMS_SHARED else: fixed_param_prefix = cfg.network.FIXED_PARAMS if cfg.network.ROIDispatch: mod = MutableModule(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in range(batch_size)], max_label_shapes=[max_label_shape for _ in range(batch_size)], fixed_param_prefix=fixed_param_prefix) else: mod = MutableModule(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in range(batch_size)], max_label_shapes=[max_label_shape for _ in range(batch_size)], fixed_param_prefix=fixed_param_prefix) if cfg.TRAIN.RESUME: mod._preload_opt_states = '%s-%04d.states' % (prefix, begin_epoch) # decide training params # metric eval_metric = metric.RCNNAccMetric(cfg) cls_metric = metric.RCNNLogLossMetric(cfg) bbox_metric = metric.RCNNL1LossMetric(cfg) eval_metrics = mx.metric.CompositeEvalMetric() for child_metric in [eval_metric, cls_metric, bbox_metric]: eval_metrics.add(child_metric) if cfg.TRAIN.LEARN_NMS: eval_metrics.add(metric.NMSLossMetric(cfg, 'pos')) eval_metrics.add(metric.NMSLossMetric(cfg, 'neg')) eval_metrics.add(metric.NMSAccMetric(cfg)) # callback batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=frequent) epoch_end_callback = [mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True), callback.do_checkpoint(prefix, means, stds)] # decide learning rate base_lr = lr lr_factor = cfg.TRAIN.lr_factor lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, cfg.TRAIN.warmup, cfg.TRAIN.warmup_lr, cfg.TRAIN.warmup_step) # optimizer optimizer_params = {'momentum': cfg.TRAIN.momentum, 'wd': cfg.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None} # train if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=kvstore, optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch)
def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): new_args_name = args.cfg if args.vis: config.TRAIN.VISUALIZE = True logger, final_output_path = create_logger(config.output_path, new_args_name, config.dataset.image_set, args.temp) prefix = os.path.join(final_output_path, prefix) logger.info('called with args {}'.format(args)) print(config.train_iter.SE3_PM_LOSS) if config.train_iter.SE3_PM_LOSS: print("SE3_PM_LOSS == True") else: print("SE3_PM_LOSS == False") if not config.network.STANDARD_FLOW_REP: print_and_log("[h, w] representation for flow is dep", logger) # load dataset and prepare imdb for training image_sets = [iset for iset in config.dataset.image_set.split('+')] datasets = [dset for dset in config.dataset.dataset.split('+')] print("config.dataset.class_name: {}".format(config.dataset.class_name)) print("image_sets: {}".format(image_sets)) if datasets[0].startswith('ModelNet'): pairdbs = [ load_gt_pairdb(config, datasets[i], image_sets[i] + class_name.split('/')[-1], config.dataset.root_path, config.dataset.dataset_path, class_name=class_name, result_path=final_output_path) for class_name in config.dataset.class_name for i in range(len(image_sets)) ] else: pairdbs = [ load_gt_pairdb(config, datasets[i], image_sets[i] + class_name, config.dataset.root_path, config.dataset.dataset_path, class_name=class_name, result_path=final_output_path) for class_name in config.dataset.class_name for i in range(len(image_sets)) ] pairdb = merge_pairdb(pairdbs) if not args.temp: src_file = os.path.join(curr_path, 'symbols', config.symbol + '.py') dst_file = os.path.join( final_output_path, '{}_{}.py'.format(config.symbol, time.strftime('%Y-%m-%d-%H-%M'))) os.popen('cp {} {}'.format(src_file, dst_file)) sym_instance = eval(config.symbol + '.' + config.symbol)() sym = sym_instance.get_symbol(config, is_train=True) # setup multi-gpu batch_size = len(ctx) input_batch_size = config.TRAIN.BATCH_PAIRS * batch_size pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) # load training data train_data = TrainDataLoader(sym, pairdb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx) train_data.get_batch_parallel() max_scale = [ max([v[0] for v in config.SCALES]), max(v[1] for v in config.SCALES) ] max_data_shape = [('image_observed', (config.TRAIN.BATCH_PAIRS, 3, max_scale[0], max_scale[1])), ('image_rendered', (config.TRAIN.BATCH_PAIRS, 3, max_scale[0], max_scale[1])), ('depth_gt_observed', (config.TRAIN.BATCH_PAIRS, 1, max_scale[0], max_scale[1])), ('src_pose', (config.TRAIN.BATCH_PAIRS, 3, 4)), ('tgt_pose', (config.TRAIN.BATCH_PAIRS, 3, 4))] if config.network.INPUT_DEPTH: max_data_shape.append(('depth_observed', (config.TRAIN.BATCH_PAIRS, 1, max_scale[0], max_scale[1]))) max_data_shape.append(('depth_rendered', (config.TRAIN.BATCH_PAIRS, 1, max_scale[0], max_scale[1]))) if config.network.INPUT_MASK: max_data_shape.append(('mask_observed', (config.TRAIN.BATCH_PAIRS, 1, max_scale[0], max_scale[1]))) max_data_shape.append(('mask_rendered', (config.TRAIN.BATCH_PAIRS, 1, max_scale[0], max_scale[1]))) rot_param = 3 if config.network.ROT_TYPE == "EULER" else 4 max_label_shape = [('rot', (config.TRAIN.BATCH_PAIRS, rot_param)), ('trans', (config.TRAIN.BATCH_PAIRS, 3))] if config.network.PRED_FLOW: max_label_shape.append(('flow', (config.TRAIN.BATCH_PAIRS, 2, max_scale[0], max_scale[1]))) max_label_shape.append(('flow_weights', (config.TRAIN.BATCH_PAIRS, 2, max_scale[0], max_scale[1]))) if config.train_iter.SE3_PM_LOSS: max_label_shape.append( ('point_cloud_model', (config.TRAIN.BATCH_PAIRS, 3, config.train_iter.NUM_3D_SAMPLE))) max_label_shape.append( ('point_cloud_weights', (config.TRAIN.BATCH_PAIRS, 3, config.train_iter.NUM_3D_SAMPLE))) max_label_shape.append( ('point_cloud_observed', (config.TRAIN.BATCH_PAIRS, 3, config.train_iter.NUM_3D_SAMPLE))) if config.network.PRED_MASK: max_label_shape.append( ('mask_gt_observed', (config.TRAIN.BATCH_PAIRS, 1, max_scale[0], max_scale[1]))) # max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape, max_label_shape) print_and_log( 'providing maximum shape, {}, {}'.format(max_data_shape, max_label_shape), logger) # infer max shape ''' max_label_shape = [('label', (config.TRAIN.BATCH_IMAGES, 1, max([v[0] for v in max_scale]), max([v[1] for v in max_scale])))] max_data_shape, max_label_shape = train_data.infer_shape( max_data_shape, max_label_shape) print('providing maximum shape', max_data_shape, max_label_shape) ''' # infer shape data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) print_and_log('\ndata_shape_dict: {}\n'.format(data_shape_dict), logger) sym_instance.infer_shape(data_shape_dict) print('************(wg): infering shape **************') internals = sym.get_internals() _, out_shapes, _ = internals.infer_shape(**data_shape_dict) print(sym.list_outputs()) shape_dict = dict(zip(internals.list_outputs(), out_shapes)) pprint.pprint(shape_dict) # load and initialize params if config.TRAIN.RESUME: print('continue training from ', begin_epoch) arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) elif pretrained == 'xavier': print('xavier') # arg_params = {} # aux_params = {} # sym_instance.init_weights(config, arg_params, aux_params) else: print(pretrained) arg_params, aux_params = load_param(pretrained, epoch, convert=True) print('arg_params: ', arg_params.keys()) print('aux_params: ', aux_params.keys()) if not config.network.skip_initialize: sym_instance.init_weights(config, arg_params, aux_params) # check parameter shapes if pretrained != 'xavier': sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # create solver fixed_param_prefix = config.network.FIXED_PARAMS data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] mod = MutableModule( sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in range(batch_size)], max_label_shapes=[max_label_shape for _ in range(batch_size)], fixed_param_prefix=fixed_param_prefix, config=config) # decide training params # metrics eval_metrics = mx.metric.CompositeEvalMetric() metric_list = [] iter_idx = 0 if config.network.PRED_FLOW: metric_list.append(metric.Flow_L2LossMetric(config, iter_idx)) metric_list.append(metric.Flow_CurLossMetric(config, iter_idx)) if config.train_iter.SE3_DIST_LOSS: metric_list.append(metric.Rot_L2LossMetric(config, iter_idx)) metric_list.append(metric.Trans_L2LossMetric(config, iter_idx)) if config.train_iter.SE3_PM_LOSS: metric_list.append(metric.PointMatchingLossMetric(config, iter_idx)) if config.network.PRED_MASK: metric_list.append(metric.MaskLossMetric(config, iter_idx)) # Visualize Training Batches if config.TRAIN.VISUALIZE: metric_list.append(metric.SimpleVisualize(config)) # metric_list.append(metric.MaskVisualize(config, save_dir = final_output_path)) metric_list.append( metric.MinibatchVisualize(config)) # flow visualization for child_metric in metric_list: eval_metrics.add(child_metric) # callback batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=args.frequent) epoch_end_callback = mx.callback.module_checkpoint( mod, prefix, period=1, save_optimizer_states=True) # decide learning rate base_lr = lr lr_factor = 0.1 lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [ epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch ] lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [ int(epoch * len(pairdb) / batch_size) for epoch in lr_epoch_diff ] print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) # train if config.TRAIN.optimizer == 'adam': optimizer_params = {'learning_rate': lr} if pretrained == 'xavier': init = mx.init.Mixed(['rot_weight|trans_weight', '.*'], [ mx.init.Zero(), mx.init.Xavier( rnd_type='gaussian', factor_type="in", magnitude=2) ]) mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, optimizer='adam', optimizer_params=optimizer_params, begin_epoch=begin_epoch, num_epoch=end_epoch, prefix=prefix, initializer=init, force_init=True) else: mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, optimizer='adam', arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch, prefix=prefix) elif config.TRAIN.optimizer == 'sgd': # optimizer optimizer_params = { 'momentum': config.TRAIN.momentum, 'wd': config.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None } if pretrained == 'xavier': init = mx.init.Mixed(['rot_weight|trans_weight', '.*'], [ mx.init.Zero(), mx.init.Xavier( rnd_type='gaussian', factor_type="in", magnitude=2) ]) mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, optimizer='sgd', optimizer_params=optimizer_params, begin_epoch=begin_epoch, num_epoch=end_epoch, prefix=prefix, initializer=init, force_init=True) else: mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch, prefix=prefix)
def train_feature_distance_net(args, ctx, pretrained, pretrained_flow, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): # ==============prepare logger============== logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.image_set) prefix = os.path.join(final_output_path, prefix) # ==============load symbol============== shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'), final_output_path) sym_instance = eval(config.symbol + '.' + config.symbol)() if config.TRAIN.G_type == 0: sym = sym_instance.get_train_feature_distance_symbol(config) elif config.TRAIN.G_type == 1: sym = sym_instance.get_train_feature_distance_symbol_res(config) # ==============setup multi-gpu============== batch_size = len(ctx) input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size # ==============print config============== pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) # ==============load dataset and prepare imdb for training============== image_sets = [iset for iset in config.dataset.image_set.split('+')] roidbs = [ load_gt_roidb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, flip=config.TRAIN.FLIP) for image_set in image_sets ] roidb = merge_roidb(roidbs) roidb = filter_roidb(roidb, config) train_iter = ImagenetVIDIter(roidb, input_batch_size, config, config.TRAIN.SHUFFLE, ctx) # infer max shape max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES]))), ('data_ref', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] print 'providing maximum shape', max_data_shape data_shape_dict = dict(train_iter.provide_data_single) pprint.pprint(data_shape_dict) sym_instance.infer_shape(data_shape_dict) # ==============init params============== if config.TRAIN.RESUME: print('continue training from ', begin_epoch) arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) else: arg_params, aux_params = load_param(pretrained, epoch, convert=True) # arg_params_flow, aux_params_flow = load_param(pretrained_flow, epoch, convert=True) # arg_params.update(arg_params_flow) # aux_params.update(aux_params_flow) sym_instance.init_weight(config, arg_params, aux_params) # ==============check parameter shapes============== # sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # ==============create solver============== fixed_param_prefix = config.network.FIXED_PARAMS data_names = train_iter.data_name label_names = train_iter.label_name mod = MutableModule( sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in range(batch_size)], max_label_shapes=None, fixed_param_prefix=fixed_param_prefix) if config.TRAIN.RESUME: mod._preload_opt_states = '%s-%04d.states' % (prefix, begin_epoch) # ==============optimizer============== optimizer_params = { 'learning_rate': 0.00005, } if not isinstance(train_iter, PrefetchingIter): train_iter = PrefetchingIter(train_iter) batch_end_callback = callback.Speedometer(train_iter.batch_size, frequent=args.frequent) epoch_end_callback = [ mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True), callback.do_checkpoint(prefix) ] feature_L2_loss = metric.FeatureL2LossMetric(config) eval_metrics = mx.metric.CompositeEvalMetric() eval_metrics.add(feature_L2_loss) mod.fit(train_iter, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, optimizer='RMSprop', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch, initializer=mx.init.Normal(0.02), allow_missing=True)
def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.image_set) prefix = os.path.join(final_output_path, prefix) # load symbol shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'), final_output_path) sym_instance = eval(config.symbol + '.' + config.symbol)() sym = sym_instance.get_retina_symbol(config, is_train=True) feat_sym = [] feat_sym_p4 = sym.get_internals()['box_pred/p4_output'] feat_sym_p5 = sym.get_internals()['box_pred/p5_output'] feat_sym_p6 = sym.get_internals()['box_pred/p6_output'] feat_sym_p7 = sym.get_internals()['box_pred/p7_output'] feat_sym.append(feat_sym_p4) feat_sym.append(feat_sym_p5) feat_sym.append(feat_sym_p6) feat_sym.append(feat_sym_p7) ####### feat_stride = [] feat_stride.append(config.network.p4_RPN_FEAT_STRIDE) feat_stride.append(config.network.p5_RPN_FEAT_STRIDE) feat_stride.append(config.network.p6_RPN_FEAT_STRIDE) feat_stride.append(config.network.p7_RPN_FEAT_STRIDE) anchor_scales = [] anchor_scales.append(config.network.p4_ANCHOR_SCALES) anchor_scales.append(config.network.p5_ANCHOR_SCALES) anchor_scales.append(config.network.p6_ANCHOR_SCALES) anchor_scales.append(config.network.p7_ANCHOR_SCALES) anchor_ratios = [] anchor_ratios.append(config.network.p4_ANCHOR_RATIOS) anchor_ratios.append(config.network.p5_ANCHOR_RATIOS) anchor_ratios.append(config.network.p6_ANCHOR_RATIOS) anchor_ratios.append(config.network.p7_ANCHOR_RATIOS) ############# # setup multi-gpu batch_size = len(ctx) input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size # print config pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) # load dataset and prepare imdb for training image_sets = [iset for iset in config.dataset.image_set.split('+')] roidbs = [ load_gt_roidb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, flip=config.TRAIN.FLIP) for image_set in image_sets ] roidb = merge_roidb(roidbs) roidb = filter_roidb(roidb, config) # load training data train_data = AnchorLoader(feat_sym, feat_stride, anchor_scales, anchor_ratios, roidb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, aspect_grouping=config.TRAIN.ASPECT_GROUPING) # infer max shape max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) max_data_shape.append(('gt_boxes', (config.TRAIN.BATCH_IMAGES, 100, 5))) print 'providing maximum shape', max_data_shape, max_label_shape # infer max shape data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) pprint.pprint(data_shape_dict) sym_instance.infer_shape(data_shape_dict) # load and initialize params if config.TRAIN.RESUME: print('continue training from ', begin_epoch) arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) else: arg_params, aux_params = load_param(pretrained, epoch, convert=True) sym_instance.init_weight(config, arg_params, aux_params) # check parameter shapes sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # create solver fixed_param_prefix = config.network.FIXED_PARAMS data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] mod = MutableModule( sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in range(batch_size)], max_label_shapes=[max_label_shape for _ in range(batch_size)], fixed_param_prefix=fixed_param_prefix) if config.TRAIN.RESUME: mod._preload_opt_states = '%s-%04d.states' % (prefix, begin_epoch) # decide training params # metric Retina_toal_eval_metric = metric.RetinaToalAccMetric() Retina_cls_metric = metric.RetinaFocalLossMetric() Retina_bbox_metric = metric.RetinaL1LossMetric() eval_metrics = mx.metric.CompositeEvalMetric() # rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric for child_metric in [ Retina_toal_eval_metric, Retina_cls_metric, Retina_bbox_metric ]: eval_metrics.add(child_metric) # callback batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=args.frequent) means = np.tile(np.array(config.TRAIN.BBOX_MEANS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) stds = np.tile(np.array(config.TRAIN.BBOX_STDS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) epoch_end_callback = [ mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True), callback.do_checkpoint(prefix, means, stds) ] # decide learning rate base_lr = lr lr_factor = config.TRAIN.lr_factor lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [ epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch ] lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [ int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff ] print lr_step.split(',') print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) # optimizer optimizer_params = { 'learning_rate': lr, 'wd': 0.0001, } if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) # train initializer = mx.init.MSRAPrelu(factor_type='out', slope=0) # adam = mx.optimizer.AdaDelta(rho=0.09, epsilon=1e-14) #optimizer_params=optimizer_params, print "-----------------------train--------------------------------" mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, optimizer='adam', optimizer_params=optimizer_params, initializer=initializer, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch)
def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): """Main train function for segmentation Args: args: paramenter parser ctx: GPU context pretrained: pretrained file path epoch: pretrained checkpoint epoch prefix: model save name prefix begin_epoch: which epoch start to train end_epoch: eneded epoch of training phase lr: learning rate lr_step: list of epoch number to do learning rate decay """ ########################################## # Step 1. Create logger and set up the save prefix ########################################## logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.image_set) prefix = os.path.join(final_output_path, prefix) ########################################## # Step 2. Copy the symbols and load the symbol to build network ########################################## shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'), final_output_path) sym_instance = eval(config.symbol + '.' + config.symbol)() sym = sym_instance.get_symbol(config, is_train=True) # #sym = eval('get_' + args.network + '_train')(num_classes=config.dataset.NUM_CLASSES) ########################################## # Step 3. Setup multi-gpu and batch size ########################################## batch_size = len(ctx) input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size # print config pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) ############################################ # Step 4. load dataset and prepare imdb for training ############################################ image_sets = [iset for iset in config.dataset.image_set.split('+')] segdbs = [ load_gt_segdb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, result_path=final_output_path, flip=config.TRAIN.FLIP) for image_set in image_sets ] segdb = merge_segdb(segdbs) ############################################ # Step 5. Set dataloader and set the data shape ############################################ train_data = TrainDataLoader(sym, segdb, config, batch_size=input_batch_size, crop_height=config.TRAIN.CROP_HEIGHT, crop_width=config.TRAIN.CROP_WIDTH, shuffle=config.TRAIN.SHUFFLE, ctx=ctx) # infer max shape max_scale = [(config.TRAIN.CROP_HEIGHT, config.TRAIN.CROP_WIDTH)] max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in max_scale]), max([v[1] for v in max_scale])))] max_label_shape = [('label', (config.TRAIN.BATCH_IMAGES, 1, max([v[0] for v in max_scale]), max([v[1] for v in max_scale])))] # max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape, max_label_shape) print('providing maximum shape', max_data_shape, max_label_shape) # infer shape data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) pprint.pprint(data_shape_dict) sym_instance.infer_shape(data_shape_dict) ############################################## # Step 6. load and initialize params ############################################## if config.TRAIN.RESUME: print('continue training from ', begin_epoch) arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) else: print(pretrained) arg_params, aux_params = load_param(pretrained, epoch, convert=True) sym_instance.init_weights(config, arg_params, aux_params) # check parameter shapes sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) ############################################## # Step 6 Create solver and set metrics ############################################## fixed_param_prefix = config.network.FIXED_PARAMS data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] mod = MutableModule( sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in xrange(batch_size)], max_label_shapes=[max_label_shape for _ in xrange(batch_size)], fixed_param_prefix=fixed_param_prefix) # decide training params # metric fcn_loss_metric = metric.FCNLogLossMetric(config.default.frequent * batch_size) eval_metrics = mx.metric.CompositeEvalMetric() # rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric for child_metric in [fcn_loss_metric]: eval_metrics.add(child_metric) ############################################## # Step 7. Set callback for training process ############################################## batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=args.frequent) epoch_end_callback = mx.callback.module_checkpoint( mod, prefix, period=1, save_optimizer_states=True) ############################################## # Step 8. Decide learning rate and optimizers ############################################## base_lr = lr lr_factor = 0.1 lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [ epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch ] lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [ int(epoch * len(segdb) / batch_size) for epoch in lr_epoch_diff ] print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) # optimizer optimizer_params = { 'momentum': config.TRAIN.momentum, 'wd': config.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None } if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) ############################################## # Step 9 Start to train ############################################## mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch)
def train_rcnn(cfg, dataset, image_set, root_path, dataset_path, frequent, kvstore, flip, shuffle, resume, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, train_shared, lr, lr_step, proposal, logger=None, output_path=None): # set up logger # 首先需要加载日志输出对象 if not logger: logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) # load symbol # 加载对应的目标检测框架mxnet模型symbol sym_instance = eval(cfg.symbol + '.' + cfg.symbol)() # sym_instance获取对应的rfcn模型,并且设置is_train为True,表示加载的模型需要对应train的模型 sym = sym_instance.get_symbol_rfcn(cfg, is_train=True) # setup multi-gpu batch_size = len(ctx) input_batch_size = cfg.TRAIN.BATCH_IMAGES * batch_size # print cfg pprint.pprint(cfg) logger.info('training rcnn cfg:{}\n'.format(pprint.pformat(cfg))) # load dataset and prepare imdb for training image_sets = [iset for iset in image_set.split('+')] roidbs = [ load_proposal_roidb(dataset, image_set, root_path, dataset_path, proposal=proposal, append_gt=True, flip=flip, result_path=output_path) for image_set in image_sets ] roidb = merge_roidb(roidbs) roidb = filter_roidb(roidb, cfg) means, stds = add_bbox_regression_targets(roidb, cfg) # load training data train_data = ROIIter(roidb, cfg, batch_size=input_batch_size, shuffle=shuffle, ctx=ctx, aspect_grouping=cfg.TRAIN.ASPECT_GROUPING) # infer max shape max_data_shape = [('data', (cfg.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in cfg.SCALES]), max([v[1] for v in cfg.SCALES])))] # infer shape data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) sym_instance.infer_shape(data_shape_dict) # load and initialize params if resume: print('continue training from ', begin_epoch) arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) else: arg_params, aux_params = load_param(pretrained, epoch, convert=True) sym_instance.init_weight_rfcn(cfg, arg_params, aux_params) # check parameter shapes sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # prepare training # create solver data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] if train_shared: fixed_param_prefix = cfg.network.FIXED_PARAMS_SHARED else: fixed_param_prefix = cfg.network.FIXED_PARAMS mod = MutableModule( sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in range(batch_size)], fixed_param_prefix=fixed_param_prefix) if cfg.TRAIN.RESUME: mod._preload_opt_states = '%s-%04d.states' % (prefix, begin_epoch) # decide training params # metric eval_metric = metric.RCNNAccMetric(cfg) cls_metric = metric.RCNNLogLossMetric(cfg) bbox_metric = metric.RCNNL1LossMetric(cfg) eval_metrics = mx.metric.CompositeEvalMetric() for child_metric in [eval_metric, cls_metric, bbox_metric]: eval_metrics.add(child_metric) # callback batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=frequent) epoch_end_callback = [ mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True), callback.do_checkpoint(prefix, means, stds) ] # decide learning rate base_lr = lr lr_factor = cfg.TRAIN.lr_factor lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [ epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch ] lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [ int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff ] print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, cfg.TRAIN.warmup, cfg.TRAIN.warmup_lr, cfg.TRAIN.warmup_step) # optimizer optimizer_params = { 'momentum': cfg.TRAIN.momentum, 'wd': cfg.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None } # train if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=kvstore, optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch)
def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): logger, final_output_path, _, tensorboard_path = create_env( config.output_path, args.cfg, config.dataset.image_set) prefix = os.path.join(final_output_path, prefix) # print config pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) print "config.symbol", config.symbol sym_instance = eval(config.symbol + '.' + config.symbol)() sym = sym_instance.get_symbol(config, is_train=True) # setup multi-gpu input_batch_size = config.TRAIN.BATCH_IMAGES * len(ctx) NUM_GPUS = len(ctx) # load dataset and prepare imdb for training image_sets = [iset for iset in config.dataset.image_set.split('+')] segdbs = [ load_gt_segdb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, result_path=final_output_path) for image_set in image_sets ] segdb = merge_segdb(segdbs) # load training data train_data = TrainDataLoader(sym, segdb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx) # loading val data val_image_set = config.dataset.test_image_set val_root_path = config.dataset.root_path val_dataset = config.dataset.dataset val_dataset_path = config.dataset.dataset_path val_imdb = eval(val_dataset)(val_image_set, val_root_path, val_dataset_path, result_path=final_output_path) val_segdb = val_imdb.gt_segdb() val_data = TrainDataLoader(sym, val_segdb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx) # infer max shape max_scale = [(config.TRAIN.crop_size[0], config.TRAIN.crop_size[1])] max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in max_scale]), max([v[1] for v in max_scale])))] max_label_shape = [ ('label', (config.TRAIN.BATCH_IMAGES, 1, max([v[0] for v in max_scale]) // config.network.LABEL_STRIDE, max([v[1] for v in max_scale]) // config.network.LABEL_STRIDE)) ] max_data_shape, max_label_shape = train_data.infer_shape( max_data_shape, max_label_shape) print 'providing maximum shape', max_data_shape, max_label_shape # infer shape data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) pprint.pprint(data_shape_dict) sym_instance.infer_shape(data_shape_dict) # load and initialize params if config.TRAIN.RESUME: print 'continue training from ', begin_epoch arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) preload_opt_states = load_preload_opt_states(prefix, begin_epoch) else: print pretrained arg_params, aux_params = load_param(pretrained, epoch, convert=True) preload_opt_states = None sym_instance.init_weights(config, arg_params, aux_params) # check parameter shapes sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # create solver fixed_param_prefix = config.network.FIXED_PARAMS data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] mod = MutableModule( sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in xrange(NUM_GPUS)], max_label_shapes=[max_label_shape for _ in xrange(NUM_GPUS)], fixed_param_prefix=fixed_param_prefix) # metric imagecrossentropylossmetric = metric.ImageCrossEntropyLossMetric() pixcelAccMetric = metric.PixcelAccMetric() eval_metrics = mx.metric.CompositeEvalMetric() # rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric for child_metric in [imagecrossentropylossmetric, pixcelAccMetric]: eval_metrics.add(child_metric) # callback batch_end_callback = [ callback.Speedometer(train_data.batch_size, frequent=args.frequent), callback.TensorboardCallback(tensorboard_path, prefix="train/batch") ] epoch_end_callback = mx.callback.module_checkpoint( mod, prefix, period=1, save_optimizer_states=True) shared_tensorboard = batch_end_callback[1] epoch_end_metric_callback = callback.TensorboardCallback( tensorboard_path, shared_tensorboard=shared_tensorboard, prefix="train/epoch") eval_end_callback = callback.TensorboardCallback( tensorboard_path, shared_tensorboard=shared_tensorboard, prefix="val/epoch") lr_callback = callback.LrCallback(tensorboard_path, shared_tensorboard=shared_tensorboard, prefix='train/batch') #decide learning rate base_lr = lr lr_factor = 0.1 lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [ epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch ] lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [ int(epoch * len(segdb) / input_batch_size) for epoch in lr_epoch_diff ] print 'lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters if config.TRAIN.lr_type == "MultiStage": lr_scheduler = LinearWarmupMultiStageScheduler( lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step, args.frequent, stop_lr=lr * 0.01) elif config.TRAIN.lr_type == "MultiFactor": lr_scheduler = LinearWarmupMultiFactorScheduler( lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step, args.frequent) optimizer_params = { 'momentum': config.TRAIN.momentum, 'wd': config.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None } optimizer = SGD(**optimizer_params) freeze_layer_pattern = config.TRAIN.FIXED_PARAMS_PATTERN if freeze_layer_pattern.strip(): args_lr_mult = {} re_prog = re.compile(freeze_layer_pattern) fixed_param_names = [ name for name in sym.list_arguments() if re_prog.match(name) ] print "fixed_params_names:" print(fixed_param_names) for name in fixed_param_names: args_lr_mult[name] = config.TRAIN.FIXED_PARAMS_PATTERN_LR_MULT else: args_lr_mult = {} optimizer.set_lr_mult(args_lr_mult) if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) if not isinstance(val_data, PrefetchingIter): val_data = PrefetchingIter(val_data) if Debug: monitor = mx.monitor.Monitor(1) else: monitor = None initializer = mx.initializer.Xavier(magnitude=1, rnd_type="gaussian") # train mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, eval_end_callback=eval_end_callback, epoch_end_metric_callback=epoch_end_metric_callback, optimizer=optimizer, eval_data=val_data, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch, allow_missing=begin_epoch == 0, allow_extra=True, monitor=monitor, preload_opt_states=preload_opt_states, eval_data_frequency=config.TRAIN.eval_data_frequency, initializer=initializer)
def train_net(config, output_path, logger=logging): # train_net(cfg_path, ctx, config.network.pretrained, config.network.pretrained_epoch, # config.TRAIN.model_prefix, config.TRAIN.begin_epoch, config.TRAIN.end_epoch, # config.TRAIN.lr, config.TRAIN.lr_step) # train parameters pretrained_model = config.network.pretrained epoch = config.network.pretrained_epoch prefix = config.TRAIN.model_prefix begin_epoch = config.TRAIN.begin_epoch end_epoch = config.TRAIN.end_epoch lr = config.TRAIN.lr lr_step = config.TRAIN.lr_step prefix = os.path.join(output_path, prefix) # network parameters BATCH_IMAGES = config.TRAIN.BATCH_IMAGES SCALES = config.SCALES # gpu stuff ctx = [mx.gpu(int(i)) for i in config.gpus.split(',')] # final_output_path = output_path # load symbol # shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'), output_path) # sym_instance = eval(config.symbol)() network = resnet_v1_101_fcis() sym = network.get_symbol(config, is_train=True) feat_sym = sym.get_internals()['rpn_cls_score_output'] # setup multi-gpu batch_size = len(ctx) input_batch_size = BATCH_IMAGES * batch_size # load dataset and prepare imdb for training cfg_ds = config.dataset ds_name = cfg_ds.dataset image_sets = [iset for iset in cfg_ds.image_set.split('+')] if ds_name.lower() == "labelme": from utils.load_data import load_labelme_gt_sdsdb sdsdbs = [ load_labelme_gt_sdsdb(image_set, cfg_ds.dataset_path, cfg_ds.root_path, flip=config.TRAIN.FLIP, mask_size=config.MASK_SIZE, binary_thresh=config.BINARY_THRESH, classes=cfg_ds.CLASSES) for image_set in image_sets ] else: sdsdbs = [ load_gt_sdsdb(ds_name, image_set, cfg_ds.root_path, cfg_ds.dataset_path, mask_size=config.MASK_SIZE, binary_thresh=config.BINARY_THRESH, result_path=output_path, flip=config.TRAIN.FLIP) for image_set in image_sets ] sdsdb = merge_roidb(sdsdbs) sdsdb = filter_roidb(sdsdb, config) # load training data train_data = AnchorLoader(feat_sym, sdsdb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, feat_stride=config.network.RPN_FEAT_STRIDE, anchor_scales=config.network.ANCHOR_SCALES, anchor_ratios=config.network.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING, allowed_border=config.TRAIN.RPN_ALLOWED_BORDER) # infer max shape max_data_shape = [('data', (BATCH_IMAGES, 3, max([v[0] for v in SCALES]), max(v[1] for v in SCALES)))] max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) max_data_shape.append(('gt_boxes', (BATCH_IMAGES, 100, 5))) max_data_shape.append( ('gt_masks', (BATCH_IMAGES, 100, max([v[0] for v in SCALES]), max(v[1] for v in SCALES)))) print 'providing maximum shape', max_data_shape, max_label_shape # infer shape data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) print 'data shape:' pprint.pprint(data_shape_dict) network.infer_shape(data_shape_dict) # load and initialize params if config.TRAIN.RESUME: print 'continue training from ', begin_epoch arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) else: arg_params, aux_params = load_param(pretrained_model, epoch, convert=True) network.init_weight(config, arg_params, aux_params) # check parameter shapes network.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # create solver fixed_param_prefix = config.network.FIXED_PARAMS data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] mod = MutableModule( sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in xrange(batch_size)], max_label_shapes=[max_label_shape for _ in xrange(batch_size)], fixed_param_prefix=fixed_param_prefix) # decide training metric # RPN, classification accuracy/loss, regression loss rpn_acc = metric.RPNAccMetric() rpn_cls_loss = metric.RPNLogLossMetric() rpn_bbox_loss = metric.RPNL1LossMetric() fcis_acc = metric.FCISAccMetric(config) fcis_acc_fg = metric.FCISAccFGMetric(config) fcis_cls_loss = metric.FCISLogLossMetric(config) fcis_bbox_loss = metric.FCISL1LossMetric(config) fcis_mask_loss = metric.FCISMaskLossMetric(config) eval_metrics = mx.metric.CompositeEvalMetric() for child_metric in [ rpn_acc, rpn_cls_loss, rpn_bbox_loss, fcis_acc, fcis_acc_fg, fcis_cls_loss, fcis_bbox_loss, fcis_mask_loss ]: eval_metrics.add(child_metric) batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=config.default.frequent) means = np.tile(np.array(config.TRAIN.BBOX_MEANS), 2 if config.CLASS_AGNOSTIC else cfg_ds.NUM_CLASSES) stds = np.tile(np.array(config.TRAIN.BBOX_STDS), 2 if config.CLASS_AGNOSTIC else cfg_ds.NUM_CLASSES) epoch_end_callback = callback.do_checkpoint(prefix, means, stds) # print epoch, begin_epoch, end_epoch, lr_step base_lr = lr lr_factor = 0.1 lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [ epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch ] lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [ int(epoch * len(sdsdb) / batch_size) for epoch in lr_epoch_diff ] print 'lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) # optimizer optimizer_params = { 'momentum': config.TRAIN.momentum, 'wd': config.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None } if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) # del sdsdb # a = mx.viz.plot_network(sym) # a.render('../example', view=True) # print 'prepare sds finished' mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch)
def train_net(args, ctx, pretrained_res, pretrained_vgg, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): mx.random.seed(3) np.random.seed(3) logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.image_set) prefix = os.path.join(final_output_path, prefix) # load symbol shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'), final_output_path) sym_instance = eval(config.symbol)() sym = sym_instance.get_symbol(config, is_train=True) feat_sym = sym.get_internals()['rpn_cls_score_output'] # setup multi-gpu batch_size = len(ctx) input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size # print config pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) # load dataset and prepare imdb for training image_sets = [iset for iset in config.dataset.image_set.split('+')] sdsdbs = [load_gt_sdsdb(config.dataset.dataset, image_set, config.dataset.root_path, config.dataset.dataset_path, mask_size=config.MASK_SIZE, binary_thresh=config.BINARY_THRESH, result_path=final_output_path, flip=config.TRAIN.FLIP) for image_set in image_sets] sdsdb = merge_roidb(sdsdbs) sdsdb = filter_roidb(sdsdb, config) # load training data train_data = AnchorLoader(feat_sym, sdsdb, config, batch_size=input_batch_size, shuffle=config.TRAIN.SHUFFLE, ctx=ctx, feat_stride=config.network.RPN_FEAT_STRIDE, anchor_scales=config.network.ANCHOR_SCALES, anchor_ratios=config.network.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING, allowed_border=config.TRAIN.RPN_ALLOWED_BORDER) # infer max shape max_data_shape = [('data', (config.TRAIN.BATCH_IMAGES, 3, max([v[0] for v in config.SCALES]), max(v[1] for v in config.SCALES)))] max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) max_data_shape.append(('gt_boxes', (config.TRAIN.BATCH_IMAGES, 100, 5))) max_data_shape.append(('gt_masks', (config.TRAIN.BATCH_IMAGES, 100, max([v[0] for v in config.SCALES]), max(v[1] for v in config.SCALES)))) print 'providing maximum shape', max_data_shape, max_label_shape # infer shape data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) print 'data shape:' pprint.pprint(data_shape_dict) sym_instance.infer_shape(data_shape_dict) # inshape, outshape, uaxshape = sym_instance.infer_shape(data_shape_dict) # print 'symbol inshape: %s ' % (str(inshape)) # print 'symbol outshape: %s' % (str(outshape)) ''' internals = sym.get_internals() _, out_shapes, _ = internals.infer_shape(**data_shape_dict) print(sym.list_outputs()) shape_dict = dict(zip(internals.list_outputs(), out_shapes)) pprint.pprint(shape_dict) ''' # load and initialize params if config.TRAIN.RESUME: print 'continue training from ', begin_epoch arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) else: # load vgg-16 & resnet-101 parameters # pretrained_res = pretrained # pretrained_vgg = './model/pretrained_model/VGG_FC_ILSVRC_16' arg_params_res, aux_params_res = load_param(pretrained_res, epoch, convert=True) arg_params_vgg, aux_params_vgg = load_param(pretrained_vgg, epoch, convert=True) # print 'params of resnet-101' # print arg_params_res # print 'params of vgg-16' # print arg_params_vgg arg_params = dict(arg_params_res, **arg_params_vgg) aux_params = dict(aux_params_res, **aux_params_vgg) # print 'arg_params: \n %s' % (str(arg_params)) sym_instance.init_weight(config, arg_params, aux_params) # check parameter shapes sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # create solver fixed_param_prefix = config.network.FIXED_PARAMS data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] mod = MutableModule(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, max_data_shapes=[max_data_shape for _ in xrange(batch_size)], max_label_shapes=[max_label_shape for _ in xrange(batch_size)], fixed_param_prefix=fixed_param_prefix) # decide training metric # RPN, classification accuracy/loss, regression loss rpn_acc = metric.RPNAccMetric() rpn_cls_loss = metric.RPNLogLossMetric() rpn_bbox_loss = metric.RPNL1LossMetric() fcis_acc = metric.FCISAccMetric(config) fcis_acc_fg = metric.FCISAccFGMetric(config) fcis_cls_loss = metric.FCISLogLossMetric(config) fcis_bbox_loss = metric.FCISL1LossMetric(config) fcis_mask_loss = metric.FCISMaskLossMetric(config) eval_metrics = mx.metric.CompositeEvalMetric() # accumulate all loss, fcn-8s loss should be added here for child_metric in [rpn_acc, rpn_cls_loss, rpn_bbox_loss, # fcis_acc_fg, fcis_cls_loss, fcis_bbox_loss, fcis_mask_loss]: fcis_acc, fcis_acc_fg, fcis_cls_loss, fcis_bbox_loss, fcis_mask_loss]: eval_metrics.add(child_metric) batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=args.frequent) means = np.tile(np.array(config.TRAIN.BBOX_MEANS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) stds = np.tile(np.array(config.TRAIN.BBOX_STDS), 2 if config.CLASS_AGNOSTIC else config.dataset.NUM_CLASSES) epoch_end_callback = callback.do_checkpoint(prefix, means, stds) # print epoch, begin_epoch, end_epoch, lr_step base_lr = lr lr_factor = 0.1 lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [int(epoch * len(sdsdb) / batch_size) for epoch in lr_epoch_diff] print 'lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) # optimizer optimizer_params = {'momentum': config.TRAIN.momentum, 'wd': config.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None} if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) # del sdsdb # a = mx.viz.plot_network(sym) # a.render('../example', view=True) # print 'prepare sds finished' mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch)