def train(sym, roidb): ''' User function: Start training Args: sym (mxnet model): Mxnet model returned from set_network() function roidb (dataloader): Dataloader returned from set_model() function Returns: None ''' # print config #logger.info('called with system_dict\n{}'.format(pprint.pformat(vars(system_dict)))) #print(system_dict) # setup multi-gpu if(len(system_dict["gpus"]) == 0): ctx = [mx.cpu(0)]; else: ctx = [mx.gpu(int(i)) for i in system_dict["gpus"]] batch_size = system_dict["rcnn_batch_size"] * len(ctx) # load training data feat_sym = sym.get_internals()['rpn_cls_score_output'] ag = AnchorGenerator(feat_stride=system_dict["rpn_feat_stride"], anchor_scales=system_dict["rpn_anchor_scales"], anchor_ratios=system_dict["rpn_anchor_ratios"]) asp = AnchorSampler(allowed_border=system_dict["rpn_allowed_border"], batch_rois=system_dict["rpn_batch_rois"], fg_fraction=system_dict["rpn_fg_fraction"], fg_overlap=system_dict["rpn_fg_overlap"], bg_overlap=system_dict["rpn_bg_overlap"]) train_data = AnchorLoader(roidb, batch_size, system_dict["img_short_side"], system_dict["img_long_side"], system_dict["img_pixel_means"], system_dict["img_pixel_stds"], feat_sym, ag, asp, shuffle=True) # produce shape max possible _, out_shape, _ = feat_sym.infer_shape(data=(1, 3, system_dict["img_long_side"], system_dict["img_long_side"])) feat_height, feat_width = out_shape[0][-2:] rpn_num_anchors = len(system_dict["rpn_anchor_scales"]) * len(system_dict["rpn_anchor_ratios"]) data_names = ['data', 'im_info', 'gt_boxes'] label_names = ['label', 'bbox_target', 'bbox_weight'] data_shapes = [('data', (batch_size, 3, system_dict["img_long_side"], system_dict["img_long_side"])), ('im_info', (batch_size, 3)), ('gt_boxes', (batch_size, 100, 5))] label_shapes = [('label', (batch_size, 1, rpn_num_anchors * feat_height, feat_width)), ('bbox_target', (batch_size, 4 * rpn_num_anchors, feat_height, feat_width)), ('bbox_weight', (batch_size, 4 * rpn_num_anchors, feat_height, feat_width))] # print shapes data_shape_dict, out_shape_dict = infer_data_shape(sym, data_shapes + label_shapes) logger.info('max input shape\n%s' % pprint.pformat(data_shape_dict)) logger.info('max output shape\n%s' % pprint.pformat(out_shape_dict)) # load and initialize params if system_dict["resume"]: arg_params, aux_params = load_param(system_dict["resume"]) else: arg_params, aux_params = load_param(system_dict["pretrained"]) arg_params, aux_params = initialize_frcnn(sym, data_shapes, arg_params, aux_params) # check parameter shapes check_shape(sym, data_shapes + label_shapes, arg_params, aux_params) # check fixed params fixed_param_names = get_fixed_params(sym, system_dict["net_fixed_params"]) logger.info('locking params\n%s' % pprint.pformat(fixed_param_names)) # metric rpn_eval_metric = RPNAccMetric() rpn_cls_metric = RPNLogLossMetric() rpn_bbox_metric = RPNL1LossMetric() eval_metric = RCNNAccMetric() cls_metric = RCNNLogLossMetric() bbox_metric = RCNNL1LossMetric() eval_metrics = mx.metric.CompositeEvalMetric() for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric]: eval_metrics.add(child_metric) # callback batch_end_callback = mx.callback.Speedometer(batch_size, frequent=system_dict["log_interval"], auto_reset=False) epoch_end_callback = mx.callback.do_checkpoint(system_dict["save_prefix"]) # learning schedule base_lr = system_dict["lr"] lr_factor = 0.1 lr_epoch = [int(epoch) for epoch in system_dict["lr_decay_epoch"].split(',')] lr_epoch_diff = [epoch - system_dict["start_epoch"] for epoch in lr_epoch if epoch > system_dict["start_epoch"]] lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] logger.info('lr %f lr_epoch_diff %s lr_iters %s' % (lr, lr_epoch_diff, lr_iters)) lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) # optimizer optimizer_params = {'momentum': 0.9, 'wd': 0.0005, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': (1.0 / batch_size), 'clip_gradient': 5} # train mod = Module(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, work_load_list=None, fixed_param_names=fixed_param_names) mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore='device', optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=system_dict["start_epoch"], num_epoch=system_dict["epochs"])
def train_net(sym, roidb, args): # print config logger.info('called with args\n{}'.format(pprint.pformat(vars(args)))) # setup multi-gpu ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')] batch_size = args.rcnn_batch_size * len(ctx) # load training data feat_sym = sym.get_internals()['rpn_cls_score_output'] ag = AnchorGenerator(feat_stride=args.rpn_feat_stride, anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios) asp = AnchorSampler(allowed_border=args.rpn_allowed_border, batch_rois=args.rpn_batch_rois, fg_fraction=args.rpn_fg_fraction, fg_overlap=args.rpn_fg_overlap, bg_overlap=args.rpn_bg_overlap) train_data = AnchorLoader(roidb, batch_size, args.img_short_side, args.img_long_side, args.img_pixel_means, args.img_pixel_stds, feat_sym, ag, asp, shuffle=True) # produce shape max possible _, out_shape, _ = feat_sym.infer_shape(data=(1, 3, args.img_long_side, args.img_long_side)) feat_height, feat_width = out_shape[0][-2:] rpn_num_anchors = len(args.rpn_anchor_scales) * len(args.rpn_anchor_ratios) data_names = ['data', 'im_info', 'gt_boxes'] label_names = ['label', 'bbox_target', 'bbox_weight'] data_shapes = [('data', (batch_size, 3, args.img_long_side, args.img_long_side)), ('im_info', (batch_size, 3)), ('gt_boxes', (batch_size, 100, 5))] label_shapes = [('label', (batch_size, 1, rpn_num_anchors * feat_height, feat_width)), ('bbox_target', (batch_size, 4 * rpn_num_anchors, feat_height, feat_width)), ('bbox_weight', (batch_size, 4 * rpn_num_anchors, feat_height, feat_width))] # print shapes data_shape_dict, out_shape_dict = infer_data_shape(sym, data_shapes + label_shapes) logger.info('max input shape\n%s' % pprint.pformat(data_shape_dict)) logger.info('max output shape\n%s' % pprint.pformat(out_shape_dict)) # load and initialize params if args.resume: arg_params, aux_params = load_param(args.resume) else: arg_params, aux_params = load_param(args.pretrained) arg_params, aux_params = initialize_frcnn(sym, data_shapes, arg_params, aux_params) # check parameter shapes check_shape(sym, data_shapes + label_shapes, arg_params, aux_params) # check fixed params fixed_param_names = get_fixed_params(sym, args.net_fixed_params) logger.info('locking params\n%s' % pprint.pformat(fixed_param_names)) # metric rpn_eval_metric = RPNAccMetric() rpn_cls_metric = RPNLogLossMetric() rpn_bbox_metric = RPNL1LossMetric() eval_metric = RCNNAccMetric() cls_metric = RCNNLogLossMetric() bbox_metric = RCNNL1LossMetric() eval_metrics = mx.metric.CompositeEvalMetric() for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric]: eval_metrics.add(child_metric) # callback batch_end_callback = mx.callback.Speedometer(batch_size, frequent=args.log_interval, auto_reset=False) epoch_end_callback = mx.callback.do_checkpoint(args.save_prefix) # learning schedule base_lr = args.lr lr_factor = 0.1 lr_epoch = [int(epoch) for epoch in args.lr_decay_epoch.split(',')] lr_epoch_diff = [epoch - args.start_epoch for epoch in lr_epoch if epoch > args.start_epoch] lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] logger.info('lr %f lr_epoch_diff %s lr_iters %s' % (lr, lr_epoch_diff, lr_iters)) lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) # optimizer optimizer_params = {'momentum': 0.9, 'wd': 0.0005, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': (1.0 / batch_size), 'clip_gradient': 5} # train mod = Module(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, work_load_list=None, fixed_param_names=fixed_param_names) mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore='device', optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=args.start_epoch, num_epoch=args.epochs)
def train_net(sym, roidb, args): # print config logger.info('called with args\n{}'.format(pprint.pformat(vars(args)))) # setup multi-gpu ctx = [mx.cpu()] if not args.gpus else [ mx.gpu(int(i)) for i in args.gpus.split(',') ] batch_size = args.rcnn_batch_size * len(ctx) # load training data feat_sym = sym.get_internals()['rpn_cls_score_output'] ag = AnchorGenerator(feat_stride=args.rpn_feat_stride, anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios) asp = AnchorSampler(allowed_border=args.rpn_allowed_border, batch_rois=args.rpn_batch_rois, fg_fraction=args.rpn_fg_fraction, fg_overlap=args.rpn_fg_overlap, bg_overlap=args.rpn_bg_overlap) train_data = AnchorLoader(roidb, batch_size, args.img_short_side, args.img_long_side, args.img_pixel_means, args.img_pixel_stds, feat_sym, ag, asp, shuffle=True) # produce shape max possible _, out_shape, _ = feat_sym.infer_shape(data=(1, 3, args.img_long_side, args.img_long_side)) feat_height, feat_width = out_shape[0][-2:] rpn_num_anchors = len(args.rpn_anchor_scales) * len(args.rpn_anchor_ratios) data_names = ['data', 'im_info', 'gt_boxes'] label_names = ['label', 'bbox_target', 'bbox_weight'] data_shapes = [('data', (batch_size, 3, args.img_long_side, args.img_long_side)), ('im_info', (batch_size, 3)), ('gt_boxes', (batch_size, 100, 5))] label_shapes = [('label', (batch_size, 1, rpn_num_anchors * feat_height, feat_width)), ('bbox_target', (batch_size, 4 * rpn_num_anchors, feat_height, feat_width)), ('bbox_weight', (batch_size, 4 * rpn_num_anchors, feat_height, feat_width))] # print shapes data_shape_dict, out_shape_dict = infer_data_shape( sym, data_shapes + label_shapes) logger.info('max input shape\n%s' % pprint.pformat(data_shape_dict)) logger.info('max output shape\n%s' % pprint.pformat(out_shape_dict)) # load and initialize params if args.resume: arg_params, aux_params = load_param(args.resume) else: arg_params, aux_params = load_param(args.pretrained) arg_params, aux_params = initialize_frcnn(sym, data_shapes, arg_params, aux_params) # check parameter shapes check_shape(sym, data_shapes + label_shapes, arg_params, aux_params) # check fixed params fixed_param_names = get_fixed_params(sym, args.net_fixed_params) logger.info('locking params\n%s' % pprint.pformat(fixed_param_names)) # metric rpn_eval_metric = RPNAccMetric() rpn_cls_metric = RPNLogLossMetric() rpn_bbox_metric = RPNL1LossMetric() eval_metric = RCNNAccMetric() cls_metric = RCNNLogLossMetric() bbox_metric = RCNNL1LossMetric() eval_metrics = mx.gluon.metric.CompositeEvalMetric() for child_metric in [ rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric ]: eval_metrics.add(child_metric) # callback batch_end_callback = mx.callback.Speedometer(batch_size, frequent=args.log_interval, auto_reset=False) epoch_end_callback = mx.callback.do_checkpoint(args.save_prefix) # learning schedule base_lr = args.lr lr_factor = 0.1 lr_epoch = [int(epoch) for epoch in args.lr_decay_epoch.split(',')] lr_epoch_diff = [ epoch - args.start_epoch for epoch in lr_epoch if epoch > args.start_epoch ] lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [ int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff ] logger.info('lr %f lr_epoch_diff %s lr_iters %s' % (lr, lr_epoch_diff, lr_iters)) lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) # optimizer optimizer_params = { 'momentum': 0.9, 'wd': 0.0005, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': (1.0 / batch_size), 'clip_gradient': 5 } # train mod = Module(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, work_load_list=None, fixed_param_names=fixed_param_names) mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore='device', optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=args.start_epoch, num_epoch=args.epochs)
print( 'the head detection has finished and saved in cache. please run the train again.' ) # system setting mx.random.seed(42) ctx = mx.gpu() if mx.test_utils.list_gpus() else mx.cpu() # load symbol network net = get_resnet_train_symbol(units=resnet_units, filter_list=resnet_filter_list) # load training data train_data = Loader(roidb, roihead, batch_size, img_pixel_means, img_pixel_stds) data_shape_dict, out_shape_dict = infer_data_shape( net, train_data.provide_data + train_data.provide_label) # load params if is_init: arg_params, aux_params = load_param_resnet_full(pretrained, ctx) arg_params, aux_params = initialize_resnet_leaky(net, train_data.provide_data, arg_params, aux_params) print('initialize and train the new network') if is_resume: arg_params, aux_params = load_param(resume, ctx) print('resume from trained-network') # symbol data data_names = [ 'pair_im_1', 'pair_im_2', 'loca_map_1', 'loca_map_2', 'r_map',