def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr=0.001, lr_step='5'): # setup config #init_config() #print(config) # setup multi-gpu input_batch_size = config.TRAIN.BATCH_IMAGES * len(ctx) # print config logger.info(pprint.pformat(config)) # load dataset and prepare imdb for training image_sets = [iset for iset in args.image_set.split('+')] roidbs = [ load_gt_roidb(args.dataset, image_set, args.root_path, args.dataset_path, flip=not args.no_flip) for image_set in image_sets ] #roidb = merge_roidb(roidbs) #roidb = filter_roidb(roidb) roidb = roidbs[0] # load symbol #sym = eval('get_' + args.network + '_train')(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS) #feat_sym = sym.get_internals()['rpn_cls_score_output'] #train_data = AnchorLoader(feat_sym, roidb, batch_size=input_batch_size, shuffle=not args.no_shuffle, # ctx=ctx, work_load_list=args.work_load_list, # feat_stride=config.RPN_FEAT_STRIDE, anchor_scales=config.ANCHOR_SCALES, # anchor_ratios=config.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING) # load and initialize params sym = None if len(pretrained) == 0: arg_params = {} aux_params = {} else: logger.info('loading %s,%d' % (pretrained, epoch)) sym, arg_params, aux_params = mx.model.load_checkpoint( pretrained, epoch) #arg_params, aux_params = load_param(pretrained, epoch, convert=True) #for k in ['rpn_conv_3x3', 'rpn_cls_score', 'rpn_bbox_pred', 'cls_score', 'bbox_pred']: # _k = k+"_weight" # if _k in arg_shape_dict: # v = 0.001 if _k.startswith('bbox_') else 0.01 # arg_params[_k] = mx.random.normal(0, v, shape=arg_shape_dict[_k]) # print('init %s with normal %.5f'%(_k,v)) # _k = k+"_bias" # if _k in arg_shape_dict: # arg_params[_k] = mx.nd.zeros(shape=arg_shape_dict[_k]) # print('init %s with zero'%(_k)) sym = eval('get_' + args.network + '_train')(sym) feat_sym = [] for stride in config.RPN_FEAT_STRIDE: feat_sym.append( sym.get_internals()['face_rpn_cls_score_stride%s_output' % stride]) train_data = CropLoader(feat_sym, roidb, batch_size=input_batch_size, shuffle=not args.no_shuffle, ctx=ctx, work_load_list=args.work_load_list) # infer max shape max_data_shape = [('data', (1, 3, max([v[1] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] #max_data_shape = [('data', (1, 3, max([v[1] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) max_data_shape.append(('gt_boxes', (1, roidb[0]['max_num_boxes'], 5))) logger.info('providing maximum shape %s %s' % (max_data_shape, max_label_shape)) # infer shape data_shape_dict = dict(train_data.provide_data + train_data.provide_label) arg_shape, out_shape, aux_shape = sym.infer_shape(**data_shape_dict) arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape)) out_shape_dict = dict(zip(sym.list_outputs(), out_shape)) aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape)) logger.info('output shape %s' % pprint.pformat(out_shape_dict)) for k, v in arg_shape_dict.items(): if k.find('upsampling') >= 0: print('initializing upsampling_weight', k) arg_params[k] = mx.nd.zeros(shape=v) init = mx.init.Initializer() init._init_bilinear(k, arg_params[k]) #print(args[k]) # check parameter shapes #for k in sym.list_arguments(): # if k in data_shape_dict: # continue # assert k in arg_params, k + ' not initialized' # assert arg_params[k].shape == arg_shape_dict[k], \ # 'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape) #for k in sym.list_auxiliary_states(): # assert k in aux_params, k + ' not initialized' # assert aux_params[k].shape == aux_shape_dict[k], \ # 'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape) fixed_param_prefix = config.FIXED_PARAMS # create solver data_names = [k[0] for k in train_data.provide_data] label_names = [k[0] for k in train_data.provide_label] fixed_param_names = get_fixed_params(sym, fixed_param_prefix) print('fixed', fixed_param_names, file=sys.stderr) mod = Module(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, work_load_list=args.work_load_list, fixed_param_names=fixed_param_names) # metric eval_metrics = mx.metric.CompositeEvalMetric() mid = 0 for m in range(len(config.RPN_FEAT_STRIDE)): stride = config.RPN_FEAT_STRIDE[m] #mid = m*MSTEP _metric = metric.RPNAccMetric(pred_idx=mid, label_idx=mid + 1, name='RPNAcc_s%s' % stride) eval_metrics.add(_metric) mid += 2 #_metric = metric.RPNLogLossMetric(pred_idx=mid, label_idx=mid+1) #eval_metrics.add(_metric) _metric = metric.RPNL1LossMetric(loss_idx=mid, weight_idx=mid + 1, name='RPNL1Loss_s%s' % stride) eval_metrics.add(_metric) mid += 2 if config.FACE_LANDMARK: _metric = metric.RPNL1LossMetric(loss_idx=mid, weight_idx=mid + 1, name='RPNLandMarkL1Loss_s%s' % stride) eval_metrics.add(_metric) mid += 2 if config.HEAD_BOX: _metric = metric.RPNAccMetric(pred_idx=mid, label_idx=mid + 1, name='RPNAcc_head_s%s' % stride) eval_metrics.add(_metric) mid += 2 #_metric = metric.RPNLogLossMetric(pred_idx=mid, label_idx=mid+1) #eval_metrics.add(_metric) _metric = metric.RPNL1LossMetric(loss_idx=mid, weight_idx=mid + 1, name='RPNL1Loss_head_s%s' % stride) eval_metrics.add(_metric) mid += 2 # callback #means = np.tile(np.array(config.TRAIN.BBOX_MEANS), config.NUM_CLASSES) #stds = np.tile(np.array(config.TRAIN.BBOX_STDS), config.NUM_CLASSES) #epoch_end_callback = callback.do_checkpoint(prefix) epoch_end_callback = None # decide learning rate #base_lr = lr #lr_factor = 0.1 #lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) lr_epoch = [int(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [ epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch ] lr_iters = [ int(epoch * len(roidb) / input_batch_size) for epoch in lr_epoch_diff ] lr_steps = [] if len(lr_iters) == 5: factors = [0.5, 0.5, 0.4, 0.1, 0.1] for i in range(5): lr_steps.append((lr_iters[i], factors[i])) elif len(lr_iters) == 8: #warmup for li in lr_iters[0:5]: lr_steps.append((li, 1.5849)) for li in lr_iters[5:]: lr_steps.append((li, 0.1)) else: for li in lr_iters: lr_steps.append((li, 0.1)) #lr_steps = [ (20,0.1), (40, 0.1) ] #XXX end_epoch = 10000 logger.info('lr %f lr_epoch_diff %s lr_steps %s' % (lr, lr_epoch_diff, lr_steps)) # optimizer opt = optimizer.SGD(learning_rate=lr, momentum=0.9, wd=0.0005, rescale_grad=1.0 / len(ctx), clip_gradient=None) initializer = mx.init.Xavier() #initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style train_data = mx.io.PrefetchingIter(train_data) _cb = mx.callback.Speedometer(train_data.batch_size, frequent=args.frequent, auto_reset=False) global_step = [0] def save_model(epoch): arg, aux = mod.get_params() all_layers = mod.symbol.get_internals() outs = [] for stride in config.RPN_FEAT_STRIDE: num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS'] _name = 'face_rpn_cls_score_stride%d_output' % stride rpn_cls_score = all_layers[_name] # prepare rpn data rpn_cls_score_reshape = mx.symbol.Reshape( data=rpn_cls_score, shape=(0, 2, -1, 0), name="face_rpn_cls_score_reshape_stride%d" % stride) rpn_cls_prob = mx.symbol.SoftmaxActivation( data=rpn_cls_score_reshape, mode="channel", name="face_rpn_cls_prob_stride%d" % stride) rpn_cls_prob_reshape = mx.symbol.Reshape( data=rpn_cls_prob, shape=(0, 2 * num_anchors, -1, 0), name='face_rpn_cls_prob_reshape_stride%d' % stride) _name = 'face_rpn_bbox_pred_stride%d_output' % stride rpn_bbox_pred = all_layers[_name] outs.append(rpn_cls_prob_reshape) outs.append(rpn_bbox_pred) if config.FACE_LANDMARK: _name = 'face_rpn_landmark_pred_stride%d_output' % stride rpn_landmark_pred = all_layers[_name] outs.append(rpn_landmark_pred) _sym = mx.sym.Group(outs) mx.model.save_checkpoint(prefix, epoch, _sym, arg, aux) def _batch_callback(param): #global global_step _cb(param) global_step[0] += 1 mbatch = global_step[0] for step in lr_steps: if mbatch == step[0]: opt.lr *= step[1] print('lr change to', opt.lr, ' in batch', mbatch, file=sys.stderr) break if mbatch == lr_steps[-1][0]: print('saving final checkpoint', mbatch, file=sys.stderr) save_model(0) #arg, aux = mod.get_params() #mx.model.save_checkpoint(prefix, 99, mod.symbol, arg, aux) sys.exit(0) if args.checkpoint is not None: _, arg_params, aux_params = mx.model.load_checkpoint( args.checkpoint, 0) # train mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=checkpoint_callback('model/testR50'), batch_end_callback=_batch_callback, kvstore=args.kvstore, optimizer=opt, initializer=initializer, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch)
def train(sym, roidb): ''' User function: Start training Args: sym (mxnet model): Mxnet model returned from set_network() function roidb (dataloader): Dataloader returned from set_model() function Returns: None ''' # print config #logger.info('called with system_dict\n{}'.format(pprint.pformat(vars(system_dict)))) #print(system_dict) # setup multi-gpu if(len(system_dict["gpus"]) == 0): ctx = [mx.cpu(0)]; else: ctx = [mx.gpu(int(i)) for i in system_dict["gpus"]] batch_size = system_dict["rcnn_batch_size"] * len(ctx) # load training data feat_sym = sym.get_internals()['rpn_cls_score_output'] ag = AnchorGenerator(feat_stride=system_dict["rpn_feat_stride"], anchor_scales=system_dict["rpn_anchor_scales"], anchor_ratios=system_dict["rpn_anchor_ratios"]) asp = AnchorSampler(allowed_border=system_dict["rpn_allowed_border"], batch_rois=system_dict["rpn_batch_rois"], fg_fraction=system_dict["rpn_fg_fraction"], fg_overlap=system_dict["rpn_fg_overlap"], bg_overlap=system_dict["rpn_bg_overlap"]) train_data = AnchorLoader(roidb, batch_size, system_dict["img_short_side"], system_dict["img_long_side"], system_dict["img_pixel_means"], system_dict["img_pixel_stds"], feat_sym, ag, asp, shuffle=True) # produce shape max possible _, out_shape, _ = feat_sym.infer_shape(data=(1, 3, system_dict["img_long_side"], system_dict["img_long_side"])) feat_height, feat_width = out_shape[0][-2:] rpn_num_anchors = len(system_dict["rpn_anchor_scales"]) * len(system_dict["rpn_anchor_ratios"]) data_names = ['data', 'im_info', 'gt_boxes'] label_names = ['label', 'bbox_target', 'bbox_weight'] data_shapes = [('data', (batch_size, 3, system_dict["img_long_side"], system_dict["img_long_side"])), ('im_info', (batch_size, 3)), ('gt_boxes', (batch_size, 100, 5))] label_shapes = [('label', (batch_size, 1, rpn_num_anchors * feat_height, feat_width)), ('bbox_target', (batch_size, 4 * rpn_num_anchors, feat_height, feat_width)), ('bbox_weight', (batch_size, 4 * rpn_num_anchors, feat_height, feat_width))] # print shapes data_shape_dict, out_shape_dict = infer_data_shape(sym, data_shapes + label_shapes) logger.info('max input shape\n%s' % pprint.pformat(data_shape_dict)) logger.info('max output shape\n%s' % pprint.pformat(out_shape_dict)) # load and initialize params if system_dict["resume"]: arg_params, aux_params = load_param(system_dict["resume"]) else: arg_params, aux_params = load_param(system_dict["pretrained"]) arg_params, aux_params = initialize_frcnn(sym, data_shapes, arg_params, aux_params) # check parameter shapes check_shape(sym, data_shapes + label_shapes, arg_params, aux_params) # check fixed params fixed_param_names = get_fixed_params(sym, system_dict["net_fixed_params"]) logger.info('locking params\n%s' % pprint.pformat(fixed_param_names)) # metric rpn_eval_metric = RPNAccMetric() rpn_cls_metric = RPNLogLossMetric() rpn_bbox_metric = RPNL1LossMetric() eval_metric = RCNNAccMetric() cls_metric = RCNNLogLossMetric() bbox_metric = RCNNL1LossMetric() eval_metrics = mx.metric.CompositeEvalMetric() for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric]: eval_metrics.add(child_metric) # callback batch_end_callback = mx.callback.Speedometer(batch_size, frequent=system_dict["log_interval"], auto_reset=False) epoch_end_callback = mx.callback.do_checkpoint(system_dict["save_prefix"]) # learning schedule base_lr = system_dict["lr"] lr_factor = 0.1 lr_epoch = [int(epoch) for epoch in system_dict["lr_decay_epoch"].split(',')] lr_epoch_diff = [epoch - system_dict["start_epoch"] for epoch in lr_epoch if epoch > system_dict["start_epoch"]] lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] logger.info('lr %f lr_epoch_diff %s lr_iters %s' % (lr, lr_epoch_diff, lr_iters)) lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) # optimizer optimizer_params = {'momentum': 0.9, 'wd': 0.0005, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': (1.0 / batch_size), 'clip_gradient': 5} # train mod = Module(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, work_load_list=None, fixed_param_names=fixed_param_names) mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore='device', optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=system_dict["start_epoch"], num_epoch=system_dict["epochs"])
def train_net(sym, roidb, args): # print config logger.info('called with args\n{}'.format(pprint.pformat(vars(args)))) # setup multi-gpu ctx = [mx.cpu()] if not args.gpus else [ mx.gpu(int(i)) for i in args.gpus.split(',') ] batch_size = args.rcnn_batch_size * len(ctx) # load training data feat_sym = sym.get_internals()['rpn_cls_score_output'] ag = AnchorGenerator(feat_stride=args.rpn_feat_stride, anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios) asp = AnchorSampler(allowed_border=args.rpn_allowed_border, batch_rois=args.rpn_batch_rois, fg_fraction=args.rpn_fg_fraction, fg_overlap=args.rpn_fg_overlap, bg_overlap=args.rpn_bg_overlap) train_data = AnchorLoader(roidb, batch_size, args.img_short_side, args.img_long_side, args.img_pixel_means, args.img_pixel_stds, feat_sym, ag, asp, shuffle=True) # produce shape max possible _, out_shape, _ = feat_sym.infer_shape(data=(1, 3, args.img_long_side, args.img_long_side)) feat_height, feat_width = out_shape[0][-2:] rpn_num_anchors = len(args.rpn_anchor_scales) * len(args.rpn_anchor_ratios) data_names = ['data', 'im_info', 'gt_boxes'] label_names = ['label', 'bbox_target', 'bbox_weight'] data_shapes = [('data', (batch_size, 3, args.img_long_side, args.img_long_side)), ('im_info', (batch_size, 3)), ('gt_boxes', (batch_size, 100, 5))] label_shapes = [('label', (batch_size, 1, rpn_num_anchors * feat_height, feat_width)), ('bbox_target', (batch_size, 4 * rpn_num_anchors, feat_height, feat_width)), ('bbox_weight', (batch_size, 4 * rpn_num_anchors, feat_height, feat_width))] # print shapes data_shape_dict, out_shape_dict = infer_data_shape( sym, data_shapes + label_shapes) logger.info('max input shape\n%s' % pprint.pformat(data_shape_dict)) logger.info('max output shape\n%s' % pprint.pformat(out_shape_dict)) # load and initialize params if args.resume: arg_params, aux_params = load_param(args.resume) else: arg_params, aux_params = load_param(args.pretrained) arg_params, aux_params = initialize_frcnn(sym, data_shapes, arg_params, aux_params) # check parameter shapes check_shape(sym, data_shapes + label_shapes, arg_params, aux_params) # check fixed params fixed_param_names = get_fixed_params(sym, args.net_fixed_params) logger.info('locking params\n%s' % pprint.pformat(fixed_param_names)) # metric rpn_eval_metric = RPNAccMetric() rpn_cls_metric = RPNLogLossMetric() rpn_bbox_metric = RPNL1LossMetric() eval_metric = RCNNAccMetric() cls_metric = RCNNLogLossMetric() bbox_metric = RCNNL1LossMetric() eval_metrics = mx.gluon.metric.CompositeEvalMetric() for child_metric in [ rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric ]: eval_metrics.add(child_metric) # callback batch_end_callback = mx.callback.Speedometer(batch_size, frequent=args.log_interval, auto_reset=False) epoch_end_callback = mx.callback.do_checkpoint(args.save_prefix) # learning schedule base_lr = args.lr lr_factor = 0.1 lr_epoch = [int(epoch) for epoch in args.lr_decay_epoch.split(',')] lr_epoch_diff = [ epoch - args.start_epoch for epoch in lr_epoch if epoch > args.start_epoch ] lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [ int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff ] logger.info('lr %f lr_epoch_diff %s lr_iters %s' % (lr, lr_epoch_diff, lr_iters)) lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) # optimizer optimizer_params = { 'momentum': 0.9, 'wd': 0.0005, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': (1.0 / batch_size), 'clip_gradient': 5 } # train mod = Module(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, work_load_list=None, fixed_param_names=fixed_param_names) mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore='device', optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=args.start_epoch, num_epoch=args.epochs)
def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, lr=0.001, lr_step='5'): # setup config #init_config() #print(config) # setup multi-gpu input_batch_size = config.TRAIN.BATCH_IMAGES * len(ctx) # print config logger.info(pprint.pformat(config)) # load dataset and prepare imdb for training image_sets = [iset for iset in args.image_set.split('+')] roidbs = [ load_gt_roidb(args.dataset, image_set, args.root_path, args.dataset_path, flip=not args.no_flip) for image_set in image_sets ] roidb = merge_roidb(roidbs) roidb = filter_roidb(roidb) # load symbol #sym = eval('get_' + args.network + '_train')(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS) #feat_sym = sym.get_internals()['rpn_cls_score_output'] #train_data = AnchorLoader(feat_sym, roidb, batch_size=input_batch_size, shuffle=not args.no_shuffle, # ctx=ctx, work_load_list=args.work_load_list, # feat_stride=config.RPN_FEAT_STRIDE, anchor_scales=config.ANCHOR_SCALES, # anchor_ratios=config.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING) sym = eval('get_' + args.network + '_train')() #print(sym.get_internals()) feat_sym = [] for stride in config.RPN_FEAT_STRIDE: feat_sym.append(sym.get_internals()['rpn_cls_score_stride%s_output' % stride]) #train_data = AnchorLoaderFPN(feat_sym, roidb, batch_size=input_batch_size, shuffle=not args.no_shuffle, # ctx=ctx, work_load_list=args.work_load_list) train_data = CropLoader(feat_sym, roidb, batch_size=input_batch_size, shuffle=not args.no_shuffle, ctx=ctx, work_load_list=args.work_load_list) # infer max shape max_data_shape = [('data', (1, 3, max([v[1] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] #max_data_shape = [('data', (1, 3, max([v[1] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) max_data_shape.append(('gt_boxes', (1, roidb[0]['max_num_boxes'], 5))) logger.info('providing maximum shape %s %s' % (max_data_shape, max_label_shape)) # infer shape data_shape_dict = dict(train_data.provide_data + train_data.provide_label) arg_shape, out_shape, aux_shape = sym.infer_shape(**data_shape_dict) arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape)) out_shape_dict = dict(zip(sym.list_outputs(), out_shape)) aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape)) logger.info('output shape %s' % pprint.pformat(out_shape_dict)) # load and initialize params if args.resume: arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) else: arg_params, aux_params = load_param(pretrained, epoch, convert=True) #for k in ['rpn_conv_3x3', 'rpn_cls_score', 'rpn_bbox_pred', 'cls_score', 'bbox_pred']: # _k = k+"_weight" # if _k in arg_shape_dict: # v = 0.001 if _k.startswith('bbox_') else 0.01 # arg_params[_k] = mx.random.normal(0, v, shape=arg_shape_dict[_k]) # print('init %s with normal %.5f'%(_k,v)) # _k = k+"_bias" # if _k in arg_shape_dict: # arg_params[_k] = mx.nd.zeros(shape=arg_shape_dict[_k]) # print('init %s with zero'%(_k)) for k, v in arg_shape_dict.iteritems(): if k.find('upsampling') >= 0: print('initializing upsampling_weight', k) arg_params[k] = mx.nd.zeros(shape=v) init = mx.init.Initializer() init._init_bilinear(k, arg_params[k]) #print(args[k]) # check parameter shapes #for k in sym.list_arguments(): # if k in data_shape_dict: # continue # assert k in arg_params, k + ' not initialized' # assert arg_params[k].shape == arg_shape_dict[k], \ # 'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape) #for k in sym.list_auxiliary_states(): # assert k in aux_params, k + ' not initialized' # assert aux_params[k].shape == aux_shape_dict[k], \ # 'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape) # create solver fixed_param_prefix = config.FIXED_PARAMS data_names = [k[0] for k in train_data.provide_data] label_names = [k[0] for k in train_data.provide_label] #mod = MutableModule(sym, data_names=data_names, label_names=label_names, # logger=logger, context=ctx, work_load_list=args.work_load_list, # max_data_shapes=max_data_shape, max_label_shapes=max_label_shape, # fixed_param_prefix=fixed_param_prefix) fixed_param_names = get_fixed_params(sym, fixed_param_prefix) print('fixed', fixed_param_names, file=sys.stderr) mod = Module(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, work_load_list=args.work_load_list, fixed_param_names=fixed_param_names) # decide training params # metric eval_metrics = mx.metric.CompositeEvalMetric() #if len(sym.list_outputs())>4: # metric_names = ['RPNAccMetric', 'RPNLogLossMetric', 'RPNL1LossMetric', 'RCNNAccMetric', 'RCNNLogLossMetric', 'RCNNL1LossMetric'] #else:#train rpn only #print('sym', sym.list_outputs()) #metric_names = ['RPNAccMetric', 'RPNLogLossMetric', 'RPNL1LossMetric'] mids = [0, 4, 8] for mid in mids: _metric = metric.RPNAccMetric(pred_idx=mid, label_idx=mid + 1) eval_metrics.add(_metric) #_metric = metric.RPNLogLossMetric(pred_idx=mid, label_idx=mid+1) #eval_metrics.add(_metric) _metric = metric.RPNL1LossMetric(loss_idx=mid + 2, weight_idx=mid + 3) eval_metrics.add(_metric) #rpn_eval_metric = metric.RPNAccMetric() #rpn_cls_metric = metric.RPNLogLossMetric() #rpn_bbox_metric = metric.RPNL1LossMetric() #eval_metric = metric.RCNNAccMetric() #cls_metric = metric.RCNNLogLossMetric() #bbox_metric = metric.RCNNL1LossMetric() #for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric]: # eval_metrics.add(child_metric) # callback means = np.tile(np.array(config.TRAIN.BBOX_MEANS), config.NUM_CLASSES) stds = np.tile(np.array(config.TRAIN.BBOX_STDS), config.NUM_CLASSES) #epoch_end_callback = callback.do_checkpoint(prefix, means, stds) epoch_end_callback = None # decide learning rate base_lr = lr lr_factor = 0.1 lr_epoch = [int(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [ epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch ] lr = base_lr * (lr_factor**(len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [ int(epoch * len(roidb) / input_batch_size) for epoch in lr_epoch_diff ] #lr_iters = [36000,42000] #TODO #lr_iters = [40000,50000,60000] #TODO #lr_iters = [40,50,60] #TODO end_epoch = 10000 #lr_iters = [4,8] #TODO logger.info('lr %f lr_epoch_diff %s lr_iters %s' % (lr, lr_epoch_diff, lr_iters)) #lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) # optimizer opt = optimizer.SGD(learning_rate=lr, momentum=0.9, wd=0.0005, rescale_grad=1.0 / len(ctx), clip_gradient=None) initializer = mx.init.Xavier() #initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style if len(ctx) > 1: train_data = mx.io.PrefetchingIter(train_data) _cb = mx.callback.Speedometer(train_data.batch_size, frequent=args.frequent, auto_reset=False) global_step = [0] def save_model(epoch): arg, aux = mod.get_params() all_layers = mod.symbol.get_internals() outs = [] for stride in config.RPN_FEAT_STRIDE: num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS'] _name = 'rpn_cls_score_stride%d_output' % stride rpn_cls_score = all_layers[_name] # prepare rpn data rpn_cls_score_reshape = mx.symbol.Reshape( data=rpn_cls_score, shape=(0, 2, -1, 0), name="rpn_cls_score_reshape_stride%d" % stride) rpn_cls_prob = mx.symbol.SoftmaxActivation( data=rpn_cls_score_reshape, mode="channel", name="rpn_cls_prob_stride%d" % stride) rpn_cls_prob_reshape = mx.symbol.Reshape( data=rpn_cls_prob, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_prob_reshape_stride%d' % stride) _name = 'rpn_bbox_pred_stride%d_output' % stride rpn_bbox_pred = all_layers[_name] outs.append(rpn_cls_prob_reshape) outs.append(rpn_bbox_pred) _sym = mx.sym.Group(outs) mx.model.save_checkpoint(prefix, epoch, _sym, arg, aux) def _batch_callback(param): #global global_step _cb(param) global_step[0] += 1 mbatch = global_step[0] for _iter in lr_iters: if mbatch == _iter: opt.lr *= 0.1 print('lr change to', opt.lr, ' in batch', mbatch, file=sys.stderr) break if mbatch % 1000 == 0: print('saving final checkpoint', mbatch, file=sys.stderr) save_model(mbatch) if mbatch == lr_iters[-1]: print('saving final checkpoint', mbatch, file=sys.stderr) save_model(0) #arg, aux = mod.get_params() #mx.model.save_checkpoint(prefix, 99, mod.symbol, arg, aux) sys.exit(0) # train mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=_batch_callback, kvstore=args.kvstore, optimizer=opt, initializer=initializer, allow_missing=True, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch)
def train_net(sym, roidb, args): # print config logger.info('called with args\n{}'.format(pprint.pformat(vars(args)))) # setup multi-gpu ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')] batch_size = args.rcnn_batch_size * len(ctx) # load training data feat_sym = sym.get_internals()['rpn_cls_score_output'] ag = AnchorGenerator(feat_stride=args.rpn_feat_stride, anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios) asp = AnchorSampler(allowed_border=args.rpn_allowed_border, batch_rois=args.rpn_batch_rois, fg_fraction=args.rpn_fg_fraction, fg_overlap=args.rpn_fg_overlap, bg_overlap=args.rpn_bg_overlap) train_data = AnchorLoader(roidb, batch_size, args.img_short_side, args.img_long_side, args.img_pixel_means, args.img_pixel_stds, feat_sym, ag, asp, shuffle=True) # produce shape max possible _, out_shape, _ = feat_sym.infer_shape(data=(1, 3, args.img_long_side, args.img_long_side)) feat_height, feat_width = out_shape[0][-2:] rpn_num_anchors = len(args.rpn_anchor_scales) * len(args.rpn_anchor_ratios) data_names = ['data', 'im_info', 'gt_boxes'] label_names = ['label', 'bbox_target', 'bbox_weight'] data_shapes = [('data', (batch_size, 3, args.img_long_side, args.img_long_side)), ('im_info', (batch_size, 3)), ('gt_boxes', (batch_size, 100, 5))] label_shapes = [('label', (batch_size, 1, rpn_num_anchors * feat_height, feat_width)), ('bbox_target', (batch_size, 4 * rpn_num_anchors, feat_height, feat_width)), ('bbox_weight', (batch_size, 4 * rpn_num_anchors, feat_height, feat_width))] # print shapes data_shape_dict, out_shape_dict = infer_data_shape(sym, data_shapes + label_shapes) logger.info('max input shape\n%s' % pprint.pformat(data_shape_dict)) logger.info('max output shape\n%s' % pprint.pformat(out_shape_dict)) # load and initialize params if args.resume: arg_params, aux_params = load_param(args.resume) else: arg_params, aux_params = load_param(args.pretrained) arg_params, aux_params = initialize_frcnn(sym, data_shapes, arg_params, aux_params) # check parameter shapes check_shape(sym, data_shapes + label_shapes, arg_params, aux_params) # check fixed params fixed_param_names = get_fixed_params(sym, args.net_fixed_params) logger.info('locking params\n%s' % pprint.pformat(fixed_param_names)) # metric rpn_eval_metric = RPNAccMetric() rpn_cls_metric = RPNLogLossMetric() rpn_bbox_metric = RPNL1LossMetric() eval_metric = RCNNAccMetric() cls_metric = RCNNLogLossMetric() bbox_metric = RCNNL1LossMetric() eval_metrics = mx.metric.CompositeEvalMetric() for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric]: eval_metrics.add(child_metric) # callback batch_end_callback = mx.callback.Speedometer(batch_size, frequent=args.log_interval, auto_reset=False) epoch_end_callback = mx.callback.do_checkpoint(args.save_prefix) # learning schedule base_lr = args.lr lr_factor = 0.1 lr_epoch = [int(epoch) for epoch in args.lr_decay_epoch.split(',')] lr_epoch_diff = [epoch - args.start_epoch for epoch in lr_epoch if epoch > args.start_epoch] lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] logger.info('lr %f lr_epoch_diff %s lr_iters %s' % (lr, lr_epoch_diff, lr_iters)) lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) # optimizer optimizer_params = {'momentum': 0.9, 'wd': 0.0005, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': (1.0 / batch_size), 'clip_gradient': 5} # train mod = Module(sym, data_names=data_names, label_names=label_names, logger=logger, context=ctx, work_load_list=None, fixed_param_names=fixed_param_names) mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore='device', optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=args.start_epoch, num_epoch=args.epochs)
def train_net(args, ctx, pretrained, pretrained_flow, epoch, prefix, begin_epoch, end_epoch, lr, lr_step): logger, final_output_path = create_logger(config.output_path, args.cfg, config.dataset.image_set) prefix = os.path.join(final_output_path, prefix) # load symbol config.symbol = 'resnet_v1_101_flownet_rfcn_ucf101' shutil.copy2(os.path.join(curr_path, 'symbols', config.symbol + '.py'), final_output_path) sym_instance = eval(config.symbol + '.' + config.symbol)() sym = sym_instance.get_train_symbol(config) # feat_sym = sym.get_internals()['rpn_cls_score_output'] # setup multi-gpu batch_size = len(ctx) input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size # print config pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) # load dataset and prepare imdb for training config.dataset.dataset = 'UCF101' config.dataset.root_path = '/data/weik/data/' config.dataset.dataset_path = '/data_ssd2/datasets/UCF101/JPG/' config.dataset.traintestlist_path = '/data_ssd2/datasets/UCF101/ucfTrainTestList/' config.TRAIN.FLIP = False split = '01' gtdb = load_gt_imdb(config.dataset.dataset, config.dataset.root_path, config.dataset.dataset_path, config.dataset.traintestlist_path, split=split, flip=config.TRAIN.FLIP) # load training data train_data = TrainLoader(sym, gtdb, config, batch_size=128, shuffle=False, ctx=ctx, aspect_grouping=True) data_shape_dict = dict(train_data.provide_data_single + train_data.provide_label_single) pprint.pprint(data_shape_dict) sym_instance.infer_shape(data_shape_dict) # load and initialize params if config.TRAIN.RESUME: print('continue training from ', begin_epoch) arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) else: arg_params, aux_params = load_param(pretrained, epoch, convert=True) arg_params_flow, aux_params_flow = load_param(pretrained_flow, epoch, convert=True) arg_params.update(arg_params_flow) aux_params.update(aux_params_flow) sym_instance.init_weight(config, arg_params, aux_params) # check parameter shapes sym_instance.check_parameter_shapes(arg_params, aux_params, data_shape_dict) # create solver fixed_param_prefix = config.network.FIXED_PARAMS data_names = [k[0] for k in train_data.provide_data_single] label_names = [k[0] for k in train_data.provide_label_single] # Create Module mod = Module(sym, data_names=data_names, label_names=label_names) if config.TRAIN.RESUME: mod._preload_opt_states = '%s-%04d.states' % (prefix, begin_epoch) # decide training params # metric eval_metrics = mx.metric.Accuracy() # callback batch_end_callback = mx.callback.Speedometer(train_data.batch_size, frequent=args.frequent) epoch_end_callback = mx.callback.module_checkpoint(mod, prefix, period=1, save_optimizer_states=True) # decide learning rate base_lr = lr lr_factor = config.TRAIN.lr_factor lr_epoch = [float(epoch) for epoch in lr_step.split(',')] lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) lr_iters = [int(epoch * len(gtdb) / batch_size) for epoch in lr_epoch_diff] print('lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters) lr_scheduler = WarmupMultiFactorScheduler(lr_iters, lr_factor, config.TRAIN.warmup, config.TRAIN.warmup_lr, config.TRAIN.warmup_step) # optimizer optimizer_params = {'momentum': config.TRAIN.momentum, 'wd': config.TRAIN.wd, 'learning_rate': lr, 'lr_scheduler': lr_scheduler, 'rescale_grad': 1.0, 'clip_gradient': None} if not isinstance(train_data, PrefetchingIter): train_data = PrefetchingIter(train_data) # train mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, batch_end_callback=batch_end_callback, kvstore=config.default.kvstore, optimizer='sgd', optimizer_params=optimizer_params, arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch)