def get_config(is_train): class General: # number of iteration for print the metrics to stdout log_frequency = 10 # the directory name for the experiment, the default is the name of config name = __name__.rsplit("/")[-1].rsplit(".")[-1] # batch size per GPU batch_image = 2 if is_train else 1 # use FP16 for weight and activation # recommend to toggle when you are training on Volta or later GPUs fp16 = False # number of threads used for the data loader # this term affects both the CPU utilization and the MEM usage # lower this if you are training on Desktop loader_worker = 8 # switch the built in profile to find the bottleneck of network profile = False class KvstoreParam: # the type of communicator used to sync model parameters kvstore = "nccl" # "local", "aggregated" batch_image = General.batch_image # GPUs to use gpus = [0, 1, 2, 3, 4, 5, 6, 7] fp16 = General.fp16 class NormalizeParam: # the type of normalizer used for network # see also ModelParam.pretrain.fixed_param for the freeze of gamma/beta normalizer = normalizer_factory(type="fixbn") # freeze bn stats normalizer = normalizer_factory( type="localbn") # use bn stats in one GPU normalizer = normalizer_factory( type="syncbn", ndev=len(KvstoreParam.gpus)) # use bn stats across GPUs normalizer = normalizer_factory(type="gn") # use GroupNorm class BackboneParam: # you can control the FP16 option and normalizer for each individual component fp16 = General.fp16 normalizer = NormalizeParam.normalizer # some backbone component accept additional configs, like the depth for ResNet depth = 50 class NeckParam: # you can control the FP16 option and normalizer for each individual component fp16 = General.fp16 normalizer = NormalizeParam.normalizer class RpnParam: # you can control the FP16 option and normalizer for each individual component fp16 = General.fp16 normalizer = NormalizeParam.normalizer batch_image = General.batch_image # use ONNX-compatible proposal operator instead of the one written in C++/CUDA nnvm_proposal = True # use in-network rpn target operator instead of the label generated by data loader # if your network is quite fast, the CPU might not feed the labels fast enough # else you can offload the rpn target generation to CPU to save GPU resources nnvm_rpn_target = False # anchor grid generated are used in the rpn target assign and proposal decoding class anchor_generate: scale = (8, ) ratio = (0.5, 1.0, 2.0) stride = (4, 8, 16, 32, 64) # number of anchors per image image_anchor = 256 # to avoid generate the same anchor grid more than once # we cache an anchor grid in the arg_params # max_side specify the max side of resized input image # 3000 is a safe bet, increase it if necessary max_side = 1400 # valid when use nnvm_rpn_target, controls the rpn target assign class anchor_assign: # number of pixels the anchor box could extend out of the image border allowed_border = 0 # iou lower bound with groundtruth box for foreground anchor pos_thr = 0.7 # iou upper bound with groundtruth box for background anchor neg_thr = 0.3 # every groundtruth box will match the anchors overlaps most with it by default # increase the threshold to avoid matching low quality anchors min_pos_thr = 0.0 # number of anchors per image image_anchor = 256 # fraction of foreground anchors per image pos_fraction = 0.5 # rpn head structure class head: # number of channels for the 3x3 conv in rpn head conv_channel = 256 # mean and std for rpn regression target mean = (0, 0, 0, 0) std = (1, 1, 1, 1) # the proposal generation for RCNN class proposal: # number of top-scored proposals to take before NMS pre_nms_top_n = 2000 if is_train else 1000 # number of top-scored proposals to take after NMS post_nms_top_n = 2000 if is_train else 1000 # proposal NMS threshold nms_thr = 0.7 # min proposal box to keep, 0 means keep all min_bbox_side = 0 # the proposal sampling for RCNN during training class subsample_proposal: # add gt to proposals proposal_wo_gt = False # number of proposals sampled per image during training image_roi = 512 # the maxinum fraction of foreground proposals fg_fraction = 0.25 # iou lower bound with gt bbox for foreground proposals fg_thr = 0.5 # iou upper bound with gt bbox for background proposals bg_thr_hi = 0.5 # iou lower bound with gt bbox for background proposals # set to non-zero value could remove some trivial background proposals bg_thr_lo = 0.0 # the target encoding for RCNN bbox head class bbox_target: # 1(background) + num_class # could be num_class if using sigmoid activition instead of softmax one num_reg_class = 1 + 80 # share the regressor for all classes class_agnostic = False # the mean, std, and weight for bbox head regression target weight = (1.0, 1.0, 1.0, 1.0) mean = (0.0, 0.0, 0.0, 0.0) std = (0.1, 0.1, 0.2, 0.2) class BboxParam: # you can control the FP16 option and normalizer for each individual component fp16 = General.fp16 normalizer = NormalizeParam.normalizer # num_class may be different from RpnParam.bbox_target.num_reg_class # if the class_agnostic regressor is adopted num_class = 1 + 80 image_roi = RpnParam.subsample_proposal.image_roi batch_image = General.batch_image class regress_target: class_agnostic = RpnParam.bbox_target.class_agnostic mean = RpnParam.bbox_target.mean std = RpnParam.bbox_target.std class MaskParam: # you can control the FP16 option and normalizer for each individual component fp16 = General.fp16 normalizer = NormalizeParam.normalizer # output resolution of mask head resolution = 28 # number of channels for 3x3 convs in mask head dim_reduced = 256 # mask head only trains on foreground proposals # so we discard all the background proposals to save computation num_fg_roi = int(RpnParam.subsample_proposal.image_roi * RpnParam.subsample_proposal.fg_fraction) class RoiParam: # you can control the FP16 option and normalizer for each individual component fp16 = General.fp16 normalizer = NormalizeParam.normalizer # Each RoI is pooled into an out_size x out_size fixed-length representation out_size = 7 # the total stride of the feature map to pool from stride = (4, 8, 16, 32) # FPN specific configs # objects of size in [224^2, 448^2) will be assgin to P4 roi_canonical_scale = 224 roi_canonical_level = 4 class MaskRoiParam: # you can control the FP16 option and normalizer for each individual component fp16 = General.fp16 normalizer = NormalizeParam.normalizer # Each RoI is pooled into an out_size x out_size fixed-length representation out_size = 14 # the total stride of the feature map to pool from stride = (4, 8, 16, 32) # FPN specific configs # objects of size in [224^2, 448^2) will be assgin to P4 roi_canonical_scale = 224 roi_canonical_level = 4 class DatasetParam: # specify the roidbs to read for training/validation if is_train: # == coco_train2017 image_set = ("coco_train2014", "coco_valminusminival2014") else: # == coco_val2017 image_set = ("coco_minival2014", ) class OptimizeParam: class optimizer: type = "sgd" # learning rate will automaticly adapt to different batch size # the base learning rate is 0.02 for 16 images lr = 0.01 / 8 * len(KvstoreParam.gpus) * KvstoreParam.batch_image momentum = 0.9 wd = 0.0001 clip_gradient = None class schedule: # correspond to the 1x, 2x, ... training schedule mult = 2 begin_epoch = 0 end_epoch = 6 * mult lr_mode = "step" # or "cosine" # lr step factor lr_factor = 0.1 # lr step iterations if mult <= 1: lr_iter = [ 60000 * mult * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image), 80000 * mult * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image) ] else: # follow the practice in arXiv:1811.08883 # reduce the lr in the last 60k and 20k iterations lr_iter = [ -60000 * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image), -20000 * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image) ] # follow the practice in arXiv:1706.02677 class warmup: type = "gradual" lr = 0.01 / 8 * len( KvstoreParam.gpus) * KvstoreParam.batch_image / 3 iter = 500 class TestParam: # detection below min_det_score will be removed in the evaluation min_det_score = 0.05 # only the top max_det_per_image detecitons will be evaluated max_det_per_image = 100 # callback, useful in multi-scale testing process_roidb = lambda x: x # callback, useful in scale-aware post-processing process_output = lambda x, y: process_output(x, y) # the model name and epoch used during test # by default the last checkpoint is employed # user can override this with --epoch N when invoking script class model: prefix = "experiments/{}/checkpoint".format(General.name) epoch = OptimizeParam.schedule.end_epoch class nms: type = "nms" # or "softnms" thr = 0.5 # we make use of the coco test toolchain # if no coco format annotation file is specified # test script will generate one on the fly from roidb class coco: annotation = "data/coco/annotations/instances_minival2014.json" # compose the components to for a detector backbone = Backbone(BackboneParam) neck = Neck(NeckParam) rpn_head = RpnHead(RpnParam, MaskParam) roi_extractor = RoiExtractor(RoiParam) mask_roi_extractor = RoiExtractor(MaskRoiParam) bbox_head = BboxHead(BboxParam) mask_head = MaskHead(BboxParam, MaskParam, MaskRoiParam) bbox_post_processer = BboxPostProcessor(TestParam) detector = Detector() if is_train: train_sym = detector.get_train_symbol(backbone, neck, rpn_head, roi_extractor, mask_roi_extractor, bbox_head, mask_head) test_sym = None else: train_sym = None test_sym = detector.get_test_symbol(backbone, neck, rpn_head, roi_extractor, mask_roi_extractor, bbox_head, mask_head, bbox_post_processer) class ModelParam: train_symbol = train_sym test_symbol = test_sym # training model from scratch from_scratch = False # use random seed when initializating random = True # sublinear memory checkpointing memonger = False # checkpointing up to a layer # recompute early stage of a network is cheaper memonger_until = "stage3_unit21_plus" class pretrain: # the model name and epoch used for initialization prefix = "pretrain_model/resnet%s_v1b" % BackboneParam.depth epoch = 0 # any params partially match the fixed_param will be fixed # fixed params will not be updated fixed_param = ["conv0", "stage1", "gamma", "beta"] # any params partially match the excluded_param will not be fixed excluded_param = ["mask_fcn"] # callback, useful for adding cached anchor or complex initialization def process_weight(sym, arg, aux): for stride in RpnParam.anchor_generate.stride: add_anchor_to_arg(sym, arg, aux, RpnParam.anchor_generate.max_side, stride, RpnParam.anchor_generate.scale, RpnParam.anchor_generate.ratio) # data processing class NormParam: # mean/std for input image mean = tuple(i * 255 for i in (0.485, 0.456, 0.406)) # RGB order std = tuple(i * 255 for i in (0.229, 0.224, 0.225)) # data processing class ResizeParam: # the input is resized to a short side not exceeding short # and a long side not exceeding long short = 800 long = 1333 # SimpleDet is written in MXNet symbolic API which features the fastest # execution while requires static input shape # All the inputs are padded to the maximum shape item on the dataset class PadParam: # the resized input is padded to short x long with 0 in bottom-right corner short = 800 long = 1333 max_num_gt = 100 max_len_gt_poly = 2500 # this control the rpn target generation offloaded to CPU data loader # refer to RpnParam.anchor_generate for more infos class AnchorTarget2DParam: def __init__(self): self.generate = self._generate() class _generate: def __init__(self): self.stride = (4, 8, 16, 32, 64) # the shorts and longs have to be pre-computed since the # loader knows nothing of the network # the downsampled side can be calculated as ceil(side / 2) self.short = (200, 100, 50, 25, 13) self.long = (334, 167, 84, 42, 21) scales = (8, ) aspects = (0.5, 1.0, 2.0) class assign: allowed_border = 0 pos_thr = 0.7 neg_thr = 0.3 min_pos_thr = 0.0 class sample: image_anchor = 256 pos_fraction = 0.5 # align blobs name between loader and network class RenameParam: mapping = dict(image="data") from core.detection_input import ReadRoiRecord, Resize2DImageBbox, \ ConvertImageFromHwcToChw, Flip2DImageBbox, Pad2DImageBbox, \ RenameRecord, Norm2DImage from models.maskrcnn.input import PreprocessGtPoly, EncodeGtPoly, \ Resize2DImageBboxMask, Flip2DImageBboxMask, Pad2DImageBboxMask from models.FPN.input import PyramidAnchorTarget2D # modular data augmentation design if is_train: transform = [ ReadRoiRecord(None), Norm2DImage(NormParam), PreprocessGtPoly(), Resize2DImageBboxMask(ResizeParam), Flip2DImageBboxMask(), EncodeGtPoly(PadParam), Pad2DImageBboxMask(PadParam), ConvertImageFromHwcToChw(), RenameRecord(RenameParam.mapping) ] data_name = ["data"] label_name = ["im_info", "gt_bbox", "gt_poly"] if not RpnParam.nnvm_rpn_target: transform.append(PyramidAnchorTarget2D(AnchorTarget2DParam())) label_name += ["rpn_cls_label", "rpn_reg_target", "rpn_reg_weight"] else: transform = [ ReadRoiRecord(None), Norm2DImage(NormParam), Resize2DImageBbox(ResizeParam), ConvertImageFromHwcToChw(), RenameRecord(RenameParam.mapping) ] data_name = ["data", "im_info", "im_id", "rec_id"] label_name = [] import core.detection_metric as metric from models.maskrcnn.metric import SigmoidCELossMetric from mxboard import SummaryWriter # summary writer logs metric to tensorboard for a better track of training sw = SummaryWriter(logdir="./tflogs", flush_secs=5) rpn_acc_metric = metric.AccWithIgnore( name="RpnAcc", output_names=["rpn_cls_loss_output", "rpn_cls_label_blockgrad_output"], label_names=[], summary=sw) rpn_l1_metric = metric.L1( name="RpnL1", output_names=["rpn_reg_loss_output", "rpn_cls_label_blockgrad_output"], label_names=[], summary=sw) box_acc_metric = metric.AccWithIgnore( name="RcnnAcc", output_names=["bbox_cls_loss_output", "bbox_label_blockgrad_output"], label_names=[], summary=sw) box_l1_metric = metric.L1( name="RcnnL1", output_names=["bbox_reg_loss_output", "bbox_label_blockgrad_output"], label_names=[], summary=sw) mask_cls_metric = SigmoidCELossMetric(name="MaskCE", output_names=["mask_loss_output"], label_names=[], summary=sw) metric_list = [ rpn_acc_metric, rpn_l1_metric, box_acc_metric, box_l1_metric, mask_cls_metric ] return General, KvstoreParam, RpnParam, RoiParam, BboxParam, DatasetParam, \ ModelParam, OptimizeParam, TestParam, \ transform, data_name, label_name, metric_list
def get_config(is_train): class General: log_frequency = 10 name = __name__.rsplit("/")[-1].rsplit(".")[-1] batch_image = 2 if is_train else 1 fp16 = False loader_worker = 8 class KvstoreParam: kvstore = "nccl" batch_image = General.batch_image gpus = [0, 1, 2, 3, 4, 5, 6, 7] fp16 = General.fp16 class NormalizeParam: normalizer = normalizer_factory(type="gn") class BackboneParam: fp16 = General.fp16 normalizer = NormalizeParam.normalizer depth = 50 class NeckParam: fp16 = General.fp16 normalizer = NormalizeParam.normalizer class RpnParam: fp16 = General.fp16 normalizer = NormalizeParam.normalizer batch_image = General.batch_image nnvm_proposal = True nnvm_rpn_target = False class anchor_generate: scale = (8, ) ratio = (0.5, 1.0, 2.0) stride = (4, 8, 16, 32, 64) image_anchor = 256 max_side = 1400 class anchor_assign: allowed_border = 0 pos_thr = 0.7 neg_thr = 0.3 min_pos_thr = 0.0 image_anchor = 256 pos_fraction = 0.5 class head: conv_channel = 256 mean = (0, 0, 0, 0) std = (1, 1, 1, 1) class proposal: pre_nms_top_n = 2000 if is_train else 1000 post_nms_top_n = 2000 if is_train else 1000 nms_thr = 0.7 min_bbox_side = 0 class subsample_proposal: proposal_wo_gt = False image_roi = 512 fg_fraction = 0.25 fg_thr = 0.5 bg_thr_hi = 0.5 bg_thr_lo = 0.0 class bbox_target: num_reg_class = 81 class_agnostic = False weight = (1.0, 1.0, 1.0, 1.0) mean = (0.0, 0.0, 0.0, 0.0) std = (0.1, 0.1, 0.2, 0.2) class BboxParam: fp16 = General.fp16 normalizer = NormalizeParam.normalizer num_class = 1 + 80 image_roi = 512 batch_image = General.batch_image class regress_target: class_agnostic = False mean = (0.0, 0.0, 0.0, 0.0) std = (0.1, 0.1, 0.2, 0.2) class MaskParam: fp16 = General.fp16 normalizer = NormalizeParam.normalizer resolution = 28 dim_reduced = 256 num_fg_roi = int(RpnParam.subsample_proposal.image_roi * RpnParam.subsample_proposal.fg_fraction) class RoiParam: fp16 = General.fp16 normalizer = NormalizeParam.normalizer out_size = 7 stride = (4, 8, 16, 32) roi_canonical_scale = 224 roi_canonical_level = 4 class MaskRoiParam: fp16 = General.fp16 normalizer = NormalizeParam.normalizer out_size = 14 stride = (4, 8, 16, 32) roi_canonical_scale = 224 roi_canonical_level = 4 class DatasetParam: if is_train: image_set = ("coco_train2017", ) else: image_set = ("coco_val2017", ) class OptimizeParam: class optimizer: type = "sgd" lr = 0.02 / 8 * len(KvstoreParam.gpus) * KvstoreParam.batch_image momentum = 0.9 wd = 0.0001 clip_gradient = None class schedule: mult = 2 begin_epoch = 0 end_epoch = 6 * mult lr_iter = [ 60000 * mult * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image), 80000 * mult * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image) ] class warmup: type = "gradual" lr = 0.01 / 8 * len( KvstoreParam.gpus) * KvstoreParam.batch_image / 3.0 iter = 500 class TestParam: min_det_score = 0.05 max_det_per_image = 100 process_roidb = lambda x: x process_output = lambda x, y: process_output(x, y) class model: prefix = "experiments/{}/checkpoint".format(General.name) epoch = OptimizeParam.schedule.end_epoch class nms: type = "nms" thr = 0.5 class coco: annotation = "data/coco/annotations/instances_minival2014.json" backbone = Backbone(BackboneParam) neck = Neck(NeckParam) rpn_head = RpnHead(RpnParam, MaskParam) roi_extractor = RoiExtractor(RoiParam) mask_roi_extractor = RoiExtractor(MaskRoiParam) bbox_head = BboxHead(BboxParam) mask_head = MaskHead(BboxParam, MaskParam, MaskRoiParam) bbox_post_processer = BboxPostProcessor(TestParam) detector = Detector() if is_train: train_sym = detector.get_train_symbol(backbone, neck, rpn_head, roi_extractor, mask_roi_extractor, bbox_head, mask_head) test_sym = None else: train_sym = None test_sym = detector.get_test_symbol(backbone, neck, rpn_head, roi_extractor, mask_roi_extractor, bbox_head, mask_head, bbox_post_processer) class ModelParam: train_symbol = train_sym test_symbol = test_sym from_scratch = True random = True memonger = False memonger_until = "stage3_unit21_plus" class pretrain: prefix = "pretrain_model/resnet%s_v1b" % BackboneParam.depth epoch = 0 fixed_param = [] def process_weight(sym, arg, aux): for stride in RpnParam.anchor_generate.stride: add_anchor_to_arg(sym, arg, aux, RpnParam.anchor_generate.max_side, stride, RpnParam.anchor_generate.scale, RpnParam.anchor_generate.ratio) # data processing class NormParam: mean = tuple(i * 255 for i in (0.485, 0.456, 0.406)) # RGB order std = tuple(i * 255 for i in (0.229, 0.224, 0.225)) # data processing class ResizeParam: short = 800 long = 1333 class PadParam: short = 800 long = 1333 max_num_gt = 100 max_len_gt_poly = 2500 class AnchorTarget2DParam: def __init__(self): self.generate = self._generate() class _generate: def __init__(self): self.stride = (4, 8, 16, 32, 64) self.short = (200, 100, 50, 25, 13) self.long = (334, 167, 84, 42, 21) scales = (8) aspects = (0.5, 1.0, 2.0) class assign: allowed_border = 0 pos_thr = 0.7 neg_thr = 0.3 min_pos_thr = 0.0 class sample: image_anchor = 256 pos_fraction = 0.5 class RenameParam: mapping = dict(image="data") from core.detection_input import ReadRoiRecord, Resize2DImageBbox, \ ConvertImageFromHwcToChw, Flip2DImageBbox, Pad2DImageBbox, \ RenameRecord, Norm2DImage from models.maskrcnn.input import PreprocessGtPoly, EncodeGtPoly, \ Resize2DImageBboxMask, Flip2DImageBboxMask, Pad2DImageBboxMask from models.FPN.input import PyramidAnchorTarget2D if is_train: transform = [ ReadRoiRecord(None), Norm2DImage(NormParam), PreprocessGtPoly(), Resize2DImageBboxMask(ResizeParam), Flip2DImageBboxMask(), EncodeGtPoly(PadParam), Pad2DImageBboxMask(PadParam), ConvertImageFromHwcToChw(), RenameRecord(RenameParam.mapping) ] data_name = ["data"] label_name = ["im_info", "gt_bbox", "gt_poly"] if not RpnParam.nnvm_rpn_target: transform.append(PyramidAnchorTarget2D(AnchorTarget2DParam())) label_name += ["rpn_cls_label", "rpn_reg_target", "rpn_reg_weight"] else: transform = [ ReadRoiRecord(None), Norm2DImage(NormParam), Resize2DImageBbox(ResizeParam), ConvertImageFromHwcToChw(), RenameRecord(RenameParam.mapping) ] data_name = ["data", "im_info", "im_id", "rec_id"] label_name = [] import core.detection_metric as metric from models.maskrcnn.metric import SigmoidCELossMetric rpn_acc_metric = metric.AccWithIgnore( "RpnAcc", ["rpn_cls_loss_output", "rpn_cls_label_blockgrad_output"], []) rpn_l1_metric = metric.L1( "RpnL1", ["rpn_reg_loss_output", "rpn_cls_label_blockgrad_output"], []) # for bbox, the label is generated in network so it is an output box_acc_metric = metric.AccWithIgnore( "RcnnAcc", ["bbox_cls_loss_output", "bbox_label_blockgrad_output"], []) box_l1_metric = metric.L1( "RcnnL1", ["bbox_reg_loss_output", "bbox_label_blockgrad_output"], []) mask_cls_metric = SigmoidCELossMetric("MaskCE", ["mask_loss_output"], []) metric_list = [ rpn_acc_metric, rpn_l1_metric, box_acc_metric, box_l1_metric, ] return General, KvstoreParam, RpnParam, RoiParam, BboxParam, DatasetParam, \ ModelParam, OptimizeParam, TestParam, \ transform, data_name, label_name, metric_list
allowed_border = 0 pos_thr = 0.7 neg_thr = 0.3 min_pos_thr = 0.0 class sample: image_anchor = 256 pos_fraction = 0.5 class RenameParam: mapping = dict(image="data") transform = [ ReadRoiRecord(None), PreprocessGtPoly(), Resize2DImageBboxMask(ResizeParam), Flip2DImageBboxMask(), EncodeGtPoly(PadParam), Pad2DImageBboxMask(PadParam), ConvertImageFromHwcToChw(), AnchorTarget2D(AnchorTarget2DParam), RenameRecord(RenameParam.mapping) ] DEBUG = True with open("data/cache/coco_valminusminival2014.roidb", "rb") as fin: roidb = pkl.load(fin) roidb = [rec for rec in roidb if rec["gt_bbox"].shape[0] > 0] roidb = [ roidb[i] for i in np.random.choice(len(roidb), 20, replace=False)