def get_symbol_train(num_classes=20, nms_thresh=0.5, force_suppress=True, nms_topk=400, **kwargs): block_config = [2, 4, 4] bottleneck_width = [1, 2, 4] growth_rate = [32, 32, 32] num_init_features = 64 total_filters = [128, 256, 448] total_filter = num_init_features if type(bottleneck_width) is list: bottleneck_widths = bottleneck_width else: bottleneck_widths = [bottleneck_width] * 4 if type(growth_rate) is list: growth_rates = growth_rate else: growth_rates = [growth_rate] * 4 ####################################################### data = mx.sym.var('data') label = mx.symbol.Variable(name="label") # stem1 = _conv_block(data, 16, 3, 2, 1, 'stem1') # stem2 = _conv_block(stem1, 32, 3, 2, 1, 'stem2') # stem3 = _conv_block(stem2, 64, 3, 2, 1, 'stem3') stem1 = _conv_block(data, 16, 3, 2, 1, 'stem1') stem2 = _conv_block(stem1, 32, 3, 2, 1, 'stem2') stem3 = _conv_block(stem2, 64, 3, 1, 1, 'stem3') from_layer = stem3 feat_layers = [] for idx, num_layers in enumerate(block_config): from_layer = _dense_block(from_layer, num_layers, growth_rates[idx], bottleneck_widths[idx], 'stage{}'.format(idx + 1)) total_filter = total_filters[idx] if idx == len(block_config) - 1: with_pooling = False else: with_pooling = True from_layer = _transition_block(from_layer, total_filter, with_pooling, 'stage{}_tb'.format(idx + 1)) if idx >= 1: feat_layers.append(from_layer) ####################################################### stage2_tb = from_layer.get_internals()['stage2_tb/relu_output'] stage4_tb_ext_pm2_b2a = _conv_block(stage2_tb, 128, 1, 1, 0, 'stage4_tb_ext_pm2_b2a') stage4_tb_ext_pm2_b2b = _conv_block(stage4_tb_ext_pm2_b2a, 128, 1, 1, 0, 'stage4_tb_ext_pm2_b2b') stage4_tb_ext_pm2_b2c = _conv_block(stage4_tb_ext_pm2_b2b, 256, 1, 1, 0, 'stage4_tb_ext_pm2_b2c') stage4_tb_ext_pm2 = _conv_block(stage2_tb, 256, 1, 1, 0, 'stage4_tb_ext_pm2') stage4_tb_ext_pm2_res = mx.sym.broadcast_add(stage4_tb_ext_pm2, stage4_tb_ext_pm2_b2c) stage4_tb_ext_pm2_res_relu = mx.sym.Activation( data=stage4_tb_ext_pm2_res, act_type='relu', name='stage4_tb_ext_pm2_res/relu') stage3_tb = from_layer.get_internals()['stage3_tb/relu_output'] stage4_tb_ext_pm3_b2a = _conv_block(stage3_tb, 128, 1, 1, 0, 'stage4_tb_ext_pm3_b2a') stage4_tb_ext_pm3_b2b = _conv_block(stage4_tb_ext_pm3_b2a, 128, 1, 1, 0, 'stage4_tb_ext_pm3_b2b') stage4_tb_ext_pm3_b2c = _conv_block(stage4_tb_ext_pm3_b2b, 256, 1, 1, 0, 'stage4_tb_ext_pm3_b2c') stage4_tb_ext_pm3 = _conv_block(stage3_tb, 256, 1, 1, 0, 'stage4_tb_ext_pm3') stage4_tb_ext_pm3_res = mx.sym.broadcast_add(stage4_tb_ext_pm3, stage4_tb_ext_pm3_b2c) stage4_tb_ext_pm3_res_relu = mx.sym.Activation( data=stage4_tb_ext_pm3_res, act_type='relu', name='stage4_tb_ext_pm3_res/relu') stage4_tb_relu_ext1_fe1_1 = _conv_block(stage3_tb, 256, 1, 1, 0, 'stage4_tb_relu_ext1_fe1_1') ext1_fe1_2 = _conv_block(stage4_tb_relu_ext1_fe1_1, 256, 3, 2, 1, 'ext1_fe1_2') stage4_tb_ext_pm4_b2a = _conv_block(ext1_fe1_2, 128, 1, 1, 0, 'stage4_tb_ext_pm4_b2a') stage4_tb_ext_pm4_b2b = _conv_block(stage4_tb_ext_pm4_b2a, 128, 1, 1, 0, 'stage4_tb_ext_pm4_b2b') stage4_tb_ext_pm4_b2c = _conv_block(stage4_tb_ext_pm4_b2b, 256, 1, 1, 0, 'stage4_tb_ext_pm4_b2c') stage4_tb_ext_pm4 = _conv_block(ext1_fe1_2, 256, 1, 1, 0, 'stage4_tb_ext_pm4') stage4_tb_ext_pm4_res = mx.sym.broadcast_add(stage4_tb_ext_pm4, stage4_tb_ext_pm4_b2c) stage4_tb_ext_pm4_res_relu = mx.sym.Activation( data=stage4_tb_ext_pm4_res, act_type='relu', name='stage4_tb_ext_pm4_res/relu') ####################################################### stage4_tb_ext_pm4_feat_deconv_pre = _conv_block( stage4_tb_ext_pm4_res_relu, 256, 1, 1, 0, 'stage4/tb/ext/pm4/feat/deconv/pre') stage4_tb_ext_pm4_feat_deconv = _deconv_block( stage4_tb_ext_pm4_feat_deconv_pre, 256, 2, 2, 0, 'stage4/tb/ext/pm4/feat/deconv') stage4_tb_ext_pm3_res_hyper = _conv_block( stage4_tb_ext_pm3_res_relu, 256, 1, 1, 0, 'stage4_tb/ext/pm3/res/hyper/relu') stage4_tb_ext_pm3_feat = mx.sym.broadcast_add( stage4_tb_ext_pm3_res_hyper, stage4_tb_ext_pm4_feat_deconv) stage4_tb_ext_pm3_feat_relu = mx.sym.Activation( data=stage4_tb_ext_pm3_feat, act_type='relu', name='stage4/tb/ext/pm3/res/deconv/pre/relu') stage4_tb_ext_pm3_feat_deconv_pre = _conv_block( stage4_tb_ext_pm3_feat_relu, 256, 1, 1, 0, 'stage4/tb/ext/pm3/feat/deconv/pre') stage4_tb_ext_pm3_feat_deconv = _deconv_block( stage4_tb_ext_pm3_feat_deconv_pre, 256, 2, 2, 0, 'stage4/tb/ext/pm3/feat/deconv') stage4_tb_ext_pm2_res_hyper = _conv_block( stage4_tb_ext_pm2_res_relu, 256, 1, 1, 0, 'stage4_tb/ext/pm2/res/hyper/relu') stage4_tb_ext_pm2_feat = mx.sym.broadcast_add( stage4_tb_ext_pm2_res_hyper, stage4_tb_ext_pm3_feat_deconv) stage4_tb_ext_pm2_feat_relu = mx.sym.Activation( data=stage4_tb_ext_pm2_feat, act_type='relu', name='stage4/tb/ext/pm2/res/deconv/pre/relu') ####################################################### from_layers = [ stage4_tb_ext_pm2_feat_relu, stage4_tb_ext_pm3_feat_relu, stage4_tb_ext_pm4_res_relu ] # sizes = [[0.1,0.16, 0.22], [0.3,0.38, 0.46], [0.56,0.66,0.76]] # ratios = [[0.25, 0.5, 1.0],[0.25, 0.5, 1.0,1.5], [0.33,0.5,1.0,1.5]] # sizes = [[0.02, 0.04, 0.06], [0.08, 0.10, 0.12], [0.14, 0.16, 0.2]] # ratios = [[0.8, 1.2, 1.8],[0.8, 1.2, 1.8, 2.1], [0.8, 1.2, 1.8, 2.1]] # sizes = [[0.03, 0.05, 0.08], [0.1, 0.12, 0.15], [0.18, 0.2, 0.25]] # ratios = [[2, 3.14, 3.6],[2, 3, 3.14, 3.6], [2, 3, 3.14, 3.6]] # sizes = [[0.02, 0.04, 0.06], [0.08, 0.10, 0.12], [0.14, 0.16, 0.2]] # ratios = [[0.8, 1.2, 1.8],[0.8, 1.2, 1.8, 2.1], [0.8, 1.2, 1.8, 2.1]] # 减少锚点框数量 sizes = [[0.02, 0.04, 0.06], [0.08, 0.10, 0.12], [0.14, 0.16, 0.2]] ratios = [[0.8, 1.2, 1.8], [0.8, 1.2, 1.8, 2.1], [0.8, 1.2, 1.8, 2.1]] normalizations = [-1, -1, -1] steps = [] num_channels = [-1, -1, -1] loc_preds, cls_preds, anchor_boxes = multibox_layer(from_layers, \ num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \ num_channels=num_channels, clip=False, interm_layer=0, steps=steps) tmp = mx.symbol.contrib.MultiBoxTarget( *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \ ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \ negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2), name="multibox_target") loc_target = tmp[0] loc_target_mask = tmp[1] cls_target = tmp[2] cls_prob = mx.symbol.SoftmaxOutput(data=cls_preds, label=cls_target, \ ignore_label=-1, use_ignore=True, grad_scale=1., multi_output=True, \ normalization='valid', name="cls_prob") loc_loss_ = mx.symbol.smooth_l1(name="loc_loss_", \ data=loc_target_mask * (loc_preds - loc_target), scalar=1.0) loc_loss = mx.symbol.MakeLoss(loc_loss_, grad_scale=1., \ normalization='valid', name="loc_loss") cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label") det = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \ name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk) det = mx.symbol.MakeLoss(data=det, grad_scale=0, name="det_out") out = mx.symbol.Group([cls_prob, loc_loss, cls_label, det]) return out
def get_symbol_train(network, num_classes, from_layers, num_filters, strides, pads, sizes, ratios, normalizations=-1, steps=[], min_filter=128, square_bb=False, per_cls_reg=False, nms_thresh=0.5, force_suppress=False, nms_topk=400, **kwargs): """Build network symbol for training SSD Parameters ---------- network : str base network symbol name num_classes : int number of object classes not including background from_layers : list of str feature extraction layers, use '' for add extra layers For example: from_layers = ['relu4_3', 'fc7', '', '', '', ''] which means extract feature from relu4_3 and fc7, adding 4 extra layers on top of fc7 num_filters : list of int number of filters for extra layers, you can use -1 for extracted features, however, if normalization and scale is applied, the number of filter for that layer must be provided. For example: num_filters = [512, -1, 512, 256, 256, 256] strides : list of int strides for the 3x3 convolution appended, -1 can be used for extracted feature layers pads : list of int paddings for the 3x3 convolution, -1 can be used for extracted layers sizes : list or list of list [min_size, max_size] for all layers or [[], [], []...] for specific layers ratios : list or list of list [ratio1, ratio2...] for all layers or [[], [], ...] for specific layers normalizations : int or list of int use normalizations value for all layers or [...] for specific layers, -1 indicate no normalizations and scales steps : list specify steps for each MultiBoxPrior layer, leave empty, it will calculate according to layer dimensions min_filter : int minimum number of filters used in 1x1 convolution nms_thresh : float non-maximum suppression threshold force_suppress : boolean whether suppress different class objects nms_topk : int apply NMS to top K detections Returns ------- mx.Symbol """ use_python_layer = True use_focal_loss = cfg.train['use_focal_loss'] use_smooth_ce = cfg.train['use_smooth_ce'] label = mx.sym.Variable('label') if not 'use_global_stats' in kwargs: import ipdb ipdb.set_trace() kwargs['use_global_stats'] = 0 mimic_fc = 0 if not 'mimic_fc' in kwargs else kwargs['mimic_fc'] python_anchor = False if not 'python_anchor' in kwargs else kwargs[ 'python_anchor'] dense_vh = False if not 'dense_vh' in kwargs else kwargs['dense_vh'] data_shape = (0, 0) if not 'data_shape' in kwargs else kwargs['data_shape'] if isinstance(data_shape, int): data_shape = (data_shape, data_shape) ignore_labels = [] if not 'ignore_labels' in kwargs else kwargs[ 'ignore_labels'] body = import_module(network).get_symbol(num_classes, **kwargs) layers = multi_layer_feature(body, from_layers, num_filters, strides, pads, min_filter=min_filter) loc_preds, cls_preds, anchor_boxes = multibox_layer(layers, \ num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \ num_channels=num_filters, clip=False, interm_layer=0, steps=steps, dense_vh=dense_vh, \ data_shape=data_shape, per_cls_reg=per_cls_reg, mimic_fc=mimic_fc, python_anchor=python_anchor) if use_python_layer: neg_ratio = -1 if use_focal_loss else 3 th_small = 0.04 if not 'th_small' in kwargs else kwargs['th_small'] cls_probs = mx.sym.SoftmaxActivation(cls_preds, mode='channel') tmp = mx.sym.Custom(*[anchor_boxes, label, cls_probs], name='multibox_target', op_type='multibox_target', ignore_labels=ignore_labels, per_cls_reg=per_cls_reg, hard_neg_ratio=neg_ratio, th_small=th_small, square_bb=square_bb) else: assert not per_cls_reg neg_ratio = -1 if use_focal_loss else 3 tmp = mx.contrib.symbol.MultiBoxTarget( *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \ ignore_label=-1, negative_mining_ratio=neg_ratio, minimum_negative_samples=0, \ negative_mining_thresh=.4, variances=(0.1, 0.1, 0.2, 0.2), name="multibox_target") loc_target = tmp[0] loc_target_mask = tmp[1] cls_target = tmp[2] if not use_python_layer: cls_target = mx.sym.reshape(cls_target, (0, 1, -1)) # match_info = tmp[3] if use_focal_loss: gamma = cfg.train['focal_loss_gamma'] alpha = cfg.train['focal_loss_alpha'] cls_prob = mx.sym.SoftmaxActivation(cls_preds, mode='channel') if not use_smooth_ce: cls_loss = mx.sym.Custom(cls_preds, cls_prob, cls_target, op_type='reweight_loss', name='cls_loss', gamma=gamma, alpha=alpha, normalize=True) else: th_prob = cfg.train['smooth_ce_th'] # / float(num_classes) w_reg = cfg.train['smooth_ce_lambda'] * float(num_classes) var_th_prob = mx.sym.var(name='th_prob_sce', shape=(1,), dtype=np.float32, \ init=mx.init.Constant(np.log(th_prob))) var_th_prob = mx.sym.exp(var_th_prob) cls_loss = mx.sym.Custom(cls_preds, cls_prob, cls_target, var_th_prob, op_type='smoothed_focal_loss', name='cls_loss', gamma=gamma, alpha=alpha, th_prob=th_prob, w_reg=w_reg, normalize=True) # cls_loss = mx.sym.MakeLoss(cls_loss, grad_scale=1.0, name='cls_loss') elif use_smooth_ce: th_prob = cfg.train['smooth_ce_th'] cls_prob = mx.sym.SoftmaxActivation(cls_preds, mode='channel') cls_loss = mx.sym.Custom(cls_preds, cls_prob, cls_target, op_type='smoothed_softmax_loss', name='cls_loss', th_prob=th_prob, normalization='valid') else: cls_loss = mx.symbol.SoftmaxOutput(data=cls_preds, label=cls_target, \ ignore_label=-1, use_ignore=True, grad_scale=1., multi_output=True, \ normalization='valid', name="cls_loss") # loc_preds = mx.sym.Activation(loc_preds, act_type='tanh') # loc_loss_ = mx.sym.square(name='loc_loss_', \ # data=loc_target_mask * (loc_preds - loc_target)) * 10.0 loc_loss_ = mx.symbol.smooth_l1(name="loc_loss_", \ data=loc_target_mask * (loc_preds - loc_target), scalar=1.0) loc_loss = mx.symbol.MakeLoss(loc_loss_, grad_scale=cfg.train['smoothl1_weight'], \ normalization='valid', name="loc_loss") # monitoring training status cls_label = mx.sym.BlockGrad(cls_target, name="cls_label") loc_label = mx.sym.BlockGrad(loc_target_mask, name='loc_label') # # cls_prob = mx.sym.slice_axis(cls_prob, axis=1, begin=1, end=None) # det = mx.sym.Custom(cls_prob, loc_preds, anchor_boxes, name='detection', op_type='multibox_detection', # th_pos=cfg.valid['th_pos'], th_nms=cfg.valid['th_nms']) # det = mx.contrib.symbol.MultiBoxDetection(*[cls_loss, loc_preds, anchor_boxes], \ name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk) # det = mx.symbol.MakeLoss(data=det, grad_scale=0, name="det_out") # group output out = [cls_loss, loc_loss, cls_label, loc_label, det] # out = [cls_loss, loc_loss, cls_label, loc_label, det, match_info] if use_focal_loss and use_smooth_ce: out.append(mx.sym.BlockGrad(var_th_prob)) return mx.sym.Group(out)
def get_symbol(network, num_classes, from_layers, num_filters, sizes, ratios, strides, pads, normalizations=-1, steps=[], min_filter=128, per_cls_reg=False, nms_thresh=0.5, force_suppress=False, nms_topk=400, **kwargs): """Build network for testing SSD Parameters ---------- network : str base network symbol name num_classes : int number of object classes not including background from_layers : list of str feature extraction layers, use '' for add extra layers For example: from_layers = ['relu4_3', 'fc7', '', '', '', ''] which means extract feature from relu4_3 and fc7, adding 4 extra layers on top of fc7 num_filters : list of int number of filters for extra layers, you can use -1 for extracted features, however, if normalization and scale is applied, the number of filter for that layer must be provided. For example: num_filters = [512, -1, 512, 256, 256, 256] strides : list of int strides for the 3x3 convolution appended, -1 can be used for extracted feature layers pads : list of int paddings for the 3x3 convolution, -1 can be used for extracted layers sizes : list or list of list [min_size, max_size] for all layers or [[], [], []...] for specific layers ratios : list or list of list [ratio1, ratio2...] for all layers or [[], [], ...] for specific layers normalizations : int or list of int use normalizations value for all layers or [...] for specific layers, -1 indicate no normalizations and scales steps : list specify steps for each MultiBoxPrior layer, leave empty, it will calculate according to layer dimensions min_filter : int minimum number of filters used in 1x1 convolution nms_thresh : float non-maximum suppression threshold force_suppress : boolean whether suppress different class objects nms_topk : int apply NMS to top K detections Returns ------- mx.Symbol """ data_shape = (0, 0) if not 'data_shape' in kwargs else kwargs['data_shape'] if isinstance(data_shape, int): data_shape = (data_shape, data_shape) mimic_fc = 0 if not 'mimic_fc' in kwargs else kwargs['mimic_fc'] python_anchor = False if not 'python_anchor' in kwargs else kwargs[ 'python_anchor'] dense_vh = False if not 'dense_vh' in kwargs else kwargs['dense_vh'] kwargs['use_global_stats'] = True body = import_module(network).get_symbol(num_classes, **kwargs) layers = multi_layer_feature(body, from_layers, num_filters, strides, pads, min_filter=min_filter) loc_preds, cls_preds, anchor_boxes = multibox_layer(layers, \ num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \ num_channels=num_filters, clip=False, interm_layer=0, steps=steps, dense_vh=dense_vh, \ data_shape=data_shape, per_cls_reg=per_cls_reg, mimic_fc=mimic_fc, python_anchor=python_anchor) # body = import_module(network).get_symbol(num_classes, **kwargs) # layers = multi_layer_feature(body, from_layers, num_filters, strides, pads, # min_filter=min_filter) # # loc_preds, cls_preds, anchor_boxes = multibox_layer(layers, \ # num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \ # num_channels=num_filters, clip=False, interm_layer=0, steps=steps) cls_prob = mx.symbol.SoftmaxActivation(data=cls_preds, mode='channel', name='cls_prob') ### cls_prob = mx.sym.slice_axis(cls_prob, axis=1, begin=1, end=None) out = mx.sym.Custom(cls_prob, loc_preds, anchor_boxes, name='detection', op_type='multibox_detection', th_pos=cfg.valid['th_pos'], th_nms=cfg.valid['th_nms'], per_cls_reg=per_cls_reg) ### # out = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \ # name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress, # variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk, clip=False) ### return out
def get_symbol_train(num_classes=20, nms_thresh=0.5, force_suppress=False, nms_topk=400, **kwargs): """ Single-shot multi-box detection with VGG 16 layers ConvNet This is a modified version, with fc6/fc7 layers replaced by conv layers And the network is slightly smaller than original VGG 16 network This is a training network with losses Parameters: ---------- num_classes: int number of object classes not including background nms_thresh : float non-maximum suppression threshold force_suppress : boolean whether suppress different class objects nms_topk : int apply NMS to top K detections Returns: ---------- mx.Symbol """ data = mx.symbol.Variable(name="data") label = mx.symbol.Variable(name="label") # group 1 conv1_1 = mx.symbol.Convolution( data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1") relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1") conv1_2 = mx.symbol.Convolution( data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_2") relu1_2 = mx.symbol.Activation(data=conv1_2, act_type="relu", name="relu1_2") pool1 = mx.symbol.Pooling( data=relu1_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool1") # group 2 conv2_1 = mx.symbol.Convolution( data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_1") relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1") conv2_2 = mx.symbol.Convolution( data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_2") relu2_2 = mx.symbol.Activation(data=conv2_2, act_type="relu", name="relu2_2") pool2 = mx.symbol.Pooling( data=relu2_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool2") # group 3 conv3_1 = mx.symbol.Convolution( data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_1") relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1") conv3_2 = mx.symbol.Convolution( data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_2") relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2") conv3_3 = mx.symbol.Convolution( data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_3") relu3_3 = mx.symbol.Activation(data=conv3_3, act_type="relu", name="relu3_3") pool3 = mx.symbol.Pooling( data=relu3_3, pool_type="max", kernel=(2, 2), stride=(2, 2), \ pooling_convention="full", name="pool3") # group 4 conv4_1 = mx.symbol.Convolution( data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_1") relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1") conv4_2 = mx.symbol.Convolution( data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_2") relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2") conv4_3 = mx.symbol.Convolution( data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_3") relu4_3 = mx.symbol.Activation(data=conv4_3, act_type="relu", name="relu4_3") pool4 = mx.symbol.Pooling( data=relu4_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool4") # group 5 conv5_1 = mx.symbol.Convolution( data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_1") relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1") conv5_2 = mx.symbol.Convolution( data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_2") relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2") conv5_3 = mx.symbol.Convolution( data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_3") relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3") pool5 = mx.symbol.Pooling( data=relu5_3, pool_type="max", kernel=(3, 3), stride=(1, 1), pad=(1,1), name="pool5") # group 6 conv6 = mx.symbol.Convolution( data=pool5, kernel=(3, 3), pad=(6, 6), dilate=(6, 6), num_filter=1024, name="conv6") relu6 = mx.symbol.Activation(data=conv6, act_type="relu", name="relu6") # drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6") # group 7 conv7 = mx.symbol.Convolution( data=relu6, kernel=(1, 1), pad=(0, 0), num_filter=1024, name="conv7") relu7 = mx.symbol.Activation(data=conv7, act_type="relu", name="relu7") # drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7") ### ssd extra layers ### conv8_1, relu8_1 = legacy_conv_act_layer(relu7, "8_1", 256, kernel=(1,1), pad=(0,0), \ stride=(1,1), act_type="relu", use_batchnorm=False) conv8_2, relu8_2 = legacy_conv_act_layer(relu8_1, "8_2", 512, kernel=(3,3), pad=(1,1), \ stride=(2,2), act_type="relu", use_batchnorm=False) conv9_1, relu9_1 = legacy_conv_act_layer(relu8_2, "9_1", 128, kernel=(1,1), pad=(0,0), \ stride=(1,1), act_type="relu", use_batchnorm=False) conv9_2, relu9_2 = legacy_conv_act_layer(relu9_1, "9_2", 256, kernel=(3,3), pad=(1,1), \ stride=(2,2), act_type="relu", use_batchnorm=False) conv10_1, relu10_1 = legacy_conv_act_layer(relu9_2, "10_1", 128, kernel=(1,1), pad=(0,0), \ stride=(1,1), act_type="relu", use_batchnorm=False) conv10_2, relu10_2 = legacy_conv_act_layer(relu10_1, "10_2", 256, kernel=(3,3), pad=(0,0), \ stride=(1,1), act_type="relu", use_batchnorm=False) conv11_1, relu11_1 = legacy_conv_act_layer(relu10_2, "11_1", 128, kernel=(1,1), pad=(0,0), \ stride=(1,1), act_type="relu", use_batchnorm=False) conv11_2, relu11_2 = legacy_conv_act_layer(relu11_1, "11_2", 256, kernel=(3,3), pad=(0,0), \ stride=(1,1), act_type="relu", use_batchnorm=False) # specific parameters for VGG16 network from_layers = [relu4_3, relu7, relu8_2, relu9_2, relu10_2, relu11_2] sizes = [[.1, .141], [.2,.272], [.37, .447], [.54, .619], [.71, .79], [.88, .961]] ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \ [1,2,.5], [1,2,.5]] normalizations = [20, -1, -1, -1, -1, -1] steps = [ x / 300.0 for x in [8, 16, 32, 64, 100, 300]] num_channels = [512] loc_preds, cls_preds, anchor_boxes = multibox_layer(from_layers, \ num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \ num_channels=num_channels, clip=False, interm_layer=0, steps=steps) tmp = mx.symbol.contrib.MultiBoxTarget( *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \ ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \ negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2), name="multibox_target") loc_target = tmp[0] loc_target_mask = tmp[1] cls_target = tmp[2] cls_prob = mx.symbol.SoftmaxOutput(data=cls_preds, label=cls_target, \ ignore_label=-1, use_ignore=True, grad_scale=1., multi_output=True, \ normalization='valid', name="cls_prob") loc_loss_ = mx.symbol.smooth_l1(name="loc_loss_", \ data=loc_target_mask * (loc_preds - loc_target), scalar=1.0) loc_loss = mx.symbol.MakeLoss(loc_loss_, grad_scale=1., \ normalization='valid', name="loc_loss") # monitoring training status cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label") det = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \ name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk) det = mx.symbol.MakeLoss(data=det, grad_scale=0, name="det_out") # group output out = mx.symbol.Group([cls_prob, loc_loss, cls_label, det]) return out
def get_symbol_train(num_classes=1, nms_thresh=0.5, force_suppress=False, nms_topk=400): """ Single-shot multi-box detection with VGG 16 layers ConvNet This is a modified version, with fc6/fc7 layers replaced by conv layers And the network is slightly smaller than original VGG 16 network This is a training network with losses Parameters: ---------- num_classes: int number of object classes not including background nms_thresh : float non-maximum suppression threshold force_suppress : boolean whether suppress different class objects nms_topk : int apply NMS to top K detections Returns: ---------- mx.Symbol """ data = mx.symbol.Variable(name="data") label = mx.symbol.Variable(name="label") # group 1 conv1_1 = mx.symbol.Convolution(data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1") relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1") conv1_2 = mx.symbol.Convolution(data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_2") relu1_2 = mx.symbol.Activation(data=conv1_2, act_type="relu", name="relu1_2") pool1 = mx.symbol.Pooling(data=relu1_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool1") # group 2 conv2_1 = mx.symbol.Convolution(data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_1") relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1") conv2_2 = mx.symbol.Convolution(data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_2") relu2_2 = mx.symbol.Activation(data=conv2_2, act_type="relu", name="relu2_2") pool2 = mx.symbol.Pooling(data=relu2_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool2") # group 3 conv3_1 = mx.symbol.Convolution(data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_1") relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1") conv3_2 = mx.symbol.Convolution(data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_2") relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2") conv3_3 = mx.symbol.Convolution(data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_3") relu3_3 = mx.symbol.Activation(data=conv3_3, act_type="relu", name="relu3_3") pool3 = mx.symbol.Pooling( data=relu3_3, pool_type="max", kernel=(2, 2), stride=(2, 2), \ pooling_convention="full", name="pool3") # group 4 conv4_1 = mx.symbol.Convolution(data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_1") relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1") conv4_2 = mx.symbol.Convolution(data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_2") relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2") conv4_3 = mx.symbol.Convolution(data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_3") relu4_3 = mx.symbol.Activation(data=conv4_3, act_type="relu", name="relu4_3") pool4 = mx.symbol.Pooling(data=relu4_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool4") # group 5 conv5_1 = mx.symbol.Convolution(data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_1") relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1") conv5_2 = mx.symbol.Convolution(data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_2") relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2") conv5_3 = mx.symbol.Convolution(data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_3") relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3") pool5 = mx.symbol.Pooling(data=relu5_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name='pool5') P5 = mx.symbol.Convolution(data=pool5, num_filter=256, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name='P5') P5_topdown = mx.symbol.Deconvolution(data=P5, num_filter=256, kernel=(4, 4), stride=(2, 2), pad=(1, 1), name='P5_topdown') P4_lateral = mx.symbol.Convolution(data=pool4, num_filter=256, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name='P4_lateral') P4 = P4_lateral + P5_topdown P4_topdown = mx.symbol.Deconvolution(data=P4, num_filter=256, kernel=(4, 4), stride=(2, 2), pad=(1, 1), name='P4_topdown') P3_lateral = mx.symbol.Convolution(data=pool3, num_filter=256, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name='P3_lateral') P3 = P3_lateral + P4_topdown P3_topdown = mx.symbol.Deconvolution(data=P3, num_filter=256, kernel=(4, 4), stride=(2, 2), pad=(1, 1), name='P3_topdown') P2_lateral = mx.symbol.Convolution(data=pool2, num_filter=256, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name='P2_lateral') P2 = P3_topdown + P2_lateral # specific parameters for VGG16 network from_layers = [P2, P3, P4, P5] sizes = [[0.01, .03], [.05, .07], [.09, .11], [.13, .15]] ratios = [[ 1, ], [ 1, ], [ 1, ], [ 1, ]] normalizations = [20, -1, -1, -1] steps = [x / 640.0 for x in [4, 8, 16, 32]] num_channels = [256] loc_preds, cls_preds, anchor_boxes = multibox_layer(from_layers, \ num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \ num_channels=num_channels, clip=False, interm_layer=0, steps=steps) tmp = mx.contrib.symbol.MultiBoxTarget( *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \ ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \ negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2), name="multibox_target") loc_target = tmp[0] loc_target_mask = tmp[1] cls_target = tmp[2] cls_prob = mx.symbol.SoftmaxOutput(data=cls_preds, label=cls_target, \ ignore_label=-1, use_ignore=True, grad_scale=1., multi_output=True, \ normalization='valid', name="cls_prob") loc_loss_ = mx.symbol.smooth_l1(name="loc_loss_", \ data=loc_target_mask * (loc_preds - loc_target), scalar=1.0) loc_loss = mx.symbol.MakeLoss(loc_loss_, grad_scale=1., \ normalization='valid', name="loc_loss") # monitoring training status cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label") det = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \ name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk) det = mx.symbol.MakeLoss(data=det, grad_scale=0, name="det_out") # group output out = mx.symbol.Group([cls_prob, loc_loss, cls_label, det]) return out
def pvanet_multibox(data, num_classes, use_global_stats=True, no_bias=False, lr_mult=1.0): ''' pvanet 10.0 ''' conv1 = conv_bn_relu(data, group_name='conv1', num_filter=16, kernel=(4, 4), pad=(1, 1), stride=(2, 2), no_bias=no_bias, use_global_stats=use_global_stats, use_crelu=True, lr_mult=lr_mult) # conv2 conv2 = mcrelu(conv1, prefix_group='conv2', filters=(16, 24, 48), no_bias=no_bias, use_global_stats=use_global_stats, lr_mult=lr_mult) # conv3 conv3 = mcrelu(conv2, prefix_group='conv3', filters=(24, 48, 96), no_bias=no_bias, use_global_stats=use_global_stats, lr_mult=lr_mult) # inc3a inc3a = inception(conv3, prefix_group='inc3a', filters_1=96, filters_3=(16, 64), filters_5=(16, 32, 32), no_bias=no_bias, use_global_stats=use_global_stats, do_pool=True, lr_mult=lr_mult) # inc3b inc3b = inception(inc3a, prefix_group='inc3b', filters_1=96, filters_3=(16, 64), filters_5=(16, 32, 32), no_bias=no_bias, use_global_stats=use_global_stats, lr_mult=lr_mult) # inc3b/residual inc3b, inc3b_elt = residual_inc(conv3, inc3b, prefix_lhs='inc3a', prefix_rhs='inc3b', num_filter=128, stride=(2, 2), no_bias=no_bias, use_global_stats=use_global_stats, lr_mult=lr_mult) # inc3c inc3c = inception(inc3b, prefix_group='inc3c', filters_1=96, filters_3=(16, 64), filters_5=(16, 32, 32), no_bias=no_bias, use_global_stats=use_global_stats, lr_mult=lr_mult) # inc3d inc3d = inception(inc3c, prefix_group='inc3d', filters_1=96, filters_3=(16, 64), filters_5=(16, 32, 32), no_bias=no_bias, use_global_stats=use_global_stats, lr_mult=lr_mult) # inc3e inc3e = inception(inc3d, prefix_group='inc3e', filters_1=96, filters_3=(16, 64), filters_5=(16, 32, 32), no_bias=no_bias, use_global_stats=use_global_stats, lr_mult=lr_mult) # inc3e/residual inc3e, _ = residual_inc(inc3b_elt, inc3e, prefix_lhs='inc3c', prefix_rhs='inc3e', num_filter=128, stride=(1, 1), no_bias=no_bias, use_global_stats=use_global_stats, lr_mult=lr_mult) # inc4a inc4a = inception(inc3e, prefix_group='inc4a', filters_1=128, filters_3=(32, 96), filters_5=(16, 32, 32), no_bias=no_bias, use_global_stats=use_global_stats, do_pool=True, lr_mult=lr_mult) # inc4b inc4b = inception(inc4a, prefix_group='inc4b', filters_1=128, filters_3=(32, 96), filters_5=(16, 32, 32), no_bias=no_bias, use_global_stats=use_global_stats, lr_mult=lr_mult) # inc4b/residual inc4b, inc4b_elt = residual_inc(inc3e, inc4b, prefix_lhs='inc4a', prefix_rhs='inc4b', num_filter=192, stride=(2, 2), no_bias=no_bias, use_global_stats=use_global_stats, lr_mult=lr_mult) # inc4c inc4c = inception(inc4b, prefix_group='inc4c', filters_1=128, filters_3=(32, 96), filters_5=(16, 32, 32), no_bias=no_bias, use_global_stats=use_global_stats, lr_mult=lr_mult) # inc4d inc4d = inception(inc4c, prefix_group='inc4d', filters_1=128, filters_3=(32, 96), filters_5=(16, 32, 32), no_bias=no_bias, use_global_stats=use_global_stats, lr_mult=lr_mult) # inc4e inc4e = inception(inc4d, prefix_group='inc4e', filters_1=128, filters_3=(32, 96), filters_5=(16, 32, 32), no_bias=no_bias, use_global_stats=use_global_stats, lr_mult=lr_mult) # inc4e/residual inc4e, _ = residual_inc(inc4b_elt, inc4e, prefix_lhs='inc4c', prefix_rhs='inc4e', num_filter=384, stride=(1, 1), no_bias=no_bias, use_global_stats=use_global_stats, lr_mult=lr_mult) # hyperfeature downsample = mx.sym.Pooling(conv3, name='downsample', kernel=(3, 3), pad=(0, 0), stride=(2, 2), pool_type='max', pooling_convention='full') upsample = mx.sym.UpSampling(inc4e, name='upsample', scale=2, sample_type='bilinear', num_filter=384, num_args=2) concat = mx.sym.concat(downsample, inc3e, upsample) # TODO: feature size tuning # For now I will just use 256. # feature size would be (n, 256, 32, 32) convf = conv_bn_relu(concat, group_name='convf_16', num_filter=256, pad=(0, 0), kernel=(1, 1), stride=(1, 1), no_bias=no_bias, use_global_stats=use_global_stats, lr_mult=lr_mult) from_layers = [convf] sizes = [(32.0 / 512.0)] feat_strides = [16, 32, 64, 128, 256] for fs in feat_strides[1:]: projf = conv_bn_relu(convf, group_name='projf_{}'.format(fs), num_filter=64, pad=(0, 0), kernel=(1, 1), stride=(1, 1), no_bias=no_bias, use_global_stats=use_global_stats, lr_mult=lr_mult) convf = conv_bn_relu(projf, group_name='convf_{}'.format(fs), num_filter=256, pad=(1, 1), kernel=(3, 3), stride=(2, 2), no_bias=no_bias, use_global_stats=use_global_stats, lr_mult=lr_mult) from_layers.append(convf) sizes.append((fs * 2.0 / 512.0)) ratios = [(1.0, 0.5, 2.0)] * len(from_layers) normalizations = [(-1)] * len(from_layers) num_channels = [256] loc_preds, cls_preds, anchor_boxes = multibox_layer(from_layers, \ num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \ num_channels=num_channels, clip=False, interm_layer=0, steps=feat_strides) return loc_preds, cls_preds, anchor_boxes
def get_symbol(network, num_classes, from_layers, num_filters, sizes, ratios, strides, pads, normalizations=-1, steps=[], min_filter=128, nms_thresh=0.5, force_suppress=False, nms_topk=400, **kwargs): """Build network for testing SSD Parameters ---------- network : str base network symbol name num_classes : int number of object classes not including background from_layers : list of str feature extraction layers, use '' for add extra layers For example: from_layers = ['relu4_3', 'fc7', '', '', '', ''] which means extract feature from relu4_3 and fc7, adding 4 extra layers on top of fc7 num_filters : list of int number of filters for extra layers, you can use -1 for extracted features, however, if normalization and scale is applied, the number of filter for that layer must be provided. For example: num_filters = [512, -1, 512, 256, 256, 256] strides : list of int strides for the 3x3 convolution appended, -1 can be used for extracted feature layers pads : list of int paddings for the 3x3 convolution, -1 can be used for extracted layers sizes : list or list of list [min_size, max_size] for all layers or [[], [], []...] for specific layers ratios : list or list of list [ratio1, ratio2...] for all layers or [[], [], ...] for specific layers normalizations : int or list of int use normalizations value for all layers or [...] for specific layers, -1 indicate no normalizations and scales steps : list specify steps for each MultiBoxPrior layer, leave empty, it will calculate according to layer dimensions min_filter : int minimum number of filters used in 1x1 convolution nms_thresh : float non-maximum suppression threshold force_suppress : boolean whether suppress different class objects nms_topk : int apply NMS to top K detections Returns ------- mx.Symbol """ body = import_module(network).get_symbol(num_classes, **kwargs) layers = multi_layer_feature(body, from_layers, num_filters, strides, pads, min_filter=min_filter) loc_preds, cls_preds, anchor_boxes = multibox_layer(layers, \ num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \ num_channels=num_filters, clip=False, interm_layer=0, steps=steps) cls_prob = mx.symbol.SoftmaxActivation(data=cls_preds, mode='channel', \ name='cls_prob') out = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \ name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk) return out
def get_symbol_train(network, num_classes, from_layers, num_filters, strides, pads, sizes, ratios, normalizations=-1, steps=[], min_filter=128, nms_thresh=0.5, force_suppress=False, nms_topk=400, **kwargs): """Build network symbol for training SSD Parameters ---------- network : str base network symbol name num_classes : int number of object classes not including background from_layers : list of str feature extraction layers, use '' for add extra layers For example: from_layers = ['relu4_3', 'fc7', '', '', '', ''] which means extract feature from relu4_3 and fc7, adding 4 extra layers on top of fc7 num_filters : list of int number of filters for extra layers, you can use -1 for extracted features, however, if normalization and scale is applied, the number of filter for that layer must be provided. For example: num_filters = [512, -1, 512, 256, 256, 256] strides : list of int strides for the 3x3 convolution appended, -1 can be used for extracted feature layers pads : list of int paddings for the 3x3 convolution, -1 can be used for extracted layers sizes : list or list of list [min_size, max_size] for all layers or [[], [], []...] for specific layers ratios : list or list of list [ratio1, ratio2...] for all layers or [[], [], ...] for specific layers normalizations : int or list of int use normalizations value for all layers or [...] for specific layers, -1 indicate no normalizations and scales steps : list specify steps for each MultiBoxPrior layer, leave empty, it will calculate according to layer dimensions min_filter : int minimum number of filters used in 1x1 convolution nms_thresh : float non-maximum suppression threshold force_suppress : boolean whether suppress different class objects nms_topk : int apply NMS to top K detections Returns ------- mx.Symbol """ label = mx.sym.Variable('label') body = import_module(network).get_symbol(num_classes, **kwargs) layers = multi_layer_feature(body, from_layers, num_filters, strides, pads, min_filter=min_filter) loc_preds, cls_preds, anchor_boxes = multibox_layer(layers, \ num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \ num_channels=num_filters, clip=False, interm_layer=0, steps=steps) tmp = mx.contrib.symbol.MultiBoxTarget( *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \ ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \ negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2), name="multibox_target") loc_target = tmp[0] loc_target_mask = tmp[1] cls_target = tmp[2] ''' Focal loss related ''' cls_prob_ = mx.sym.SoftmaxActivation(cls_preds, mode='channel') cls_prob = mx.sym.Custom(cls_preds, cls_prob_, cls_target, op_type='focal_loss', name='cls_prob', gamma=2.0, alpha=0.25, normalize=True) # cls_prob = mx.symbol.SoftmaxOutput(data=cls_preds, label=cls_target, \ # ignore_label=-1, use_ignore=True, grad_scale=1., multi_output=True, \ # normalization='valid', name="cls_prob") loc_loss_ = mx.symbol.smooth_l1(name="loc_loss_", \ data=loc_target_mask * (loc_preds - loc_target), scalar=1.0) loc_loss = mx.symbol.MakeLoss(loc_loss_, grad_scale=1., \ normalization='valid', name="loc_loss") # monitoring training status cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label") det = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \ name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk) det = mx.symbol.MakeLoss(data=det, grad_scale=0, name="det_out") # group output out = mx.symbol.Group([cls_prob, loc_loss, cls_label, det]) return out
def get_symbol_train(network, num_classes, from_layers, num_filters, strides, pads, sizes, ratios, normalizations=-1, steps=[], min_filter=128, nms_thresh=0.5, force_suppress=False, nms_topk=400, **kwargs): """Build network symbol for training SSD Parameters ---------- network : str base network symbol name num_classes : int number of object classes not including background from_layers : list of str feature extraction layers, use '' for add extra layers For example: from_layers = ['relu4_3', 'fc7', '', '', '', ''] which means extract feature from relu4_3 and fc7, adding 4 extra layers on top of fc7 num_filters : list of int number of filters for extra layers, you can use -1 for extracted features, however, if normalization and scale is applied, the number of filter for that layer must be provided. For example: num_filters = [512, -1, 512, 256, 256, 256] strides : list of int strides for the 3x3 convolution appended, -1 can be used for extracted feature layers pads : list of int paddings for the 3x3 convolution, -1 can be used for extracted layers sizes : list or list of list [min_size, max_size] for all layers or [[], [], []...] for specific layers ratios : list or list of list [ratio1, ratio2...] for all layers or [[], [], ...] for specific layers normalizations : int or list of int use normalizations value for all layers or [...] for specific layers, -1 indicate no normalizations and scales steps : list specify steps for each MultiBoxPrior layer, leave empty, it will calculate according to layer dimensions min_filter : int minimum number of filters used in 1x1 convolution nms_thresh : float non-maximum suppression threshold force_suppress : boolean whether suppress different class objects nms_topk : int apply NMS to top K detections Returns ------- mx.Symbol """ label = mx.sym.Variable('label') body = import_module(network).get_symbol(num_classes, **kwargs) layers = multi_layer_feature(body, from_layers, num_filters, strides, pads, min_filter=min_filter) loc_preds, cls_preds, anchor_boxes = multibox_layer(layers, \ num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \ num_channels=num_filters, clip=False, interm_layer=0, steps=steps) tmp = mx.contrib.symbol.MultiBoxTarget( *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \ ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \ negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2), name="multibox_target") loc_target = tmp[0] loc_target_mask = tmp[1] cls_target = tmp[2] cls_prob = mx.symbol.SoftmaxOutput(data=cls_preds, label=cls_target, \ ignore_label=-1, use_ignore=True, grad_scale=1., multi_output=True, \ normalization='valid', name="cls_prob") loc_loss_ = mx.symbol.smooth_l1(name="loc_loss_", \ data=loc_target_mask * (loc_preds - loc_target), scalar=1.0) loc_loss = mx.symbol.MakeLoss(loc_loss_, grad_scale=1., \ normalization='valid', name="loc_loss") # monitoring training status cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label") det = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \ name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk) det = mx.symbol.MakeLoss(data=det, grad_scale=0, name="det_out") # group output out = mx.symbol.Group([cls_prob, loc_loss, cls_label, det]) return out
def get_symbol_train(num_classes=20): """ Single-shot multi-box detection with VGG 16 layers ConvNet This is a modified version, with fc6/fc7 layers replaced by conv layers And the network is slightly smaller than original VGG 16 network This is a training network with losses Parameters: ---------- num_classes: int number of object classes not including background Returns: ---------- mx.Symbol """ data = mx.symbol.Variable(name="data") label = mx.symbol.Variable(name="label") # group 1 conv1_1 = mx.symbol.Convolution(data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1") relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1") conv1_2 = mx.symbol.Convolution(data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_2") relu1_2 = mx.symbol.Activation(data=conv1_2, act_type="relu", name="relu1_2") pool1 = mx.symbol.Pooling(data=relu1_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool1") # group 2 conv2_1 = mx.symbol.Convolution(data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_1") relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1") conv2_2 = mx.symbol.Convolution(data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_2") relu2_2 = mx.symbol.Activation(data=conv2_2, act_type="relu", name="relu2_2") pool2 = mx.symbol.Pooling(data=relu2_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool2") # group 3 conv3_1 = mx.symbol.Convolution(data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_1") relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1") conv3_2 = mx.symbol.Convolution(data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_2") relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2") conv3_3 = mx.symbol.Convolution(data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_3") relu3_3 = mx.symbol.Activation(data=conv3_3, act_type="relu", name="relu3_3") pool3 = mx.symbol.Pooling( data=relu3_3, pool_type="max", kernel=(2, 2), stride=(2, 2), \ pooling_convention="full", name="pool3") # group 4 conv4_1 = mx.symbol.Convolution(data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_1") relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1") conv4_2 = mx.symbol.Convolution(data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_2") relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2") conv4_3 = mx.symbol.Convolution(data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_3") relu4_3 = mx.symbol.Activation(data=conv4_3, act_type="relu", name="relu4_3") pool4 = mx.symbol.Pooling(data=relu4_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool4") # group 5 conv5_1 = mx.symbol.Convolution(data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_1") relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1") conv5_2 = mx.symbol.Convolution(data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_2") relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2") conv5_3 = mx.symbol.Convolution(data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_3") relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3") pool5 = mx.symbol.Pooling(data=relu5_3, pool_type="max", kernel=(3, 3), stride=(1, 1), pad=(1, 1), name="pool5") # group 6 conv6 = mx.symbol.Convolution(data=pool5, kernel=(3, 3), pad=(6, 6), dilate=(6, 6), num_filter=1024, name="conv6") relu6 = mx.symbol.Activation(data=conv6, act_type="relu", name="relu6") # drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6") # group 7 conv7 = mx.symbol.Convolution(data=relu6, kernel=(1, 1), pad=(0, 0), num_filter=1024, name="conv7") relu7 = mx.symbol.Activation(data=conv7, act_type="relu", name="relu7") # drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7") ### ssd extra layers ### conv8_1, relu8_1 = conv_act_layer(relu7, "8_1", 256, kernel=(1,1), pad=(0,0), \ stride=(1,1), act_type="relu", use_batchnorm=False) conv8_2, relu8_2 = conv_act_layer(relu8_1, "8_2", 512, kernel=(3,3), pad=(1,1), \ stride=(2,2), act_type="relu", use_batchnorm=False) conv9_1, relu9_1 = conv_act_layer(relu8_2, "9_1", 128, kernel=(1,1), pad=(0,0), \ stride=(1,1), act_type="relu", use_batchnorm=False) conv9_2, relu9_2 = conv_act_layer(relu9_1, "9_2", 256, kernel=(3,3), pad=(1,1), \ stride=(2,2), act_type="relu", use_batchnorm=False) conv10_1, relu10_1 = conv_act_layer(relu9_2, "10_1", 128, kernel=(1,1), pad=(0,0), \ stride=(1,1), act_type="relu", use_batchnorm=False) conv10_2, relu10_2 = conv_act_layer(relu10_1, "10_2", 256, kernel=(3,3), pad=(1,1), \ stride=(2,2), act_type="relu", use_batchnorm=False) # global Pooling pool10 = mx.symbol.Pooling(data=relu10_2, pool_type="avg", global_pool=True, kernel=(1, 1), name='pool10') # specific parameters for VGG16 network from_layers = [relu4_3, relu7, relu8_2, relu9_2, relu10_2, pool10] sizes = [[.1], [.2, .276], [.38, .461], [.56, .644], [.74, .825], [.92, 1.01]] ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \ [1,2,.5,3,1./3], [1,2,.5,3,1./3]] normalizations = [20, -1, -1, -1, -1, -1] num_channels = [512] loc_preds, cls_preds, anchor_boxes = multibox_layer(from_layers, \ num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \ num_channels=num_channels, clip=True, interm_layer=0) tmp = mx.symbol.MultiBoxTarget( *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \ ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \ negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2), name="multibox_target") loc_target = tmp[0] loc_target_mask = tmp[1] cls_target = tmp[2] cls_prob = mx.symbol.SoftmaxOutput(data=cls_preds, label=cls_target, \ ignore_label=-1, use_ignore=True, grad_scale=3., multi_output=True, \ normalization='valid', name="cls_prob") loc_loss_ = mx.symbol.smooth_l1(name="loc_loss_", \ data=loc_target_mask * (loc_preds - loc_target), scalar=1.0) loc_loss = mx.symbol.MakeLoss(loc_loss_, grad_scale=1., \ normalization='valid', name="loc_loss") # monitoring training status cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label") # group output out = mx.symbol.Group([cls_prob, loc_loss, cls_label]) return out
def get_symbol_train(num_classes=20): """ Single-shot multi-box detection with VGG 16 layers ConvNet This is a modified version, with fc6/fc7 layers replaced by conv layers And the network is slightly smaller than original VGG 16 network This is a training network with losses Parameters: ---------- num_classes: int number of object classes not including background Returns: ---------- mx.Symbol """ data = mx.symbol.Variable(name="data") label = mx.symbol.Variable(name="label") # group 1 conv1_1 = mx.symbol.Convolution( data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1") relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1") conv1_2 = mx.symbol.Convolution( data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_2") relu1_2 = mx.symbol.Activation(data=conv1_2, act_type="relu", name="relu1_2") pool1 = mx.symbol.Pooling( data=relu1_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool1") # group 2 conv2_1 = mx.symbol.Convolution( data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_1") relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1") conv2_2 = mx.symbol.Convolution( data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_2") relu2_2 = mx.symbol.Activation(data=conv2_2, act_type="relu", name="relu2_2") pool2 = mx.symbol.Pooling( data=relu2_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool2") # group 3 conv3_1 = mx.symbol.Convolution( data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_1") relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1") conv3_2 = mx.symbol.Convolution( data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_2") relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2") conv3_3 = mx.symbol.Convolution( data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_3") relu3_3 = mx.symbol.Activation(data=conv3_3, act_type="relu", name="relu3_3") pool3 = mx.symbol.Pooling( data=relu3_3, pool_type="max", kernel=(2, 2), stride=(2, 2), \ pooling_convention="full", name="pool3") # group 4 conv4_1 = mx.symbol.Convolution( data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_1") relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1") conv4_2 = mx.symbol.Convolution( data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_2") relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2") conv4_3 = mx.symbol.Convolution( data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_3") relu4_3 = mx.symbol.Activation(data=conv4_3, act_type="relu", name="relu4_3") pool4 = mx.symbol.Pooling( data=relu4_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool4") # group 5 conv5_1 = mx.symbol.Convolution( data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_1") relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1") conv5_2 = mx.symbol.Convolution( data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_2") relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2") conv5_3 = mx.symbol.Convolution( data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_3") relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3") pool5 = mx.symbol.Pooling( data=relu5_3, pool_type="max", kernel=(3, 3), stride=(1, 1), pad=(1,1), name="pool5") # group 6 conv6 = mx.symbol.Convolution( data=pool5, kernel=(3, 3), pad=(6, 6), dilate=(6, 6), num_filter=1024, name="conv6") relu6 = mx.symbol.Activation(data=conv6, act_type="relu", name="relu6") # drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6") # group 7 conv7 = mx.symbol.Convolution( data=relu6, kernel=(1, 1), pad=(0, 0), num_filter=1024, name="conv7") relu7 = mx.symbol.Activation(data=conv7, act_type="relu", name="relu7") # drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7") ### ssd extra layers ### conv8_1, relu8_1 = conv_act_layer(relu7, "8_1", 256, kernel=(1,1), pad=(0,0), \ stride=(1,1), act_type="relu", use_batchnorm=False) conv8_2, relu8_2 = conv_act_layer(relu8_1, "8_2", 512, kernel=(3,3), pad=(1,1), \ stride=(2,2), act_type="relu", use_batchnorm=False) conv9_1, relu9_1 = conv_act_layer(relu8_2, "9_1", 128, kernel=(1,1), pad=(0,0), \ stride=(1,1), act_type="relu", use_batchnorm=False) conv9_2, relu9_2 = conv_act_layer(relu9_1, "9_2", 256, kernel=(3,3), pad=(1,1), \ stride=(2,2), act_type="relu", use_batchnorm=False) conv10_1, relu10_1 = conv_act_layer(relu9_2, "10_1", 128, kernel=(1,1), pad=(0,0), \ stride=(1,1), act_type="relu", use_batchnorm=False) conv10_2, relu10_2 = conv_act_layer(relu10_1, "10_2", 256, kernel=(3,3), pad=(1,1), \ stride=(2,2), act_type="relu", use_batchnorm=False) # global Pooling pool10 = mx.symbol.Pooling(data=relu10_2, pool_type="avg", global_pool=True, kernel=(1,1), name='pool10') # specific parameters for VGG16 network from_layers = [relu4_3, relu7, relu8_2, relu9_2, relu10_2, pool10] sizes = [[.1], [.2,.276], [.38, .461], [.56, .644], [.74, .825], [.92, 1.01]] ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \ [1,2,.5,3,1./3], [1,2,.5,3,1./3]] normalizations = [20, -1, -1, -1, -1, -1] num_channels = [512] loc_preds, cls_preds, anchor_boxes = multibox_layer(from_layers, \ num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \ num_channels=num_channels, clip=True, interm_layer=0) tmp = mx.contrib.symbol.MultiBoxTarget( *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \ ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \ negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2), name="multibox_target") loc_target = tmp[0] loc_target_mask = tmp[1] cls_target = tmp[2] cls_prob = mx.symbol.SoftmaxOutput(data=cls_preds, label=cls_target, \ ignore_label=-1, use_ignore=True, grad_scale=3., multi_output=True, \ normalization='valid', name="cls_prob") loc_loss_ = mx.symbol.smooth_l1(name="loc_loss_", \ data=loc_target_mask * (loc_preds - loc_target), scalar=1.0) loc_loss = mx.symbol.MakeLoss(loc_loss_, grad_scale=1., \ normalization='valid', name="loc_loss") # monitoring training status cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label") # group output out = mx.symbol.Group([cls_prob, loc_loss, cls_label]) return out
def get_symbol_train(num_classes=20, nms_thresh=0.5, force_suppress=False, nms_topk=400): label = mx.symbol.Variable(name="label") feature_net1, feature_net2, feature_net3, feature_net4 = get_feature_layer( ) conv1, relu1 = conv_act_layer(feature_net4, "8_1", 512, stride=(2, 2)) conv2, relu2 = conv_act_layer(relu1, "9_1", 512, stride=(2, 2)) conv3, relu3 = conv_act_layer(relu2, "10_1", 512, stride=(2, 2)) conv4, relu4 = conv_act_layer(relu3, "11_1", 512, stride=(1, 1), pad=(0, 0), kernel=(3, 3)) deconv1 = deconv_layer(relu4, relu3, deconv_kernel=(3, 3), deconv_pad=(0, 0)) deconv2 = deconv_layer(deconv1, relu2) deconv3 = deconv_layer(deconv2, relu1, deconv_kernel=(2, 2), deconv_pad=(0, 0)) deconv4 = deconv_layer(deconv3, feature_net4, deconv_kernel=(2, 2), deconv_pad=(0, 0)) deconv5 = deconv_layer(deconv4, feature_net2, deconv_kernel=(2, 2), deconv_pad=(0, 0)) layer1 = residual_predict(relu4) layer2 = residual_predict(deconv1) layer3 = residual_predict(deconv2) layer4 = residual_predict(deconv3) layer5 = residual_predict(deconv4) layer6 = residual_predict(deconv5) from_layers = [layer6, layer5, layer4, layer3, layer2, layer1] sizes = [[.1, .141], [.2, .272], [.37, .447], [.54, .619], [.71, .79], [.88, .961]] ratios = [[1, 2, .5], [1, 2, .5, 3, 1. / 3], [1, 2, .5, 3, 1. / 3], [1, 2, .5, 3, 1. / 3], [1, 2, .5], [1, 2, .5]] loc_preds, cls_preds, anchor_boxes = multibox_layer(from_layers, num_classes, sizes=sizes, ratios=ratios, clip=False, interm_layer=0) tmp = mx.contrib.symbol.MultiBoxTarget( *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \ ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \ negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2), name="multibox_target") loc_target = tmp[0] loc_target_mask = tmp[1] cls_target = tmp[2] cls_prob = mx.symbol.SoftmaxOutput(data=cls_preds, label=cls_target, \ ignore_label=-1, use_ignore=True, grad_scale=1., multi_output=True, \ normalization='valid', name="cls_prob") loc_loss_ = mx.symbol.smooth_l1(name="loc_loss_", \ data=loc_target_mask * (loc_preds - loc_target), scalar=1.0) loc_loss = mx.symbol.MakeLoss(loc_loss_, grad_scale=1., \ normalization='valid', name="loc_loss") # monitoring training status cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label") det = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \ name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk) det = mx.symbol.MakeLoss(data=det, grad_scale=0, name="det_out") # group output out = mx.symbol.Group([cls_prob, loc_loss, cls_label, det]) return out
def get_symbol_train(seq_len): """ Single-shot multi-box detection with VGG 16 layers ConvNet This is a modified version, with fc6/fc7 layers replaced by conv layers And the network is slightly smaller than original VGG 16 network This is a training network with losses Parameters: ---------- num_classes: int number of object classes not including background nms_thresh : float non-maximum suppression threshold force_suppress : boolean whether suppress different class objects nms_topk : int apply NMS to top K detections Returns: ---------- mx.Symbol """ #print('xx') network = cfg.NETWORK num_classes = cfg.NUM_CLASSES data = mx.symbol.Variable(name="data") expression = mx.symbol.Variable(name='expression') label = mx.symbol.Variable(name="label") if network=='vgg16': c5,c4,c3,_ = symbol_vgg(data) elif network=='vgg16_bn': c5,c4,c3,_ = symbol_vgg_bn(data) elif network.startswith('resnet'): #yi fan hou xi num_layers = int(network.split('_')[-1]) if num_layers >= 50: filter_list = [64, 256, 512, 1024, 2048] bottle_neck = True else: filter_list = [64, 64, 128, 256, 512] bottle_neck = False #num_stages = 4 if num_layers == 18: units = [2, 2, 2, 2] elif num_layers == 34: units = [3, 4, 6, 3] elif num_layers == 50: units = [3, 4, 6, 3] elif num_layers == 101: units = [3, 4, 23, 3] elif num_layers == 152: units = [3, 8, 36, 3] elif num_layers == 200: units = [3, 24, 36, 3] elif num_layers == 269: units = [3, 30, 48, 8] else: raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers)) c5,c4,c3,_=symbol_resnet(data,units,filter_list,bottle_neck) elif network=='inceptionv3': c5,c4,c3,_=symbol_Inception_v3(data) rnn_feat = get_rnn_feat(seq_len,expression) c5 = residual_att_unit(data=c5,express=rnn_feat,ratio=0.75,num_filter=512,stride=(1,1),bottle_neck=False,dim_match=False,name='c5',deform=False) c4 = residual_att_unit(data=c4,express=rnn_feat,ratio=0.5,num_filter=256,stride=(1,1),bottle_neck=False,dim_match=False,name='c4',deform=False) c3 = residual_att_unit(data=c3,express=rnn_feat,ratio=0.25,num_filter=128,stride=(1,1),bottle_neck=False,dim_match=False,name='c3',deform=False) P6 = mx.symbol.Convolution(data=c5,num_filter=256,kernel=(3,3),stride=(2,2),pad=(1,1),name='P6') p6_relu = mx.symbol.Activation(data=P6,act_type='relu',name='p6_relu') P7 = mx.symbol.Convolution(data=p6_relu,num_filter=256,kernel=(3,3),stride=(2,2),pad=(1,1),name='P7') P5 = mx.symbol.Convolution(data=c5,num_filter=256,kernel=(1,1),stride=(1,1),pad=(0,0),name='P5') P5_topdown = mx.symbol.Deconvolution(data=P5,num_filter=256,kernel=(4,4),stride=(2,2),pad=(1,1),name='P5_topdown') P4_lateral = mx.symbol.Convolution(data=c4,num_filter=256,kernel=(1,1),stride=(1,1),pad=(0,0),name='P4_lateral') P4 = mx.sym.elemwise_add(P4_lateral,P5_topdown,name='P4') P4_topdown = mx.symbol.Deconvolution(data=P4,num_filter=256,kernel=(4,4),stride=(2,2),pad=(1,1),name='P4_topdown') P3_lateral = mx.symbol.Convolution(data=c3,num_filter=256,kernel=(1,1),stride=(1,1),pad=(0,0),name='P3_lateral') P3 = mx.sym.elemwise_add(P3_lateral, P4_topdown,name='P3') #specific parameters from_layers = [P7,P6,P5,P4,P3] sizes = [[0.01, .1], [.2,.3], [.4, .5], [.6, .7],[.9,1.]] ratios = [[1,], [1,], [1,], [1,],[1,]] normalizations = [20, -1, -1, -1,-1] steps = [ x / 640.0 for x in [4, 8, 16, 32,32]] num_channels = [256] loc_preds, cls_preds, anchor_boxes = multibox_layer(from_layers,\ num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \ num_channels=num_channels, clip=False, interm_layer=0, steps=steps) tmp = mx.contrib.symbol.MultiBoxTarget( *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \ ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \ negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2), name="multibox_target") loc_target = tmp[0] loc_target_mask = tmp[1] cls_target = tmp[2] cls_prob = mx.symbol.SoftmaxOutput(data=cls_preds, label=cls_target, \ ignore_label=-1, use_ignore=True, grad_scale=1., multi_output=True, \ normalization='valid', name="cls_prob") loc_loss_ = mx.symbol.smooth_l1(name="loc_loss_", \ data=loc_target_mask * (loc_preds - loc_target), scalar=1.0) loc_loss = mx.symbol.MakeLoss(loc_loss_, grad_scale=1., \ normalization='valid', name="loc_loss") # monitoring training status cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label") det = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \ name="detection", nms_threshold=cfg.NMS_THRESHOLD, force_suppress=cfg.FORCE_SUPPRESS, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=cfg.NMS_TOPK) det = mx.symbol.MakeLoss(data=det, grad_scale=0, name="det_out") # group output out = mx.symbol.Group([cls_prob, loc_loss, cls_label, det]) return out,('expression','data',),('label',)
def get_symbol_train(num_classes=20, nms_thresh=0.5, force_suppress=False, nms_topk=400): data = mx.symbol.Variable(name="data") label = mx.symbol.Variable(name="label") # group 1 '''conv1_1 = mx.symbol.Convolution( data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1") relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1") conv1_2 = mx.symbol.Convolution( data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_2") relu1_2 = mx.symbol.Activation(data=conv1_2, act_type="relu", name="relu1_2") pool1 = mx.symbol.Pooling( data=relu1_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool1")''' conv1 = mx.symbol.Convolution(name='conv1', data=data, num_filter=32, pad=(1, 1), kernel=(3, 3), stride=(2, 2), no_bias=True) conv1_bn = mx.symbol.BatchNorm(name='conv1_bn', data=conv1, use_global_stats=False, fix_gamma=False, eps=0.000100) conv1_scale = conv1_bn relu1 = mx.symbol.Activation(name='relu1', data=conv1_scale, act_type='relu') # group 2 '''conv2_1 = mx.symbol.Convolution( data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_1") relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1") conv2_2 = mx.symbol.Convolution( data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_2") relu2_2 = mx.symbol.Activation(data=conv2_2, act_type="relu", name="relu2_2") pool2 = mx.symbol.Pooling( data=relu2_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool2")''' conv2_1_dw = mx.symbol.ChannelwiseConvolution(name='conv2_1_dw', data=relu1, num_filter=32, pad=(1, 1), kernel=(3, 3), stride=(1, 1), no_bias=True, num_group=32) conv2_1_dw_bn = mx.symbol.BatchNorm(name='conv2_1_dw_bn', data=conv2_1_dw, use_global_stats=False, fix_gamma=False, eps=0.000100) conv2_1_dw_scale = conv2_1_dw_bn relu2_1_dw = mx.symbol.Activation(name='relu2_1_dw', data=conv2_1_dw_scale, act_type='relu') conv2_1_sep = mx.symbol.Convolution(name='conv2_1_sep', data=relu2_1_dw, num_filter=64, pad=(0, 0), kernel=(1, 1), stride=(1, 1), no_bias=True) conv2_1_sep_bn = mx.symbol.BatchNorm(name='conv2_1_sep_bn', data=conv2_1_sep, use_global_stats=False, fix_gamma=False, eps=0.000100) conv2_1_sep_scale = conv2_1_sep_bn relu2_1_sep = mx.symbol.Activation(name='relu2_1_sep', data=conv2_1_sep_scale, act_type='relu') conv2_2_dw = mx.symbol.ChannelwiseConvolution(name='conv2_2_dw', data=relu2_1_sep, num_filter=64, pad=(1, 1), kernel=(3, 3), stride=(2, 2), no_bias=True, num_group=64) conv2_2_dw_bn = mx.symbol.BatchNorm(name='conv2_2_dw_bn', data=conv2_2_dw, use_global_stats=False, fix_gamma=False, eps=0.000100) conv2_2_dw_scale = conv2_2_dw_bn relu2_2_dw = mx.symbol.Activation(name='relu2_2_dw', data=conv2_2_dw_scale, act_type='relu') conv2_2_sep = mx.symbol.Convolution(name='conv2_2_sep', data=relu2_2_dw, num_filter=128, pad=(0, 0), kernel=(1, 1), stride=(1, 1), no_bias=True) conv2_2_sep_bn = mx.symbol.BatchNorm(name='conv2_2_sep_bn', data=conv2_2_sep, use_global_stats=False, fix_gamma=False, eps=0.000100) conv2_2_sep_scale = conv2_2_sep_bn relu2_2_sep = mx.symbol.Activation(name='relu2_2_sep', data=conv2_2_sep_scale, act_type='relu') # group 3 '''conv3_1 = mx.symbol.Convolution( data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_1") relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1") conv3_2 = mx.symbol.Convolution( data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_2") relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2") conv3_3 = mx.symbol.Convolution( data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_3") relu3_3 = mx.symbol.Activation(data=conv3_3, act_type="relu", name="relu3_3") pool3 = mx.symbol.Pooling( data=relu3_3, pool_type="max", kernel=(2, 2), stride=(2, 2), \ pooling_convention="full", name="pool3")''' conv3_1_dw = mx.symbol.ChannelwiseConvolution(name='conv3_1_dw', data=relu2_2_sep, num_filter=128, pad=(1, 1), kernel=(3, 3), stride=(1, 1), no_bias=True, num_group=128) conv3_1_dw_bn = mx.symbol.BatchNorm(name='conv3_1_dw_bn', data=conv3_1_dw, use_global_stats=False, fix_gamma=False, eps=0.000100) conv3_1_dw_scale = conv3_1_dw_bn relu3_1_dw = mx.symbol.Activation(name='relu3_1_dw', data=conv3_1_dw_scale, act_type='relu') conv3_1_sep = mx.symbol.Convolution(name='conv3_1_sep', data=relu3_1_dw, num_filter=128, pad=(0, 0), kernel=(1, 1), stride=(1, 1), no_bias=True) conv3_1_sep_bn = mx.symbol.BatchNorm(name='conv3_1_sep_bn', data=conv3_1_sep, use_global_stats=False, fix_gamma=False, eps=0.000100) conv3_1_sep_scale = conv3_1_sep_bn relu3_1_sep = mx.symbol.Activation(name='relu3_1_sep', data=conv3_1_sep_scale, act_type='relu') conv3_2_dw = mx.symbol.ChannelwiseConvolution(name='conv3_2_dw', data=relu3_1_sep, num_filter=128, pad=(1, 1), kernel=(3, 3), stride=(2, 2), no_bias=True, num_group=128) conv3_2_dw_bn = mx.symbol.BatchNorm(name='conv3_2_dw_bn', data=conv3_2_dw, use_global_stats=False, fix_gamma=False, eps=0.000100) conv3_2_dw_scale = conv3_2_dw_bn relu3_2_dw = mx.symbol.Activation(name='relu3_2_dw', data=conv3_2_dw_scale, act_type='relu') conv3_2_sep = mx.symbol.Convolution(name='conv3_2_sep', data=relu3_2_dw, num_filter=256, pad=(0, 0), kernel=(1, 1), stride=(1, 1), no_bias=True) conv3_2_sep_bn = mx.symbol.BatchNorm(name='conv3_2_sep_bn', data=conv3_2_sep, use_global_stats=False, fix_gamma=False, eps=0.000100) conv3_2_sep_scale = conv3_2_sep_bn relu3_2_sep = mx.symbol.Activation(name='relu3_2_sep', data=conv3_2_sep_scale, act_type='relu') # group 4 '''conv4_1 = mx.symbol.Convolution( data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_1") relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1") conv4_2 = mx.symbol.Convolution( data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_2") relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2") conv4_3 = mx.symbol.Convolution( data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_3") relu4_3 = mx.symbol.Activation(data=conv4_3, act_type="relu", name="relu4_3") pool4 = mx.symbol.Pooling( data=relu4_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool4")''' conv4_1_dw = mx.symbol.ChannelwiseConvolution(name='conv4_1_dw', data=relu3_2_sep, num_filter=256, pad=(1, 1), kernel=(3, 3), stride=(1, 1), no_bias=True, num_group=256) conv4_1_dw_bn = mx.symbol.BatchNorm(name='conv4_1_dw_bn', data=conv4_1_dw, use_global_stats=False, fix_gamma=False, eps=0.000100) conv4_1_dw_scale = conv4_1_dw_bn relu4_1_dw = mx.symbol.Activation(name='relu4_1_dw', data=conv4_1_dw_scale, act_type='relu') conv4_1_sep = mx.symbol.Convolution(name='conv4_1_sep', data=relu4_1_dw, num_filter=256, pad=(0, 0), kernel=(1, 1), stride=(1, 1), no_bias=True) conv4_1_sep_bn = mx.symbol.BatchNorm(name='conv4_1_sep_bn', data=conv4_1_sep, use_global_stats=False, fix_gamma=False, eps=0.000100) conv4_1_sep_scale = conv4_1_sep_bn relu4_1_sep = mx.symbol.Activation(name='relu4_1_sep', data=conv4_1_sep_scale, act_type='relu') conv4_2_dw = mx.symbol.ChannelwiseConvolution(name='conv4_2_dw', data=relu4_1_sep, num_filter=256, pad=(1, 1), kernel=(3, 3), stride=(2, 2), no_bias=True, num_group=256) conv4_2_dw_bn = mx.symbol.BatchNorm(name='conv4_2_dw_bn', data=conv4_2_dw, use_global_stats=False, fix_gamma=False, eps=0.000100) conv4_2_dw_scale = conv4_2_dw_bn relu4_2_dw = mx.symbol.Activation(name='relu4_2_dw', data=conv4_2_dw_scale, act_type='relu') conv4_2_sep = mx.symbol.Convolution(name='conv4_2_sep', data=relu4_2_dw, num_filter=512, pad=(0, 0), kernel=(1, 1), stride=(1, 1), no_bias=True) conv4_2_sep_bn = mx.symbol.BatchNorm(name='conv4_2_sep_bn', data=conv4_2_sep, use_global_stats=False, fix_gamma=False, eps=0.000100) conv4_2_sep_scale = conv4_2_sep_bn relu4_2_sep = mx.symbol.Activation(name='relu4_2_sep', data=conv4_2_sep_scale, act_type='relu') # group 5 conv5_1 = mx.symbol.Convolution(data=relu4_2_sep, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_1") relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1") conv5_2 = mx.symbol.Convolution(data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_2") relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2") conv5_3 = mx.symbol.Convolution(data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_3") relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3") pool5 = mx.symbol.Pooling(data=relu5_3, pool_type="max", kernel=(3, 3), stride=(1, 1), pad=(1, 1), name="pool5") # group 6 conv6 = mx.symbol.Convolution(data=pool5, kernel=(3, 3), pad=(6, 6), dilate=(6, 6), num_filter=1024, name="conv6") relu6 = mx.symbol.Activation(data=conv6, act_type="relu", name="relu6") # drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6") # group 7 conv7 = mx.symbol.Convolution(data=relu6, kernel=(1, 1), pad=(0, 0), num_filter=1024, name="conv7") relu7 = mx.symbol.Activation(data=conv7, act_type="relu", name="relu7") # drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7") ### ssd extra layers ### conv8_2, relu8_2 = conv_act_layer(relu7, "8_2", 512, kernel=(3,3), pad=(1,1), \ stride=(2,2), act_type="relu", use_batchnorm=False) conv9_2, relu9_2 = conv_act_layer(relu8_2, "9_2", 256, kernel=(3,3), pad=(1,1), \ stride=(2,2), act_type="relu", use_batchnorm=False) conv10_2, relu10_2 = conv_act_layer(relu9_2, "10_2", 256, kernel=(3,3), pad=(0,0), \ stride=(1,1), act_type="relu", use_batchnorm=False) conv11_2, relu11_2 = conv_act_layer(relu10_2, "11_2", 256, kernel=(3,3), pad=(0,0), \ stride=(1,1), act_type="relu", use_batchnorm=False) # specific parameters for VGG16 network from_layers = [relu4_1_sep, relu7, relu8_2, relu9_2, relu10_2, relu11_2] sizes = [[.1, .141], [.2, .272], [.37, .447], [.54, .619], [.71, .79], [.88, .961]] ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \ [1,2,.5], [1,2,.5]] normalizations = [20, -1, -1, -1, -1, -1] steps = [x / 300.0 for x in [8, 16, 32, 64, 100, 300]] num_channels = [512] loc_preds, cls_preds, anchor_boxes = multibox_layer(from_layers, \ num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \ num_channels=num_channels, clip=False, interm_layer=0, steps=steps) tmp = mx.contrib.symbol.MultiBoxTarget( *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \ ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \ negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2), name="multibox_target") loc_target = tmp[0] loc_target_mask = tmp[1] cls_target = tmp[2] cls_prob = mx.symbol.SoftmaxOutput(data=cls_preds, label=cls_target, \ ignore_label=-1, use_ignore=True, grad_scale=1., multi_output=True, \ normalization='valid', name="cls_prob") loc_loss_ = mx.symbol.smooth_l1(name="loc_loss_", \ data=loc_target_mask * (loc_preds - loc_target), scalar=1.0) loc_loss = mx.symbol.MakeLoss(loc_loss_, grad_scale=1., \ normalization='valid', name="loc_loss") # monitoring training status cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label") det = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \ name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress, variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk) det = mx.symbol.MakeLoss(data=det, grad_scale=0, name="det_out") # group output out = mx.symbol.Group([cls_prob, loc_loss, cls_label, det]) return out