Example #1
0
def get_ssd_md_conv_down(conv_feat):
    conv_C7, conv_C6, conv_C5, conv_C4, conv_C3 = conv_feat

    # C5 to P5, 1x1 dimension reduction to 256
    P5 = conv_act_layer(from_layer=conv_C5, kernel=(1, 1), num_filter=256, name="P5_lateral", use_act=False)
    # P5_up = mx.symbol.UpSampling(P5, scale=2, sample_type='nearest', workspace=512, name='P5_upsampling', num_args=1)
    # P5 = conv_act_layer(from_layer=P5, kernel=(3, 3), pad=(1, 1), num_filter=256, name="P5", use_act=False)

    # P5 2x upsampling + C4 = P4
    P4_la = conv_act_layer(from_layer=conv_C4, kernel=(1, 1), num_filter=256, name="P4_lateral", use_act=False)
    P5_clip = mx.symbol.Crop(*[P5, P4_la], name="P4_clip")
    P4 = mx.sym.ElementWiseSum(*[P5_clip, P4_la], name="P4_sum")
    P4_up = mx.symbol.UpSampling(P4, scale=2, sample_type='nearest', workspace=512, name='P4_upsampling', num_args=1)
    P4 = conv_act_layer(from_layer=P4, kernel=(3, 3), pad=(1, 1), num_filter=256, name="P4", use_act=False)

    # P4 2x upsampling + C3 = P3
    P3_la = conv_act_layer(from_layer=conv_C3, kernel=(1, 1), num_filter=256, name="P3_lateral", use_act=False)
    P4_clip = mx.symbol.Crop(*[P4_up, P3_la], name="P3_clip")
    P3 = mx.sym.ElementWiseSum(*[P4_clip, P3_la], name="P3_sum")
    P3 = conv_act_layer(from_layer=P3, kernel=(3, 3), pad=(1, 1), num_filter=256, name="P3", use_act=False)

    conv_fpn_feat = dict()
    conv_fpn_feat.update({"stride64": conv_C7, "stride32": conv_C6, "stride16": P5, "stride8": P4, "stride4": P3})

    return conv_fpn_feat, [conv_C7, conv_C6, P5, P4, P3]
Example #2
0
def get_detnet_conv_down(conv_feat):
    conv_C6, conv_C5, conv_C4, conv_C3, conv_C2 = conv_feat
    # C6 to P6, 1x1 dimension reduction to 256
    P6 = conv_act_layer(from_layer=conv_C6, kernel=(1, 1), num_filter=256, name="P6_lateral", use_act=False)

    # P6 + C5 = P5
    P5 = conv_act_layer(from_layer=conv_C5, kernel=(1, 1), num_filter=256, name="P5_lateral", use_act=False)
    P5 = mx.symbol.ElementWiseSum(*[P6, P5], name="P5_sum")

    # P5 + C4 = P4
    P4_la   = conv_act_layer(from_layer=conv_C4, kernel=(1, 1), num_filter=256, name="P4_lateral", use_act=False)
    P4      = mx.sym.ElementWiseSum(*[P5, P4_la], name="P4_sum")
    P4_up = mx.symbol.UpSampling(P4, scale=2, sample_type='nearest', workspace=512, name='P4_upsampling', num_args=1)

    # P4 2x upsampling + C3 = P3
    P3_la   = conv_act_layer(from_layer=conv_feat[2], kernel=(1, 1), num_filter=256, name="P3_lateral", use_act=False)
    P4_clip = mx.symbol.Crop(*[P4_up, P3_la], name="P3_clip")
    P3      = mx.sym.ElementWiseSum(*[P4_clip, P3_la], name="P3_sum")
    P3_up = mx.symbol.UpSampling(P3, scale=2, sample_type='nearest', workspace=512, name='P3_upsampling', num_args=1)

    # P3 2x upsampling + C2 = P2
    P2_la   = conv_act_layer(from_layer=conv_feat[3], kernel=(1, 1), num_filter=256, name="P2_lateral", use_act=False)
    P3_clip = mx.symbol.Crop(*[P3_up, P2_la], name="P2_clip")
    P2      = mx.sym.ElementWiseSum(*[P3_clip, P2_la], name="P2_sum")

    P6 = conv_act_layer(from_layer=P6, kernel=(3, 3), pad=(1, 1), num_filter=256, name="P6", use_act=False)
    P5 = conv_act_layer(from_layer=P5, kernel=(3, 3), pad=(1, 1), num_filter=256, name="P5", use_act=False)
    P4 = conv_act_layer(from_layer=P4, kernel=(3, 3), pad=(1, 1), num_filter=256, name="P4", use_act=False)
    P3 = conv_act_layer(from_layer=P3, kernel=(3, 3), pad=(1, 1), num_filter=256, name="P3", use_act=False)
    P2 = conv_act_layer(from_layer=P2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="P2", use_act=False)

    conv_fpn_feat = dict()
    conv_fpn_feat.update({"stride32": P6, "stride16": P5, "stride8": P4, "stride4": P3, "stride2": P2})

    return conv_fpn_feat, [P6, P5, P4, P3, P2]
Example #3
0
def get_deeplabv2_conv(data, num_layers):
    _, _, conv_C3, conv_C2 = get_resnet_conv(data, num_layers)
    #  deeplabv2 res4 stride 8
    unit = residual_unit(data=conv_C3, num_filter=1024, stride=(1, 1), dim_match=False, name='stage3_unit1', dilate=(2,2))
    for i in range(2, units[2] + 1):
        unit = residual_unit(data=unit, num_filter=1024, stride=(1, 1), dim_match=True, dilate=(2, 2),
                             name='stage3_unit%s' % i)
    conv_C4 = unit
    # deeplabv2 res5 stride 8
    unit = residual_unit(data=unit, num_filter=2048, stride=(1, 1), dim_match=False, name='stage4_unit1', dilate=(4, 4))
    for i in range(2, units[3] + 1):
        unit = residual_unit(data=unit, num_filter=2048, stride=(1, 1), dim_match=True, dilate=(4, 4),
                             name='stage4_unit%s' % i)
    conv_C5 = unit

    # extra conv C6 stride 16
    conv_1x1 = conv_act_layer(unit, 'multi_feat_3_conv_1x1',
                              128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu')
    conv_C6 = conv_act_layer(conv_1x1, 'multi_feat_3_conv_3x3',
                             256, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu')

    # extra conv C7 stride 32
    conv_1x1 = conv_act_layer(conv_C6, 'multi_feat_4_conv_1x1',
                              128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu')
    conv_C7 = conv_act_layer(conv_1x1, 'multi_feat_4_conv_3x3',
                             256, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu')

    conv_feat = [conv_C7, conv_C6, conv_C5, conv_C4, conv_C3]

    # _, conv_C4, conv_C3, conv_C2 = get_resnet_conv(data, num_layers)
    #
    # # detnet res5 stride 16
    # unit = residual_unit(data=conv_C4, num_filter=2048, stride=(1, 1), dim_match=False, name='stage4_unit1', dilate=(2, 2))
    # for i in range(2, units[3] + 1):
    #     unit = residual_unit(data=unit, num_filter=2048, stride=(1, 1), dim_match=True,
    #                          name='stage4_unit%s' % i)
    # conv_C5 = unit
    #
    # # extra conv C6 stride 16
    # conv_1x1 = conv_act_layer(unit, 'multi_feat_2_conv_1x1',
    #                           256, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu')
    # conv_C6 = conv_act_layer(conv_1x1, 'multi_feat_2_conv_3x3',
    #                          512, kernel=(3, 3), pad=(2, 2), stride=(1, 1), dilate=(2, 2), act_type='relu')
    #
    # # extra conv C7 stride 32
    # conv_1x1 = conv_act_layer(conv_C6, 'multi_feat_3_conv_1x1',
    #                           128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu')
    # conv_C7 = conv_act_layer(conv_1x1, 'multi_feat_3_conv_3x3',
    #                          256, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu')
    #
    # # extra conv C8 stride 64
    # conv_1x1 = conv_act_layer(conv_C7, 'multi_feat_4_conv_1x1',
    #                           128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu')
    # conv_C8 = conv_act_layer(conv_1x1, 'multi_feat_4_conv_3x3',
    #                          256, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu')
    #
    # conv_feat = [conv_C8, conv_C7, conv_C6, conv_C5, conv_C4]
    return conv_feat
Example #4
0
def get_ssd_conv(data, num_layers):
    conv_C5, conv_C4, conv_C3, _ = get_resnetm_conv(data, num_layers)

    # extra conv C6
    conv_1x1 = conv_act_layer(conv_C5, 'multi_feat_2_conv_1x1',
                              128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu')
    conv_C6 = conv_act_layer(conv_1x1, 'multi_feat_2_conv_3x3',
                              256, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu')

    # extra conv C7
    conv_1x1 = conv_act_layer(conv_C6, 'multi_feat_3_conv_1x1',
                              128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu')
    conv_C7 = conv_act_layer(conv_1x1, 'multi_feat_3_conv_3x3',
                             256, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu')

    # extra conv C8
    conv_1x1 = conv_act_layer(conv_C7, 'multi_feat_4_conv_1x1',
                              128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu')
    conv_C8 = conv_act_layer(conv_1x1, 'multi_feat_4_conv_3x3',
                             256, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu')

    # extra conv C9
    conv_1x1 = conv_act_layer(conv_C8, 'multi_feat_5_conv_1x1',
                              128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu')
    conv_C9 = conv_act_layer(conv_1x1, 'multi_feat_5_conv_3x3',
                             256, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu')

    conv_feat = [conv_C7, conv_C6, conv_C5, conv_C4, conv_C3]
    return conv_feat
Example #5
0
def get_resnetm_conv_down(conv_feat):
    # C5 to P5, 1x1 dimension reduction to 256
    C5 = conv_feat[0]
    P5 = conv_act_layer(from_layer=C5, kernel=(1, 1), num_filter=256, name="P5_lateral", use_act=False)
    P5_up = mx.symbol.UpSampling(P5, scale=2, sample_type='nearest', workspace=512, name='P5_upsampling', num_args=1)
    P5 = conv_act_layer(from_layer=P5, kernel=(3, 3), pad=(1, 1), num_filter=256, name="P5", use_act=False)

    # P5 2x upsampling + C4 = P4
    P4_la   = conv_act_layer(from_layer=conv_feat[1], kernel=(1, 1), num_filter=256, name="P4_lateral", use_act=False)
    P5_clip = mx.symbol.Crop(*[P5_up, P4_la], name="P4_clip")
    P4      = mx.sym.ElementWiseSum(*[P5_clip, P4_la], name="P4_sum")
    P4_up = mx.symbol.UpSampling(P4, scale=2, sample_type='nearest', workspace=512, name='P4_upsampling', num_args=1)
    P4      = conv_act_layer(from_layer=P4, kernel=(3, 3), pad=(1, 1), num_filter=256, name="P4", use_act=False)

    # P4 2x upsampling + C3 = P3
    P3_la   = conv_act_layer(from_layer=conv_feat[2], kernel=(1, 1), num_filter=256, name="P3_lateral", use_act=False)
    P4_clip = mx.symbol.Crop(*[P4_up, P3_la], name="P3_clip")
    P3      = mx.sym.ElementWiseSum(*[P4_clip, P3_la], name="P3_sum")
    P3_up = mx.symbol.UpSampling(P3, scale=2, sample_type='nearest', workspace=512, name='P3_upsampling', num_args=1)
    P3      = conv_act_layer(from_layer=P3, kernel=(3, 3), pad=(1, 1), num_filter=256, name="P3", use_act=False)

    # P3 2x upsampling + C2 = P2
    P2_la   = conv_act_layer(from_layer=conv_feat[3], kernel=(1, 1), num_filter=256, name="P2_lateral", use_act=False)
    P3_clip = mx.symbol.Crop(*[P3_up, P2_la], name="P2_clip")
    P2      = mx.sym.ElementWiseSum(*[P3_clip, P2_la], name="P2_sum")
    P2      = conv_act_layer(from_layer=P2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="P2", use_act=False)

    # P6 2x subsampling P5
    P6 = mx.symbol.Pooling(data=P5, kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type='max', name='P6')

    conv_fpn_feat = dict()
    conv_fpn_feat.update({"stride32":P6, "stride16":P5, "stride8":P4, "stride4":P3, "stride2":P2})

    return conv_fpn_feat, [P6, P5, P4, P3, P2]
Example #6
0
def get_ssd_md_conv(data, num_layers):
    _, conv_C4, conv_C3, _ = get_resnetm_conv(data, num_layers)

    # deeplabv2 res5 stride 8
    unit = residual_unit(data=conv_C4, num_filter=2048, stride=(1, 1), dim_match=False, name='stage4_unit1', dilate=(2, 2))
    for i in range(2, units[3] + 1):
        unit = residual_unit(data=unit, num_filter=2048, stride=(1, 1), dim_match=True, dilate=(2, 2),
                             name='stage4_unit%s' % i)
    conv_C5 = unit

    # extra conv C6
    conv_1x1 = conv_act_layer(conv_C5, 'multi_feat_2_conv_1x1',
                              128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu')
    conv_C6 = conv_act_layer(conv_1x1, 'multi_feat_2_conv_3x3',
                              256, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu')

    # extra conv C7
    conv_1x1 = conv_act_layer(conv_C6, 'multi_feat_3_conv_1x1',
                              128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu')
    conv_C7 = conv_act_layer(conv_1x1, 'multi_feat_3_conv_3x3',
                             256, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu')

    # extra conv C8
    conv_1x1 = conv_act_layer(conv_C7, 'multi_feat_4_conv_1x1',
                              128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu')
    conv_C8 = conv_act_layer(conv_1x1, 'multi_feat_4_conv_3x3',
                             256, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu')

    # extra conv C9
    conv_1x1 = conv_act_layer(conv_C8, 'multi_feat_5_conv_1x1',
                              128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu')
    conv_C9 = conv_act_layer(conv_1x1, 'multi_feat_5_conv_3x3',
                             256, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu')

    conv_feat = [conv_C7, conv_C6, conv_C5, conv_C4, conv_C3]
    return conv_feat
Example #7
0
def get_symbol_train(num_classes=20):
    """
    Single-shot multi-box detection with VGG 16 layers ConvNet
    This is a modified version, with fc6/fc7 layers replaced by conv layers
    And the network is slightly smaller than original VGG 16 network
    This is a training network with losses

    Parameters:
    ----------
    num_classes: int
        number of object classes not including background

    Returns:
    ----------
    mx.Symbol
    """
    data = mx.symbol.Variable(name="data")
    label = mx.symbol.Variable(name="label")

    # group 1
    conv1_1 = mx.symbol.Convolution(data=data,
                                    kernel=(3, 3),
                                    pad=(1, 1),
                                    num_filter=64,
                                    name="conv1_1")
    relu1_1 = mx.symbol.Activation(data=conv1_1,
                                   act_type="relu",
                                   name="relu1_1")
    conv1_2 = mx.symbol.Convolution(data=relu1_1,
                                    kernel=(3, 3),
                                    pad=(1, 1),
                                    num_filter=64,
                                    name="conv1_2")
    relu1_2 = mx.symbol.Activation(data=conv1_2,
                                   act_type="relu",
                                   name="relu1_2")
    pool1 = mx.symbol.Pooling(data=relu1_2,
                              pool_type="max",
                              kernel=(2, 2),
                              stride=(2, 2),
                              name="pool1")
    # group 2
    conv2_1 = mx.symbol.Convolution(data=pool1,
                                    kernel=(3, 3),
                                    pad=(1, 1),
                                    num_filter=128,
                                    name="conv2_1")
    relu2_1 = mx.symbol.Activation(data=conv2_1,
                                   act_type="relu",
                                   name="relu2_1")
    conv2_2 = mx.symbol.Convolution(data=relu2_1,
                                    kernel=(3, 3),
                                    pad=(1, 1),
                                    num_filter=128,
                                    name="conv2_2")
    relu2_2 = mx.symbol.Activation(data=conv2_2,
                                   act_type="relu",
                                   name="relu2_2")
    pool2 = mx.symbol.Pooling(data=relu2_2,
                              pool_type="max",
                              kernel=(2, 2),
                              stride=(2, 2),
                              name="pool2")
    # group 3
    conv3_1 = mx.symbol.Convolution(data=pool2,
                                    kernel=(3, 3),
                                    pad=(1, 1),
                                    num_filter=256,
                                    name="conv3_1")
    relu3_1 = mx.symbol.Activation(data=conv3_1,
                                   act_type="relu",
                                   name="relu3_1")
    conv3_2 = mx.symbol.Convolution(data=relu3_1,
                                    kernel=(3, 3),
                                    pad=(1, 1),
                                    num_filter=256,
                                    name="conv3_2")
    relu3_2 = mx.symbol.Activation(data=conv3_2,
                                   act_type="relu",
                                   name="relu3_2")
    conv3_3 = mx.symbol.Convolution(data=relu3_2,
                                    kernel=(3, 3),
                                    pad=(1, 1),
                                    num_filter=256,
                                    name="conv3_3")
    relu3_3 = mx.symbol.Activation(data=conv3_3,
                                   act_type="relu",
                                   name="relu3_3")
    pool3 = mx.symbol.Pooling(
        data=relu3_3, pool_type="max", kernel=(2, 2), stride=(2, 2), \
        pooling_convention="full", name="pool3")
    # group 4
    conv4_1 = mx.symbol.Convolution(data=pool3,
                                    kernel=(3, 3),
                                    pad=(1, 1),
                                    num_filter=512,
                                    name="conv4_1")
    relu4_1 = mx.symbol.Activation(data=conv4_1,
                                   act_type="relu",
                                   name="relu4_1")
    conv4_2 = mx.symbol.Convolution(data=relu4_1,
                                    kernel=(3, 3),
                                    pad=(1, 1),
                                    num_filter=512,
                                    name="conv4_2")
    relu4_2 = mx.symbol.Activation(data=conv4_2,
                                   act_type="relu",
                                   name="relu4_2")
    conv4_3 = mx.symbol.Convolution(data=relu4_2,
                                    kernel=(3, 3),
                                    pad=(1, 1),
                                    num_filter=512,
                                    name="conv4_3")
    relu4_3 = mx.symbol.Activation(data=conv4_3,
                                   act_type="relu",
                                   name="relu4_3")
    pool4 = mx.symbol.Pooling(data=relu4_3,
                              pool_type="max",
                              kernel=(2, 2),
                              stride=(2, 2),
                              name="pool4")
    # group 5
    conv5_1 = mx.symbol.Convolution(data=pool4,
                                    kernel=(3, 3),
                                    pad=(1, 1),
                                    num_filter=512,
                                    name="conv5_1")
    relu5_1 = mx.symbol.Activation(data=conv5_1,
                                   act_type="relu",
                                   name="relu5_1")
    conv5_2 = mx.symbol.Convolution(data=relu5_1,
                                    kernel=(3, 3),
                                    pad=(1, 1),
                                    num_filter=512,
                                    name="conv5_2")
    relu5_2 = mx.symbol.Activation(data=conv5_2,
                                   act_type="relu",
                                   name="relu5_2")
    conv5_3 = mx.symbol.Convolution(data=relu5_2,
                                    kernel=(3, 3),
                                    pad=(1, 1),
                                    num_filter=512,
                                    name="conv5_3")
    relu5_3 = mx.symbol.Activation(data=conv5_3,
                                   act_type="relu",
                                   name="relu5_3")
    pool5 = mx.symbol.Pooling(data=relu5_3,
                              pool_type="max",
                              kernel=(3, 3),
                              stride=(1, 1),
                              pad=(1, 1),
                              name="pool5")
    # group 6
    conv6 = mx.symbol.Convolution(data=pool5,
                                  kernel=(3, 3),
                                  pad=(6, 6),
                                  dilate=(6, 6),
                                  num_filter=1024,
                                  name="conv6")
    relu6 = mx.symbol.Activation(data=conv6, act_type="relu", name="relu6")
    # drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6")
    # group 7
    conv7 = mx.symbol.Convolution(data=relu6,
                                  kernel=(1, 1),
                                  pad=(0, 0),
                                  num_filter=1024,
                                  name="conv7")
    relu7 = mx.symbol.Activation(data=conv7, act_type="relu", name="relu7")
    # drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7")

    ### ssd extra layers ###
    conv8_1, relu8_1 = conv_act_layer(relu7, "8_1", 256, kernel=(1,1), pad=(0,0), \
        stride=(1,1), act_type="relu", use_batchnorm=False)
    conv8_2, relu8_2 = conv_act_layer(relu8_1, "8_2", 512, kernel=(3,3), pad=(1,1), \
        stride=(2,2), act_type="relu", use_batchnorm=False)
    conv9_1, relu9_1 = conv_act_layer(relu8_2, "9_1", 128, kernel=(1,1), pad=(0,0), \
        stride=(1,1), act_type="relu", use_batchnorm=False)
    conv9_2, relu9_2 = conv_act_layer(relu9_1, "9_2", 256, kernel=(3,3), pad=(1,1), \
        stride=(2,2), act_type="relu", use_batchnorm=False)
    conv10_1, relu10_1 = conv_act_layer(relu9_2, "10_1", 128, kernel=(1,1), pad=(0,0), \
        stride=(1,1), act_type="relu", use_batchnorm=False)
    conv10_2, relu10_2 = conv_act_layer(relu10_1, "10_2", 256, kernel=(3,3), pad=(1,1), \
        stride=(2,2), act_type="relu", use_batchnorm=False)
    # global Pooling
    pool10 = mx.symbol.Pooling(data=relu10_2,
                               pool_type="avg",
                               global_pool=True,
                               kernel=(1, 1),
                               name='pool10')

    # specific parameters for VGG16 network
    from_layers = [relu4_3, relu7, relu8_2, relu9_2, relu10_2, pool10]
    sizes = [[.1], [.2, .276], [.38, .461], [.56, .644], [.74, .825],
             [.92, 1.01]]
    ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \
        [1,2,.5,3,1./3], [1,2,.5,3,1./3]]
    normalizations = [20, -1, -1, -1, -1, -1]

    loc_preds, cls_preds, anchor_boxes = multibox_layer(from_layers, \
        num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \
        clip=True, interm_layer=0)

    tmp = mx.symbol.MultiBoxTarget(
        *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \
        ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \
        negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2),
        name="multibox_target")
    loc_target = tmp[0]
    loc_target_mask = tmp[1]
    cls_target = tmp[2]

    cls_prob = mx.symbol.SoftmaxOutput(data=cls_preds, label=cls_target, \
        ignore_label=-1, use_ignore=True, grad_scale=1., multi_output=True, \
        normalization='valid', name="cls_prob")
    loc_loss_ = mx.symbol.smooth_l1(name="loc_loss_", \
        data=loc_target_mask * (loc_preds - loc_target), scalar=1.0)
    loc_loss = mx.symbol.MakeLoss(loc_loss_, grad_scale=1., \
        normalization='valid', name="loc_loss")

    # monitoring training status
    cls_label = mx.symbol.MakeLoss(data=cls_target,
                                   grad_scale=0,
                                   name="cls_label")

    # group output
    out = mx.symbol.Group([cls_prob, loc_loss, cls_label])
    # out = mx.symbol.Group([loc_preds, cls_preds, anchor_boxes])
    return out