def bottleneck_csp_block(x, num_filters, num_blocks, depth_multiple, width_multiple, shortcut=False): '''CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks''' num_filters = make_divisible(num_filters * width_multiple, 8) num_blocks = max(round(num_blocks * depth_multiple), 1) if num_blocks > 1 else num_blocks # depth gain res_connection = DarknetConv2D(num_filters // 2, (1, 1))(x) x = DarknetConv2D_BN_Swish(num_filters // 2, (1, 1))(x) # Bottleneck block stack for i in range(num_blocks): y = compose(DarknetConv2D_BN_Swish(num_filters // 2, (1, 1)), DarknetConv2D_BN_Swish(num_filters // 2, (3, 3)))(x) x = Add()([x, y]) if shortcut else y x = DarknetConv2D(num_filters // 2, (1, 1))(x) x = Concatenate()([x, res_connection]) x = CustomBatchNormalization()(x) x = Activation(swish)(x) return DarknetConv2D_BN_Swish(num_filters, (1, 1))(x)
def yolo4_body(inputs, num_anchors, num_classes, weights_path=None): """Create YOLO_V4 model CNN body in Keras.""" darknet = Model(inputs, csp_darknet53_body(inputs)) if weights_path is not None: darknet.load_weights(weights_path, by_name=True) print('Load weights {}.'.format(weights_path)) #feature map 1 head (19x19 for 608 input) x1 = make_yolo_spp_head(darknet.output, 512) #upsample fpn merge for feature map 1 & 2 x1_upsample = compose(DarknetConv2D_BN_Leaky(256, (1, 1)), UpSampling2D(2))(x1) x2 = DarknetConv2D_BN_Leaky(256, (1, 1))(darknet.layers[204].output) x2 = Concatenate()([x2, x1_upsample]) #feature map 2 head (38x38 for 608 input) x2 = make_yolo_head(x2, 256) #upsample fpn merge for feature map 2 & 3 x2_upsample = compose(DarknetConv2D_BN_Leaky(128, (1, 1)), UpSampling2D(2))(x2) x3 = DarknetConv2D_BN_Leaky(128, (1, 1))(darknet.layers[131].output) x3 = Concatenate()([x3, x2_upsample]) #feature map 3 head & output (76x76 for 608 input) #x3, y3 = make_last_layers(x3, 128, num_anchors*(num_classes+5)) x3 = make_yolo_head(x3, 128) y3 = compose(DarknetConv2D_BN_Leaky(256, (3, 3)), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x3) #downsample fpn merge for feature map 3 & 2 x3_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), DarknetConv2D_BN_Leaky(256, (3, 3), strides=(2, 2)))(x3) x2 = Concatenate()([x3_downsample, x2]) #feature map 2 output (38x38 for 608 input) #x2, y2 = make_last_layers(x2, 256, num_anchors*(num_classes+5)) x2 = make_yolo_head(x2, 256) y2 = compose(DarknetConv2D_BN_Leaky(512, (3, 3)), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x2) #downsample fpn merge for feature map 2 & 1 x2_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), DarknetConv2D_BN_Leaky(512, (3, 3), strides=(2, 2)))(x2) x1 = Concatenate()([x2_downsample, x1]) #feature map 1 output (19x19 for 608 input) #x1, y1 = make_last_layers(x1, 512, num_anchors*(num_classes+5)) x1 = make_yolo_head(x1, 512) y1 = compose(DarknetConv2D_BN_Leaky(1024, (3, 3)), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x1) return Model(inputs, [y1, y2, y3])
def tiny_yolo4lite_mobilenet_body(inputs, num_anchors, num_classes, alpha=1.0, use_spp=True): '''Create Tiny YOLO_v3 Lite MobileNet model CNN body in keras.''' mobilenet = MobileNet(input_tensor=inputs, weights='imagenet', include_top=False, alpha=alpha) # input: 416 x 416 x 3 # conv_pw_13_relu :13 x 13 x (1024*alpha) # conv_pw_11_relu :26 x 26 x (512*alpha) # conv_pw_5_relu : 52 x 52 x (256*alpha) # f1 :13 x 13 x (1024*alpha) for 416 input f1 = mobilenet.get_layer('conv_pw_13_relu').output # f2: 26 x 26 x (512*alpha) for 416 input f2 = mobilenet.get_layer('conv_pw_11_relu').output #feature map 1 head (13 x 13 x (512*alpha) for 416 input) x1 = DarknetConv2D_BN_Leaky(int(512 * alpha), (1, 1))(f1) if use_spp: x1 = Spp_Conv2D_BN_Leaky(x1, int(512 * alpha)) #upsample fpn merge for feature map 1 & 2 x1_upsample = compose(DarknetConv2D_BN_Leaky(int(256 * alpha), (1, 1)), UpSampling2D(2))(x1) x2 = compose( Concatenate(), #DarknetConv2D_BN_Leaky(int(512*alpha), (3,3)), Depthwise_Separable_Conv2D_BN_Leaky(filters=int(512 * alpha), kernel_size=(3, 3), block_id_str='15'))( [x1_upsample, f2]) #feature map 2 output (26 x 26 x (512*alpha) for 416 input) y2 = DarknetConv2D(num_anchors * (num_classes + 5), (1, 1))(x2) #downsample fpn merge for feature map 2 & 1 x2_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), #DarknetConv2D_BN_Leaky(int(512*alpha), (3,3), strides=(2,2)), Darknet_Depthwise_Separable_Conv2D_BN_Leaky(int(512 * alpha), (3, 3), strides=(2, 2), block_id_str='16'))(x2) x1 = compose( Concatenate(), #DarknetConv2D_BN_Leaky(int(1024*alpha), (3,3)), Depthwise_Separable_Conv2D_BN_Leaky(filters=int(1024 * alpha), kernel_size=(3, 3), block_id_str='17'))( [x2_downsample, x1]) #feature map 1 output (13 x 13 x (1024*alpha) for 416 input) y1 = DarknetConv2D(num_anchors * (num_classes + 5), (1, 1))(x1) return Model(inputs, [y1, y2])
def tiny_yolo4lite_efficientnet_body(inputs, num_anchors, num_classes, level=0, use_spp=True): ''' Create Tiny YOLO_v4 Lite EfficientNet model CNN body in keras. # Arguments level: EfficientNet level number. by default we use basic EfficientNetB0 as backbone ''' efficientnet, feature_map_info = get_efficientnet_backbone_info( inputs, level=level) f1 = efficientnet.get_layer('top_activation').output f2 = efficientnet.get_layer('block6a_expand_activation').output f1_channel_num = feature_map_info['f1_channel_num'] f2_channel_num = feature_map_info['f2_channel_num'] #feature map 1 head (13 x 13 x (f1_channel_num//2) for 416 input) x1 = DarknetConv2D_BN_Leaky(f1_channel_num // 2, (1, 1))(f1) if use_spp: x1 = Spp_Conv2D_BN_Leaky(x1, f1_channel_num // 2) #upsample fpn merge for feature map 1 & 2 x1_upsample = compose(DarknetConv2D_BN_Leaky(f2_channel_num // 2, (1, 1)), UpSampling2D(2))(x1) x2 = compose( Concatenate(), #DarknetConv2D_BN_Leaky(f2_channel_num, (3,3)), Depthwise_Separable_Conv2D_BN_Leaky(filters=f2_channel_num, kernel_size=(3, 3), block_id_str='8'))( [x1_upsample, f2]) #feature map 2 output (26 x 26 x f2_channel_num for 416 input) y2 = DarknetConv2D(num_anchors * (num_classes + 5), (1, 1))(x2) #downsample fpn merge for feature map 2 & 1 x2_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), #DarknetConv2D_BN_Leaky(f1_channel_num//2, (3,3), strides=(2,2)), Darknet_Depthwise_Separable_Conv2D_BN_Leaky(f1_channel_num // 2, (3, 3), strides=(2, 2), block_id_str='9'))(x2) x1 = compose( Concatenate(), #DarknetConv2D_BN_Leaky(f1_channel_num, (3,3)), Depthwise_Separable_Conv2D_BN_Leaky(filters=f1_channel_num, kernel_size=(3, 3), block_id_str='10'))( [x2_downsample, x1]) #feature map 1 output (13 x 13 x f1_channel_num for 416 input) y1 = DarknetConv2D(num_anchors * (num_classes + 5), (1, 1))(x1) return Model(inputs, [y1, y2])
def tiny_yolo4_mobilenetv3small_body(inputs, num_anchors, num_classes, alpha=1.0, use_spp=True): '''Create Tiny YOLO_v4 MobileNetV3Small model CNN body in keras.''' mobilenetv3small = MobileNetV3Small(input_tensor=inputs, weights='imagenet', include_top=False, alpha=alpha) # input: 416 x 416 x 3 # activation_31(layer 165, final feature map): 13 x 13 x (576*alpha) # expanded_conv_10/Add(layer 162, end of block10): 13 x 13 x (96*alpha) # activation_22(layer 117, middle in block8) : 26 x 26 x (288*alpha) # expanded_conv_7/Add(layer 114, end of block7) : 26 x 26 x (48*alpha) # activation_7(layer 38, middle in block3) : 52 x 52 x (96*alpha) # expanded_conv_2/Add(layer 35, end of block2): 52 x 52 x (24*alpha) # f1 :13 x 13 x (576*alpha) # NOTE: activation layer name may different for TF1.x/2.x, so we # use index to fetch layer f1 = mobilenetv3small.layers[165].output # f2: 26 x 26 x (288*alpha) for 416 input f2 = mobilenetv3small.layers[117].output #feature map 1 head (13 x 13 x (288*alpha) for 416 input) x1 = DarknetConv2D_BN_Leaky(int(288 * alpha), (1, 1))(f1) if use_spp: x1 = Spp_Conv2D_BN_Leaky(x1, int(288 * alpha)) #upsample fpn merge for feature map 1 & 2 x1_upsample = compose(DarknetConv2D_BN_Leaky(int(144 * alpha), (1, 1)), UpSampling2D(2))(x1) x2 = compose( Concatenate(), #Depthwise_Separable_Conv2D_BN_Leaky(filters=int(288*alpha), kernel_size=(3, 3), block_id_str='11'), DarknetConv2D_BN_Leaky(int(288 * alpha), (3, 3)))([x1_upsample, f2]) #feature map 2 output (26 x 26 x (288*alpha) for 416 input) y2 = DarknetConv2D(num_anchors * (num_classes + 5), (1, 1))(x2) #downsample fpn merge for feature map 2 & 1 x2_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), #Darknet_Depthwise_Separable_Conv2D_BN_Leaky(int(288*alpha), (3,3), strides=(2,2), block_id_str='12'), DarknetConv2D_BN_Leaky(int(288 * alpha), (3, 3), strides=(2, 2)))(x2) x1 = compose( Concatenate(), #Depthwise_Separable_Conv2D_BN_Leaky(filters=int(576*alpha), kernel_size=(3, 3), block_id_str='13'), DarknetConv2D_BN_Leaky(int(576 * alpha), (3, 3)))([x2_downsample, x1]) #feature map 1 output (13 x 13 x (576*alpha) for 416 input) y1 = DarknetConv2D(num_anchors * (num_classes + 5), (1, 1))(x1) return Model(inputs, [y1, y2])
def tiny_yolo4_mobilenetv3large_body(inputs, num_anchors, num_classes, alpha=1.0, spp=True): '''Create Tiny YOLO_v4 MobileNetV3Large model CNN body in keras.''' mobilenetv3large = MobileNetV3Large(input_tensor=inputs, weights='imagenet', include_top=False, alpha=alpha) # input: 416 x 416 x 3 # activation_38(layer 194, final feature map): 13 x 13 x (960*alpha) # expanded_conv_14/Add(layer 191, end of block14): 13 x 13 x (160*alpha) # activation_29(layer 146, middle in block12) : 26 x 26 x (672*alpha) # expanded_conv_11/Add(layer 143, end of block11) : 26 x 26 x (112*alpha) # activation_15(layer 79, middle in block6) : 52 x 52 x (240*alpha) # expanded_conv_5/Add(layer 76, end of block5): 52 x 52 x (40*alpha) # f1 :13 x 13 x (960*alpha) # NOTE: activation layer name may different for TF1.x/2.x, so we # use index to fetch layer f1 = mobilenetv3large.layers[194].output # f2: 26 x 26 x (672*alpha) for 416 input f2 = mobilenetv3large.layers[146].output #feature map 1 head (13 x 13 x (480*alpha) for 416 input) x1 = DarknetConv2D_BN_Leaky(int(480 * alpha), (1, 1))(f1) if spp: x1 = Spp_Conv2D_BN_Leaky(x1, int(480 * alpha)) #upsample fpn merge for feature map 1 & 2 x1_upsample = compose(DarknetConv2D_BN_Leaky(int(336 * alpha), (1, 1)), UpSampling2D(2))(x1) x2 = compose( Concatenate(), #Depthwise_Separable_Conv2D_BN_Leaky(filters=int(672*alpha), kernel_size=(3, 3), block_id_str='15'), DarknetConv2D_BN_Leaky(int(672 * alpha), (3, 3)))([x1_upsample, f2]) #feature map 2 output (26 x 26 x (672*alpha) for 416 input) y2 = DarknetConv2D(num_anchors * (num_classes + 5), (1, 1))(x2) #downsample fpn merge for feature map 2 & 1 x2_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), #Darknet_Depthwise_Separable_Conv2D_BN_Leaky(int(480*alpha), (3,3), strides=(2,2), block_id_str='16'), DarknetConv2D_BN_Leaky(int(480 * alpha), (3, 3), strides=(2, 2)))(x2) x1 = compose( Concatenate(), #Depthwise_Separable_Conv2D_BN_Leaky(filters=int(960*alpha), kernel_size=(3, 3), block_id_str='17'), DarknetConv2D_BN_Leaky(int(960 * alpha), (3, 3)))([x2_downsample, x1]) #feature map 1 output (13 x 13 x (960*alpha) for 416 input) y1 = DarknetConv2D(num_anchors * (num_classes + 5), (1, 1))(x1) return Model(inputs, [y1, y2])
def yolo5lite_predictions(feature_maps, feature_channel_nums, num_anchors, num_classes, depth_multiple, width_multiple): f1, f2, f3 = feature_maps f1_channel_num, f2_channel_num, f3_channel_num = feature_channel_nums # SPP & BottleneckCSP block, in ultralytics PyTorch version # they're defined in backbone x1 = make_yolo5_spp_neck(f1, f1_channel_num) x1 = bottleneck_csp_lite_block(x1, f1_channel_num, 3, depth_multiple, width_multiple, shortcut=False, block_id_str='pred_1') #feature map 1 head (19x19 for 608 input) x1 = DarknetConv2D_BN_Swish(f2_channel_num, (1,1))(x1) #upsample fpn merge for feature map 1 & 2 x1_upsample = UpSampling2D(2)(x1) x2 = Concatenate()([f2, x1_upsample]) x2 = bottleneck_csp_lite_block(x2, f2_channel_num, 3, depth_multiple, width_multiple, shortcut=False, block_id_str='pred_2') #feature map 2 head (38x38 for 608 input) x2 = DarknetConv2D_BN_Swish(f3_channel_num, (1,1))(x2) #upsample fpn merge for feature map 2 & 3 x2_upsample = UpSampling2D(2)(x2) x3 = Concatenate()([f3, x2_upsample]) #feature map 3 head & output (76x76 for 608 input) x3 = bottleneck_csp_lite_block(x3, f3_channel_num, 3, depth_multiple, width_multiple, shortcut=False, block_id_str='pred_3') y3 = DarknetConv2D(num_anchors*(num_classes+5), (1,1), name='predict_conv_3')(x3) #downsample fpn merge for feature map 3 & 2 x3_downsample = compose( ZeroPadding2D(((1,0),(1,0))), #DarknetConv2D_BN_Swish(f3_channel_num, (3,3), strides=(2,2)))(x3) Darknet_Depthwise_Separable_Conv2D_BN_Swish(f3_channel_num, (3,3), strides=(2,2), block_id_str='pred_3_2'))(x3) x2 = Concatenate()([x3_downsample, x2]) #feature map 2 output (38x38 for 608 input) x2 = bottleneck_csp_lite_block(x2, f2_channel_num, 3, depth_multiple, width_multiple, shortcut=False, block_id_str='pred_4') y2 = DarknetConv2D(num_anchors*(num_classes+5), (1,1), name='predict_conv_2')(x2) #downsample fpn merge for feature map 2 & 1 x2_downsample = compose( ZeroPadding2D(((1,0),(1,0))), #DarknetConv2D_BN_Swish(f2_channel_num, (3,3), strides=(2,2)))(x2) Darknet_Depthwise_Separable_Conv2D_BN_Swish(f2_channel_num, (3,3), strides=(2,2), block_id_str='pred_4_2'))(x2) x1 = Concatenate()([x2_downsample, x1]) #feature map 1 output (19x19 for 608 input) x1 = bottleneck_csp_lite_block(x1, f1_channel_num, 3, depth_multiple, width_multiple, shortcut=False, block_id_str='pred_5') y1 = DarknetConv2D(num_anchors*(num_classes+5), (1,1), name='predict_conv_1')(x1) return y1, y2, y3
def DarknetConv2D_BN_Swish(*args, **kwargs): """Darknet Convolution2D followed by CustomBatchNormalization and Swish.""" no_bias_kwargs = {'use_bias': False} no_bias_kwargs.update(kwargs) return compose( DarknetConv2D(*args, **no_bias_kwargs), CustomBatchNormalization(), Activation(swish))
def tiny_yolo5lite_predictions(feature_maps, feature_channel_nums, num_anchors, num_classes, use_spp): f1, f2 = feature_maps f1_channel_num, f2_channel_num = feature_channel_nums #feature map 1 head (13 x 13 x f1_channel_num//2 for 416 input) x1 = DarknetConv2D_BN_Mish(f1_channel_num // 2, (1, 1))(f1) if use_spp: x1 = Spp_Conv2D_BN_Mish(x1, f1_channel_num // 2) #upsample fpn merge for feature map 1 & 2 x1_upsample = compose(DarknetConv2D_BN_Mish(f2_channel_num // 2, (1, 1)), UpSampling2D(2))(x1) x2 = compose( Concatenate(), #DarknetConv2D_BN_Mish(f2_channel_num, (3,3)), Depthwise_Separable_Conv2D_BN_Mish(filters=f2_channel_num, kernel_size=(3, 3), block_id_str='pred_1'))( [x1_upsample, f2]) #feature map 2 output (26 x 26 x f2_channel_num for 416 input) y2 = DarknetConv2D(num_anchors * (num_classes + 5), (1, 1), name='predict_conv_2')(x2) #downsample fpn merge for feature map 2 & 1 x2_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), #DarknetConv2D_BN_Mish(f1_channel_num//2, (3,3), strides=(2,2)), Darknet_Depthwise_Separable_Conv2D_BN_Mish(f1_channel_num // 2, (3, 3), strides=(2, 2), block_id_str='pred_2'))(x2) x1 = compose( Concatenate(), #DarknetConv2D_BN_Mish(f1_channel_num, (3,3)), Depthwise_Separable_Conv2D_BN_Mish(filters=f1_channel_num, kernel_size=(3, 3), block_id_str='pred_3'))( [x2_downsample, x1]) #feature map 1 output (13 x 13 x f1_channel_num for 416 input) y1 = DarknetConv2D(num_anchors * (num_classes + 5), (1, 1), name='predict_conv_1')(x1) return y1, y2
def yolo5lite_predictions(feature_maps, feature_channel_nums, num_anchors, num_classes): f1, f2, f3 = feature_maps f1_channel_num, f2_channel_num, f3_channel_num = feature_channel_nums #feature map 1 head (13 x 13 x f1_channel_num//2 for 416 input) x1 = make_csp_yolo_spp_depthwise_separable_head(f1, f1_channel_num // 2, block_id_str='pred_1') #upsample fpn merge for feature map 1 & 2 x1_upsample = compose(DarknetConv2D_BN_Mish(f2_channel_num // 2, (1, 1)), UpSampling2D(2))(x1) x2 = DarknetConv2D_BN_Mish(f2_channel_num // 2, (1, 1))(f2) x2 = Concatenate()([x2, x1_upsample]) #feature map 2 head (26 x 26 x f2_channel_num//2 for 416 input) x2 = make_csp_yolo_depthwise_separable_head(x2, f2_channel_num // 2, block_id_str='pred_2') #upsample fpn merge for feature map 2 & 3 x2_upsample = compose(DarknetConv2D_BN_Mish(f3_channel_num // 2, (1, 1)), UpSampling2D(2))(x2) x3 = DarknetConv2D_BN_Mish(f3_channel_num // 2, (1, 1))(f3) x3 = Concatenate()([x3, x2_upsample]) #feature map 3 head & output (52 x 52 x f3_channel_num for 416 input) x3 = make_csp_yolo_depthwise_separable_head(x3, f3_channel_num // 2, block_id_str='pred_3') y3 = compose( Depthwise_Separable_Conv2D_BN_Mish(f3_channel_num, (3, 3), block_id_str='pred_3_3'), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1), name='predict_conv_3'))(x3) #downsample fpn merge for feature map 3 & 2 x3_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), Darknet_Depthwise_Separable_Conv2D_BN_Mish( f2_channel_num // 2, (3, 3), strides=(2, 2), block_id_str='pred_3_4'))(x3) x2 = Concatenate()([x3_downsample, x2]) #feature map 2 output (26 x 26 x f2_channel_num for 416 input) x2 = make_csp_yolo_depthwise_separable_head(x2, f2_channel_num // 2, block_id_str='pred_4') y2 = compose( Depthwise_Separable_Conv2D_BN_Mish(f2_channel_num, (3, 3), block_id_str='pred_4_3'), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1), name='predict_conv_2'))(x2) #downsample fpn merge for feature map 2 & 1 x2_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), Darknet_Depthwise_Separable_Conv2D_BN_Mish( f1_channel_num // 2, (3, 3), strides=(2, 2), block_id_str='pred_4_4'))(x2) x1 = Concatenate()([x2_downsample, x1]) #feature map 1 output (13 x 13 x f1_channel_num for 416 input) x1 = make_csp_yolo_depthwise_separable_head(x1, f1_channel_num // 2, block_id_str='pred_5') y1 = compose( Depthwise_Separable_Conv2D_BN_Mish(f1_channel_num, (3, 3), block_id_str='pred_5_3'), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1), name='predict_conv_1'))(x1) return y1, y2, y3
def yolo5_predictions(feature_maps, feature_channel_nums, num_anchors, num_classes): f1, f2, f3 = feature_maps f1_channel_num, f2_channel_num, f3_channel_num = feature_channel_nums #feature map 1 head (19x19 for 608 input) x1 = make_csp_yolo_spp_head(f1, f1_channel_num // 2) #upsample fpn merge for feature map 1 & 2 x1_upsample = compose(DarknetConv2D_BN_Mish(f2_channel_num // 2, (1, 1)), UpSampling2D(2))(x1) x2 = DarknetConv2D_BN_Mish(f2_channel_num // 2, (1, 1))(f2) x2 = Concatenate()([x2, x1_upsample]) #feature map 2 head (38x38 for 608 input) x2 = make_csp_yolo_head(x2, f2_channel_num // 2) #upsample fpn merge for feature map 2 & 3 x2_upsample = compose(DarknetConv2D_BN_Mish(f3_channel_num // 2, (1, 1)), UpSampling2D(2))(x2) x3 = DarknetConv2D_BN_Mish(f3_channel_num // 2, (1, 1))(f3) x3 = Concatenate()([x3, x2_upsample]) #feature map 3 head & output (76x76 for 608 input) #x3, y3 = make_last_layers(x3, f3_channel_num//2, num_anchors*(num_classes+5)) x3 = make_csp_yolo_head(x3, f3_channel_num // 2) y3 = compose( DarknetConv2D_BN_Mish(f3_channel_num, (3, 3)), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1), name='predict_conv_3'))(x3) #downsample fpn merge for feature map 3 & 2 x3_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), DarknetConv2D_BN_Mish(f2_channel_num // 2, (3, 3), strides=(2, 2)))(x3) x2 = Concatenate()([x3_downsample, x2]) #feature map 2 output (38x38 for 608 input) #x2, y2 = make_last_layers(x2, 256, num_anchors*(num_classes+5)) x2 = make_csp_yolo_head(x2, f2_channel_num // 2) y2 = compose( DarknetConv2D_BN_Mish(f2_channel_num, (3, 3)), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1), name='predict_conv_2'))(x2) #downsample fpn merge for feature map 2 & 1 x2_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), DarknetConv2D_BN_Mish(f1_channel_num // 2, (3, 3), strides=(2, 2)))(x2) x1 = Concatenate()([x2_downsample, x1]) #feature map 1 output (19x19 for 608 input) #x1, y1 = make_last_layers(x1, f1_channel_num//2, num_anchors*(num_classes+5)) x1 = make_csp_yolo_head(x1, f1_channel_num // 2) y1 = compose( DarknetConv2D_BN_Mish(f1_channel_num, (3, 3)), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1), name='predict_conv_1'))(x1) return y1, y2, y3
def CSPDarkNet53(input_shape=None, input_tensor=None, include_top=True, weights='imagenet', pooling=None, classes=1000, **kwargs): """Generate cspdarknet53 model for Imagenet classification.""" if not (weights in {'imagenet', None} or os.path.exists(weights)): raise ValueError('The `weights` argument should be either ' '`None` (random initialization), `imagenet` ' '(pre-training on ImageNet), ' 'or the path to the weights file to be loaded.') if weights == 'imagenet' and include_top and classes != 1000: raise ValueError('If using `weights` as `"imagenet"` with `include_top`' ' as true, `classes` should be 1000') # Determine proper input shape input_shape = _obtain_input_shape(input_shape, default_size=224, min_size=28, data_format=K.image_data_format(), require_flatten=include_top, weights=weights) if input_tensor is None: img_input = Input(shape=input_shape) else: img_input = input_tensor x = csp_darknet53_body(img_input) if include_top: model_name='cspdarknet53' x = GlobalAveragePooling2D(name='avg_pool')(x) x = Reshape((1, 1, 1024))(x) x = DarknetConv2D(classes, (1, 1))(x) x = Flatten()(x) x = Softmax(name='Predictions/Softmax')(x) else: model_name='cspdarknet53_headless' if pooling == 'avg': x = GlobalAveragePooling2D(name='avg_pool')(x) elif pooling == 'max': x = GlobalMaxPooling2D(name='max_pool')(x) # Ensure that the model takes into account # any potential predecessors of `input_tensor`. if input_tensor is not None: inputs = get_source_inputs(input_tensor) else: inputs = img_input # Create model. model = Model(inputs, x, name=model_name) # Load weights. if weights == 'imagenet': if include_top: file_name = 'cspdarknet53_weights_tf_dim_ordering_tf_kernels_224.h5' weight_path = BASE_WEIGHT_PATH + file_name else: file_name = 'cspdarknet53_weights_tf_dim_ordering_tf_kernels_224_no_top.h5' weight_path = BASE_WEIGHT_PATH + file_name weights_path = get_file(file_name, weight_path, cache_subdir='models') model.load_weights(weights_path) elif weights is not None: model.load_weights(weights) return model
def yolo4lite_mobilenet_body(inputs, num_anchors, num_classes, alpha=1.0): '''Create YOLO_v4 Lite MobileNet model CNN body in keras.''' mobilenet = MobileNet(input_tensor=inputs, weights='imagenet', include_top=False, alpha=alpha) # input: 416 x 416 x 3 # conv_pw_13_relu :13 x 13 x (1024*alpha) # conv_pw_11_relu :26 x 26 x (512*alpha) # conv_pw_5_relu : 52 x 52 x (256*alpha) f1 = mobilenet.get_layer('conv_pw_13_relu').output # f1 :13 x 13 x (1024*alpha) for 416 input #feature map 1 head (13 x 13 x (512*alpha) for 416 input) x1 = make_yolo_spp_depthwise_separable_head(f1, int(512 * alpha), block_id_str='14') #upsample fpn merge for feature map 1 & 2 x1_upsample = compose(DarknetConv2D_BN_Leaky(int(256 * alpha), (1, 1)), UpSampling2D(2))(x1) f2 = mobilenet.get_layer('conv_pw_11_relu').output # f2: 26 x 26 x (512*alpha) for 416 input x2 = DarknetConv2D_BN_Leaky(int(256 * alpha), (1, 1))(f2) x2 = Concatenate()([x2, x1_upsample]) #feature map 2 head (26 x 26 x (256*alpha) for 416 input) x2 = make_yolo_depthwise_separable_head(x2, int(256 * alpha), block_id_str='15') #upsample fpn merge for feature map 2 & 3 x2_upsample = compose(DarknetConv2D_BN_Leaky(int(128 * alpha), (1, 1)), UpSampling2D(2))(x2) f3 = mobilenet.get_layer('conv_pw_5_relu').output # f3 : 52 x 52 x (256*alpha) for 416 input x3 = DarknetConv2D_BN_Leaky(int(128 * alpha), (1, 1))(f3) x3 = Concatenate()([x3, x2_upsample]) #feature map 3 head & output (52 x 52 x (256*alpha) for 416 input) #x3, y3 = make_depthwise_separable_last_layers(x3, int(128*alpha), num_anchors*(num_classes+5), block_id_str='16') x3 = make_yolo_depthwise_separable_head(x3, int(128 * alpha), block_id_str='16') y3 = compose( Depthwise_Separable_Conv2D_BN_Leaky(int(256 * alpha), (3, 3), block_id_str='16_3'), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x3) #downsample fpn merge for feature map 3 & 2 x3_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), Darknet_Depthwise_Separable_Conv2D_BN_Leaky(int(256 * alpha), (3, 3), strides=(2, 2), block_id_str='16_4'))(x3) x2 = Concatenate()([x3_downsample, x2]) #feature map 2 output (26 x 26 x (512*alpha) for 416 input) #x2, y2 = make_depthwise_separable_last_layers(x2, int(256*alpha), num_anchors*(num_classes+5), block_id_str='17') x2 = make_yolo_depthwise_separable_head(x2, int(256 * alpha), block_id_str='17') y2 = compose( Depthwise_Separable_Conv2D_BN_Leaky(int(512 * alpha), (3, 3), block_id_str='17_3'), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x2) #downsample fpn merge for feature map 2 & 1 x2_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), Darknet_Depthwise_Separable_Conv2D_BN_Leaky(int(512 * alpha), (3, 3), strides=(2, 2), block_id_str='17_4'))(x2) x1 = Concatenate()([x2_downsample, x1]) #feature map 1 output (13 x 13 x (1024*alpha) for 416 input) #x1, y1 = make_depthwise_separable_last_layers(x1, int(512*alpha), num_anchors*(num_classes+5), block_id_str='18') x1 = make_yolo_depthwise_separable_head(x1, int(512 * alpha), block_id_str='18') y1 = compose( Depthwise_Separable_Conv2D_BN_Leaky(int(1024 * alpha), (3, 3), block_id_str='18_3'), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x1) return Model(inputs, [y1, y2, y3])
def yolo4_mobilenet_body(inputs, num_anchors, num_classes, alpha=1.0): """Create YOLO_V4 MobileNet model CNN body in Keras.""" mobilenet = MobileNet(input_tensor=inputs, weights='imagenet', include_top=False, alpha=alpha) # input: 416 x 416 x 3 # conv_pw_13_relu :13 x 13 x (1024*alpha) # conv_pw_11_relu :26 x 26 x (512*alpha) # conv_pw_5_relu : 52 x 52 x (256*alpha) # f1: 13 x 13 x (1024*alpha) for 416 input f1 = mobilenet.get_layer('conv_pw_13_relu').output # f2: 26 x 26 x (512*alpha) for 416 input f2 = mobilenet.get_layer('conv_pw_11_relu').output # f3: 52 x 52 x (256*alpha) for 416 input f3 = mobilenet.get_layer('conv_pw_5_relu').output f1_channel_num = int(1024 * alpha) f2_channel_num = int(512 * alpha) f3_channel_num = int(256 * alpha) #feature map 1 head (13 x 13 x (512*alpha) for 416 input) x1 = make_yolo_spp_head(f1, f1_channel_num // 2) #upsample fpn merge for feature map 1 & 2 x1_upsample = compose(DarknetConv2D_BN_Leaky(f2_channel_num // 2, (1, 1)), UpSampling2D(2))(x1) x2 = DarknetConv2D_BN_Leaky(f2_channel_num // 2, (1, 1))(f2) x2 = Concatenate()([x2, x1_upsample]) #feature map 2 head (26 x 26 x (256*alpha) for 416 input) x2 = make_yolo_head(x2, f2_channel_num // 2) #upsample fpn merge for feature map 2 & 3 x2_upsample = compose(DarknetConv2D_BN_Leaky(f3_channel_num // 2, (1, 1)), UpSampling2D(2))(x2) x3 = DarknetConv2D_BN_Leaky(f3_channel_num // 2, (1, 1))(f3) x3 = Concatenate()([x3, x2_upsample]) #feature map 3 head & output (52 x 52 x (256*alpha) for 416 input) #x3, y3 = make_last_layers(x3, f3_channel_num//2, num_anchors*(num_classes+5)) x3 = make_yolo_head(x3, f3_channel_num // 2) y3 = compose(DarknetConv2D_BN_Leaky(f3_channel_num, (3, 3)), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x3) #downsample fpn merge for feature map 3 & 2 x3_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), DarknetConv2D_BN_Leaky(f2_channel_num // 2, (3, 3), strides=(2, 2)))(x3) x2 = Concatenate()([x3_downsample, x2]) #feature map 2 output (26 x 26 x (512*alpha) for 416 input) #x2, y2 = make_last_layers(x2, f2_channel_num//2, num_anchors*(num_classes+5)) x2 = make_yolo_head(x2, f2_channel_num // 2) y2 = compose(DarknetConv2D_BN_Leaky(f2_channel_num, (3, 3)), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x2) #downsample fpn merge for feature map 2 & 1 x2_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), DarknetConv2D_BN_Leaky(f1_channel_num // 2, (3, 3), strides=(2, 2)))(x2) x1 = Concatenate()([x2_downsample, x1]) #feature map 1 output (13 x 13 x (1024*alpha) for 416 input) #x1, y1 = make_last_layers(x1, f1_channel_num//2, num_anchors*(num_classes+5)) x1 = make_yolo_head(x1, f1_channel_num // 2) y1 = compose(DarknetConv2D_BN_Leaky(f1_channel_num, (3, 3)), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x1) return Model(inputs, [y1, y2, y3])
def yolo4lite_efficientnet_body(inputs, num_anchors, num_classes, level=1): ''' Create YOLO_v4 Lite EfficientNet model CNN body in keras. # Arguments level: EfficientNet level number. by default we use basic EfficientNetB1 as backbone ''' efficientnet, feature_map_info = get_efficientnet_backbone_info( inputs, level=level) f1 = efficientnet.get_layer('top_activation').output f1_channel_num = feature_map_info['f1_channel_num'] f2 = efficientnet.get_layer('block6a_expand_activation').output f2_channel_num = feature_map_info['f2_channel_num'] f3 = efficientnet.get_layer('block4a_expand_activation').output f3_channel_num = feature_map_info['f3_channel_num'] #feature map 1 head (13x13x(f1_channel_num//2) for 416 input) x1 = make_yolo_spp_depthwise_separable_head(f1, f1_channel_num // 2, block_id_str='8') #upsample fpn merge for feature map 1 & 2 x1_upsample = compose(DarknetConv2D_BN_Leaky(f2_channel_num // 2, (1, 1)), UpSampling2D(2))(x1) x2 = DarknetConv2D_BN_Leaky(f2_channel_num // 2, (1, 1))(f2) x2 = Concatenate()([x2, x1_upsample]) #feature map 2 head (26x26x(f2_channel_num//2) for 416 input) x2 = make_yolo_depthwise_separable_head(x2, f2_channel_num // 2, block_id_str='9') #upsample fpn merge for feature map 2 & 3 x2_upsample = compose(DarknetConv2D_BN_Leaky(f3_channel_num // 2, (1, 1)), UpSampling2D(2))(x2) x3 = DarknetConv2D_BN_Leaky(f3_channel_num // 2, (1, 1))(f3) x3 = Concatenate()([x3, x2_upsample]) #feature map 3 head & output (52x52xf3_channel_num for 416 input) #x3, y3 = make_depthwise_separable_last_layers(x3, f3_channel_num//2, num_anchors*(num_classes+5), block_id_str='10') x3 = make_yolo_depthwise_separable_head(x3, f3_channel_num // 2, block_id_str='10') y3 = compose( Depthwise_Separable_Conv2D_BN_Leaky(f3_channel_num, (3, 3), block_id_str='10_3'), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x3) #downsample fpn merge for feature map 3 & 2 x3_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), Darknet_Depthwise_Separable_Conv2D_BN_Leaky(f2_channel_num // 2, (3, 3), strides=(2, 2), block_id_str='10_4'))(x3) x2 = Concatenate()([x3_downsample, x2]) #feature map 2 output (26x26xf2_channel_num for 416 input) #x2, y2 = make_depthwise_separable_last_layers(x2, f2_channel_num//2, num_anchors*(num_classes+5), block_id_str='11') x2 = make_yolo_depthwise_separable_head(x2, f2_channel_num // 2, block_id_str='11') y2 = compose( Depthwise_Separable_Conv2D_BN_Leaky(f2_channel_num, (3, 3), block_id_str='11_3'), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x2) #downsample fpn merge for feature map 2 & 1 x2_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), Darknet_Depthwise_Separable_Conv2D_BN_Leaky(f1_channel_num // 2, (3, 3), strides=(2, 2), block_id_str='11_4'))(x2) x1 = Concatenate()([x2_downsample, x1]) #feature map 1 output (13x13xf1_channel_num for 416 input) #x1, y1 = make_depthwise_separable_last_layers(x1, f1_channel_num//2, num_anchors*(num_classes+5), block_id_str='12') x1 = make_yolo_depthwise_separable_head(x1, f1_channel_num // 2, block_id_str='12') y1 = compose( Depthwise_Separable_Conv2D_BN_Leaky(f1_channel_num, (3, 3), block_id_str='12_3'), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x1) return Model(inputs, [y1, y2, y3])
def yolo4lite_mobilenetv3small_body(inputs, num_anchors, num_classes, alpha=1.0): '''Create YOLO_v4 Lite MobileNetV3Small model CNN body in keras.''' mobilenetv3small = MobileNetV3Small(input_tensor=inputs, weights='imagenet', include_top=False, alpha=alpha) # input: 416 x 416 x 3 # activation_31(layer 165, final feature map): 13 x 13 x (576*alpha) # expanded_conv_10/Add(layer 162, end of block10): 13 x 13 x (96*alpha) # activation_22(layer 117, middle in block8) : 26 x 26 x (288*alpha) # expanded_conv_7/Add(layer 114, end of block7) : 26 x 26 x (48*alpha) # activation_7(layer 38, middle in block3) : 52 x 52 x (96*alpha) # expanded_conv_2/Add(layer 35, end of block2): 52 x 52 x (24*alpha) # f1 :13 x 13 x (576*alpha) # NOTE: activation layer name may different for TF1.x/2.x, so we # use index to fetch layer f1 = mobilenetv3small.layers[165].output #feature map 1 head (13 x 13 x (288*alpha) for 416 input) x1 = make_yolo_spp_depthwise_separable_head(f1, int(288 * alpha), block_id_str='11') #upsample fpn merge for feature map 1 & 2 x1_upsample = compose(DarknetConv2D_BN_Leaky(int(144 * alpha), (1, 1)), UpSampling2D(2))(x1) f2 = mobilenetv3small.layers[117].output # f2: 26 x 26 x (288*alpha) for 416 input x2 = DarknetConv2D_BN_Leaky(int(144 * alpha), (1, 1))(f2) x2 = Concatenate()([x2, x1_upsample]) #feature map 2 head (26 x 26 x (144*alpha) for 416 input) x2 = make_yolo_depthwise_separable_head(x2, int(144 * alpha), block_id_str='12') #upsample fpn merge for feature map 2 & 3 x2_upsample = compose(DarknetConv2D_BN_Leaky(int(48 * alpha), (1, 1)), UpSampling2D(2))(x2) f3 = mobilenetv3small.layers[38].output # f3 : 52 x 52 x (96*alpha) x3 = DarknetConv2D_BN_Leaky(int(48 * alpha), (1, 1))(f3) x3 = Concatenate()([x3, x2_upsample]) #feature map 3 head & output (52 x 52 x (96*alpha) for 416 input) #x3, y3 = make_depthwise_separable_last_layers(x3, int(48*alpha), num_anchors*(num_classes+5), block_id_str='13') x3 = make_yolo_depthwise_separable_head(x3, int(48 * alpha), block_id_str='13') y3 = compose( Depthwise_Separable_Conv2D_BN_Leaky(int(96 * alpha), (3, 3), block_id_str='13_3'), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x3) #downsample fpn merge for feature map 3 & 2 x3_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), Darknet_Depthwise_Separable_Conv2D_BN_Leaky(int(144 * alpha), (3, 3), strides=(2, 2), block_id_str='13_4'))(x3) x2 = Concatenate()([x3_downsample, x2]) #feature map 2 output (26 x 26 x (288*alpha) for 416 input) #x2, y2 = make_depthwise_separable_last_layers(x2, int(144*alpha), num_anchors*(num_classes+5), block_id_str='14') x2 = make_yolo_depthwise_separable_head(x2, int(144 * alpha), block_id_str='14') y2 = compose( Depthwise_Separable_Conv2D_BN_Leaky(int(288 * alpha), (3, 3), block_id_str='14_3'), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x2) #downsample fpn merge for feature map 2 & 1 x2_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), Darknet_Depthwise_Separable_Conv2D_BN_Leaky(int(288 * alpha), (3, 3), strides=(2, 2), block_id_str='14_4'))(x2) x1 = Concatenate()([x2_downsample, x1]) #feature map 1 output (13 x 13 x (576*alpha) for 416 input) #x1, y1 = make_depthwise_separable_last_layers(x1, int(288*alpha), num_anchors*(num_classes+5), block_id_str='15') x1 = make_yolo_depthwise_separable_head(x1, int(288 * alpha), block_id_str='15') y1 = compose( Depthwise_Separable_Conv2D_BN_Leaky(int(576 * alpha), (3, 3), block_id_str='15_3'), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x1) return Model(inputs, [y1, y2, y3])
def yolo4lite_mobilenetv3large_body(inputs, num_anchors, num_classes, alpha=1.0): '''Create YOLO_v4 Lite MobileNetV3Large model CNN body in keras.''' mobilenetv3large = MobileNetV3Large(input_tensor=inputs, weights='imagenet', include_top=False, alpha=alpha) # input: 416 x 416 x 3 # activation_38(layer 194, final feature map): 13 x 13 x (960*alpha) # expanded_conv_14/Add(layer 191, end of block14): 13 x 13 x (160*alpha) # activation_29(layer 146, middle in block12) : 26 x 26 x (672*alpha) # expanded_conv_11/Add(layer 143, end of block11) : 26 x 26 x (112*alpha) # activation_15(layer 79, middle in block6) : 52 x 52 x (240*alpha) # expanded_conv_5/Add(layer 76, end of block5): 52 x 52 x (40*alpha) # f1 :13 x 13 x (960*alpha) # NOTE: activation layer name may different for TF1.x/2.x, so we # use index to fetch layer f1 = mobilenetv3large.layers[194].output #feature map 1 head (13 x 13 x (480*alpha) for 416 input) x1 = make_yolo_spp_depthwise_separable_head(f1, int(480 * alpha)) #upsample fpn merge for feature map 1 & 2 x1_upsample = compose(DarknetConv2D_BN_Leaky(int(336 * alpha), (1, 1)), UpSampling2D(2))(x1) f2 = mobilenetv3large.layers[146].output # f2: 26 x 26 x (672*alpha) for 416 input x2 = DarknetConv2D_BN_Leaky(int(336 * alpha), (1, 1))(f2) x2 = Concatenate()([x2, x1_upsample]) #feature map 2 head (26 x 26 x (336*alpha) for 416 input) x2 = make_yolo_depthwise_separable_head(x2, int(336 * alpha)) #upsample fpn merge for feature map 2 & 3 x2_upsample = compose(DarknetConv2D_BN_Leaky(int(120 * alpha), (1, 1)), UpSampling2D(2))(x2) f3 = mobilenetv3large.layers[79].output # f3 : 52 x 52 x (240*alpha) for 416 input x3 = DarknetConv2D_BN_Leaky(int(120 * alpha), (1, 1))(f3) x3 = Concatenate()([x3, x2_upsample]) #feature map 3 head & output (52 x 52 x (240*alpha) for 416 input) #x3, y3 = make_depthwise_separable_last_layers(x3, int(120*alpha), num_anchors*(num_classes+5)) x3 = make_yolo_depthwise_separable_head(x3, int(120 * alpha)) y3 = compose(Depthwise_Separable_Conv2D_BN_Leaky(int(240 * alpha), (3, 3)), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x3) #downsample fpn merge for feature map 3 & 2 x3_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), Darknet_Depthwise_Separable_Conv2D_BN_Leaky(int(336 * alpha), (3, 3), strides=(2, 2)))(x3) x2 = Concatenate()([x3_downsample, x2]) #feature map 2 output (26 x 26 x (672*alpha) for 416 input) #x2, y2 = make_depthwise_separable_last_layers(x2, int(336*alpha), num_anchors*(num_classes+5)) x2 = make_yolo_depthwise_separable_head(x2, int(336 * alpha)) y2 = compose(Depthwise_Separable_Conv2D_BN_Leaky(int(672 * alpha), (3, 3)), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x2) #downsample fpn merge for feature map 2 & 1 x2_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), Darknet_Depthwise_Separable_Conv2D_BN_Leaky(int(480 * alpha), (3, 3), strides=(2, 2)))(x2) x1 = Concatenate()([x2_downsample, x1]) #feature map 1 output (13 x 13 x (960*alpha) for 416 input) #x1, y1 = make_depthwise_separable_last_layers(x1, int(480*alpha), num_anchors*(num_classes+5)) x1 = make_yolo_depthwise_separable_head(x1, int(480 * alpha)) y1 = compose(Depthwise_Separable_Conv2D_BN_Leaky(int(960 * alpha), (3, 3)), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x1) return Model(inputs, [y1, y2, y3])
def yolo4_mobilenetv3small_body(inputs, num_anchors, num_classes, alpha=1.0): """Create YOLO_V4 MobileNetV3Small model CNN body in Keras.""" mobilenetv3small = MobileNetV3Small(input_tensor=inputs, weights='imagenet', include_top=False, alpha=alpha) # input: 416 x 416 x 3 # activation_31(layer 165, final feature map): 13 x 13 x (576*alpha) # expanded_conv_10/Add(layer 162, end of block10): 13 x 13 x (96*alpha) # activation_22(layer 117, middle in block8) : 26 x 26 x (288*alpha) # expanded_conv_7/Add(layer 114, end of block7) : 26 x 26 x (48*alpha) # activation_7(layer 38, middle in block3) : 52 x 52 x (96*alpha) # expanded_conv_2/Add(layer 35, end of block2): 52 x 52 x (24*alpha) # NOTE: activation layer name may different for TF1.x/2.x, so we # use index to fetch layer # f1: 13 x 13 x (576*alpha) f1 = mobilenetv3small.layers[165].output # f2: 26 x 26 x (288*alpha) for 416 input f2 = mobilenetv3small.layers[117].output # f3: 52 x 52 x (96*alpha) f3 = mobilenetv3small.layers[38].output f1_channel_num = int(576 * alpha) f2_channel_num = int(288 * alpha) f3_channel_num = int(96 * alpha) #f1_channel_num = 1024 #f2_channel_num = 512 #f3_channel_num = 256 #feature map 1 head (13 x 13 x f1_channel_num//2 for 416 input) x1 = make_yolo_spp_head(f1, f1_channel_num // 2) #upsample fpn merge for feature map 1 & 2 x1_upsample = compose(DarknetConv2D_BN_Leaky(f2_channel_num // 2, (1, 1)), UpSampling2D(2))(x1) x2 = DarknetConv2D_BN_Leaky(f2_channel_num // 2, (1, 1))(f2) x2 = Concatenate()([x2, x1_upsample]) #feature map 2 head (26 x 26 x f2_channel_num//2 for 416 input) x2 = make_yolo_head(x2, f2_channel_num // 2) #upsample fpn merge for feature map 2 & 3 x2_upsample = compose(DarknetConv2D_BN_Leaky(f3_channel_num // 2, (1, 1)), UpSampling2D(2))(x2) x3 = DarknetConv2D_BN_Leaky(f3_channel_num // 2, (1, 1))(f3) x3 = Concatenate()([x3, x2_upsample]) #feature map 3 head & output (52 x 52 x f3_channel_num for 416 input) #x3, y3 = make_last_layers(x3, f3_channel_num//2, num_anchors*(num_classes+5)) x3 = make_yolo_head(x3, f3_channel_num // 2) y3 = compose(DarknetConv2D_BN_Leaky(f3_channel_num, (3, 3)), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x3) #downsample fpn merge for feature map 3 & 2 x3_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), DarknetConv2D_BN_Leaky(f2_channel_num // 2, (3, 3), strides=(2, 2)))(x3) x2 = Concatenate()([x3_downsample, x2]) #feature map 2 output (26 x 26 x f2_channel_num for 416 input) #x2, y2 = make_last_layers(x2, f2_channel_num//2, num_anchors*(num_classes+5)) x2 = make_yolo_head(x2, f2_channel_num // 2) y2 = compose(DarknetConv2D_BN_Leaky(f2_channel_num, (3, 3)), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x2) #downsample fpn merge for feature map 2 & 1 x2_downsample = compose( ZeroPadding2D(((1, 0), (1, 0))), DarknetConv2D_BN_Leaky(f1_channel_num // 2, (3, 3), strides=(2, 2)))(x2) x1 = Concatenate()([x2_downsample, x1]) #feature map 1 output (13 x 13 x f1_channel_num for 416 input) #x1, y1 = make_last_layers(x1, f1_channel_num//2, num_anchors*(num_classes+5)) x1 = make_yolo_head(x1, f1_channel_num // 2) y1 = compose(DarknetConv2D_BN_Leaky(f1_channel_num, (3, 3)), DarknetConv2D(num_anchors * (num_classes + 5), (1, 1)))(x1) return Model(inputs, [y1, y2, y3])