def csp_block(inputs, filters, num_blocks): """ Create a CSPBlock which applies the following scheme to the input (N, H, W, C): - the first part (N, H, W, C // 2) goes into a series of residual connection - the second part is directly concatenated to the output of the previous operation Args: inputs (tf.Tensor): 4D (N,H,W,C) input tensor filters (int): Number of filters to use num_blocks (int): Number of residual blocks to apply Returns: tf.Tensor: 4D (N,H/2,W/2,filters) output tensor """ half_filters = filters // 2 x = conv_bn( inputs, filters=filters, kernel_size=3, strides=2, zero_pad=True, padding="valid", activation="mish", ) route = conv_bn(x, filters=half_filters, kernel_size=1, strides=1, activation="mish") x = conv_bn(x, filters=half_filters, kernel_size=1, strides=1, activation="mish") x = residual_block(x, num_blocks=num_blocks) x = conv_bn(x, filters=half_filters, kernel_size=1, strides=1, activation="mish") x = tf.keras.layers.Concatenate()([x, route]) x = conv_bn(x, filters=filters, kernel_size=1, strides=1, activation="mish") return x
def csp_darknet53(input_shape): """ CSPDarknet53 implementation based on AlexeyAB/darknet config https://github.com/AlexeyAB/darknet/blob/master/cfg/yolov4.cfg """ inputs = tf.keras.Input(shape=input_shape) # First downsampling: L29 -> L103 x = conv_bn(inputs, filters=32, kernel_size=3, strides=1, activation="mish") # This block could be expressed as a CSPBlock with modification of num_filters in the middle # For readability purpose, we chose to keep the CSPBlock as simple as possible and have a little redondancy x = conv_bn( x, filters=64, kernel_size=3, strides=2, zero_pad=True, padding="valid", activation="mish", ) route = conv_bn(x, filters=64, kernel_size=1, strides=1, activation="mish") shortcut = conv_bn(x, filters=64, kernel_size=1, strides=1, activation="mish") x = conv_bn(shortcut, filters=32, kernel_size=1, strides=1, activation="mish") x = conv_bn(x, filters=64, kernel_size=3, strides=1, activation="mish") x = x + shortcut x = conv_bn(x, filters=64, kernel_size=1, strides=1, activation="mish") x = tf.keras.layers.Concatenate()([x, route]) x = conv_bn(x, filters=64, kernel_size=1, strides=1, activation="mish") # Second downsampling: L105 -> L191 x = csp_block(x, filters=128, num_blocks=2) # Third downsampling: L193 -> L400 output_1 = csp_block(x, filters=256, num_blocks=8) # Fourth downsampling: L402 -> L614 output_2 = csp_block(output_1, filters=512, num_blocks=8) # Fifth downsampling: L616 -> L744 output_3 = csp_block(output_2, filters=1024, num_blocks=4) return tf.keras.Model(inputs, [output_1, output_2, output_3], name="CSPDarknet53")
def residual_block(inputs, num_blocks): """ Applies several residual connections. Args: inputs (tf.Tensor): 4D (N,H,W,C) input tensor num_blocks (int): Number of residual blocks Returns: tf.Tensor: 4D (N,H,W,C) output Tensor """ _, _, _, filters = inputs.shape x = inputs for _ in range(num_blocks): block_inputs = x x = conv_bn(x, filters, kernel_size=1, strides=1, activation="mish") x = conv_bn(x, filters, kernel_size=3, strides=1, activation="mish") x = x + block_inputs return x
def yolov4_neck(input_shapes): """ Implements the neck of YOLOv4, including the SPP and the modified PAN. Args: input_shapes (List[Tuple[int]]): List of 3 tuples, which are the output shapes of the backbone. For CSPDarknet53, those are: [(52, 52, 256), (26, 26, 512), (13, 13, 1024)] for a (416, 416) input. Returns: tf.keras.Model: Neck model """ input_1 = tf.keras.Input(shape=filter(None, input_shapes[0])) input_2 = tf.keras.Input(shape=filter(None, input_shapes[1])) input_3 = tf.keras.Input(shape=filter(None, input_shapes[2])) x = conv_bn(input_3, filters=512, kernel_size=1, strides=1, activation="leaky_relu") x = conv_bn(x, filters=1024, kernel_size=3, strides=1, activation="leaky_relu") x = conv_bn(x, filters=512, kernel_size=1, strides=1, activation="leaky_relu") maxpool_1 = tf.keras.layers.MaxPool2D((5, 5), strides=1, padding="same")(x) maxpool_2 = tf.keras.layers.MaxPool2D((9, 9), strides=1, padding="same")(x) maxpool_3 = tf.keras.layers.MaxPool2D((13, 13), strides=1, padding="same")(x) spp = tf.keras.layers.Concatenate()([maxpool_3, maxpool_2, maxpool_1, x]) x = conv_bn(spp, filters=512, kernel_size=1, strides=1, activation="leaky_relu") x = conv_bn(x, filters=1024, kernel_size=3, strides=1, activation="leaky_relu") output_3 = conv_bn( x, filters=512, kernel_size=1, strides=1, activation="leaky_relu" ) x = conv_bn( output_3, filters=256, kernel_size=1, strides=1, activation="leaky_relu" ) upsampled = tf.keras.layers.UpSampling2D()(x) x = conv_bn(input_2, filters=256, kernel_size=1, strides=1, activation="leaky_relu") x = tf.keras.layers.Concatenate()([x, upsampled]) x = conv_bn(x, filters=256, kernel_size=1, strides=1, activation="leaky_relu") x = conv_bn(x, filters=512, kernel_size=3, strides=1, activation="leaky_relu") x = conv_bn(x, filters=256, kernel_size=1, strides=1, activation="leaky_relu") x = conv_bn(x, filters=512, kernel_size=3, strides=1, activation="leaky_relu") output_2 = conv_bn( x, filters=256, kernel_size=1, strides=1, activation="leaky_relu" ) x = conv_bn( output_2, filters=128, kernel_size=1, strides=1, activation="leaky_relu" ) upsampled = tf.keras.layers.UpSampling2D()(x) x = conv_bn(input_1, filters=128, kernel_size=1, strides=1, activation="leaky_relu") x = tf.keras.layers.Concatenate()([x, upsampled]) x = conv_bn(x, filters=128, kernel_size=1, strides=1, activation="leaky_relu") x = conv_bn(x, filters=256, kernel_size=3, strides=1, activation="leaky_relu") x = conv_bn(x, filters=128, kernel_size=1, strides=1, activation="leaky_relu") x = conv_bn(x, filters=256, kernel_size=3, strides=1, activation="leaky_relu") output_1 = conv_bn( x, filters=128, kernel_size=1, strides=1, activation="leaky_relu" ) return tf.keras.Model( [input_1, input_2, input_3], [output_1, output_2, output_3], name="YOLOv4_neck" )
def yolov3_head( input_shapes, anchors, num_classes, training, yolo_max_boxes, yolo_iou_threshold, yolo_score_threshold, ): """ Returns the YOLOv3 head, which is used in YOLOv4 Args: input_shapes (List[Tuple[int]]): List of 3 tuples, which are the output shapes of the neck. None dimensions are ignored. For CSPDarknet53+YOLOv4_neck, those are: [ (52, 52, 128), (26, 26, 256), (13, 13, 512)] for a (416, 416) input. anchors (List[numpy.array[int, 2]]): List of 3 numpy arrays containing the anchor sizes used for each stage. The first and second columns of the numpy arrays respectively contain the anchors width and height. num_classes (int): Number of classes. training (boolean): If False, will output boxes computed through YOLO regression and NMS, and YOLO features otherwise. Set it True for training, and False for inferences. yolo_max_boxes (int): Maximum number of boxes predicted on each image (across all anchors/stages) yolo_iou_threshold (float between 0. and 1.): IOU threshold defining whether close boxes will be merged during non max regression. yolo_score_threshold (float between 0. and 1.): Boxes with score lower than this threshold will be filtered out during non max regression. Returns: tf.keras.Model: Head model """ input_1 = tf.keras.Input(shape=filter(None, input_shapes[0])) input_2 = tf.keras.Input(shape=filter(None, input_shapes[1])) input_3 = tf.keras.Input(shape=filter(None, input_shapes[2])) x = conv_bn(input_1, filters=256, kernel_size=3, strides=1, activation="leaky_relu") output_1 = conv_classes_anchors(x, num_anchors_stage=len(anchors[0]), num_classes=num_classes) x = conv_bn( input_1, filters=256, kernel_size=3, strides=2, zero_pad=True, padding="valid", activation="leaky_relu", ) x = tf.keras.layers.Concatenate()([x, input_2]) x = conv_bn(x, filters=256, kernel_size=1, strides=1, activation="leaky_relu") x = conv_bn(x, filters=512, kernel_size=3, strides=1, activation="leaky_relu") x = conv_bn(x, filters=256, kernel_size=1, strides=1, activation="leaky_relu") x = conv_bn(x, filters=512, kernel_size=3, strides=1, activation="leaky_relu") connection = conv_bn(x, filters=256, kernel_size=1, strides=1, activation="leaky_relu") x = conv_bn(connection, filters=512, kernel_size=3, strides=1, activation="leaky_relu") output_2 = conv_classes_anchors(x, num_anchors_stage=len(anchors[1]), num_classes=num_classes) x = conv_bn( connection, filters=512, kernel_size=3, strides=2, zero_pad=True, padding="valid", activation="leaky_relu", ) x = tf.keras.layers.Concatenate()([x, input_3]) x = conv_bn(x, filters=512, kernel_size=1, strides=1, activation="leaky_relu") x = conv_bn(x, filters=1024, kernel_size=3, strides=1, activation="leaky_relu") x = conv_bn(x, filters=512, kernel_size=1, strides=1, activation="leaky_relu") x = conv_bn(x, filters=1024, kernel_size=3, strides=1, activation="leaky_relu") x = conv_bn(x, filters=512, kernel_size=1, strides=1, activation="leaky_relu") x = conv_bn(x, filters=1024, kernel_size=3, strides=1, activation="leaky_relu") output_3 = conv_classes_anchors(x, num_anchors_stage=len(anchors[2]), num_classes=num_classes) if training: return tf.keras.Model( [input_1, input_2, input_3], [output_1, output_2, output_3], name="YOLOv3_head", ) predictions_1 = tf.keras.layers.Lambda( lambda x_input: yolov3_boxes_regression(x_input, anchors[0]), name="yolov3_boxes_regression_small_scale", )(output_1) predictions_2 = tf.keras.layers.Lambda( lambda x_input: yolov3_boxes_regression(x_input, anchors[1]), name="yolov3_boxes_regression_medium_scale", )(output_2) predictions_3 = tf.keras.layers.Lambda( lambda x_input: yolov3_boxes_regression(x_input, anchors[2]), name="yolov3_boxes_regression_large_scale", )(output_3) output = tf.keras.layers.Lambda( lambda x_input: yolo_nms( x_input, yolo_max_boxes=yolo_max_boxes, yolo_iou_threshold=yolo_iou_threshold, yolo_score_threshold=yolo_score_threshold, ), name="yolov4_nms", )([predictions_1, predictions_2, predictions_3]) return tf.keras.Model([input_1, input_2, input_3], output, name="YOLOv3_head")