def _make_predict_layer(self, input_channels: int, inner_channels_list: List[int], output_channels: int) -> nn.ModuleList: """ Yolo 的最终预测层,共有七层卷积网络,前五层用于提取特征,后两层用于获得 yolo 网络的预测结果 :param input_channels: 输入通道数 :param inner_channels_list: 中间通道数,[down_dimension_channels(特征整合通道数,即降维), feature_extract_channels(特征提取通道数)] :param output_channels: 输出通道数 :return: 最终预测层的七层卷积网络 """ m = nn.ModuleList([ # 将输入降维 base_model.Conv2d(input_channels, inner_channels_list[0], 1), base_model.Conv2d(inner_channels_list[0], inner_channels_list[1], 3), # 特征提取 base_model.Conv2d(inner_channels_list[1], inner_channels_list[0], 1), # 特征整合 base_model.Conv2d(inner_channels_list[0], inner_channels_list[1], 3), # 特征提取 base_model.Conv2d(inner_channels_list[1], inner_channels_list[0], 1), # 特征整合 base_model.Conv2d(inner_channels_list[0], inner_channels_list[1], 3), # 特征提取 # 降维到输出维度 nn.Conv2d(inner_channels_list[1], output_channels, kernel_size=1, stride=1, padding=0, bias=True) ]) return m
def __init__(self, config: dict) -> None: super().__init__() self.config = config self.backbone = darknet.darknet53(False) # 最终预测层的通道数 self.predict_output_channels_13 = len( config["anchors"][0]) * (5 + config["classes"]) self.predict_output_channels_26 = len( config["anchors"][1]) * (5 + config["classes"]) self.predict_output_channels_52 = len( config["anchors"][2]) * (5 + config["classes"]) # channels: 1024 -> 512 -> 255 self.last_layer_13 = self._make_predict_layer( self.backbone.layers_output_channels[-1], [512, 1024], self.predict_output_channels_13) # 上一层的中间预测结果进行特征整合,上采样:512*13*13 -> 256*26*26 self.last_layer_26_conv = base_model.Conv2d(512, 256, 1) self.last_layer_26_upsample = nn.Upsample(scale_factor=2, mode='nearest') # channels: 512/2 + 512 -> 256 -> 255 self.last_layer_26 = self._make_predict_layer( self.backbone.layers_output_channels[-2] + 256, [256, 512], self.predict_output_channels_26) # 上一层的中间预测结果进行特征整合,上采样:256*26*26 -> 128*52*52 self.last_layer_52_conv = base_model.Conv2d(256, 128, 1) self.last_layer_52_upsample = nn.Upsample(scale_factor=2, mode='nearest') # channels: 256/2 + 256 -> 128 -> 255 self.last_layer_52 = self._make_predict_layer( self.backbone.layers_output_channels[-3] + 128, [128, 256], self.predict_output_channels_52)
def _make_layer(self, input_channels: int, block_channels: int, block_inner_channels: int, block_count: int) -> nn.Module: """ :param input_channels: 初始输入通道数 :param block_channels: 残差结构的输入和输出通道数,因为残差结构需要重复堆叠,所以残差结构的输入通道数应该等于残差结构的输出通道数 :param block_inner_channels: 残差结构的中间通道数 :param block_count: 残差结构的数目 :return: """ layers = [] # 1. 下采样 layers.append(("down_sample_conv", base_model.Conv2d(input_channels, block_channels, 3, 2))) # 2. 堆叠残差结构 for i in range(0, block_count): layers.append( ("residual_{}".format(i), base_model.BasicBlock(block_channels, block_inner_channels, block_channels))) # 3. 堆叠所有层 return nn.Sequential(OrderedDict(layers))
def __init__(self, block_counts: list) -> None: """ :param block_counts: 每个残差层的重复数目 """ super().__init__() self.input_channels = 3 # 输入图片通道数 self.init_channels = 32 # 初始卷积通道数 # 初始卷积 self.init_conv = base_model.Conv2d(self.input_channels, self.init_channels, 3) # 416,416,3 -> 416,416,32 # 五层下采样加残差卷积 self.layer1 = self._make_layer( 32, 64, 32, block_counts[0]) # 416,416,32 -> 208,208,64 self.layer2 = self._make_layer( 64, 128, 64, block_counts[1]) # 208,208,64 -> 104,104,128 self.layer3 = self._make_layer( 128, 256, 128, block_counts[2]) # 104,104,128 -> 52,52,256 self.layer4 = self._make_layer( 256, 512, 256, block_counts[3]) # 52,52,256 -> 26,26,512 self.layer5 = self._make_layer( 512, 1024, 512, block_counts[4]) # 26,26,512 -> 13,13,1024 # DarkNet53 的五个特征层的通道数 self.layers_output_channels = [64, 128, 256, 512, 1024] # 进行权值初始化 for m in self.modules(): # 遍历所有模型(所有层级的,但是只需要对最基础层级的模型进行权值初始化) if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[ 1] * m.out_channels # 卷积核的参数数目 m.weight.data.normal_(0, math.sqrt(2. / n)) # 简化的 kaiming 高斯初始化 elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) # 初始化权重为 1 m.bias.data.zero_() # 初始化偏置为 0