def rpn_graph(feature_map, anchor_per_location, anchor_stride): """ 根据特征图建立RPN网络的计算图,对应网络的输出 :param feature_map: 特征图,形状为[批,高,宽,通道数] :param anchor_per_location: int,每个像素点产生多少个anchor :param anchor_stride: 一般取1,表示特征图上,每个点都产生anchor :return: 一个列表,有三个元素,依次是anchor的logits,probs, bbox回归 """ batch_size, height, width, channal = feature_map.shape num_anchor = height * width * anchor_per_location // anchor_stride # 这张特征图一共可以产生num_anchor个anchor shared = conv2d(feature_map, 512, 3, anchor_stride, name='rpn_conv_shared') shared = tf.nn.relu(shared) # out_channal=2 * anchor_per_location,区分是或者不是物体 x = conv2d(inputs=shared, out_channal=2 * anchor_per_location, kernel_size=1, strides=1, name='rpn_class_raw') # 把形状调整为[批数,anchor数,2], 2 表示object/non-object # rpn_binary_logits = tf.reshape(x, shape=[batch_size, num_anchor, 2]) rpn_binary_logits = tf.reshape(x, [batch_size, -1, 2]) rpn_probs = tf.nn.softmax(rpn_binary_logits) x = conv2d(inputs=shared, out_channal=4 * anchor_per_location, kernel_size=1, strides=1, name='rpn_bbox_pred') # 坐标回归 rpn_bbox = tf.reshape(x, [batch_size, -1, 4]) return [rpn_binary_logits, rpn_probs, rpn_bbox]
def build_fpn_mask_graph(rois, feature_maps, image_shape, pool_size, num_class, train_bn=True, name=None): """ 构建mask :param rois: Proposals, [batch, num_rois, (x1, y1, x2, y2)] :param feature_maps: 一个列表,[p2, p3, p4, p5] 代表四个层级的特征图 :param image_shape: 原始输入图片的shape,[高,宽,通道数]。一个批次的所有图片,必须有相同的shape :param pool_size: ROI Pooling后的大小,在论文中,对于mask是14*14 :param num_class: 分类数,他决定了最终的通道数 :param train_bn: :return: [num_boxes, 28, 28, num_classes] """ # [num_boxes, height, width, channels], ROI Pooling后的结果 x = pyramidROIAlign(pool_size, rois, image_shape, feature_maps) for _ in range(4): x = conv2d(inputs=x, out_channal=256, kernel_size=3, strides=1, use_bias=False) x = batch_norm(x, train_bn) x = tf.nn.relu(x) x = conv2d_transpose(inputs=x, out_channal=256, kernel_size=3, strides=2) x = tf.nn.relu(x) x = conv2d(x, num_class, 1, 1, use_bias=True, name=name) return x
def fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_shape, pool_size, num_classes): """ 构建FPN的分类与回归 :param rois: Proposals, [batch, num_rois, (x1, y1, x2, y2)] :param mrcnn_feature_maps: 一个列表,[p2, p3, p4, p5] 代表四个层级的特征图 其相对于输入图片的缩放倍数依次是8, 16, 32, 64 :param input_image_shape: 原始输入图片的shape,[高,宽,通道数]。一个批次的所有图片,必须有相同的shape :param pool_size: ROI Pooling后的大小,一般是7*7 :param num_classes: 分类数,他决定了最终的通道数,因为我们用global avarage pool :return: """ # [num_boxes, height, width, channels], ROI Pooling后的结果 x = pyramidROIAlign(pool_size, rois, input_image_shape, mrcnn_feature_maps) num_boxes = tf.shape(x)[0] # TODO 到这里,彻底放弃了批数大于1的情形!! # 这里其实就是全连接,并且批数由num_boxes代替了 x = conv2d(inputs=x, out_channal=1024, kernel_size=pool_size[0], strides=pool_size[0], use_bias=True, name="mrcnn_class_conv1") # 这里是全连接,就不来batch_norm了 x = tf.nn.relu(x) x = conv2d(inputs=x, out_channal=1024, kernel_size=1, strides=1, use_bias=True, name="mrcnn_class_conv2") # 这时候,x的shape是[num_boxex, 1, 1, 1024], 调用下面这句话以后,变成了[num_box, 1024] shared = tf.squeeze(tf.nn.relu(x), axis=[1, 2]) # 下面分为两个head,一个用于回归,一个用于分类 mrcnn_class_logits = dense(inputs=shared, out_dimension=num_classes, use_biase=True, name="mrcnn_class_logits") mrcnn_class_probs = tf.nn.softmax(mrcnn_class_logits, name="mrcnn_class_probs") mrcnn_bbox = dense(inputs=shared, out_dimension=4 * num_classes, use_biase=True) mrcnn_bbox = tf.reshape(mrcnn_bbox, shape=[num_boxes, num_classes, 4], name="mrcnn_class_bbox") # mrcnn_class_logits, mrcnn_class_probs的shape都是[num_boxex, num_classes] # mrcnn_bbox 的shape是[num_boxex, num_classes, (dx, dy, log(h), log(w))] return mrcnn_class_logits, mrcnn_class_probs, mrcnn_bbox
def build_fpn_mask_graph(rois, feature_maps, image_shape, pool_size, num_class, train_bn=True, name=None): """ 构建mask :param rois: Proposals, [num_rois, (x1, y1, x2, y2)] :param feature_maps: 一个列表,[p2, p3, p4, p5] 代表四个层级的特征图 :param image_shape: 原始输入图片的shape,[高,宽,通道数]。一个批次的所有图片,必须有相同的shape :param pool_size: ROI Pooling后的大小,在论文中,对于mask是14*14 :param num_class: 分类数,他决定了最终的通道数 :param train_bn: :return: [num_boxes, 28, 28, num_classes] """ # [num_boxes, height, width, channels], ROI Pooling后的结果 x = pyramidROIAlign(pool_size, rois, image_shape, feature_maps) asserts = tf.Assert(tf.shape(x)[0] > 0, data=[tf.shape(x)]) with tf.control_dependencies([asserts]): x = tf.identity(x) for i in range(4): x = conv2d(inputs=x, out_channal=256, kernel_size=3, strides=1, use_bias=False, name=name + "_conv" + str(i + 1)) x = batch_norm(x, train_bn, name=name + "_bn" + str(i + 1)) x = tf.nn.relu(x) x = conv2d_transpose(inputs=x, out_channal=256, kernel_size=2, strides=2, name=name + "_deconv") x = tf.nn.relu(x) x = conv2d(x, num_class, 1, 1, use_bias=True, name=name) return x
def build_model(self, mode, input_image, gt_boxes=None, class_ids=None, input_gt_mask=None, anchor_labels=None, anchor_deltas=None): """ feature map有五个层级,p2, p3, p4, p5, p6,其相对于输入图片的缩放倍数依次是8, 16, 32, 64, 128 在特征图上每个像素点的位置都要产生3个不同ratio的anchor.假设第s层有N个像素点,则在s层产生的anchor数是3N, 其shape为[3N, (x1, y1, x2, y2)]。有五个层级,则调用tf.cancat函数在第0维拼接起来,形成的shape是 [num_anchor, (x1, y1, x2, y2)]。最后,按照批数拼接起来,最终的shape是[batch, num_anchor, (x1, y1, x2, y2)] 我们采用正则化坐标,故所有的坐标值的范围都必须在区间[0,1]里面 mode: 必须是'training','validation','inference'三者之一。mode是'training' 或者 'validation'时,所有参数都不能是None, mode是'inference'时,只需要提供input_image即可 :param mode: 必须是'training','validation','inference'三者之一 :param input_image: [1, 高, 宽, 3] # 简单一点,每次一张图片, float32 :param gt_boxes: shape=[1, gt个数, 4], float32 :param class_ids: shape=[1, gt个数], tf.int32 :param input_gt_mask: [1,gt个数,高,宽], bool :param anchor_labels: [批数,anchor个数],其中1表示正例,0表示负例,-1表示不予考虑, int32 :param anchor_deltas: anchor与gt之间的回归差异,[批数,anchor个数,(dx, dy, log(h), log(w))], float32 :return: """ mode_validation = mode in ['training', 'validation', 'inference'] with tf.control_dependencies( [tf.Assert(mode_validation, data=["invalid mode"])]): batch_size = input_image.shape[0] resnet = Model(resnetlist=resnet50, version=1) training = True if mode == 'training' else False # layer是一个列表,包含c2, c3, c4, c5,其相对于输入图片的缩放比例依次是8,16,32,64 # resolution是输入图片相对于特征图的分辨率倍数,是一个列表,依次是[8,16,32,64] layer, resolution = resnet(inputs=input_image, training=training) P5 = conv2d(inputs=layer[3], out_channal=256, kernel_size=1, strides=1, name='fpn_c5p5') P4 = tf.add_n([ conv2d_transpose(inputs=P5, out_channal=256, kernel_size=1, strides=2, name="fpn_trans4"), conv2d(inputs=layer[2], out_channal=256, kernel_size=1, strides=1, name="fpn_c4p4") ], name="fpn_p4add") P3 = tf.add_n([ conv2d_transpose(inputs=P4, out_channal=256, kernel_size=1, strides=2, name="fpn_trans3"), conv2d(inputs=layer[1], out_channal=256, kernel_size=1, strides=1, name="fpn_c3p3") ], name="fpn_p3add") P2 = tf.add_n([ conv2d_transpose(inputs=P3, out_channal=256, kernel_size=1, strides=2, name="fpn_trans2"), conv2d(inputs=layer[0], out_channal=256, kernel_size=1, strides=1, name="fpn_c2p2") ], name="fpn_p2add") # 根据FPN,最终来一个卷积,得到最后的特征图。没有非线性函数 p2 = conv2d(P2, 256, 3, 1, name="fpn_p2") p3 = conv2d(P3, 256, 3, 1, name="fpn_p3") p4 = conv2d(P4, 256, 3, 1, name="fpn_p4") p5 = conv2d(P5, 256, 3, 1, name="fpn_p5") # feature map 6 用来做RPN,不用来做proposal的相关分类 p6 = tf.layers.max_pooling2d(inputs=p5, pool_size=1, strides=2, name='feature_map6') resolution6 = resolution[-1] * 2 resolution.append(resolution6) rpn_feature_maps = [p2, p3, p4, p5, p6] mrcnn_feature_maps = [p2, p3, p4, p5] # 定义一个列表,其长度为输出特征图的层级数,用于装入每级特征图的rpn输出。 # rpn输出包含[rpn_binary_logits, rpn_probs, rpn_bbox], # 其shape依次是[批数,每个层级的anchors数,2],[批数,anchors数,2],[批数,anchors数,4] layer_output = [] for i, p in enumerate(rpn_feature_maps): layer_output.append( rpn_graph(p, config.anchor_per_location, anchor_stride=1, name=str(i))) # 把各层的输出连接起来,[[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]] output_name = ['rpn_binary_logits', 'rpn_binary_probs', 'rpn_bbox'] outputs = list(zip(*layer_output)) # 从底层到高层连接,连接以后,anchor数翻了5倍 outputs = [ tf.concat(list(o), axis=1, name=n) for o, n in zip(outputs, output_name) ] # [批数,anchors数,2], [批数,anchors数,2], [批数,anchors数,4] rpn_binary_logits, rpn_binary_probs, rpn_bbox_pred = outputs # 保留的proposal个数 num_proposal = config.POST_NMS_ROIS_TRAINING if mode == 'training' else config.POST_NMS_ROIS_INFERENCE if self.anchors is None: self.get_anchors(config.batch_size, resolution, config.input_shape, config.smallest_anchor_size) # 根据anchor来生成经过非极大值抑制后的proposal, 形状是 [个数,4] proposal = proposalLayer( inputs=[rpn_binary_probs, rpn_bbox_pred, self.anchors], max_proposal=num_proposal, nms_thresh=config.RPN_NMS_THRESHOLD, name="ROI") if mode == 'inference': mrcnn_class_logits, mrcnn_class_probs, mrcnn_bbox = fpn_classifier_graph( proposal, mrcnn_feature_maps, config.IMAGE_SHAPE, config.POOL_SIZE, config.NUM_CLASSES, name="mrcnn") # 经过最终处理以后的盒子,[x1, y1, x2, y2], 对应的类别,概率 boxes, ids, probs = detectionLayer(proposal, mrcnn_class_probs, mrcnn_bbox, config.IMAGE_SHAPE) mask = build_fpn_mask_graph(tf.expand_dims(boxes, 0), mrcnn_feature_maps, config.IMAGE_SHAPE, config.MASK_POOL_SIZE, config.NUM_CLASSES, train_bn=training, name="mrcnn_mask") mask = filter_mask(mask, ids) mask = tf.nn.sigmoid(mask) return [boxes, ids, probs, mask] else: # 调用detection_targets函数,处理proposal,返回proposal,以及相应的类别、回归、masks # 因为批数不好处理,故只能蛋疼地分成一批一批地处理 # [N, (x1, y1, x2, y2)]; [N]; [N, 4]; [N, 高,宽],float32 rois, target_class_ids, target_bbox, target_mask = detection_targets( proposal, gt_class_ids=class_ids[0], gt_boxes=gt_boxes[0], gt_masks=input_gt_mask[0]) # mrcnn_class_logits, mrcnn_class_probs的shape都是[num_boxex, num_classes] # mrcnn_bbox 的shape是[num_boxex, num_classes, (dx, dy, log(h), log(w))] mrcnn_class_logits, mrcnn_class_probs, mrcnn_bbox = fpn_classifier_graph( rois, mrcnn_feature_maps, config.IMAGE_SHAPE, config.POOL_SIZE, config.NUM_CLASSES, name="mrcnn") # [num_boxes, 28, 28, num_classes] mrcnn_mask_logits = build_fpn_mask_graph(rois, mrcnn_feature_maps, config.IMAGE_SHAPE, config.MASK_POOL_SIZE, config.NUM_CLASSES, train_bn=training, name="mrcnn_mask") # rpn loss rpn_binary_loss = rpn_binary_loss_graph(anchor_labels, rpn_binary_logits) rpn_bbox_loss = rpn_bbox_loss_graph(anchor_deltas, rpn_bbox_pred, anchor_labels) proposal_class_loss, targets_id = proposal_class_loss_graph( target_class_ids, mrcnn_class_logits, config.NUM_CLASSES) proposal_bbox_loss = proposal_bbox_loss_graph( target_bbox, mrcnn_bbox, target_class_ids) mask_loss = mask_loss_graph(target_mask, mrcnn_mask_logits, target_class_ids, config.NUM_CLASSES) rpn_loss = rpn_binary_loss + rpn_bbox_loss # rpn的损失 proposal_loss = proposal_class_loss + proposal_bbox_loss # proposal的损失 total_loss = rpn_loss + proposal_loss + mask_loss # 返回rpn的损失,proposal的损失,mask的损失,和总损失 return [rpn_loss, proposal_loss, mask_loss, total_loss]
def build_model(mode, input_image, gt_boxes=None, class_ids=None, input_gt_mask=None, rpn_binary_gt=None, rpn_bbox_gt=None, anchors=None): # TODO 在输入层产生的顺序是根据config中的feat_strides产生的 即128, 64, 32, 16, 8 """ feature map有五个层级,p2, p3, p4, p5, p6,其相对于输入图片的缩放倍数依次是8, 16, 32, 64, 128 在特征图上每个像素点的位置都要产生k^2个anchor.假设第s层有N个像素点,则在s层产生的anchor数是N*k^2, 其shape为[N*k^2, (x1, y1, x2, y2)]。有五个层级,则调用tf.cancat函数在第0维拼接起来,形成的shape是 [num_anchor, (x1, y1, x2, y2)]。最后,按照批数拼接起来,最终的shape是[batch, num_anchor, (x1, y1, x2, y2)] 我们采用正则化坐标,故所有的坐标值的范围都必须在区间[0,1]里面 mode: 必须是'training','validation','inference'三者之一。mode是'training' 或者 'validation'时,所有参数都不能是None, mode是'inference'时,只需要提供input_image即可 :param mode: 必须是'training','validation','inference'三者之一 :param input_image: [1, 高, 宽, 3] # 简单一点,每次一张图片 :param gt_boxes: shape=[1, 个数, 4] :param class_ids: shape=[1, 个数] :param input_gt_mask: [1,高,宽] :param rpn_binary_gt: shape=[1, None, 1] anchor的标签,0表示背景,1表示instance,-1表示不关心 :param rpn_bbox_gt: shape=[1, None, 4], anchor的回归目标值 :param anchors: [1, num_anchor, (x1, y1, x2, y2)] :return: """ mode_validation = mode in ['training', 'validation', 'inference'] with tf.control_dependencies( [tf.Assert(mode_validation, data=["invalid mode"])]): batch_size = input_image.shape[0] resnet = Model(resnetlist=resnet50, version=2) training = True if mode == 'training' else False # layer是一个列表,包含c2, c3, c4, c5,其相对于输入图片的缩放比例依次是8,16,32,64 # resolution是输入图片相对于特征图的分辨率倍数,值为64 layer, resolution = resnet(inputs=input_image, training=training) P5 = conv2d(inputs=layer[3], out_channal=256, kernel_size=1, strides=1, name='fpn_c5p5') P4 = tf.add_n([ conv2d_transpose(inputs=P5, out_channal=256, kernel_size=1, strides=2, name="fpn_trans4"), conv2d(inputs=layer[2], out_channal=256, kernel_size=1, strides=1, name="fpn_c4p4") ], name="fpn_p4add") P3 = tf.add_n([ conv2d_transpose(inputs=P4, out_channal=256, kernel_size=1, strides=2, name="fpn_trans4"), conv2d(inputs=layer[1], out_channal=256, kernel_size=1, strides=1, name="fpn_c3p3") ], name="fpn_p3add") P2 = tf.add_n([ conv2d_transpose(inputs=P3, out_channal=256, kernel_size=1, strides=2, name="fpn_trans4"), conv2d(inputs=layer[0], out_channal=256, kernel_size=1, strides=1, name="fpn_c2p2") ], name="fpn_p2add") # 根据FPN,最终来一个卷积,得到最后的特征图。没有非线性函数 p2 = conv2d(P2, 256, 3, 1, name="fpn_p2") p3 = conv2d(P3, 256, 3, 1, name="fpn_p3") p4 = conv2d(P4, 256, 3, 1, name="fpn_p4") p5 = conv2d(P5, 256, 3, 1, name="fpn_p5") # feature map 6 用来做RPN,不用来做分类 p6 = tf.layers.max_pooling2d(inputs=p5, pool_size=1, strides=2, name='feature_map6') rpn_feature_maps = [p2, p3, p4, p5, p6] mrcnn_feature_maps = [p2, p3, p4, p5] # 定义一个列表,其长度为输出特征图的层级数,用于装入每级特征图的rpn输出。 # rpn输出包含[rpn_binary_logits, rpn_probs, rpn_bbox], # 其shape依次是[批数,每个层级的anchors数,2],[批数,anchors数,2],[批数,anchors数,4] layer_output = [] for p in rpn_feature_maps: layer_output.append( rpn_graph(p, config.anchor_per_location, anchor_stride=1)) # 把各层的输出连接起来,[[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]] output_name = ['rpn_binary_logits', 'rpn_binary_probs', 'rpn_bbox'] outputs = list(zip(*layer_output)) # 连接以后,anchor数翻了5倍 outputs = [ tf.concat(list(o), axis=1, name=n) for o, n in zip(outputs, output_name) ] # [批数,anchors数,2], [批数,anchors数,2], [批数,anchors数,4] rpn_binary_logits, rpn_binary_probs, rpn_bbox_pred = outputs # 保留的proposal个数 num_proposal = config.POST_NMS_ROIS_TRAINING if mode == 'training' else config.POST_NMS_ROIS_INFERENCE # 生成经过非极大值抑制后的proposal, 形状是 [批数,个数,4] proposal = proposalLayer(inputs=[rpn_binary_probs, rpn_bbox_pred, anchors], max_proposal=num_proposal, nms_thresh=config.RPN_NMS_THRESHOLD, name="ROI") if mode == 'inference': mrcnn_class_logits, mrcnn_class_probs, mrcnn_bbox = fpn_classifier_graph( proposal, mrcnn_feature_maps, config.IMAGE_SHAPE, config.POOL_SIZE, config.NUM_CLASSES) # 经过最终处理以后的盒子,[x1, y1, x2, y2], 对应的类别,概率 boxes, ids, probs = detectionLayer(proposal, mrcnn_class_probs, mrcnn_bbox, config.IMAGE_SHAPE) mask = build_fpn_mask_graph(tf.expand_dims(boxes, 0), mrcnn_feature_maps, config.IMAGE_SHAPE, config.MASK_POOL_SIZE, config.NUM_CLASSES, train_bn=training) mask = filter_mask(mask, ids) mask = tf.nn.sigmoid(mask) return [boxes, ids, probs, mask] else: # 调用detection_targets函数,返回proposal,以及相应的类别、回归、masks # 因为批数不好处理,故只能蛋疼地分成一批一批地处理 rois_list, target_class_ids_list, target_bbox_list, target_mask_list = [], [], [], [] for i in range(batch_size): # roi_gt_class_ids[M], 反映proposal的分类 # gt_deltas[M, (dx, dy, log(h), log(w))] # 反映proposal相对于gt的回归 # masks[M, 高,宽] # [N, (x1, y1, x2, y2)]; [N]; [N, 4]; [N, 高,宽] rois, target_class_ids, target_bbox, target_mask = detection_targets( proposal[i], gt_class_ids=class_ids[i], gt_boxes=gt_boxes[i], gt_masks=input_gt_mask[i]) rois_list.append(rois) target_bbox_list.append(target_bbox) target_class_ids_list.append(target_class_ids) target_mask_list.append(target_mask) rois = tf.convert_to_tensor(rois_list) target_bbox = tf.convert_to_tensor(target_bbox_list) target_class_ids = tf.convert_to_tensor(target_class_ids_list) target_mask = tf.convert_to_tensor(target_mask_list) # [batch, N, 高,宽] # mrcnn_class_logits, mrcnn_class_probs的shape都是[num_boxex, num_classes] # mrcnn_bbox 的shape是[num_boxex, num_classes, (dx, dy, log(h), log(w))] mrcnn_class_logits, mrcnn_class_probs, mrcnn_bbox = fpn_classifier_graph( rois, mrcnn_feature_maps, config.IMAGE_SHAPE, config.POOL_SIZE, config.NUM_CLASSES) # [num_boxes, 28, 28, num_classes] mrcnn_mask_logits = build_fpn_mask_graph(rois, mrcnn_feature_maps, config.IMAGE_SHAPE, config.MASK_POOL_SIZE, config.NUM_CLASSES, train_bn=training, name="mrcnn_mask_logits") # rpn loss rpn_binary_loss = rpn_binary_loss_graph(rpn_binary_gt, rpn_binary_logits) rpn_bbox_loss = rpn_bbox_loss_graph(rpn_bbox_gt, rpn_bbox_pred, rpn_binary_gt) # proposal loss proposal_class_loss = proposal_class_loss_graph( target_class_ids, mrcnn_class_logits, config.NUM_CLASSES) proposal_bbox_loss = proposal_bbox_loss_graph(target_bbox, mrcnn_bbox, target_class_ids) mask_loss = mask_loss_graph(target_mask, mrcnn_mask_logits, target_class_ids, config.NUM_CLASSES) rpn_loss = rpn_binary_loss + rpn_bbox_loss # rpn的损失 proposal_loss = proposal_class_loss + proposal_bbox_loss # proposal的损失 total_loss = rpn_loss + proposal_loss + mask_loss # 返回rpn的损失,proposal的损失,mask的损失,和总损失 return [rpn_loss, proposal_loss, mask_loss, total_loss]