/
model.py
779 lines (652 loc) · 38.5 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
from resnet.resnet50 import conv2d, Model, dense, batch_norm, conv2d_transpose
import tensorflow as tf
import config
resnet50 = [[3, 256], [4, 512], [6, 1024], [3, 2048]]
resnet101 = [[3, 256], [4, 512], [23, 1024], [3, 2048]]
resnet152 = [[3, 256], [8, 512], [36, 1024], [3, 2048]]
def rpn_graph(feature_map, anchor_per_location,anchor_stride):
"""
根据特征图建立RPN网络的计算图,对应网络的输出
:param feature_map: 特征图,形状为[批,高,宽,通道数]
:param anchor_per_location: int,每个像素点产生多少个anchor
:param anchor_stride: 一般取1,表示特征图上,每个点都产生anchor
:return: 一个列表,有三个元素,依次是anchor的logits,probs, bbox回归
"""
batch_size, height, width, channal = feature_map.shape
num_anchor = height*width*anchor_per_location // anchor_stride # 这张特征图一共可以产生num_anchor个anchor
shared = conv2d(feature_map, 512, 3, anchor_stride, name='rpn_conv_shared')
shared = tf.nn.relu(shared)
# out_channal=2 * anchor_per_location,区分是或者不是物体
x = conv2d(inputs=shared,out_channal=2 * anchor_per_location,kernel_size=1,strides=1, name='rpn_class_raw')
# 把形状调整为[批数,anchor数,2], 2 表示object/non-object
# rpn_binary_logits = tf.reshape(x, shape=[batch_size, num_anchor, 2])
rpn_binary_logits = tf.reshape(x, [batch_size, -1, 2])
rpn_probs = tf.nn.softmax(rpn_binary_logits)
x = conv2d(inputs=shared, out_channal=4*anchor_per_location, kernel_size=1,strides=1, name='rpn_bbox_pred')
# 坐标回归
rpn_bbox = tf.reshape(x, [batch_size, -1, 4])
return [rpn_binary_logits, rpn_probs, rpn_bbox]
def apply_box_deltas(boxes, delta):
"""
对boxes进行修正处理
x坐标的回归 = (gt的x - anchor的x) / anchor的宽
高度的回归 = log(gt的高/anchor的高)
:param boxes: [..,anchor数,4], (x1, y1, x2, y2)
:param delta: [..,anchor数,4], (dx, dy, log(dh), log(dw))
:return:
"""
shape = tf.shape(boxes)
with tf.control_dependencies([tf.Assert(shape == tf.shape(delta), data=["shape must be same", shape, tf.shape(delta)])]):
boxes_reshaped = tf.reshape(boxes, [-1, 4]) # [num_box, 4]
delta_reshaped = tf.reshape(delta, [-1, 4]) # [num_box, 4]
height = boxes_reshaped[:, 3] - boxes_reshaped[:, 1]
width = boxes_reshaped[:, 2] - boxes_reshaped[:, 0]
center_x = (boxes_reshaped[:, 0] +boxes_reshaped[:, 2]) / 2
center_y = (boxes_reshaped[:, 3] + boxes_reshaped[:, 1]) / 2
# 修正后的中心点xy坐标
pred_x = delta_reshaped[:,0] * width + center_x
pred_y = delta_reshaped[:,1] * height + center_y
# 修正后的高度和宽度
pred_width = tf.exp(delta_reshaped[:,3]) * width
pred_height = tf.exp(delta_reshaped[:,2]) * height
x1 = pred_x - pred_width/2
y1 = pred_y - pred_height/2
x2 = pred_x + pred_width/2
y2 = pred_y + pred_height/2
# 拼接起来返回
result = tf.stack([x1, y1, x2, y2], axis=1)
return tf.reshape(result,shape)
def nms(boxes, scores, max_count, thresh):
"""
:param boxes: [批数,个数,4]
:param scores: [批数, 个数]
:param max_count: 最多输出的盒子个数
:param thresh: iou阈值
:return: [批数,个数,4]
"""
out = []
batch_size = boxes.shape[0]
for i in range(batch_size):
selected_indices = tf.image.non_max_suppression(boxes[i], scores[i], max_count,thresh)
out.append(tf.gather(boxes[i], selected_indices))
return tf.stack(out, axis=0)
def overlaps_graph(boxes1, boxes2):
"""
:param boxes1: [M, (x1, y1, x2, y2)]
:param boxes2: [N, (x1, y1, x2, y2)]
:return: [M, N]的iou矩阵
"""
b1 = tf.reshape(tf.tile(tf.expand_dims(boxes1, 1),
[1, 1, tf.shape(boxes2)[0]]), [-1, 4])
b2 = tf.tile(boxes2, [tf.shape(boxes1)[0], 1])
# 计算交集
b1_x1, b1_y1, b1_x2, b1_y2 = tf.split(b1, 4, axis=1)
b2_x1, b2_y1, b2_x2, b2_y2 = tf.split(b2, 4, axis=1)
y1 = tf.maximum(b1_y1, b2_y1)
x1 = tf.maximum(b1_x1, b2_x1)
y2 = tf.minimum(b1_y2, b2_y2)
x2 = tf.minimum(b1_x2, b2_x2)
intersection = tf.maximum(x2 - x1, 0) * tf.maximum(y2 - y1, 0)
# 计算并集
b1_area = (b1_y2 - b1_y1) * (b1_x2 - b1_x1)
b2_area = (b2_y2 - b2_y1) * (b2_x2 - b2_x1)
union = b1_area + b2_area - intersection
# 计算iou
iou = intersection / union
overlaps = tf.reshape(iou, [tf.shape(boxes1)[0], tf.shape(boxes2)[0]])
return overlaps
def detection_targets(proposals, gt_class_ids, gt_boxes, gt_masks):
"""
:param proposals: 形状是[N, (x1, y1, x2, y2)] ,归一化坐标
:param gt_class_ids: 形状是 [MAX_GT_INSTANCES],即,其长度等于每张图片的实例个数
:param gt_boxes: [MAX_GT_INSTANCES, (x1, y1, x2, y2)]
:param gt_masks: 形状是[ MAX_GT_INSTANCES,高, 宽], bool值,每个instance都要有一个mask,与gt_boxes一一对应
:return proposals, [N, (x1, y1, x2, y2)]; roi_gt_class_ids, [N]; gt_deltas, [N, 4]; masks,[N, 高,宽]
上面四个返回值的shape[0]都是 config.TRAIN_ROIS_PER_IMAGE = M,即每张图片的训练proposal个数,
roi_gt_class_ids [M], 反映proposal的分类
gt_deltas [M, (dx,dy, log(h), log(w))] 反映proposal相对于gt的回归
masks [M, 高,宽]
"""
# 保证至少有一个proposal
asserts = [tf.Assert(tf.greater(tf.shape(proposals)[0], 0), [proposals])]
with tf.control_dependencies(asserts):
proposals = tf.identity(proposals)
# 在coco数据里面,可能一个bounding box框住了好多个实例,我们就把用他的标签置为-1,不予训练
crowd_ix = tf.where(gt_class_ids < 0)[:, 0]
crowd_boxes = tf.gather(gt_boxes, crowd_ix)
non_crowd_ix = tf.where(gt_class_ids > 0)[:, 0]
# 把只有一个实例的框框、对应的id、mask选出来
gt_boxes = tf.gather(gt_boxes, non_crowd_ix)
gt_class_ids = tf.gather(gt_class_ids ,non_crowd_ix)
gt_masks = tf.gather(gt_masks, non_crowd_ix, axis=2)
# 计算proposals和gt_boxes之间的iou。返回的shape是[proposals.shape[0], gt_boxes.shape[0]]
overlaps = overlaps_graph(proposals, gt_boxes)
roi_iou_max = tf.reduce_max(overlaps, axis=1)
positive_indices = tf.where(roi_iou_max >= config.posi_anchor_thresh)[:, 0]
# 对于负例,是与GT的iou小于阈值,并且与多实例框框的阈值小于0.001
crowd_overlaps = overlaps_graph(proposals, crowd_boxes)
crowd_iou_max = tf.reduce_max(crowd_overlaps, axis=1)
negative_indices = tf.where(tf.logical_and(roi_iou_max < config.neg_anchor_thresh, crowd_iou_max < 0.001))[:, 0]
# 定义正负例的个数
positive_count = tf.minimum(tf.cast(config.TRAIN_ROIS_PER_IMAGE * config.ROI_POSITIVE_RATIO, tf.int32),
tf.size(positive_indices))
r = 1.0/config.ROI_POSITIVE_RATIO
negative_count = tf.cast((r-1.0)*tf.cast(positive_count,tf.float32), tf.int32)
negative_count = tf.minimum(negative_count, tf.size(negative_indices))
# 随意选择正负样本
positive_indices = tf.random_shuffle(positive_indices)[:positive_count]
negative_indices = tf.random_shuffle(negative_indices)[:negative_count]
# 取出正负例的proposal,只选择这些去训练
positive_rois = tf.gather(proposals, positive_indices)
negative_rois = tf.gather(proposals, negative_indices)
# 取出 proposal对应的gt_boxes和相应的class_ids,mask
positive_overlaps = tf.gather(overlaps, positive_indices)
gt_indices = tf.argmax(positive_overlaps, axis=1) # 其长度等于正例个数
roi_gt_boxes = tf.gather(gt_boxes, gt_indices)
roi_class_ids = tf.gather(gt_class_ids, gt_indices)
roi_mask = tf.gather(gt_masks, gt_indices, axis=0)
gt_deltas = box_deltas(positive_rois, roi_gt_boxes)
gt_deltas /= config.RPN_BBOX_STD_DEV
# 把mask调整为[N, 高,宽, 1]
roi_mask = tf.expand_dims(roi_mask, -1)
# 计算目标masks
boxes = positive_rois
if config.USE_MINI_MASK:
# 把roi规范化image坐标转换为规范化mini-mask坐标、
# TODO 有待考察
x1, y1, x2, y2 = tf.split(positive_rois, num_or_size_splits=4, axis=1)
gt_x1, gt_y1, gt_x2, gt_y2 = tf.split(roi_gt_boxes, num_or_size_splits=4, axis=1)
gt_h = gt_y2 - gt_y1
gt_w = gt_x2 - gt_x1
y1 = (y1 - gt_y1) / gt_h
x1 = (x1 - gt_x1) / gt_w
y2 = (y2 - gt_y1) / gt_h
x2 = (x2 - gt_x1) / gt_w
boxes = tf.concat([x1, y1, x2, y2], 1)
# 在gt的proposal的地方裁剪mask,然后resize成[28, 28]的大小
# 蛋疼的是,在tensorflow中,图片左上角为原点,水平向右是x轴,竖直向下是y轴
# 对于这个api,boxes必须是规范化坐标,且为[y1, x1, y2, x2],就是说,第一个参数衡量竖直向下的偏移量
# 所以就有了下面这两句话
x1, y1, x2, y2 = tf.split(boxes, num_or_size_splits=4, axis=1)
boxes = tf.concat([y1,x1,y2,x2], axis=1)
# 把proposal地方的mask截取出来
masks = tf.image.crop_and_resize(
image=tf.cast(roi_mask, tf.float32), boxes=boxes,
box_ind=tf.range(0, tf.shape(roi_mask)[0]), crop_size=config.MASK_SHAPE)
# 把mask的shape还原为[N, 高,宽]
masks = tf.squeeze(masks, axis=3)
# 在resize过程中,可能导致非0和1的float数出现,这里二值化为0和1
masks = tf.round(masks)
# 把正例roi和负例roi拼接起来,并对多余的进行零填充
proposals = tf.concat([positive_rois, negative_rois], axis=0)
a = tf.maximum(config.TRAIN_ROIS_PER_IMAGE-tf.shape(proposals)[0], 0) # 不够凑足一个TRAIN_ROIS_PER_IMAGE的
b = tf.shape(negative_rois)[0] # 负的
proposals = tf.pad(proposals, [(0, a), (0, 0)])
roi_gt_class_ids = tf.pad(roi_class_ids, [(0, b)]) #负例的pad为0
roi_gt_class_ids = tf.pad(roi_gt_class_ids, [(0, a)], constant_values=-1) # 超出的pad为-1
gt_deltas = tf.pad(gt_deltas, [(0, a+b), (0, 0)])
masks = tf.pad(masks, [(0, a+b), (0, 0), (0, 0)])
return proposals, roi_gt_class_ids, gt_deltas, masks
def box_deltas(box, gt_box):
"""
计算box与gt_box之间的框框偏差
y的回归 = (GT的y-anchor的y)/anchor的高
高的回归 = log(GT的高 / anchor的高)
:param box: [N, (x1, y1, x2, y2)]
:param gt_box: [N, (x1, y1, x2, y2)]
:return: [dx, dy, dh, dw]
"""
asserts = [tf.Assert(tf.equal(tf.shape(box)[0], tf.shape(gt_box)[0]), ["The length of box and gt_box must be same"])]
with tf.control_dependencies(asserts):
box = tf.identity(box)
box = tf.cast(box, tf.float32)
gt_box = tf.cast(gt_box, tf.float32)
width = box[:, 2] - box[:, 0]
height = box[:, 3] - box[:, 1]
center_x = (box[:, 0] + box[:, 2])/2
center_y = (box[:, 1] + box[:, 3])/2
gt_width = gt_box[:, 2] - gt_box[:, 0]
gt_height = gt_box[:, 3] - gt_box[:, 1]
gt_center_x = (gt_box[:, 0] + gt_box[:, 2])/2
gt_center_y = (gt_box[:, 1] + gt_box[:, 3])/2
dy = (gt_center_y - center_y) / height
dx = (gt_center_x - center_x) / width
dh = tf.log(gt_height / height)
dw = tf.log(gt_width / width)
result = tf.stack([dx, dy, dh, dw], axis=1)
return result
def proposalLayer(inputs, max_proposal,nms_thresh, name=None):
"""
该函数根据特征输出,来制定proposal,这些proposal已经进行了非极大值抑制
:param inputs: 包含三个元素的列表, 依次是rpn_binary_class, rpn_bbox, anchors
:param max_proposal:
:param nms_thresh:
:return:经过非极大值抑制后的proposal,shape是[批数,proposal个数,(x1, y1, x2, y2)],正则化坐标
"""
# rpn_binary_class的形状是[批数,anchor数,2],只取正例的分数
scores = inputs[0][:, :, 1]
# rpn_bbox的形状是[批数,anchor数,4]
deltas = inputs[1]
deltas = deltas * tf.reshape(config.RPN_BBOX_STD_DEV, [1, 1, 4])
anchors = inputs[2] # 取出anchors
# 取anchor数和6000的较小值,只保留这些proposal
pre_nms_limit = tf.minimum(6000, tf.shape(anchors)[1])
# 按照最后一个维度,取出分数最高的前k个的索引,以列表形式返回,这里,ix是一个二维列表
ix = tf.nn.top_k(scores, pre_nms_limit, sorted=True).indices # (批数,选中的编号)
# scores,deltas和anchors的rank都是3, 需要定义一个函数,来对anchor进行选取
# 虽然这种写法很蛋疼,但暂时也找不到更好的写法,待研究
def unmatch_dim_gather(values, indice):
batch_size = values.shape[0] # 批数
outputs = []
for i in range(batch_size):
out = tf.gather(values[i], indice[i])
# if not isinstance(out, (tuple, list)):
# out = [out]
outputs.append(out)
outputs = tf.convert_to_tensor(outputs)
return outputs
scores = tf.expand_dims(scores, dim=-1)
scores = unmatch_dim_gather(scores, ix) # (批数,anchor数,1)
scores = tf.squeeze(scores, -1)
deltas = unmatch_dim_gather(deltas, ix) # (批数,anchor数,4)
pre_nms_anchors = unmatch_dim_gather(anchors, ix) # (批数,anchor数,4)
# 修正坐标
boxes = apply_box_deltas(pre_nms_anchors, deltas)
# 我们使用正则化坐标,所有框框的坐标值都在[0, 1]之间
boxes = tf.clip_by_value(boxes, 0, 1)
boxes = nms(boxes, scores, max_proposal, nms_thresh)
return boxes
def fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_shape, pool_size, num_classes):
"""
构建FPN的分类与回归
:param rois: Proposals, [batch, num_rois, (x1, y1, x2, y2)]
:param mrcnn_feature_maps: 一个列表,[p2, p3, p4, p5] 代表四个层级的特征图
其相对于输入图片的缩放倍数依次是8, 16, 32, 64
:param input_image_shape: 原始输入图片的shape,[高,宽,通道数]。一个批次的所有图片,必须有相同的shape
:param pool_size: ROI Pooling后的大小,一般是7*7
:param num_classes: 分类数,他决定了最终的通道数,因为我们用global avarage pool
:return:
"""
# [num_boxes, height, width, channels], ROI Pooling后的结果
x = pyramidROIAlign(pool_size, rois, input_image_shape, mrcnn_feature_maps)
num_boxes = tf.shape(x)[0]
# TODO 到这里,彻底放弃了批数大于1的情形!!
# 这里其实就是全连接,并且批数由num_boxes代替了
x = conv2d(inputs=x,out_channal=1024, kernel_size=pool_size[0],strides=pool_size[0], use_bias=True, name="mrcnn_class_conv1")
# 这里是全连接,就不来batch_norm了
x = tf.nn.relu(x)
x = conv2d(inputs=x, out_channal=1024, kernel_size=1, strides=1, use_bias=True, name="mrcnn_class_conv2")
# 这时候,x的shape是[num_boxex, 1, 1, 1024], 调用下面这句话以后,变成了[num_box, 1024]
shared = tf.squeeze(tf.nn.relu(x),axis=[1, 2])
# 下面分为两个head,一个用于回归,一个用于分类
mrcnn_class_logits = dense(inputs=shared,out_dimension=num_classes,use_biase=True,name="mrcnn_class_logits")
mrcnn_class_probs = tf.nn.softmax(mrcnn_class_logits,name="mrcnn_class_probs")
mrcnn_bbox = dense(inputs=shared, out_dimension=4*num_classes, use_biase=True)
mrcnn_bbox = tf.reshape(mrcnn_bbox, shape=[num_boxes, num_classes, 4], name="mrcnn_class_bbox")
# mrcnn_class_logits, mrcnn_class_probs的shape都是[num_boxex, num_classes]
# mrcnn_bbox 的shape是[num_boxex, num_classes, (dx, dy, log(h), log(w))]
return mrcnn_class_logits, mrcnn_class_probs, mrcnn_bbox
def pyramidROIAlign(pool_size, rois, image_shape, feature_maps):
"""
在不同层次的feature map上执行 ROI Pooling
:param pool_size: 特征图进行feature pool以后的网格,[高,宽],通常是[7,7]
:param rois: [batch, num_boxex, (x1, y1, x2, y2)],归一化坐标,如果不够,就用零填充
:param image_shape: 原始输入图片的shape,[高,宽,通道数]。一个批次的所有图片,必须有相同的shape
:param feature_maps: 一个列表,列表每个元素表示来自不同特征图层级的,shape都是[batch, height, width, channels]
:return: [num_boxes, height, width, channels], ROI Pooling后的结果
"""
x1, y1, x2, y2 = tf.split(rois, 4, axis=2) # shape [batch, num, 4]
h = y2 - y1 # shape [batch, num, 1]
w = x2 - x1
image_area = tf.cast(image_shape[0]*image_shape[1], tf.float32)
# 下面这几行,根据FPN的式(1)而来。我们的proposal明明有指定的来源,即每个proposal来自哪个层级,是清晰的,
# 奈何这里又要根据公式(1)指定层级呢?
# TODO 这里有待研究,论文写得蛋疼
roi_level = tf.log(h*w*image_area/(224.0*224.0))/tf.log(2.0)/2 # shape [batch, num, 1]
roi_level = tf.minimum(5, tf.maximum(2, 4+ tf.cast(tf.round(roi_level), tf.int32)))
roi_level = tf.squeeze(roi_level, 2) # shape [batch, num]
# 对每一个层级进行遍历
pooled, box_to_level = [], []
for i, level in enumerate(range(2, 6)):
# tf.where()的返回值中,有几个真值就返回几行,每一行代表这个真值的坐标值
# 这里,roi_level的rank是2,故,ix的每一行有两个元素,代表了该真值所在坐标位置,坐标中,
# 第一个表示来自哪一批,第二个表示来自该批的第几个
ix = tf.where(tf.equal(roi_level, level))
level_boxes = tf.gather_nd(rois, ix)
batch_indices = tf.cast(ix[:, 0], tf.int32) # 反映批数信息,为下面切片做准备
box_to_level.append(ix) # 记录该层级所拥有的proposal坐标
# 用approximate joint training的法则,把proposal当作常数处理,对梯度不贡献
level_boxes = tf.stop_gradient(level_boxes)
batch_indices = tf.stop_gradient(batch_indices)
with tf.control_dependencies([tf.assert_rank(
level_boxes, 2, data=["may be some wrong in how to use api tf.gather_nd"])]):
x1, y1, x2, y2 = tf.split(level_boxes, 4, 1)
temp = tf.concat([y1, x1, y2, x2], axis=1)
# temp = tf.stack()
# roi是有零填充的,零填充以后,所截取的feature map,还怎么放大到7*7???
# tf.image.crop_and_resize返回的shape是[所截取图片张数,高,宽,通道数]
pooled.append(tf.image.crop_and_resize(
feature_maps[i], temp, batch_indices, pool_size, method='bilinear'))
# 合并成tensor,shape是[num_pic, height, width, channels]
pooled = tf.concat(pooled, axis=0)
# 下面这几条蛋疼的代码,是为了将同一批次的组合起来,然后合并成[批数,num_pic, 高,宽,通道数]
# TODO 蛋疼的是,这段代码有错误。正确的做法是,将批次号相同的取出来,再扩展维度,
# 有时间再看吧,实在懒了,就令batch_size = 1 万事大吉
# 在第三列添加一个标记,记录box的id
box_to_level = tf.concat(box_to_level, axis=0) # shape 是[图片张数, 2]
box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1) # [图片张数,1]
box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range], axis=1) # [num_pic, 3]
# 排序,以批次号为主,批次号相同的,再看box index
sorting_tensor = box_to_level[:, 0] * 1000 + box_to_level[:, 1] # [num_pic]
ix = tf.nn.top_k(sorting_tensor, k=tf.shape(box_to_level)[0], sorted=True).indices[::-1]
pooled = tf.gather(pooled, ix)
return pooled
def build_fpn_mask_graph(rois, feature_maps, image_shape, pool_size, num_class, train_bn=True, name=None):
"""
构建mask
:param rois: Proposals, [batch, num_rois, (x1, y1, x2, y2)]
:param feature_maps: 一个列表,[p2, p3, p4, p5] 代表四个层级的特征图
:param image_shape: 原始输入图片的shape,[高,宽,通道数]。一个批次的所有图片,必须有相同的shape
:param pool_size: ROI Pooling后的大小,在论文中,对于mask是14*14
:param num_class: 分类数,他决定了最终的通道数
:param train_bn:
:return: [num_boxes, 28, 28, num_classes]
"""
# [num_boxes, height, width, channels], ROI Pooling后的结果
x = pyramidROIAlign(pool_size, rois,image_shape,feature_maps)
for _ in range(4):
x = conv2d(inputs=x, out_channal=256, kernel_size=3, strides=1, use_bias=False)
x = batch_norm(x, train_bn)
x = tf.nn.relu(x)
x = conv2d_transpose(inputs=x, out_channal=256, kernel_size=3, strides=2)
x = tf.nn.relu(x)
x = conv2d(x, num_class, 1, 1, use_bias=True, name=name)
return x
def smooth_l1_loss(x):
x = tf.abs(x)
less_than_one = tf.cast(tf.less(x, 1.0), tf.float32)
return less_than_one * x*x/2 + (1-less_than_one)*(x - 0.5)
def rpn_binary_loss_graph(rpn_binary_gt, rpn_binary_logits):
"""
:param rpn_binary_gt: [批数, 个数, 1], 1表示正例,0表示负例,-1则不予考虑
:param rpn_binary_logits: [批数,anchors数,2],最后一个维度里面,分别是负例、正例
:return: 交叉熵
"""
print('==================',rpn_binary_logits.shape)
rpn_binary_gt = tf.reshape(rpn_binary_gt,shape=[-1])
rpn_binary_gt = tf.cast(rpn_binary_gt, tf.int32)
rpn_binary_logits = tf.reshape(rpn_binary_logits, shape=[-1, 2])
with tf.control_dependencies([tf.Assert(
tf.shape(rpn_binary_logits)[0] == tf.shape(rpn_binary_gt)[0], data=["the length must be same", tf.shape(rpn_binary_logits),tf.shape(rpn_binary_gt)])]):
# 取出有效的索引
ix = tf.where(tf.not_equal(rpn_binary_gt, -1))
if tf.size(ix) == 0:
return tf.constant(0.0)
rpn_binary_gt = tf.gather_nd(rpn_binary_gt, ix)
rpn_binary_logits = tf.gather_nd(rpn_binary_logits, ix)
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=rpn_binary_gt, logits=rpn_binary_logits)
return tf.reduce_mean(loss)
def rpn_bbox_loss_graph(rpn_bbox_gt, rpn_bbox_pred, rpn_binary_gt):
"""
:param rpn_bbox_gt: [批数, anchor个数, (dx, dy, log(h), log(w))]
:param rpn_bbox_pred: [批数,anchors数,4]
:param rpn_binary_gt: [批数, 个数, 1], 1表示正例,0表示负例,-1则不予考虑,我们只考察正例的回归损失
:return:
"""
rpn_binary_gt = tf.cast(tf.reshape(rpn_binary_gt,shape=[-1]), tf.int32)
rpn_bbox_gt = tf.reshape(rpn_bbox_gt, [-1, 4])
rpn_bbox_pred = tf.reshape(rpn_bbox_pred, [-1, 4])
with tf.control_dependencies([tf.Assert(
tf.shape(rpn_bbox_gt) == tf.shape(rpn_bbox_pred), data=["the shape must be same",tf.shape(rpn_bbox_gt),tf.shape(rpn_bbox_pred)])]):
positive_ix = tf.where(tf.equal(rpn_binary_gt, 1))[:, 0]
if tf.size(positive_ix) == 0:
return tf.constant(0.0)
rpn_bbox_gt = tf.gather(rpn_bbox_gt, positive_ix)
rpn_bbox_pred = tf.gather(rpn_bbox_pred, positive_ix)
loss = tf.reduce_sum(smooth_l1_loss(rpn_bbox_gt - rpn_bbox_pred))
return loss/tf.cast(tf.size(positive_ix), tf.float32)
def proposal_bbox_loss_graph(target_bbox,mrcnn_bbox,target_class_ids):
"""
注意,这里的mrcnn_bbox的shape,每个类别都有自己的回归目标
:param target_bbox: [batch, N, 4]
:param mrcnn_bbox: [num_boxex, num_classes, (dx, dy, log(h), log(w))]
:param target_class_ids: [batch, N]
:return:
"""
target_bbox = tf.reshape(target_bbox, [-1, 4]) # [num_box, 4]
target_class_ids = tf.reshape(target_class_ids, [-1]) # [num_box]
target_class_ids = tf.cast(target_class_ids, tf.int32)
with tf.control_dependencies(
[tf.Assert(tf.logical_and(tf.equal(tf.shape(target_bbox)[0], tf.shape(mrcnn_bbox)[0]),
tf.equal(tf.shape(mrcnn_bbox)[0], tf.shape(target_class_ids)[0])),
data=["the shape must be same"] )]):
positive_ix = tf.cast(tf.where(target_class_ids > 0)[:, 0], tf.int32)
if tf.size(positive_ix) == 0:
return tf.constant(0.0)
positive_class_ids = tf.cast(tf.gather(target_class_ids, positive_ix), tf.int32) # 正例的具体id
indice = tf.stack([positive_ix, positive_class_ids], axis=1) # [在num_box中的索引编号,具体的id号]
mrcnn_bbox = tf.gather_nd(mrcnn_bbox, indice) # [N, 4]
target_bbox = tf.gather(target_bbox, positive_ix) # [N, 4]
loss = tf.reduce_sum(smooth_l1_loss(target_bbox - mrcnn_bbox))
return loss / tf.cast(tf.size(positive_ix), tf.float32)
def proposal_class_loss_graph(target_ids, logits, num_class):
"""是有零填充的,"""
target_ids = tf.reshape(target_ids, shape=[-1])
target_ids = tf.cast(target_ids, tf.int32)
logits = tf.reshape(logits, shape = [-1, num_class])
with tf.control_dependencies([tf.Assert(tf.shape(target_ids)[0] == tf.shape(logits)[0], data=["The length must be same",tf.shape(target_ids), tf.shape(logits)])]):
valid_ix = tf.where(tf.not_equal(target_ids, -1))[:, 0]
if tf.size(valid_ix) == 0:
return tf.constant(0.0)
target_ids = tf.gather(target_ids, valid_ix)
logits = tf.gather(logits, valid_ix)
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_ids, logits=logits)
return tf.reduce_mean(loss)
def mask_loss_graph(target_mask, mrcnn_mask_logits, target_class_ids, num_class):
"""
:param target_mask: [batch, N, 高,宽]
:param mrcnn_mask_logits: [num_boxes, 28, 28, num_classes]
:param target_class_ids: [batch, N]
:param num_class: 类别数
:return:
"""
target_class_ids = tf.reshape(target_class_ids, [-1]) # [num_boxes]
target_shape = tf.shape(target_mask)
target_mask = tf.reshape(target_mask, (-1, target_shape[2], target_shape[3])) # [num_boxex, 高,宽]
mrcnn_mask_logits = tf.transpose(mrcnn_mask_logits, [0, 3, 1, 2]) # [num_boxes, num_classes, 28, 28]
positive_ix =tf.cast(tf.where(target_class_ids > 0)[:, 0], tf.int32) # 正例在num_box中的索引号
if tf.size(positive_ix) == 0:
return tf.constant(0.0)
y_true = tf.gather(target_mask, positive_ix) # [num_boxex, 28, 28]
positive_class_ids = tf.cast(tf.gather(target_class_ids, positive_ix), tf.int32) # 正例的具体id
indice = tf.stack([positive_ix, positive_class_ids], axis=1) # [在num_box中的索引号,类别号]
y_pred = tf.gather_nd(mrcnn_mask_logits, indice) # [num_boxex, 28,28]
with tf.control_dependencies([tf.Assert(tf.shape(y_true) == tf.shape(y_pred), data=["the shape must be same",tf.shape(y_true),tf.shape(y_pred)])]):
loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=y_true, logits=y_pred))
return loss/tf.cast(tf.size(positive_ix), tf.float32)
def detectionLayer(proposal, probs, bbox, image_shape):
"""
一次检测一张图片
:param proposal: 经过非极大值抑制后, [批数,个数,4]
:param probs: [num_boxex, num_classes]
:param bbox: [num_boxex, num_classes, (dx, dy, log(h), log(w))]
:param image_shape: [高,宽,通道数]
:return: 盒子,ids, 概率
"""
with tf.control_dependencies([tf.Assert(tf.shape(proposal)[0] == 1, data=["A single picture for evaluation each time"])]):
proposal = tf.squeeze(proposal, axis=[0,]) # [num_boxes, 4]
class_ids = tf.argmax(probs, axis=1, output_type=tf.int32) # [num_boxes]
indices = tf.stack([tf.range(probs.shape[0]), class_ids], axis=1) # [序号,类别号]
class_probs = tf.gather_nd(probs, indices) # [num_box]
deltas = tf.gather_nd(bbox, indices) # [num_box, 4]
refined_rois = apply_box_deltas(proposal, deltas*config.RPN_BBOX_STD_DEV) # [num_boxes, 4]
refined_rois = tf.clip_by_value(refined_rois, 0, 1)
keep = tf.where(class_ids > 0)[:, 0] # 取出前景
if config.DETECTION_MIN_CONFIDENCE:
# 如果使用最小confidence,就取交集
conf_keep = tf.where(class_probs >= config.DETECTION_MIN_CONFIDENCE)[:, 0]
keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
tf.expand_dims(conf_keep, 0))
keep = tf.sparse_tensor_to_dense(keep)[0]
###########################下面的,全部重新组合排序########################
pre_nms_class_ids = tf.gather(class_ids, keep) # 取出id [num]
pre_nms_scores = tf.gather(class_probs, keep) # 取出相应的概率值 [num]
pre_nms_rois = tf.gather(refined_rois, keep) # 取出相应的框框 [num, 4]
unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0] # 能找到的分类数 [unique]
def nms_keep_map(class_id):
"""
只有属于同一类别的,才进行非极大值抑制
:param class_id: 给定的某一类别号
:return:
"""
ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0]
class_keep = tf.image.non_max_suppression(
boxes=tf.gather(pre_nms_rois, ixs),
scores=tf.gather(pre_nms_scores, ixs),
max_output_size=config.DETECTION_MAX_INSTANCE,
iou_threshold=config.DETECTION_NMS_THRESHHOLD)
# class_keep取出的是相对于ixs的序号,tf.gather(ixs, class_keep)就是ixs的数值本身
# ixs的数值本身,就是pre_nms_class_ids的序号
class_keep = tf.gather(ixs, class_keep)
# 用-1填充,使得都有相同的shape
gap = config.DETECTION_MAX_INSTANCE - tf.shape(class_keep)[0]
class_keep = tf.pad(class_keep, [(0, gap)], mode='CONSTANT', constant_values=-1)
# 设置shape,使得map_fn()能够立即知道她的shape
class_keep.set_shape([config.DETECTION_MAX_INSTANCE])
return class_keep
# nms_keep的shape是,[unique_pre_nms_class_ids的长度,config.DETECTION_MAX_INSTANCE]
nms_keep = tf.map_fn(nms_keep_map, unique_pre_nms_class_ids, dtype=tf.int32)
nms_keep = tf.reshape(nms_keep, [-1])
# keep就是pre_nms_class_ids的序号
keep = tf.gather(nms_keep, tf.where(nms_keep > -1)[:, 0])
#
class_probs2 = tf.gather(pre_nms_scores, keep)
num_keep = tf.minimum(tf.shape(keep)[0], config.DETECTION_MAX_INSTANCE)
top_ids = tf.nn.top_k(class_probs2, num_keep, sorted=True).indices
keep = tf.gather(keep, top_ids) # 至此,keep是分数最大的,不超过实例个数的保留值了
return tf.gather(pre_nms_rois, keep), tf.gather(pre_nms_class_ids, keep), tf.gather(pre_nms_scores, keep)
def filter_mask(mask, ids):
"""
:param mask: float型,[num, 28, 28, num_classes]
:param ids: int,[num]
:return: [num,28, 28],与ids一一对应
"""
num = tf.size(ids)
mask = tf.transpose(mask, [0, 3, 1, 2])
indice = tf.stack([tf.range(num), ids], axis=1)
mask = tf.gather_nd(mask, indice)
return mask
def build_model(mode, input_image, gt_boxes=None, class_ids=None,
input_gt_mask=None, rpn_binary_gt=None, rpn_bbox_gt=None, anchors=None):
# TODO 在输入层产生的顺序是根据config中的feat_strides产生的 即128, 64, 32, 16, 8
"""
feature map有五个层级,p2, p3, p4, p5, p6,其相对于输入图片的缩放倍数依次是8, 16, 32, 64, 128
在特征图上每个像素点的位置都要产生k^2个anchor.假设第s层有N个像素点,则在s层产生的anchor数是N*k^2,
其shape为[N*k^2, (x1, y1, x2, y2)]。有五个层级,则调用tf.cancat函数在第0维拼接起来,形成的shape是
[num_anchor, (x1, y1, x2, y2)]。最后,按照批数拼接起来,最终的shape是[batch, num_anchor, (x1, y1, x2, y2)]
我们采用正则化坐标,故所有的坐标值的范围都必须在区间[0,1]里面
mode: 必须是'training','validation','inference'三者之一。mode是'training' 或者 'validation'时,所有参数都不能是None,
mode是'inference'时,只需要提供input_image即可
:param mode: 必须是'training','validation','inference'三者之一
:param input_image: [1, 高, 宽, 3] # 简单一点,每次一张图片
:param gt_boxes: shape=[1, 个数, 4]
:param class_ids: shape=[1, 个数]
:param input_gt_mask: [1,高,宽]
:param rpn_binary_gt: shape=[1, None, 1] anchor的标签,0表示背景,1表示instance,-1表示不关心
:param rpn_bbox_gt: shape=[1, None, 4], anchor的回归目标值
:param anchors: [1, num_anchor, (x1, y1, x2, y2)]
:return:
"""
mode_validation = mode in ['training','validation','inference']
with tf.control_dependencies([tf.Assert(mode_validation, data=["invalid mode"])]):
batch_size = input_image.shape[0]
resnet = Model(resnetlist=resnet50, version=2)
training = True if mode == 'training' else False
# layer是一个列表,包含c2, c3, c4, c5,其相对于输入图片的缩放比例依次是8,16,32,64
# resolution是输入图片相对于特征图的分辨率倍数,值为64
layer, resolution = resnet(inputs=input_image, training=training)
P5 = conv2d(inputs=layer[3], out_channal=256, kernel_size=1, strides=1, name='fpn_c5p5')
P4 = tf.add_n([conv2d_transpose(inputs=P5, out_channal=256, kernel_size=1, strides=2, name="fpn_trans4"),
conv2d(inputs=layer[2], out_channal=256, kernel_size=1, strides=1, name="fpn_c4p4")], name="fpn_p4add")
P3 = tf.add_n([conv2d_transpose(inputs=P4, out_channal=256, kernel_size=1, strides=2, name="fpn_trans4"),
conv2d(inputs=layer[1], out_channal=256, kernel_size=1, strides=1, name="fpn_c3p3")],name="fpn_p3add")
P2 = tf.add_n([conv2d_transpose(inputs=P3, out_channal=256, kernel_size=1, strides=2, name="fpn_trans4"),
conv2d(inputs=layer[0], out_channal=256, kernel_size=1, strides=1, name="fpn_c2p2")], name="fpn_p2add")
# 根据FPN,最终来一个卷积,得到最后的特征图。没有非线性函数
p2 = conv2d(P2, 256, 3, 1, name="fpn_p2")
p3 = conv2d(P3, 256, 3, 1, name="fpn_p3")
p4 = conv2d(P4, 256, 3, 1, name="fpn_p4")
p5 = conv2d(P5, 256, 3, 1, name="fpn_p5")
# feature map 6 用来做RPN,不用来做分类
p6 = tf.layers.max_pooling2d(inputs=p5, pool_size=1, strides=2, name='feature_map6')
rpn_feature_maps = [p2, p3, p4, p5, p6]
mrcnn_feature_maps = [p2, p3, p4, p5]
# 定义一个列表,其长度为输出特征图的层级数,用于装入每级特征图的rpn输出。
# rpn输出包含[rpn_binary_logits, rpn_probs, rpn_bbox],
# 其shape依次是[批数,每个层级的anchors数,2],[批数,anchors数,2],[批数,anchors数,4]
layer_output = []
for p in rpn_feature_maps:
layer_output.append(rpn_graph(p, config.anchor_per_location, anchor_stride=1))
# 把各层的输出连接起来,[[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
output_name = ['rpn_binary_logits', 'rpn_binary_probs', 'rpn_bbox']
outputs = list(zip(*layer_output))
# 连接以后,anchor数翻了5倍
outputs = [tf.concat(list(o),axis=1,name=n) for o, n in zip(outputs, output_name)]
# [批数,anchors数,2], [批数,anchors数,2], [批数,anchors数,4]
rpn_binary_logits, rpn_binary_probs, rpn_bbox_pred = outputs
# 保留的proposal个数
num_proposal = config.POST_NMS_ROIS_TRAINING if mode == 'training' else config.POST_NMS_ROIS_INFERENCE
# 生成经过非极大值抑制后的proposal, 形状是 [批数,个数,4]
proposal = proposalLayer(inputs=[rpn_binary_probs, rpn_bbox_pred, anchors],
max_proposal=num_proposal,nms_thresh=config.RPN_NMS_THRESHOLD, name="ROI")
if mode == 'inference':
mrcnn_class_logits, mrcnn_class_probs, mrcnn_bbox = fpn_classifier_graph(
proposal, mrcnn_feature_maps, config.IMAGE_SHAPE, config.POOL_SIZE, config.NUM_CLASSES)
# 经过最终处理以后的盒子,[x1, y1, x2, y2], 对应的类别,概率
boxes, ids, probs = detectionLayer(proposal, mrcnn_class_probs, mrcnn_bbox, config.IMAGE_SHAPE)
mask = build_fpn_mask_graph(
tf.expand_dims(boxes, 0), mrcnn_feature_maps,
config.IMAGE_SHAPE, config.MASK_POOL_SIZE, config.NUM_CLASSES, train_bn=training)
mask = filter_mask(mask, ids)
mask = tf.nn.sigmoid(mask)
return [boxes, ids, probs, mask]
else:
# 调用detection_targets函数,返回proposal,以及相应的类别、回归、masks
# 因为批数不好处理,故只能蛋疼地分成一批一批地处理
rois_list, target_class_ids_list, target_bbox_list, target_mask_list = [], [], [], []
for i in range(batch_size):
# roi_gt_class_ids[M], 反映proposal的分类
# gt_deltas[M, (dx, dy, log(h), log(w))]
# 反映proposal相对于gt的回归
# masks[M, 高,宽]
# [N, (x1, y1, x2, y2)]; [N]; [N, 4]; [N, 高,宽]
rois, target_class_ids, target_bbox, target_mask = detection_targets(
proposal[i], gt_class_ids=class_ids[i],gt_boxes=gt_boxes[i],gt_masks=input_gt_mask[i])
rois_list.append(rois)
target_bbox_list.append(target_bbox)
target_class_ids_list.append(target_class_ids)
target_mask_list.append(target_mask)
rois = tf.convert_to_tensor(rois_list)
target_bbox = tf.convert_to_tensor(target_bbox_list)
target_class_ids = tf.convert_to_tensor(target_class_ids_list)
target_mask = tf.convert_to_tensor(target_mask_list) # [batch, N, 高,宽]
# mrcnn_class_logits, mrcnn_class_probs的shape都是[num_boxex, num_classes]
# mrcnn_bbox 的shape是[num_boxex, num_classes, (dx, dy, log(h), log(w))]
mrcnn_class_logits, mrcnn_class_probs, mrcnn_bbox = fpn_classifier_graph(
rois, mrcnn_feature_maps, config.IMAGE_SHAPE,config.POOL_SIZE, config.NUM_CLASSES)
# [num_boxes, 28, 28, num_classes]
mrcnn_mask_logits = build_fpn_mask_graph(
rois, mrcnn_feature_maps, config.IMAGE_SHAPE,
config.MASK_POOL_SIZE, config.NUM_CLASSES, train_bn=training, name="mrcnn_mask_logits")
# rpn loss
rpn_binary_loss = rpn_binary_loss_graph(rpn_binary_gt, rpn_binary_logits)
rpn_bbox_loss = rpn_bbox_loss_graph(rpn_bbox_gt, rpn_bbox_pred, rpn_binary_gt)
# proposal loss
proposal_class_loss = proposal_class_loss_graph(target_class_ids, mrcnn_class_logits, config.NUM_CLASSES)
proposal_bbox_loss = proposal_bbox_loss_graph(target_bbox,mrcnn_bbox,target_class_ids)
mask_loss = mask_loss_graph(target_mask, mrcnn_mask_logits, target_class_ids, config.NUM_CLASSES)
rpn_loss = rpn_binary_loss + rpn_bbox_loss # rpn的损失
proposal_loss = proposal_class_loss + proposal_bbox_loss # proposal的损失
total_loss = rpn_loss + proposal_loss + mask_loss
# 返回rpn的损失,proposal的损失,mask的损失,和总损失
return [rpn_loss, proposal_loss, mask_loss, total_loss]