def call(self, inputs): # Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords boxes = inputs[0] # Image meta # Holds details about the image. See compose_image_meta() image_meta = inputs[1] # Feature Maps. List of feature maps from different level of the # feature pyramid. Each is [batch, height, width, channels] feature_maps = inputs[2:] # Assign each ROI to a level in the pyramid based on the ROI area. y1, x1, y2, x2 = tf.split(boxes, 4, axis=2) h = y2 - y1 w = x2 - x1 # Use shape of first image. Images in a batch must have the same size. image_shape = parse_image_meta_graph(image_meta)['image_shape'][0] # Equation 1 in the Feature Pyramid Networks paper. Account for # the fact that our coordinates are normalized here. # e.g. a 224x224 ROI (in pixels) maps to P4 image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32) roi_level = log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area))) roi_level = tf.minimum( 5, tf.maximum(2, 4 + tf.cast(tf.round(roi_level), tf.int32))) roi_level = tf.squeeze(roi_level, 2) # Loop through levels and apply ROI pooling to each. P2 to P5. pooled = [] box_to_level = [] for i, level in enumerate(range(2, 6)): ix = tf.where(tf.equal(roi_level, level)) level_boxes = tf.gather_nd(boxes, ix) # Box indices for crop_and_resize. box_indices = tf.cast(ix[:, 0], tf.int32) # Keep track of which box is mapped to which level box_to_level.append(ix) # Stop gradient propogation to ROI proposals level_boxes = tf.stop_gradient(level_boxes) box_indices = tf.stop_gradient(box_indices) # Crop and Resize # From Mask R-CNN paper: "We sample four regular locations, so # that we can evaluate either max or average pooling. In fact, # interpolating only a single value at each bin center (without # pooling) is nearly as effective." # # Here we use the simplified approach of a single value per bin, # which is how it's done in tf.crop_and_resize() # Result: [batch * num_boxes, pool_height, pool_width, channels] pooled.append( tf.image.crop_and_resize(feature_maps[i], level_boxes, box_indices, self.pool_shape, method="bilinear")) # Pack pooled features into one tensor pooled = tf.concat(pooled, axis=0) # Pack box_to_level mapping into one array and add another # column representing the order of pooled boxes box_to_level = tf.concat(box_to_level, axis=0) box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1) box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range], axis=1) # Rearrange pooled features to match the order of the original boxes # Sort box_to_level by batch then box index # TF doesn't have a way to sort by two columns, so merge them and sort. sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1] ix = tf.nn.top_k(sorting_tensor, k=tf.shape(box_to_level)[0]).indices[::-1] ix = tf.gather(box_to_level[:, 2], ix) pooled = tf.gather(pooled, ix) # Re-add the batch dimension shape = tf.concat([tf.shape(boxes)[:2], tf.shape(pooled)[1:]], axis=0) pooled = tf.reshape(pooled, shape) return pooled
def build(self, mode: str, config: ShapesConfig): """Build Mask R-CNN architecture. input_shape: The shape of the input image. mode: Either "training" or "inference". The inputs and outputs of the model differ accordingly. """ assert mode in ['training', 'inference'] # Image size must be dividable by 2 multiple times h, w = config.IMAGE_SHAPE[:2] if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): raise Exception( "Image size must be dividable by 2 at least 6 times " "to avoid fractions when downscaling and upscaling." "For example, use 256, 320, 384, 448, 512, ... etc. ") # Inputs input_image = KL.Input(shape=[None, None, config.IMAGE_SHAPE[2]], name="input_image") input_image_meta = KL.Input(shape=[config.IMAGE_META_SIZE], name="input_image_meta") if mode == "training": # RPN GT input_rpn_match = KL.Input(shape=[None, 1], name="input_rpn_match", dtype=tf.int32) input_rpn_bbox = KL.Input(shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32) # Detection GT (class IDs, bounding boxes, and masks) # 1. GT Class IDs (zero padded) input_gt_class_ids = KL.Input(shape=[None], name="input_gt_class_ids", dtype=tf.int32) # 2. GT Boxes in pixels (zero padded) # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates input_gt_boxes = KL.Input(shape=[None, 4], name="input_gt_boxes", dtype=tf.float32) # Normalize coordinates gt_boxes = KL.Lambda(lambda x: norm_boxes_graph( x, K.shape(input_image)[1:3]))(input_gt_boxes) # 3. GT Masks (zero padded) # [batch, height, width, MAX_GT_INSTANCES] if config.USE_MINI_MASK: input_gt_masks = KL.Input(shape=[ config.MINI_MASK_SHAPE[0], config.MINI_MASK_SHAPE[1], None ], name="input_gt_masks", dtype=bool) else: input_gt_masks = KL.Input( shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None], name="input_gt_masks", dtype=bool) elif mode == "inference": # Anchors in normalized coordinates input_anchors = KL.Input(shape=[None, 4], name="input_anchors") anchors = input_anchors # Build the shared convolutional layers. # Bottom-up Layers # Returns a list of the last layers of each stage, 5 in total. # Don't create the thead (stage 5), so we pick the 4th item in the list. if callable(config.BACKBONE): _, C2, C3, C4, C5 = config.BACKBONE(input_image, stage5=True, train_bn=config.TRAIN_BN) else: _, C2, C3, C4, C5 = resnet_graph(input_image, config.BACKBONE, stage5=True, train_bn=config.TRAIN_BN) # Top-down Layers # TODO: add assert to varify feature map sizes match what's in config P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c5p5')(C5) P4 = KL.Add(name="fpn_p4add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5), KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c4p4')(C4) ]) P3 = KL.Add(name="fpn_p3add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4), KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c3p3')(C3) ]) P2 = KL.Add(name="fpn_p2add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3), KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c2p2')(C2) ]) # Attach 3x3 conv to all P layers to get the final feature maps. P2 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p2")(P2) P3 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p3")(P3) P4 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p4")(P4) P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p5")(P5) # P6 is used for the 5th anchor scale in RPN. Generated by # subsampling from P5 with stride of 2. P6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5) # Note that P6 is used in RPN, but not in the classifier heads. rpn_feature_maps = [P2, P3, P4, P5, P6] mrcnn_feature_maps = [P2, P3, P4, P5] # Anchors if mode == "training": anchors = self.get_anchors(config.IMAGE_SHAPE) # Duplicate across the batch dimension because Keras requires it # TODO: can this be optimized to avoid duplicating the anchors? anchors = np.broadcast_to(anchors, (config.BATCH_SIZE, ) + anchors.shape) # A hack to get around Keras's bad support for constants anchors = KL.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image) # RPN Model rpn = build_rpn_model(config.RPN_ANCHOR_STRIDE, len(config.RPN_ANCHOR_RATIOS), config.TOP_DOWN_PYRAMID_SIZE) # Loop through pyramid layers layer_outputs = [] # list of lists for p in rpn_feature_maps: layer_outputs.append(rpn([p])) # Concatenate layer outputs # Convert from list of lists of level outputs to list of lists # of outputs across levels. # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]] output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"] outputs = list(zip(*layer_outputs)) outputs = [ KL.Concatenate(axis=1, name=n)(list(o)) for o, n in zip(outputs, output_names) ] rpn_class_logits, rpn_class, rpn_bbox = outputs # Generate proposals # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates # and zero padded. proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training" \ else config.POST_NMS_ROIS_INFERENCE rpn_rois = ProposalLayer(proposal_count=proposal_count, nms_threshold=config.RPN_NMS_THRESHOLD, name="ROI", config=config)([rpn_class, rpn_bbox, anchors]) if mode == "training": # Class ID mask to mark class IDs supported by the dataset the image # came from. active_class_ids = KL.Lambda(lambda x: parse_image_meta_graph(x)[ "active_class_ids"])(input_image_meta) if not config.USE_RPN_ROIS: # Ignore predicted ROIs and use ROIs provided as an input. input_rois = KL.Input(shape=[config.POST_NMS_ROIS_TRAINING, 4], name="input_roi", dtype=np.int32) # Normalize coordinates target_rois = KL.Lambda(lambda x: norm_boxes_graph( x, K.shape(input_image)[1:3]))(input_rois) else: target_rois = rpn_rois # Generate detection targets # Subsamples proposals and generates target outputs for training # Note that proposal class IDs, gt_boxes, and gt_masks are zero # padded. Equally, returned rois and targets are zero padded. rois, target_class_ids, target_bbox, target_mask = \ DetectionTargetLayer(config, name="proposal_targets")([ target_rois, input_gt_class_ids, gt_boxes, input_gt_masks]) # Network Heads # TODO: verify that this handles zero padded ROIs mrcnn_class_logits, mrcnn_class, mrcnn_bbox = \ fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_meta, config.POOL_SIZE, config.NUM_CLASSES, train_bn=config.TRAIN_BN, fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE) mrcnn_mask = build_fpn_mask_graph(rois, mrcnn_feature_maps, input_image_meta, config.MASK_POOL_SIZE, config.NUM_CLASSES, train_bn=config.TRAIN_BN) # TODO: clean up (use tf.identify if necessary) output_rois = KL.Lambda(lambda x: x * 1, name="output_rois")(rois) # Losses rpn_class_loss = KL.Lambda(lambda x: rpn_class_loss_graph(*x), name="rpn_class_loss")( [input_rpn_match, rpn_class_logits]) rpn_bbox_loss = KL.Lambda( lambda x: rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")( [input_rpn_bbox, input_rpn_match, rpn_bbox]) class_loss = KL.Lambda(lambda x: mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")([ target_class_ids, mrcnn_class_logits, active_class_ids ]) bbox_loss = KL.Lambda(lambda x: mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")([ target_bbox, target_class_ids, mrcnn_bbox ]) mask_loss = KL.Lambda(lambda x: mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")([ target_mask, target_class_ids, mrcnn_mask ]) # Model inputs = [ input_image, input_image_meta, input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks ] if not config.USE_RPN_ROIS: inputs.append(input_rois) outputs = [ rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, output_rois, rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss ] model = KM.Model(inputs, outputs, name='mask_rcnn') else: # Network Heads # Proposal classifier and BBox regressor heads mrcnn_class_logits, mrcnn_class, mrcnn_bbox = \ fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, input_image_meta, config.POOL_SIZE, config.NUM_CLASSES, train_bn=config.TRAIN_BN, fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE) # Detections # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in # normalized coordinates detections = DetectionLayer(config, name="mrcnn_detection")( [rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta]) # Create masks for detections detection_boxes = KL.Lambda(lambda x: x[..., :4])(detections) mrcnn_mask = build_fpn_mask_graph(detection_boxes, mrcnn_feature_maps, input_image_meta, config.MASK_POOL_SIZE, config.NUM_CLASSES, train_bn=config.TRAIN_BN) model = KM.Model([input_image, input_image_meta, input_anchors], [ detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox ], name='mask_rcnn') # Add multi-GPU support. if config.GPU_COUNT > 1: from mrcnn.parallel_model import ParallelModel model = ParallelModel(model, config.GPU_COUNT) return model
def build(self, mode, config): """Build the Mask R-CNN teacher-student architecture for mimic training. """ assert mode in ['training', 'inference'] # Image size must be dividable by 2 multiple times h, w = config.IMAGE_SHAPE[:2] if h / 2 ** 6 != int(h / 2 ** 6) or w / 2 ** 6 != int(w / 2 ** 6): raise Exception("Image size must be dividable by 2 at least 6 times " "to avoid fractions when downscaling and upscaling." "For example, use 256, 320, 384, 448, 512, ... etc. ") # Input input_image = KL.Input(shape=[None, None, config.IMAGE_SHAPE[2]], name="input_image") input_image_meta = KL.Input(shape=[config.IMAGE_META_SIZE], name="input_image_meta") if mode == "training": # RPN GT input_rpn_match = KL.Input(shape=[None, 1], name="input_rpn_match", dtype=tf.int32) input_rpn_bbox = KL.Input(shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32) # Detection GT (class IDs, bounding boxes, and masks) # 1. GT Class IDs (zero padded) input_gt_class_ids = KL.Input(shape=[None], name="input_gt_class_ids", dtype=tf.int32) # 2. GT Boxes in pixels (zero padded) # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates input_gt_boxes = KL.Input(shape=[None, 4], name="input_gt_boxes", dtype=tf.float32) # Normalize coordinates gt_boxes = KL.Lambda(lambda x: modellib.norm_boxes_graph(x, K.shape(input_image)[1:3]))(input_gt_boxes) # 3. GT Masks (zero padded) # [batch, height, width, MAX_GT_INSTANCES] if config.USE_MINI_MASK: input_gt_masks = KL.Input( shape=[config.MINI_MASK_SHAPE[0], config.MINI_MASK_SHAPE[1], None], name="input_gt_masks", dtype=bool) else: input_gt_masks = KL.Input( shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None], name="input_gt_masks", dtype=bool) # Class ID mask to mark class IDs supported by the dataset the image came from. active_class_ids = KL.Lambda( lambda x: modellib.parse_image_meta_graph(x)["active_class_ids"] )(input_image_meta) elif mode == "inference": pass # Build the architecture of the teacher model # Backbone _, C2, C3, C4, C5 = modellib.resnet_graph(input_image, config.TEACHER_BACKBONE, stage5=True, train_bn=False) # Top-down Layers P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c5p5')(C5) P4 = KL.Add(name="fpn_p4add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5), KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c4p4')(C4)]) P3 = KL.Add(name="fpn_p3add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4), KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c3p3')(C3)]) P2 = KL.Add(name="fpn_p2add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3), KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c2p2')(C2)]) # Attach 3x3 conv to all P layers to get the final feature maps. P2 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p2")(P2) # N x 256 x 256 x 256 P3 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p3")(P3) # N x 128 x 128 x 256 P4 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p4")(P4) # N x 64 x 64 x 256 P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p5")(P5) # N x 32 x 32 x 256 # Note that P6 is used in RPN, but not in the classifier heads. t_mrcnn_feature_maps = [P2, P3, P4, P5] # Build the architecture of the student model s_prefix = 's_' # Backbone _, S2, S3, S4, S5 = s_resnet_graph(input_image, config.STUDENT_BACKBONE, prefix=s_prefix, train_bn=None) # Top-down Layers Q5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name=s_prefix + 'fpn_s5q5')(S5) Q4 = KL.Add(name=s_prefix + "fpn_q4add")([ KL.UpSampling2D(size=(2, 2), name=s_prefix + "fpn_q5upsampled")(Q5), KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name=s_prefix + 'fpn_s4q4')(S4)]) Q3 = KL.Add(name=s_prefix + "fpn_q3add")([ KL.UpSampling2D(size=(2, 2), name=s_prefix + "fpn_q4upsampled")(Q4), KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name=s_prefix + 'fpn_s3q3')(S3)]) Q2 = KL.Add(name=s_prefix + "fpn_q2add")([ KL.UpSampling2D(size=(2, 2), name=s_prefix + "fpn_q3upsampled")(Q3), KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name=s_prefix + 'fpn_s2q2')(S2)]) # Attach 3x3 conv to all Q layers to get the final feature maps. Q2 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name=s_prefix + "fpn_q2")( Q2) # N x 256 x 256 x 256 Q3 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name=s_prefix + "fpn_q3")( Q3) # N x 128 x 128 x 256 Q4 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name=s_prefix + "fpn_q4")( Q4) # N x 64 x 64 x 256 Q5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name=s_prefix + "fpn_q5")( Q5) # N x 32 x 32 x 256 # Q6 is used for the 5th anchor scale in RPN. Generated by # subsampling from P5 with stride of 2. Q6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name=s_prefix + "fpn_q6")(Q5) # Note that P6 is used in RPN, but not in the classifier heads. s_rpn_feature_maps = [Q2, Q3, Q4, Q5, Q6] s_mrcnn_feature_maps = [Q2, Q3, Q4, Q5] # RPN Model s_rpn = s_build_rpn_model(config.RPN_ANCHOR_STRIDE, len(config.RPN_ANCHOR_RATIOS), config.TOP_DOWN_PYRAMID_SIZE, prefix=s_prefix) # Loop through pyramid layers s_layer_outputs = [] # list of lists for p in s_rpn_feature_maps: s_layer_outputs.append(s_rpn([p])) # Concatenate layer outputs # Convert from list of lists of level outputs to list of lists of outputs across levels. # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]] s_output_names = ["s_rpn_class_logits", "s_rpn_class", "s_rpn_bbox"] s_outputs = list(zip(*s_layer_outputs)) s_outputs = [KL.Concatenate(axis=1, name=n)(list(o)) for o, n in zip(s_outputs, s_output_names)] s_rpn_class_logits, s_rpn_class, s_rpn_bbox = s_outputs # Proposals anchors = self.get_anchors(config.IMAGE_SHAPE) # Duplicate across the batch dimension because Keras requires it # TODO: can this be optimized to avoid duplicating the anchors? anchors = np.broadcast_to(anchors, (config.BATCH_SIZE,) + anchors.shape) # A hack to get around Keras's bad support for constants anchors = KL.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image) # Generate proposals # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates # and zero padded. proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training" \ else config.POST_NMS_ROIS_INFERENCE s_rpn_rois = s_ProposalLayer( proposal_count=proposal_count, nms_threshold=config.RPN_NMS_THRESHOLD, name="s_ROI", config=config)([s_rpn_class, s_rpn_bbox, anchors]) # Generate detection targets # Subsamples proposals and generates target outputs for training # Note that proposal class IDs, gt_boxes, and gt_masks are zero # padded. Equally, returned rois and targets are zero padded. s_rois, target_class_ids, target_bbox, target_mask = \ modellib.DetectionTargetLayer(config, name="proposal_targets")([ s_rpn_rois, input_gt_class_ids, gt_boxes, input_gt_masks]) # s_rois: [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates # Add a transformer layer to the feature maps from the student model before comparison with the teacher model transformer = build_transformer_layer(config.TOP_DOWN_PYRAMID_SIZE) s_transformed_feature_maps = [transformer([p]) for p in s_mrcnn_feature_maps] # Losses rpn_class_loss = KL.Lambda(lambda x: modellib.rpn_class_loss_graph(*x), name="rpn_class_loss")( [input_rpn_match, s_rpn_class_logits]) rpn_bbox_loss = KL.Lambda(lambda x: modellib.rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")( [input_rpn_bbox, input_rpn_match, s_rpn_bbox]) rpn_mimic_loss = KL.Lambda(lambda x: rpn_mimic_loss_graph(config, *x), name="rpn_mimic_loss")( [s_transformed_feature_maps[0], s_transformed_feature_maps[1], s_transformed_feature_maps[2], s_transformed_feature_maps[3], t_mrcnn_feature_maps[0], t_mrcnn_feature_maps[1], t_mrcnn_feature_maps[2], t_mrcnn_feature_maps[3], s_rois, input_image_meta]) # Model inputs = [input_image, input_image_meta, input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks] outputs = [s_rpn_class_logits, s_rpn_class, s_rpn_bbox, s_rpn_rois, rpn_class_loss, rpn_bbox_loss, rpn_mimic_loss] model = KM.Model(inputs, outputs, name='mimic_mask_rcnn') # Add multi-GPU support. if config.GPU_COUNT > 1: from mrcnn.parallel_model import ParallelModel model = ParallelModel(model, config.GPU_COUNT) return model
def build(self, mode, config): """Build Mask R-CNN architecture. input_shape: The shape of the input image. mode: Either "training" or "inference". The inputs and outputs of the model differ accordingly. """ assert mode in ['training', 'inference'] # Image size must be dividable by 2 multiple times h, w = config.IMAGE_SHAPE[:2] if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): raise Exception("Image size must be dividable by 2 at least 6 times " "to avoid fractions when downscaling and upscaling." "For example, use 256, 320, 384, 448, 512, ... etc. ") # Inputs input_image = KL.Input( shape=config.IMAGE_SHAPE.tolist(), name="input_image") # CHANGE: add target input if not config.NUM_TARGETS: config.NUM_TARGETS = 1 input_target = KL.Input( shape=[config.NUM_TARGETS] + config.TARGET_SHAPE.tolist(), name="input_target") input_image_meta = KL.Input(shape=[config.IMAGE_META_SIZE], name="input_image_meta") if mode == "training": # RPN GT input_rpn_match = KL.Input( shape=[None, 1], name="input_rpn_match", dtype=tf.int32) input_rpn_bbox = KL.Input( shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32) # Detection GT (class IDs, bounding boxes, and masks) # 1. GT Class IDs (zero padded) input_gt_class_ids = KL.Input( shape=[None], name="input_gt_class_ids", dtype=tf.int32) # 2. GT Boxes in pixels (zero padded) # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates input_gt_boxes = KL.Input( shape=[None, 4], name="input_gt_boxes", dtype=tf.float32) # Normalize coordinates gt_boxes = KL.Lambda(lambda x: modellib.norm_boxes_graph( x, K.shape(input_image)[1:3]))(input_gt_boxes) # 3. GT Masks (zero padded) # [batch, height, width, MAX_GT_INSTANCES] if config.USE_MINI_MASK: input_gt_masks = KL.Input( shape=[config.MINI_MASK_SHAPE[0], config.MINI_MASK_SHAPE[1], None], name="input_gt_masks", dtype=bool) else: input_gt_masks = KL.Input( shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None], name="input_gt_masks", dtype=bool) elif mode == "inference": # Anchors in normalized coordinates input_anchors = KL.Input(shape=[None, 4], name="input_anchors") # Build the shared convolutional layers. # CHANGE: Use weightshared FPN model for image and target # Create FPN Model resnet = build_resnet_model(self.config) fpn = build_fpn_model(feature_maps=self.config.FPN_FEATUREMAPS) # Create Image FP _, IC2, IC3, IC4, IC5 = resnet(input_image) IP2, IP3, IP4, IP5, IP6 = fpn([IC2, IC3, IC4, IC5]) # Create Target FR input_targets = [KL.Lambda(lambda x: x[:,idx,...])(input_target) for idx in range(input_target.shape[1])] for k, one_target in enumerate(input_targets): _, TC2, TC3, TC4, TC5 = resnet(one_target) out = fpn([TC2, TC3, TC4, TC5]) if k == 0: target_pyramid = out else: target_pyramid = [KL.Add(name="target_adding_{}_{}".format(k, i))( [target_pyramid[i], out[i]]) for i in range(len(out))] TP2, TP3, TP4, TP5, TP6 = [KL.Lambda(lambda x: x / config.NUM_TARGETS)( target_pyramid[i]) for i in range(len(target_pyramid))] # one_target = KL.Lambda(lambda x: x[:,0,...])(input_target) # one_target = input_target[:,0,...] # _, TC2, TC3, TC4, TC5 = resnet(one_target) # TP2, TP3, TP4, TP5, TP6 = fpn([TC2, TC3, TC4, TC5]) # CHANGE: add siamese distance copmputation # Combine FPs using L1 distance P2 = l1_distance_graph(IP2, TP2, feature_maps = 3*self.config.FPN_FEATUREMAPS//2, name='P2') P3 = l1_distance_graph(IP3, TP3, feature_maps = 3*self.config.FPN_FEATUREMAPS//2, name='P3') P4 = l1_distance_graph(IP4, TP4, feature_maps = 3*self.config.FPN_FEATUREMAPS//2, name='P4') P5 = l1_distance_graph(IP5, TP5, feature_maps = 3*self.config.FPN_FEATUREMAPS//2, name='P5') P6 = l1_distance_graph(IP6, TP6, feature_maps = 3*self.config.FPN_FEATUREMAPS//2, name='P6') # Note that P6 is used in RPN, but not in the classifier heads. rpn_feature_maps = [P2, P3, P4, P5, P6] mrcnn_feature_maps = [P2, P3, P4, P5] # Anchors if mode == "training": anchors = self.get_anchors(config.IMAGE_SHAPE) # Duplicate across the batch dimension because Keras requires it # TODO: can this be optimized to avoid duplicating the anchors? anchors = np.broadcast_to(anchors, (config.BATCH_SIZE,) + anchors.shape) # A hack to get around Keras's bad support for constants anchors = KL.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image) else: anchors = input_anchors # RPN Model # CHANGE: Set number of filters to [3*self.config.FPN_FEATUREMAPS//2] rpn = modellib.build_rpn_model(config.RPN_ANCHOR_STRIDE, len(config.RPN_ANCHOR_RATIOS), 3*self.config.FPN_FEATUREMAPS//2) # Loop through pyramid layers layer_outputs = [] # list of lists for p in rpn_feature_maps: layer_outputs.append(rpn([p])) # Concatenate layer outputs # Convert from list of lists of level outputs to list of lists # of outputs across levels. # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]] output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"] outputs = list(zip(*layer_outputs)) outputs = [KL.Concatenate(axis=1, name=n)(list(o)) for o, n in zip(outputs, output_names)] rpn_class_logits, rpn_class, rpn_bbox = outputs # Generate proposals # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates # and zero padded. proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training"\ else config.POST_NMS_ROIS_INFERENCE rpn_rois = modellib.ProposalLayer( proposal_count=proposal_count, nms_threshold=config.RPN_NMS_THRESHOLD, name="ROI", config=config)([rpn_class, rpn_bbox, anchors]) if mode == "training": # Class ID mask to mark class IDs supported by the dataset the image # came from. active_class_ids = KL.Lambda( lambda x: modellib.parse_image_meta_graph(x)["active_class_ids"] )(input_image_meta) if not config.USE_RPN_ROIS: # Ignore predicted ROIs and use ROIs provided as an input. input_rois = KL.Input(shape=[config.POST_NMS_ROIS_TRAINING, 4], name="input_roi", dtype=np.int32) # Normalize coordinates target_rois = KL.Lambda(lambda x: modellig.norm_boxes_graph( x, K.shape(input_image)[1:3]))(input_rois) else: target_rois = rpn_rois # Generate detection targets # Subsamples proposals and generates target outputs for training # Note that proposal class IDs, gt_boxes, and gt_masks are zero # padded. Equally, returned rois and targets are zero padded. rois, target_class_ids, target_bbox, target_mask =\ modellib.DetectionTargetLayer(config, name="proposal_targets")([ target_rois, input_gt_class_ids, gt_boxes, input_gt_masks]) # Network Heads # TODO: verify that this handles zero padded ROIs # CHANGE: reduce number of classes to 2 # CHANGE: replaced with custom 2 class function mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\ fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_meta, config.POOL_SIZE, num_classes=2, train_bn=config.TRAIN_BN, fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE) # CHANGE: reduce number of classes to 2 # CHANGE: replaced with custom 2 class function if config.MODEL == 'mrcnn': mrcnn_mask = fpn_mask_graph(rois, mrcnn_feature_maps, input_image_meta, config.MASK_POOL_SIZE, num_classes=2, train_bn=config.TRAIN_BN) # TODO: clean up (use tf.identify if necessary) output_rois = KL.Lambda(lambda x: x * 1, name="output_rois")(rois) # Losses rpn_class_loss = KL.Lambda(lambda x: modellib.rpn_class_loss_graph(*x), name="rpn_class_loss")( [input_rpn_match, rpn_class_logits]) rpn_bbox_loss = KL.Lambda(lambda x: modellib.rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")( [input_rpn_bbox, input_rpn_match, rpn_bbox]) # CHANGE: use custom class loss without using active_class_ids class_loss = KL.Lambda(lambda x: mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")( [target_class_ids, mrcnn_class_logits, active_class_ids]) bbox_loss = KL.Lambda(lambda x: modellib.mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")( [target_bbox, target_class_ids, mrcnn_bbox]) if config.MODEL == 'mrcnn': mask_loss = KL.Lambda(lambda x: modellib.mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")( [target_mask, target_class_ids, mrcnn_mask]) # Model # CHANGE: Added target to inputs inputs = [input_image, input_image_meta, input_target, input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks] if not config.USE_RPN_ROIS: inputs.append(input_rois) if config.MODEL == 'mrcnn': outputs = [rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, output_rois, rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss] elif config.MODEL =='frcnn': outputs = [rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits, mrcnn_class, mrcnn_bbox, rpn_rois, output_rois, rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss] model = KM.Model(inputs, outputs, name='mask_rcnn') else: # Network Heads # Proposal classifier and BBox regressor heads # CHANGE: reduce number of classes to 2 # CHANGE: replaced with custom 2 class function mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\ fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, input_image_meta, config.POOL_SIZE, num_classes=2, train_bn=config.TRAIN_BN, fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE) # Detections # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in # normalized coordinates detections = modellib.DetectionLayer(config, name="mrcnn_detection")( [rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta]) # Create masks for detections detection_boxes = KL.Lambda(lambda x: x[..., :4])(detections) # CHANGE: reduce number of classes to 2 # CHANGE: replaced with custom 2 class function if config.MODEL == 'mrcnn': mrcnn_mask = fpn_mask_graph(detection_boxes, mrcnn_feature_maps, input_image_meta, config.MASK_POOL_SIZE, num_classes=2, train_bn=config.TRAIN_BN) # CHANGE: Added target to the input inputs = [input_image, input_image_meta, input_target, input_anchors] if config.MODEL == 'mrcnn': outputs = [detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox] elif config.MODEL =='frcnn': outputs = [detections, mrcnn_class, mrcnn_bbox, rpn_rois, rpn_class, rpn_bbox] model = KM.Model(inputs, outputs, name='mask_rcnn') # Add multi-GPU support. if config.GPU_COUNT > 1: from mrcnn.parallel_model import ParallelModel model = ParallelModel(model, config.GPU_COUNT) return model
def rpn_mimic_loss_graph(config, Q2, Q3, Q4, Q5, P2, P3, P4, P5, boxes, image_meta): student_maps = [Q2, Q3, Q4, Q5] teacher_maps = [P2, P3, P4, P5] num_boxes_batch = config.BATCH_SIZE * config.TRAIN_ROIS_PER_IMAGE # Assign each ROI to a level in the pyramid based on the ROI area. y1, x1, y2, x2 = tf.split(boxes, 4, axis=2) # [BATCH_SIZE, TRAIN_ROIS_PER_IMAGE, 1] h = y2 - y1 w = x2 - x1 # Use shape of first image. Images in a batch must have the same size. image_shape = modellib.parse_image_meta_graph(image_meta)['image_shape'][0] # Equation 1 in the Feature Pyramid Networks paper. Account for # the fact that our coordinates are normalized here. # e.g. a 224x224 ROI (in pixels) maps to P4 image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32) roi_level = modellib.log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area))) roi_level = tf.minimum(5, tf.maximum(2, 4 + tf.cast(tf.round(roi_level), tf.int32))) # 4 + log2(roi_h * roi_w) roi_level = tf.squeeze(roi_level, 2) # [BATCH_SIZE, TRAIN_ROIS_PER_IMAGE] # init the loss for later accumulation of every pyramid level loss = 0. for i, level in enumerate(range(2, 6)): ix = tf.where(tf.equal(roi_level, level)) # [BATCH_SIZE, TRAIN_ROIS_PER_IMAGE] level_boxes = tf.gather_nd(boxes, ix) # [num_boxes_level, 4] # Box indices for crop_and_resize (specify which image the bbox refers to) box_indices = tf.cast(ix[:, 0], tf.int32) # [num_boxes_level] P = num_boxes_batch - tf.shape(level_boxes)[0] level_boxes = tf.pad(level_boxes, [(0, P), (0, 0)]) # padded with zero box_indices = tf.pad(box_indices, [(0, P)]) # padded with zero level_y1, level_x1, level_y2, level_x2 = tf.split(level_boxes, 4, axis=-1) map_h = image_shape[0] / config.BACKBONE_STRIDES[i] map_w = image_shape[1] / config.BACKBONE_STRIDES[i] # calculates the coordinate of bounding boxes at the pyramid level level_y1_p = tf.cast(tf.squeeze(tf.floor(level_y1 * map_h), axis=-1), tf.int32) # [num_boxes_batch] level_x1_p = tf.cast(tf.squeeze(tf.floor(level_x1 * map_w), axis=-1), tf.int32) level_y2_p = tf.cast(tf.squeeze(tf.ceil(level_y2 * map_h), axis=-1), tf.int32) level_x2_p = tf.cast(tf.squeeze(tf.ceil(level_x2 * map_w), axis=-1), tf.int32) level_h_p = level_y2_p - level_y1_p level_w_p = level_x2_p - level_x1_p # iterate over bounding boxes and accumulate the l2-distance loss def cond(idx, _): # return tf.less(idx, level_boxes.shape[0]) return tf.less(idx, num_boxes_batch) def body(idx, loss): s_cropped = tf.slice(student_maps[i], [box_indices[idx], level_y1_p[idx], level_x1_p[idx], 0], [1, level_h_p[idx], level_w_p[idx], -1]) t_cropped = tf.slice(teacher_maps[i], [box_indices[idx], level_y1_p[idx], level_x1_p[idx], 0], [1, level_h_p[idx], level_w_p[idx], -1]) loss += tf.reduce_mean(tf.square(s_cropped - t_cropped)) idx = tf.add(idx, 1) return [idx, loss] loss_temp = tf.while_loop(cond, body, [0, 0.]) loss += loss_temp[1] loss = loss / (2.0 * num_boxes_batch) return loss