def call(self, inputs): rois = inputs[0] mrcnn_class = inputs[1] mrcnn_bbox = inputs[2] image_meta = inputs[3] # Get windows of images in normalized coordinates. Windows are the area # in the image that excludes the padding. # Use the shape of the first image in the batch to normalize the window # because we know that all images get resized to the same size. m = Utils.parse_image_meta_graph(image_meta) image_shape = m["image_shape"][0] window = Utils.norm_boxes_graph(m["window"], image_shape[:2]) # Run detection refinement graph on each item in the batch detections_batch = Utils.batch_slice( [rois, mrcnn_class, mrcnn_bbox, window], lambda x, y, w, z: refine_detections_graph(x, y, w, z, self.config ), self.config.IMAGES_PER_GPU, ) # Reshape output # [batch, num_detections, (y1, x1, y2, x2, class_score)] in # normalized coordinates return tf.reshape( detections_batch, [self.config.BATCH_SIZE, self.config.DETECTION_MAX_INSTANCES, 6], )
def rpn_bbox_loss_graph(config, target_bbox, rpn_match, rpn_bbox): """Return the RPN bounding box loss graph. config: the model config object. target_bbox: [batch, max positive anchors, (dy, dx, log(dh), log(dw))]. Uses 0 padding to fill in unsed bbox deltas. rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive, -1=negative, 0=neutral anchor. rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))] """ # Positive anchors contribute to the loss, but negative and # neutral anchors (match value of 0 or -1) don't. rpn_match = K.squeeze(rpn_match, -1) indices = tf.where(K.equal(rpn_match, 1)) # Pick bbox deltas that contribute to the loss rpn_bbox = tf.gather_nd(rpn_bbox, indices) # Trim target bounding box deltas to the same length as rpn_bbox. batch_counts = K.sum(K.cast(K.equal(rpn_match, 1), tf.int32), axis=1) target_bbox = Utils.batch_pack_graph(target_bbox, batch_counts, config.IMAGES_PER_GPU) # TODO: use smooth_l1_loss() rather than reimplementing here # to reduce code duplication diff = K.abs(target_bbox - rpn_bbox) less_than_one = K.cast(K.less(diff, 1.0), "float32") loss = (less_than_one * 0.5 * diff**2) + (1 - less_than_one) * (diff - 0.5) loss = K.switch(tf.size(loss) > 0, K.mean(loss), tf.constant(0.0)) return loss
def call(self, inputs): proposals = inputs[0] gt_class_ids = inputs[1] gt_boxes = inputs[2] gt_masks = inputs[3] # Slice the batch and run a graph for each slice # TODO: Rename target_bbox to target_deltas for clarity names = ["rois", "target_class_ids", "target_bbox", "target_mask"] outputs = Utils.batch_slice( [proposals, gt_class_ids, gt_boxes, gt_masks], lambda w, x, y, z: detection_targets_graph(w, x, y, z, self.config ), self.config.IMAGES_PER_GPU, names=names, ) return outputs
config = InferenceConfig() config.display() # Create model if args.command == "train": model = MaskRCNN(mode="training", config=config, model_dir=args.logs) else: model = MaskRCNN(mode="inference", config=config, model_dir=args.logs) # Select weights file to load COCO_WEIGHTS_PATH = "./mask_rcnn_coco.h5" if args.weights.lower() == "coco": weights_path = COCO_WEIGHTS_PATH # Download weights file if not os.path.exists(weights_path): Utils.download_trained_weights(weights_path) elif args.weights.lower() == "last": # Find last trained weights weights_path = model.find_last() elif args.weights.lower() == "imagenet": # Start from ImageNet trained weights weights_path = model.get_imagenet_weights() else: weights_path = args.weights # Load weights print("Loading weights ", weights_path) if args.weights.lower() == "coco": # Exclude the last layers because they require a matching # number of classes model.load_weights(
def data_generator( dataset, config, shuffle=True, augmentation=None, random_rois=0, batch_size=1, detection_targets=False, ): """A generator that returns images and corresponding target class ids, bounding box deltas, and masks. dataset: The Dataset object to pick data from config: The model config object shuffle: If True, shuffles the samples before every epoch augmentation: Optional. An imgaug (https://github.com/aleju/imgaug) augmentation. For example, passing imgaug.augmenters.Fliplr(0.5) flips images right/left 50% of the time. random_rois: If > 0 then generate proposals to be used to train the network classifier and mask heads. Useful if training the Mask RCNN part without the RPN. batch_size: How many images to return in each call detection_targets: If True, generate detection targets (class IDs, bbox deltas, and masks). Typically for debugging or visualizations because in trainig detection targets are generated by DetectionTargetLayer. Returns a Python generator. Upon calling next() on it, the generator returns two lists, inputs and outputs. The containtes of the lists differs depending on the received arguments: inputs list: - images: [batch, H, W, C] - image_meta: [batch, (meta data)] Image details. See compose_image_meta() - rpn_match: [batch, N] Integer (1=positive anchor, -1=negative, 0=neutral) - rpn_bbox: [batch, N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas. - gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs - gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] - gt_masks: [batch, height, width, MAX_GT_INSTANCES]. The height and width are those of the image unless use_mini_mask is True, in which case they are defined in MINI_MASK_SHAPE. outputs list: Usually empty in regular training. But if detection_targets is True then the outputs list contains target class_ids, bbox deltas, and masks. """ b = 0 # batch item index image_index = -1 image_ids = np.copy(dataset.image_ids) error_count = 0 # Anchors # [anchor_count, (y1, x1, y2, x2)] backbone_shapes = Utils.compute_backbone_shapes(config, config.IMAGE_SHAPE) anchors = Utils.generate_pyramid_anchors( config.RPN_ANCHOR_SCALES, config.RPN_ANCHOR_RATIOS, backbone_shapes, config.BACKBONE_STRIDES, config.RPN_ANCHOR_STRIDE, ) # Keras requires a generator to run indefinately. while True: try: # Increment index to pick next image. Shuffle if at # the start of an epoch. image_index = (image_index + 1) % len(image_ids) if shuffle and image_index == 0: np.random.shuffle(image_ids) # Get GT bounding boxes and masks for image. image_id = image_ids[image_index] res = Utils.load_image_gt( dataset, config, image_id, augmentation=augmentation, use_mini_mask=config.USE_MINI_MASK, ) image, image_meta, gt_class_ids, gt_boxes, gt_masks = res # Skip images that have no instances. This can happen in cases # where we train on a subset of classes and the image doesn't # have any of the classes we care about. if not np.any(gt_class_ids > 0): continue # RPN Targets rpn_match, rpn_bbox = Utils.build_rpn_targets( image.shape, anchors, gt_class_ids, gt_boxes, config) # Mask R-CNN Targets if random_rois: rpn_rois = Utils.generate_random_rois(image.shape, random_rois, gt_class_ids, gt_boxes) if detection_targets: res = Utils.build_detection_targets( rpn_rois, gt_class_ids, gt_boxes, gt_masks, config) rois, mrcnn_class_ids, mrcnn_bbox, mrcnn_mask = res # Init batch arrays if b == 0: batch_image_meta = np.zeros((batch_size, ) + image_meta.shape, dtype=image_meta.dtype) batch_rpn_match = np.zeros([batch_size, anchors.shape[0], 1], dtype=rpn_match.dtype) batch_rpn_bbox = np.zeros( [batch_size, config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4], dtype=rpn_bbox.dtype, ) batch_images = np.zeros((batch_size, ) + image.shape, dtype=np.float32) batch_gt_class_ids = np.zeros( (batch_size, config.MAX_GT_INSTANCES), dtype=np.int32) batch_gt_boxes = np.zeros( (batch_size, config.MAX_GT_INSTANCES, 4), dtype=np.int32) batch_gt_masks = np.zeros( ( batch_size, gt_masks.shape[0], gt_masks.shape[1], config.MAX_GT_INSTANCES, ), dtype=gt_masks.dtype, ) if random_rois: batch_rpn_rois = np.zeros( (batch_size, rpn_rois.shape[0], 4), dtype=rpn_rois.dtype, ) if detection_targets: batch_rois = np.zeros((batch_size, ) + rois.shape, dtype=rois.dtype) batch_mrcnn_class_ids = np.zeros( (batch_size, ) + mrcnn_class_ids.shape, dtype=mrcnn_class_ids.dtype, ) batch_mrcnn_bbox = np.zeros( (batch_size, ) + mrcnn_bbox.shape, dtype=mrcnn_bbox.dtype, ) batch_mrcnn_mask = np.zeros( (batch_size, ) + mrcnn_mask.shape, dtype=mrcnn_mask.dtype, ) # If more instances than fits in the array, sub-sample from them. if gt_boxes.shape[0] > config.MAX_GT_INSTANCES: ids = np.random.choice( np.arange(gt_boxes.shape[0]), config.MAX_GT_INSTANCES, replace=False, ) gt_class_ids = gt_class_ids[ids] gt_boxes = gt_boxes[ids] gt_masks = gt_masks[:, :, ids] # Add to batch batch_image_meta[b] = image_meta batch_rpn_match[b] = rpn_match[:, np.newaxis] batch_rpn_bbox[b] = rpn_bbox batch_images[b] = Utils.mold_image(image.astype(np.float32), config) batch_gt_class_ids[b, :gt_class_ids.shape[0]] = gt_class_ids batch_gt_boxes[b, :gt_boxes.shape[0]] = gt_boxes batch_gt_masks[b, :, :, :gt_masks.shape[-1]] = gt_masks if random_rois: batch_rpn_rois[b] = rpn_rois if detection_targets: batch_rois[b] = rois batch_mrcnn_class_ids[b] = mrcnn_class_ids batch_mrcnn_bbox[b] = mrcnn_bbox batch_mrcnn_mask[b] = mrcnn_mask b += 1 # Batch full? if b >= batch_size: inputs = [ batch_images, batch_image_meta, batch_rpn_match, batch_rpn_bbox, batch_gt_class_ids, batch_gt_boxes, batch_gt_masks, ] outputs = [] if random_rois: inputs.extend([batch_rpn_rois]) if detection_targets: inputs.extend([batch_rois]) # Keras requires that output and targets have # the same number of dimensions batch_mrcnn_class_ids = np.expand_dims( batch_mrcnn_class_ids, -1) outputs.extend([ batch_mrcnn_class_ids, batch_mrcnn_bbox, batch_mrcnn_mask, ]) yield inputs, outputs # start a new batch b = 0 except (GeneratorExit, KeyboardInterrupt): raise except: # Log it and skip the image logging.exception("Error processing image {}".format( dataset.image_info[image_id])) error_count += 1 if error_count > 5: raise
def call(self, inputs): # Box Scores. Use the foreground class confidence. [Batch, num_rois, 1] scores = inputs[0][:, :, 1] # Box deltas [batch, num_rois, 4] deltas = inputs[1] deltas = deltas * np.reshape(self.config.RPN_BBOX_STD_DEV, [1, 1, 4]) # Anchors anchors = inputs[2] # Improve performance by trimming to top anchors by score # and doing the rest on the smaller subset. pre_nms_limit = tf.minimum(6000, tf.shape(anchors)[1]) ix = tf.nn.top_k(scores, pre_nms_limit, sorted=True, name="top_anchors").indices scores = Utils.batch_slice( [scores, ix], lambda x, y: tf.gather(x, y), self.config.IMAGES_PER_GPU, ) deltas = Utils.batch_slice( [deltas, ix], lambda x, y: tf.gather(x, y), self.config.IMAGES_PER_GPU, ) pre_nms_anchors = Utils.batch_slice( [anchors, ix], lambda a, x: tf.gather(a, x), self.config.IMAGES_PER_GPU, names=["pre_nms_anchors"], ) # Apply deltas to anchors to get refined anchors. # [batch, N, (y1, x1, y2, x2)] boxes = Utils.batch_slice( [pre_nms_anchors, deltas], lambda x, y: apply_box_deltas_graph(x, y), self.config.IMAGES_PER_GPU, names=["refined_anchors"], ) # Clip to image boundaries. Since we're in normalized coordinates, # clip to 0..1 range. [batch, N, (y1, x1, y2, x2)] window = np.array([0, 0, 1, 1], dtype=np.float32) boxes = Utils.batch_slice( boxes, lambda x: clip_boxes_graph(x, window), self.config.IMAGES_PER_GPU, names=["refined_anchors_clipped"], ) # Filter out small boxes # According to Xinlei Chen's paper, this reduces detection accuracy # for small objects, so we're skipping it. # Non-max suppression def nms(boxes, scores): indices = tf.image.non_max_suppression( boxes, scores, self.proposal_count, self.nms_threshold, name="rpn_non_max_suppression", ) proposals = tf.gather(boxes, indices) # Pad if needed padding = tf.maximum(self.proposal_count - tf.shape(proposals)[0], 0) proposals = tf.pad(proposals, [(0, padding), (0, 0)]) return proposals proposals = Utils.batch_slice([boxes, scores], nms, self.config.IMAGES_PER_GPU) return proposals
def detection_targets_graph(proposals, gt_class_ids, gt_boxes, gt_masks, config): """Generates detection targets for one image. Subsamples proposals and generates target class IDs, bounding box deltas, and masks for each. Inputs: proposals: [N, (y1, x1, y2, x2)] in normalized coordinates. Might be zero padded if there are not enough proposals. gt_class_ids: [MAX_GT_INSTANCES] int class IDs gt_boxes: [MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized coordinates. gt_masks: [height, width, MAX_GT_INSTANCES] of boolean type. Returns: Target ROIs and corresponding class IDs, bounding box shifts, and masks. rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs. Zero padded. deltas: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (dy, dx, log(dh), log(dw))] Class-specific bbox refinements. masks: [TRAIN_ROIS_PER_IMAGE, height, width). Masks cropped to bbox boundaries and resized to neural network output size. Note: Returned arrays might be zero padded if not enough target ROIs. """ # Assertions asserts = [ tf.Assert( tf.greater(tf.shape(proposals)[0], 0), [proposals], name="roi_assertion", ) ] with tf.control_dependencies(asserts): proposals = tf.identity(proposals) # Remove zero padding proposals, _ = Utils.trim_zeros_graph(proposals, name="trim_proposals") gt_boxes, non_zeros = Utils.trim_zeros_graph(gt_boxes, name="trim_gt_boxes") gt_class_ids = tf.boolean_mask(gt_class_ids, non_zeros, name="trim_gt_class_ids") gt_masks = tf.gather(gt_masks, tf.where(non_zeros)[:, 0], axis=2, name="trim_gt_masks") # Handle COCO crowds # A crowd box in COCO is a bounding box around several instances. Exclude # them from training. A crowd box is given a negative class ID. crowd_ix = tf.where(gt_class_ids < 0)[:, 0] non_crowd_ix = tf.where(gt_class_ids > 0)[:, 0] crowd_boxes = tf.gather(gt_boxes, crowd_ix) crowd_masks = tf.gather(gt_masks, crowd_ix, axis=2) gt_class_ids = tf.gather(gt_class_ids, non_crowd_ix) gt_boxes = tf.gather(gt_boxes, non_crowd_ix) gt_masks = tf.gather(gt_masks, non_crowd_ix, axis=2) # Compute overlaps matrix [proposals, gt_boxes] overlaps = overlaps_graph(proposals, gt_boxes) # Compute overlaps with crowd boxes [anchors, crowds] crowd_overlaps = overlaps_graph(proposals, crowd_boxes) crowd_iou_max = tf.reduce_max(crowd_overlaps, axis=1) no_crowd_bool = crowd_iou_max < 0.001 # Determine postive and negative ROIs roi_iou_max = tf.reduce_max(overlaps, axis=1) # 1. Positive ROIs are those with >= 0.5 IoU with a GT box positive_roi_bool = roi_iou_max >= 0.5 positive_indices = tf.where(positive_roi_bool)[:, 0] # 2. Negative ROIs are those with < 0.5 with every GT box. Skip crowds. negative_indices = tf.where( tf.logical_and(roi_iou_max < 0.5, no_crowd_bool))[:, 0] # Subsample ROIs. Aim for 33% positive # Positive ROIs positive_count = int(config.TRAIN_ROIS_PER_IMAGE * config.ROI_POSITIVE_RATIO) positive_indices = tf.random_shuffle(positive_indices)[:positive_count] positive_count = tf.shape(positive_indices)[0] # Negative ROIs. Add enough to maintain positive:negative ratio. r = 1.0 / config.ROI_POSITIVE_RATIO negative_count = ( tf.cast(r * tf.cast(positive_count, tf.float32), tf.int32) - positive_count) negative_indices = tf.random_shuffle(negative_indices)[:negative_count] # Gather selected ROIs positive_rois = tf.gather(proposals, positive_indices) negative_rois = tf.gather(proposals, negative_indices) # Assign positive ROIs to GT boxes. positive_overlaps = tf.gather(overlaps, positive_indices) roi_gt_box_assignment = tf.cond( tf.greater(tf.shape(positive_overlaps)[1], 0), true_fn=lambda: tf.argmax(positive_overlaps, axis=1), false_fn=lambda: tf.cast(tf.constant([]), tf.int64), ) roi_gt_boxes = tf.gather(gt_boxes, roi_gt_box_assignment) roi_gt_class_ids = tf.gather(gt_class_ids, roi_gt_box_assignment) # Compute bbox refinement for positive ROIs deltas = Utils.box_refinement_graph(positive_rois, roi_gt_boxes) deltas /= config.BBOX_STD_DEV # Assign positive ROIs to GT masks # Permute masks to [N, height, width, 1] transposed_masks = tf.expand_dims(tf.transpose(gt_masks, [2, 0, 1]), -1) # Pick the right mask for each ROI roi_masks = tf.gather(transposed_masks, roi_gt_box_assignment) # Compute mask targets boxes = positive_rois if config.USE_MINI_MASK: # Transform ROI corrdinates from normalized image space # to normalized mini-mask space. y1, x1, y2, x2 = tf.split(positive_rois, 4, axis=1) gt_y1, gt_x1, gt_y2, gt_x2 = tf.split(roi_gt_boxes, 4, axis=1) gt_h = gt_y2 - gt_y1 gt_w = gt_x2 - gt_x1 y1 = (y1 - gt_y1) / gt_h x1 = (x1 - gt_x1) / gt_w y2 = (y2 - gt_y1) / gt_h x2 = (x2 - gt_x1) / gt_w boxes = tf.concat([y1, x1, y2, x2], 1) box_ids = tf.range(0, tf.shape(roi_masks)[0]) masks = tf.image.crop_and_resize(tf.cast(roi_masks, tf.float32), boxes, box_ids, config.MASK_SHAPE) # Remove the extra dimension from masks. masks = tf.squeeze(masks, axis=3) # Threshold mask pixels at 0.5 to have GT masks be 0 or 1 to use with # binary cross entropy loss. masks = tf.round(masks) # Append negative ROIs and pad bbox deltas and masks that # are not used for negative ROIs with zeros. rois = tf.concat([positive_rois, negative_rois], axis=0) N = tf.shape(negative_rois)[0] P = tf.maximum(config.TRAIN_ROIS_PER_IMAGE - tf.shape(rois)[0], 0) rois = tf.pad(rois, [(0, P), (0, 0)]) roi_gt_boxes = tf.pad(roi_gt_boxes, [(0, N + P), (0, 0)]) roi_gt_class_ids = tf.pad(roi_gt_class_ids, [(0, N + P)]) deltas = tf.pad(deltas, [(0, N + P), (0, 0)]) masks = tf.pad(masks, [[0, N + P], (0, 0), (0, 0)]) return rois, roi_gt_class_ids, deltas, masks
def call(self, inputs): # Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords boxes = inputs[0] # Image meta # Holds details about the image. See compose_image_meta() image_meta = inputs[1] # Feature Maps. List of feature maps from different level of the # feature pyramid. Each is [batch, height, width, channels] feature_maps = inputs[2:] # Assign each ROI to a level in the pyramid based on the ROI area. y1, x1, y2, x2 = tf.split(boxes, 4, axis=2) h = y2 - y1 w = x2 - x1 # Use shape of first image. Images in a batch must have the same size. image_shape = Utils.parse_image_meta_graph( image_meta)["image_shape"][0] # Equation 1 in the Feature Pyramid Networks paper. Account for # the fact that our coordinates are normalized here. # e.g. a 224x224 ROI (in pixels) maps to P4 image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32) roi_level = log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area))) roi_level = tf.minimum( 5, tf.maximum(2, 4 + tf.cast(tf.round(roi_level), tf.int32))) roi_level = tf.squeeze(roi_level, 2) # Loop through levels and apply ROI pooling to each. P2 to P5. pooled = [] box_to_level = [] for i, level in enumerate(range(2, 6)): ix = tf.where(tf.equal(roi_level, level)) level_boxes = tf.gather_nd(boxes, ix) # Box indicies for crop_and_resize. box_indices = tf.cast(ix[:, 0], tf.int32) # Keep track of which box is mapped to which level box_to_level.append(ix) # Stop gradient propogation to ROI proposals level_boxes = tf.stop_gradient(level_boxes) box_indices = tf.stop_gradient(box_indices) # Crop and Resize # From Mask R-CNN paper: "We sample four regular locations, so # that we can evaluate either max or average pooling. In fact, # interpolating only a single value at each bin center (without # pooling) is nearly as effective." # # Here we use the simplified approach of a single value per bin, # which is how it's done in tf.crop_and_resize() # Result: [batch * num_boxes, pool_height, pool_width, channels] pooled.append( tf.image.crop_and_resize( feature_maps[i], level_boxes, box_indices, self.pool_shape, method="bilinear", )) # Pack pooled features into one tensor pooled = tf.concat(pooled, axis=0) # Pack box_to_level mapping into one array and add another # column representing the order of pooled boxes box_to_level = tf.concat(box_to_level, axis=0) box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1) box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range], axis=1) # Rearrange pooled features to match the order of the original boxes # Sort box_to_level by batch then box index # TF doesn't have a way to sort by two columns, so merge them and sort. sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1] ix = tf.nn.top_k(sorting_tensor, k=tf.shape(box_to_level)[0]).indices[::-1] ix = tf.gather(box_to_level[:, 2], ix) pooled = tf.gather(pooled, ix) # Re-add the batch dimension pooled = tf.expand_dims(pooled, 0) return pooled