def __init__(self): self.anchor_utils = AnchorUtils() self.bbox_utils = BboxUtil() self.image_utils = ImageUtils() # 日志保存路径 self.log_path = self.log_file_path(cfg.TRAIN.LOGS_PATH, cfg.TRAIN.DATA_SOURCE) # 模型保存路径 self.model_save_path = cfg.TRAIN.SAVE_MODEL_PATH # 训练数据 self.train_data = CocoDataset(cfg.TRAIN.COCO_TRAIN_ANN_PATH, cfg.TRAIN.COCO_TRAIN_IMAGE_PATH) # 验证数据 self.val_data = CocoDataset(cfg.TRAIN.COCO_VAL_ANN_PATH, cfg.TRAIN.COCO_VAL_IMAGE_PATH) # 加载 mask 网络模型 self.mask_model = MaskRCNN(train_flag=True) # 使用 原作者 1 + 80 类别的数据 # self.mask_model.load_weights(cfg.TEST.COCO_MODEL_PATH, by_name=True) # 载入在MS COCO上的预训练模型, 跳过不一样的分类数目层 self.mask_model.load_weights(cfg.TRAIN.MODEL_PATH, by_name=True, exclude=[ "mrcnn_class_logits", "mrcnn_bbox_fc", "mrcnn_bbox", "mrcnn_mask" ]) self.epoch = 0 pass
def __init__(self, batch_size, **kwargs): super(DetectionLayer, self).__init__(**kwargs) self.batch_size = batch_size self.detection_max_instances = cfg.TEST.DETECTION_MAX_INSTANCES self.image_utils = ImageUtils() self.bbox_utils = BboxUtil() self.misc_utils = MiscUtils()
def __init__(self): self.misc_utils = MiscUtils() self.bbox_utils = BboxUtil() # Cache anchors and reuse if image shape is the same self._anchor_cache = {} # self.anchors = None pass
def __init__(self, proposal_count, nms_threshold, batch_size, **kwargs): super(ProposalLayer, self).__init__(**kwargs) self.proposal_count = proposal_count self.nms_threshold = nms_threshold self.batch_size = batch_size self.misc_utils = MiscUtils() self.bbox_utils = BboxUtil() pass
def __init__(self, train_flag=True): """ :param train_flag: 是否为训练,训练为 True,测试为 False """ self.train_flag = train_flag self.bbox_util = BboxUtil() self.anchor_utils = AnchorUtils() self.image_utils = ImageUtils() self.mask_util = MaskUtil() # 模型 路径 self.model_path = cfg.TRAIN.MODEL_PATH if self.train_flag else cfg.TEST.COCO_MODEL_PATH # batch size self.batch_size = cfg.TRAIN.BATCH_SIZE if self.train_flag else cfg.TEST.BATCH_SIZE # 模型保存路径 self.save_model_path = cfg.TRAIN.SAVE_MODEL_PATH self.backbone = cfg.COMMON.BACKBONE self.backbone_strides = cfg.COMMON.BACKBONE_STRIDES # 输入图像 self.image_shape = np.array(cfg.COMMON.IMAGE_SHAPE) # 用于构建特征金字塔的自顶向下层的大小 self.top_down_pyramid_size = cfg.COMMON.TOP_DOWN_PYRAMID_SIZE self.rpn_anchor_stride = cfg.COMMON.RPN_ANCHOR_STRIDE self.rpn_anchor_ratios = cfg.COMMON.RPN_ANCHOR_RATIOS self.rpn_nms_threshold = cfg.COMMON.RPN_NMS_THRESHOLD self.class_num = cfg.COMMON.CLASS_NUM self.rois_per_image = cfg.TRAIN.ROIS_PER_IMAGE self.roi_positive_ratio = cfg.TRAIN.ROI_POSITIVE_RATIO self.keras_model = self.build() pass
class DetectionLayer(KE.Layer): """ Takes classified proposal boxes and their bounding box deltas and returns the final detection boxes. Returns: [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] where coordinates are normalized. """ def __init__(self, batch_size, **kwargs): super(DetectionLayer, self).__init__(**kwargs) self.batch_size = batch_size self.detection_max_instances = cfg.TEST.DETECTION_MAX_INSTANCES self.image_utils = ImageUtils() self.bbox_utils = BboxUtil() self.misc_utils = MiscUtils() def call(self, inputs): rois = inputs[0] mrcnn_class = inputs[1] mrcnn_bbox = inputs[2] image_meta = inputs[3] # Get windows of images in normalized coordinates. Windows are the area # in the image that excludes the padding. # Use the shape of the first image in the batch to normalize the window # because we know that all images get resized to the same size. m = self.image_utils.parse_image_meta_graph(image_meta) image_shape = m['image_shape'][0] window = self.bbox_utils.norm_boxes_graph(m['window'], image_shape[:2]) # Run detection refinement graph on each item in the batch detections_batch = self.misc_utils.batch_slice( [rois, mrcnn_class, mrcnn_bbox, window], lambda x, y, w, z: refine_detections_graph(x, y, w, z), self.batch_size) # Reshape output # [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] in # normalized coordinates return tf.reshape(detections_batch, [self.batch_size, self.detection_max_instances, 6]) def compute_output_shape(self, input_shape): return (None, self.detection_max_instances, 6)
class AnchorUtils(object): def __init__(self): self.misc_utils = MiscUtils() self.bbox_utils = BboxUtil() # Cache anchors and reuse if image shape is the same self._anchor_cache = {} # self.anchors = None pass def get_anchors(self, image_shape): """ :return: Returns anchor pyramid for the given image size """ if tuple(image_shape) not in self._anchor_cache: # Generate Anchors anchor = self.generate_pyramid_anchors(image_shape) # Keep a copy of the latest anchors in pixel coordinates because # it's used in inspect_model notebooks. # TODO: Remove this after the notebook are refactored to not use it # self.anchors = anchor self._anchor_cache[tuple( image_shape)] = self.bbox_utils.norm_boxes( anchor, image_shape[:2]) pass return self._anchor_cache[tuple(image_shape)] pass def generate_pyramid_anchors(self, image_shape): """ Generate anchors at different levels of a feature pyramid. Each scale is associated with a level of the pyramid, but each ratio is used in all levels of the pyramid. :param image_shape: [h, w, c] :return: anchors: [N, (y1, x1, y2, x2)] All generated anchors in one array. Sorted with the same order of the given scales. So, anchors of scale[0] come first, then anchors of scale[1], and so on. """ backbone_strides = cfg.COMMON.BACKBONE_STRIDES # [N, (height, width)]. Where N is the number of stages backbone_shape = self.misc_utils.compute_backbone_shapes( image_shape, backbone_strides) # Anchors # [anchor_count, (y1, x1, y2, x2)] anchors = [] scales = cfg.COMMON.RPN_ANCHOR_SCALES scales_len = len(scales) for i in range(scales_len): anchor_box = self.generate_anchors(scales[i], backbone_shape[i], backbone_strides[i]) anchors.append(anchor_box) pass return np.concatenate(anchors, axis=0) pass # generate anchor box def generate_anchors(self, scales, backbone_shape, backbone_strides): """ :param scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128] :param backbone_shape: [height, width] spatial shape of the feature map over which to generate anchors. :param backbone_strides: Stride of the feature map relative to the image in pixels. :return: anchor box: Convert to corner coordinates (y1, x1, y2, x2) """ # 1D array of anchor ratios of width/height. Example: [0.5, 1, 2] ratios = cfg.COMMON.RPN_ANCHOR_RATIOS # Stride of anchors on the feature map. For example, # if the value is 2 then generate anchors for every other feature map pixel. anchor_stride = cfg.COMMON.RPN_ANCHOR_STRIDE # Get all combinations of scales and ratios scales, ratios = np.meshgrid(np.array(scales), np.array(ratios)) scales = scales.flatten() ratios = ratios.flatten() # Enumerate heights and widths from scales and ratios heights = scales / np.sqrt(ratios) widths = scales * np.sqrt(ratios) # Enumerate shifts in feature space shifts_y = np.arange(0, backbone_shape[0], anchor_stride) * backbone_strides shifts_x = np.arange(0, backbone_shape[1], anchor_stride) * backbone_strides shifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y) # Enumerate combinations of shifts, widths, and heights box_widths, box_centers_x = np.meshgrid(widths, shifts_x) box_heights, box_centers_y = np.meshgrid(heights, shifts_y) # Reshape to get a list of (y, x) and a list of (h, w) box_centers = np.stack([box_centers_y, box_centers_x], axis=2).reshape([-1, 2]) box_sizes = np.stack([box_heights, box_widths], axis=2).reshape([-1, 2]) # Convert to corner coordinates (y1, x1, y2, x2) boxes = np.concatenate( [box_centers - 0.5 * box_sizes, box_centers + 0.5 * box_sizes], axis=1) return boxes pass
class MaskTrain(object): def __init__(self): self.anchor_utils = AnchorUtils() self.bbox_utils = BboxUtil() self.image_utils = ImageUtils() # 日志保存路径 self.log_path = self.log_file_path(cfg.TRAIN.LOGS_PATH, cfg.TRAIN.DATA_SOURCE) # 模型保存路径 self.model_save_path = cfg.TRAIN.SAVE_MODEL_PATH # 训练数据 self.train_data = CocoDataset(cfg.TRAIN.COCO_TRAIN_ANN_PATH, cfg.TRAIN.COCO_TRAIN_IMAGE_PATH) # 验证数据 self.val_data = CocoDataset(cfg.TRAIN.COCO_VAL_ANN_PATH, cfg.TRAIN.COCO_VAL_IMAGE_PATH) # 加载 mask 网络模型 self.mask_model = MaskRCNN(train_flag=True) # 使用 原作者 1 + 80 类别的数据 # self.mask_model.load_weights(cfg.TEST.COCO_MODEL_PATH, by_name=True) # 载入在MS COCO上的预训练模型, 跳过不一样的分类数目层 self.mask_model.load_weights(cfg.TRAIN.MODEL_PATH, by_name=True, exclude=[ "mrcnn_class_logits", "mrcnn_bbox_fc", "mrcnn_bbox", "mrcnn_mask" ]) self.epoch = 0 pass # 设置 日志文件夹 def log_file_path(self, log_dir, data_source="coco"): log_start_time = datetime.now() log_file_name = "{}_{:%Y%m%dT%H%M}".format(data_source.lower(), log_start_time) log_path = os.path.join(log_dir, log_file_name) return log_path pass def do_mask_train(self): # image augmentation augmentation = imgaug.augmenters.Fliplr(0.5) print("training - stage 1") # training - stage 1 self.train_details(self.train_data, self.val_data, learning_rate=cfg.TRAIN.ROUGH_LEARNING_RATE, epochs=cfg.TRAIN.FIRST_STAGE_N_EPOCH, layers=cfg.TRAIN.HEADS_LAYERS, augmentation=augmentation) print("training - stage 2") # training - stage 2 self.train_details(self.train_data, self.val_data, learning_rate=cfg.TRAIN.ROUGH_LEARNING_RATE, epochs=cfg.TRAIN.MIDDLE_STAGE_N_EPOCH, layers=cfg.TRAIN.FOUR_MORE_LAYERS, augmentation=augmentation) print("training - stage 3") # training - stage 3 self.train_details(self.train_data, self.val_data, learning_rate=cfg.TRAIN.FINE_LEARNING_RATE, epochs=cfg.TRAIN.LAST_STAGE_N_EPOCH, layers=cfg.TRAIN.ALL_LAYERS, augmentation=augmentation) pass def train_details(self, train_data, val_data, learning_rate, epochs, layers, augmentation=None, custom_callbacks=None, no_augmentation_sources=None): """ Train the model. :param train_data: Training data object :param val_data: val data object :param learning_rate: The learning rate to train with :param epochs: Number of training epochs. Note that previous training epochs are considered to be done alreay, so this actually determines the epochs to train in total rather than in this particaular call. :param layers: Allows selecting wich layers to train. It can be: - A regular expression to match layer names to train - One of these predefined values: heads: The RPN, classifier and mask heads of the network all: All the layers 3+: Train Resnet stage 3 and up 4+: Train Resnet stage 4 and up 5+: Train Resnet stage 5 and up :param augmentation: Optional. An imgaug (https://github.com/aleju/imgaug) augmentation. For example, passing imgaug.augmenters.Fliplr(0.5) flips images right/left 50% of the time. You can pass complex augmentations as well. This augmentation applies 50% of the time, and when it does it flips images right/left half the time and adds a Gaussian blur with a random sigma in range 0 to 5. :param custom_callbacks: Optional. Add custom callbacks to be called with the keras fit_generator method. Must be list of type keras.callbacks. :param no_augmentation_sources: Optional. List of sources to exclude for augmentation. A source is string that identifies a dataset and is defined in the Dataset class. :return: """ # Pre-defined layer regular expressions layer_regex = cfg.TRAIN.LAYER_REGEX if layers in layer_regex: layers = layer_regex[layers] pass self.set_trainable(layers) if not os.path.exists(self.log_path): os.makedirs(self.log_path) pass # Callbacks callbacks = [ keras.callbacks.TensorBoard(log_dir=self.log_path, histogram_freq=0, write_graph=True, write_images=False), keras.callbacks.ModelCheckpoint(self.model_save_path, verbose=0, save_weights_only=True) ] # Add custom callbacks to the list if custom_callbacks: callbacks += custom_callbacks pass # Data generators train_generator = self.data_generator( train_data, augmentation=augmentation, batch_size=self.mask_model.batch_size, no_augmentation_sources=no_augmentation_sources) val_generator = self.data_generator( val_data, batch_size=self.mask_model.batch_size) self.compile(learning_rate, cfg.TRAIN.LEARNING_MOMENTUM) print("learning_rate: {}, checkpoint path: {}".format( learning_rate, self.model_save_path)) # Work-around for Windows: Keras fails on Windows when using # multiprocessing workers. See discussion here: # https://github.com/matterport/Mask_RCNN/issues/13#issuecomment-353124009 if os.name is 'nt': workers = 0 pass else: workers = multiprocessing.cpu_count() pass self.mask_model.keras_model.fit_generator( generator=train_generator, initial_epoch=self.epoch, epochs=epochs, steps_per_epoch=cfg.TRAIN.STEPS_PER_EPOCH, callbacks=callbacks, validation_data=val_generator, validation_steps=cfg.TRAIN.VALIDATION_STEPS, max_queue_size=100, workers=workers, use_multiprocessing=True, ) self.epoch = max(self.epoch, epochs) pass def data_generator(self, data, augmentation=None, batch_size=1, random_rois=0, detection_targets=False, no_augmentation_sources=None): """ A generator that returns images and corresponding target class ids, bounding box deltas, and masks. :param data: The Dataset object to pick data from :param augmentation: Optional. An imgaug (https://github.com/aleju/imgaug) augmentation. For example, passing imgaug.augmenters.Fliplr(0.5) flips images right/left 50% of the time. :param batch_size: How many images to return in each call :param random_rois: If > 0 then generate proposals to be used to train the network classifier and mask heads. Useful if training the Mask RCNN part without the RPN. :param detection_targets: If True, generate detection targets (class IDs, bbox deltas, and masks). Typically for debugging or visualizations because in trainig detection targets are generated by DetectionTargetLayer. :param no_augmentation_sources: Optional. List of sources to exclude for augmentation. A source is string that identifies a dataset and is defined in the Dataset class. :return: Returns a Python generator. Upon calling next() on it, the generator returns two lists, inputs and outputs. The contents of the lists differs depending on the received arguments: inputs list: - images: [batch, H, W, C] - image_meta: [batch, (meta data)] Image details. See compose_image_meta() - rpn_match: [batch, N] Integer (1=positive anchor, -1=negative, 0=neutral) - rpn_bbox: [batch, N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas. - gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs - gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] - gt_masks: [batch, height, width, MAX_GT_INSTANCES]. The height and width are those of the image unless use_mini_mask is True, in which case they are defined in MINI_MASK_SHAPE. outputs list: Usually empty in regular training. But if detection_targets is True then the outputs list contains target class_ids, bbox deltas, and masks. """ # batch item index batch_index = 0 image_index = -1 image_ids = np.copy(data.image_ids_list) error_count = 0 no_augmentation_sources = no_augmentation_sources or [] # Anchors # [anchor_count, (y1, x1, y2, x2)] # Generate Anchors anchors = self.anchor_utils.generate_pyramid_anchors( image_shape=cfg.COMMON.IMAGE_SHAPE) image_id = "" mini_mask = cfg.TRAIN.USE_MINI_MASK max_gt_instances = cfg.TRAIN.MAX_GT_INSTANCES mean_pixel = np.array(cfg.COMMON.MEAN_PIXEL) # Keras requires a generator to run indefinitely. while True: try: # Increment index to pick next image. Shuffle if at the start of an epoch. image_index = (image_index + 1) % len(image_ids) if image_index == 0: np.random.shuffle(image_ids) # Get GT bounding boxes and masks for image. image_id = image_ids[image_index] # If the image source is not to be augmented pass None as augmentation if data.image_info_list[image_id][ 'source'] in no_augmentation_sources: image, image_meta, gt_class_ids, gt_boxes, gt_masks = self.bbox_utils.load_image_gt( data, image_id, None, mini_mask) else: image, image_meta, gt_class_ids, gt_boxes, gt_masks = self.bbox_utils.load_image_gt( data, image_id, augmentation, mini_mask) # Skip images that have no instances. This can happen in cases # where we train on a subset of classes and the image doesn't # have any of the classes we care about. if not np.any(gt_class_ids > 0): continue pass # RPN Targets rpn_match, rpn_bbox = common.build_rpn_targets( anchors, gt_class_ids, gt_boxes) # 在这里定义 变量,避免下面使用的时候出现未定义 rpn_rois = None rois = None mrcnn_class_ids = None mrcnn_bbox = None mrcnn_mask = None # Mask R-CNN Targets if random_rois: rpn_rois = self.mask_model.generate_random_rois( image.shape, random_rois, gt_boxes) if detection_targets: rois, mrcnn_class_ids, mrcnn_bbox, mrcnn_mask = \ self.mask_model.build_detection_targets(rpn_rois, gt_class_ids, gt_boxes, gt_masks) pass pass # Init batch arrays if batch_index == 0: batch_image_meta = np.zeros( (batch_size, ) + image_meta.shape, dtype=image_meta.dtype) batch_rpn_match = np.zeros( [batch_size, anchors.shape[0], 1], dtype=rpn_match.dtype) batch_rpn_bbox = np.zeros( [batch_size, cfg.TRAIN.ANCHORS_PER_IMAGE, 4], dtype=rpn_bbox.dtype) batch_images = np.zeros((batch_size, ) + image.shape, dtype=np.float32) batch_gt_class_ids = np.zeros( (batch_size, max_gt_instances), dtype=np.int32) batch_gt_boxes = np.zeros( (batch_size, max_gt_instances, 4), dtype=np.int32) batch_gt_masks = np.zeros( (batch_size, gt_masks.shape[0], gt_masks.shape[1], max_gt_instances), dtype=gt_masks.dtype) if random_rois: batch_rpn_rois = np.zeros( (batch_size, rpn_rois.shape[0], 4), dtype=rpn_rois.dtype) if detection_targets: batch_rois = np.zeros((batch_size, ) + rois.shape, dtype=rois.dtype) batch_mrcnn_class_ids = np.zeros( (batch_size, ) + mrcnn_class_ids.shape, dtype=mrcnn_class_ids.dtype) batch_mrcnn_bbox = np.zeros( (batch_size, ) + mrcnn_bbox.shape, dtype=mrcnn_bbox.dtype) batch_mrcnn_mask = np.zeros( (batch_size, ) + mrcnn_mask.shape, dtype=mrcnn_mask.dtype) pass pass pass # If more instances than fits in the array, sub-sample from them. if gt_boxes.shape[0] > max_gt_instances: ids = np.random.choice(np.arange(gt_boxes.shape[0]), max_gt_instances, replace=False) gt_class_ids = gt_class_ids[ids] gt_boxes = gt_boxes[ids] gt_masks = gt_masks[:, :, ids] # Add to batch batch_image_meta[batch_index] = image_meta batch_rpn_match[batch_index] = rpn_match[:, np.newaxis] batch_rpn_bbox[batch_index] = rpn_bbox batch_images[batch_index] = self.image_utils.mold_image( image.astype(np.float32), mean_pixel) batch_gt_class_ids[ batch_index, :gt_class_ids.shape[0]] = gt_class_ids batch_gt_boxes[batch_index, :gt_boxes.shape[0]] = gt_boxes batch_gt_masks[ batch_index, :, :, :gt_masks.shape[-1]] = gt_masks if random_rois: batch_rpn_rois[batch_index] = rpn_rois if detection_targets: batch_rois[batch_index] = rois batch_mrcnn_class_ids[batch_index] = mrcnn_class_ids batch_mrcnn_bbox[batch_index] = mrcnn_bbox batch_mrcnn_mask[batch_index] = mrcnn_mask pass pass batch_index += 1 # Batch full? if batch_index >= batch_size: inputs = [ batch_images, batch_image_meta, batch_rpn_match, batch_rpn_bbox, batch_gt_class_ids, batch_gt_boxes, batch_gt_masks ] outputs = [] if random_rois: inputs.extend([batch_rpn_rois]) if detection_targets: inputs.extend([batch_rois]) # Keras requires that output and targets have the same number of dimensions batch_mrcnn_class_ids = np.expand_dims( batch_mrcnn_class_ids, -1) outputs.extend([ batch_mrcnn_class_ids, batch_mrcnn_bbox, batch_mrcnn_mask ]) yield inputs, outputs # start a new batch batch_index = 0 pass except (GeneratorExit, KeyboardInterrupt): raise except: # Log it and skip the image logging.exception("Error processing image {}".format( data.image_info_list[image_id])) error_count += 1 if error_count > 5: raise pass pass def set_trainable(self, layer_regex, mask_model=None, indent=0, verbose=1): """ Sets model layers as trainable if their names match the given regular expression. :param layer_regex: :param mask_model: :param indent: :param verbose: :return: """ # Print message on the first call (but not on recursive calls) if verbose > 0 and mask_model is None: print("Selecting layers to train") pass mask_model = mask_model or self.mask_model.keras_model # In multi-GPU training, we wrap the model. Get layers # of the inner model because they have the weights. layers = mask_model.inner_model.layers if hasattr( mask_model, "inner_model") else mask_model.layers for layer in layers: # Is the layer a model? if layer.__class__.__name__ == 'Model': print("In model: ", layer.name) self.set_trainable(layer_regex, mask_model=layer, indent=indent + 4) continue if not layer.weights: continue # Is it trainable? trainable = bool(re.fullmatch(layer_regex, layer.name)) # Update layer. If layer is a container, update inner layer. if layer.__class__.__name__ == 'TimeDistributed': layer.layer.trainable = trainable else: layer.trainable = trainable # Print trainable layer names if trainable and verbose > 0: print("{}{:20} ({})".format(" " * indent, layer.name, layer.__class__.__name__)) pass def compile(self, learning_rate, momentum_param): """ Gets the model ready for training. Adds losses, regularization, and metrics. Then calls the Keras compile() function. :param learning_rate: :param momentum_param: :return: """ # Optimizer object optimizer = keras.optimizers.SGD(lr=learning_rate, momentum=momentum_param, clipnorm=cfg.TRAIN.GRADIENT_CLIP_NORM) self.mask_model.keras_model._losses = [] self.mask_model.keras_model._per_input_losses = {} loss_names = [ "rpn_class_loss", "rpn_bbox_loss", "mrcnn_class_loss", "mrcnn_bbox_loss", "mrcnn_mask_loss" ] for name in loss_names: layer = self.mask_model.keras_model.get_layer(name) if layer.output in self.mask_model.keras_model.losses: continue loss = (tf.reduce_mean(layer.output, keepdims=True) * cfg.COMMON.LOSS_WEIGHTS.get(name, 1.)) self.mask_model.keras_model.add_loss(loss) pass # Add L2 Regularization # Skip gamma and beta weights of batch normalization layers. reg_losses = [ keras.regularizers.l2(cfg.TRAIN.WEIGHT_DECAY)(w) / tf.cast(tf.size(w), tf.float32) for w in self.mask_model.keras_model.trainable_weights if 'gamma' not in w.name and 'beta' not in w.name ] self.mask_model.keras_model.add_loss(tf.add_n(reg_losses)) # Compile self.mask_model.keras_model.compile( optimizer=optimizer, loss=[None] * len(self.mask_model.keras_model.outputs)) # Add metrics for losses for name in loss_names: if name in self.mask_model.keras_model.metrics_names: continue pass layer = self.mask_model.keras_model.get_layer(name) self.mask_model.keras_model.metrics_names.append(name) loss = (tf.reduce_mean(layer.output, keepdims=True) * cfg.COMMON.LOSS_WEIGHTS.get(name, 1.)) self.mask_model.keras_model.metrics_tensors.append(loss) pass
def __init__(self): self.bbox_util = BboxUtil() pass
class MiscUtils(object): def __init__(self): self.bbox_util = BboxUtil() pass def compute_backbone_shapes(self, image_shape, backbone_strides): """ Computes the width and height of each stage of the backbone network :param image_shape: [h, w, c] :param backbone_strides: The strides of each layer of the FPN Pyramid. These values are based on a resNet101 backbone. :return: [N, (height, width)]. Where N is the number of stages """ return np.array([[ int(math.ceil(image_shape[0] / stride)), int(math.ceil(image_shape[1] / stride)) ] for stride in backbone_strides]) pass def batch_slice(self, inputs, graph_fn, batch_size, names=None): """ Splits inputs into slices and feeds each slice to a copy of the given computation graph and then combines the results. It allows you to run a graph on a batch of inputs even if the graph is written to support one instance only. :param inputs: list of tensors. All must have the same first dimension length :param graph_fn: A function that returns a TF tensor that's part of a graph. :param batch_size: number of slices to divide the data into. :param names: If provided, assigns names to the resulting tensors. :return: """ if not isinstance(inputs, list): inputs = [inputs] outputs = [] for i in range(batch_size): inputs_slice = [x[i] for x in inputs] output_slice = graph_fn(*inputs_slice) if not isinstance(output_slice, (tuple, list)): output_slice = [output_slice] outputs.append(output_slice) # Change outputs from a list of slices where each is # a list of outputs to a list of outputs and each has # a list of slices outputs = list(zip(*outputs)) if names is None: names = [None] * len(outputs) result = [tf.stack(o, axis=0, name=n) for o, n in zip(outputs, names)] if len(result) == 1: result = result[0] return result pass def trim_zeros_graph(self, boxes, name='trim_zeros'): """ Often boxes are represented with matrices of shape [N, 4] and are padded with zeros. This removes zero boxes. :param boxes: [N, 4] matrix of boxes. :param name: :return: non_zeros: [N] a 1D boolean mask identifying the rows to keep """ non_zeros = tf.cast(tf.reduce_sum(tf.abs(boxes), axis=1), tf.bool) boxes = tf.boolean_mask(boxes, non_zeros, name=name) return boxes, non_zeros pass def detection_targets_graph(self, proposals, gt_class_ids, gt_boxes, gt_masks): """ Generates detection targets for one image. Subsamples proposals and generates target class IDs, bounding box deltas, and masks for each. :param proposals: [POST_NMS_ROIS_TRAINING, (y1, x1, y2, x2)] in normalized coordinates. Might be zero padded if there are not enough proposals. :param gt_class_ids: [MAX_GT_INSTANCES] int class IDs :param gt_boxes: [MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized coordinates. :param gt_masks: [height, width, MAX_GT_INSTANCES] of boolean type. :return: Target ROIs and corresponding class IDs, bounding box shifts, and masks. rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs. Zero padded. deltas: [TRAIN_ROIS_PER_IMAGE, (dy, dx, log(dh), log(dw))] masks: [TRAIN_ROIS_PER_IMAGE, height, width]. Masks cropped to bbox boundaries and resized to neural network output size. Note: Returned arrays might be zero padded if not enough target ROIs. """ # Assertions asserts = [ tf.Assert(tf.greater(tf.shape(proposals)[0], 0), [proposals], name="roi_assertion"), ] with tf.control_dependencies(asserts): proposals = tf.identity(proposals) pass # Remove zero padding proposals, _ = self.trim_zeros_graph(proposals, name="trim_proposals") gt_boxes, non_zeros = self.trim_zeros_graph(gt_boxes, name="trim_gt_boxes") gt_class_ids = tf.boolean_mask(gt_class_ids, non_zeros, name="trim_gt_class_ids") gt_masks = tf.gather(gt_masks, tf.where(non_zeros)[:, 0], axis=2, name="trim_gt_masks") # Handle COCO crowds # A crowd box in COCO is a bounding box around several instances. Exclude # them from training. A crowd box is given a negative class ID. crowd_ix = tf.where(gt_class_ids < 0)[:, 0] non_crowd_ix = tf.where(gt_class_ids > 0)[:, 0] crowd_boxes = tf.gather(gt_boxes, crowd_ix) gt_class_ids = tf.gather(gt_class_ids, non_crowd_ix) gt_boxes = tf.gather(gt_boxes, non_crowd_ix) gt_masks = tf.gather(gt_masks, non_crowd_ix, axis=2) # Compute overlaps matrix [proposals, gt_boxes] overlaps = self.bbox_util.overlaps_graph(proposals, gt_boxes) # Compute overlaps with crowd boxes [proposals, crowd_boxes] crowd_overlaps = self.bbox_util.overlaps_graph(proposals, crowd_boxes) crowd_iou_max = tf.reduce_max(crowd_overlaps, axis=1) no_crowd_bool = (crowd_iou_max < 0.001) # Determine positive and negative ROIs roi_iou_max = tf.reduce_max(overlaps, axis=1) # 1. Positive ROIs are those with >= 0.5 IoU with a GT box positive_roi_bool = (roi_iou_max >= 0.5) positive_indices = tf.where(positive_roi_bool)[:, 0] # 2. Negative ROIs are those with < 0.5 with every GT box. Skip crowds. negative_indices = tf.where( tf.logical_and(roi_iou_max < 0.5, no_crowd_bool))[:, 0] # Subsample ROIs. Aim for 33% positive # Positive ROIs positive_count = int(cfg.TRAIN.ROIS_PER_IMAGE * cfg.TRAIN.ROI_POSITIVE_RATIO) positive_indices = tf.random_shuffle(positive_indices)[:positive_count] positive_count = tf.shape(positive_indices)[0] # Negative ROIs. Add enough to maintain positive:negative ratio. r = 1.0 / cfg.TRAIN.ROI_POSITIVE_RATIO negative_count = tf.cast(r * tf.cast(positive_count, tf.float32), tf.int32) - positive_count negative_indices = tf.random_shuffle(negative_indices)[:negative_count] # Gather selected ROIs positive_rois = tf.gather(proposals, positive_indices) negative_rois = tf.gather(proposals, negative_indices) # Assign positive ROIs to GT boxes. positive_overlaps = tf.gather(overlaps, positive_indices) roi_gt_box_assignment = tf.cond( tf.greater(tf.shape(positive_overlaps)[1], 0), true_fn=lambda: tf.argmax(positive_overlaps, axis=1), false_fn=lambda: tf.cast(tf.constant([]), tf.int64)) roi_gt_boxes = tf.gather(gt_boxes, roi_gt_box_assignment) roi_gt_class_ids = tf.gather(gt_class_ids, roi_gt_box_assignment) # Compute bbox refinement for positive ROIs deltas = self.bbox_util.box_refinement_graph(positive_rois, roi_gt_boxes) deltas /= np.array(cfg.COMMON.BBOX_STD_DEV) # Assign positive ROIs to GT masks # Permute masks to [N, height, width, 1] transposed_masks = tf.expand_dims(tf.transpose(gt_masks, [2, 0, 1]), -1) # Pick the right mask for each ROI roi_masks = tf.gather(transposed_masks, roi_gt_box_assignment) # Compute mask targets boxes = positive_rois if cfg.TRAIN.USE_MINI_MASK: # Transform ROI coordinates from normalized image space # to normalized mini-mask space. y1, x1, y2, x2 = tf.split(positive_rois, 4, axis=1) gt_y1, gt_x1, gt_y2, gt_x2 = tf.split(roi_gt_boxes, 4, axis=1) gt_h = gt_y2 - gt_y1 gt_w = gt_x2 - gt_x1 y1 = (y1 - gt_y1) / gt_h x1 = (x1 - gt_x1) / gt_w y2 = (y2 - gt_y1) / gt_h x2 = (x2 - gt_x1) / gt_w boxes = tf.concat([y1, x1, y2, x2], 1) box_ids = tf.range(0, tf.shape(roi_masks)[0]) masks = tf.image.crop_and_resize(tf.cast(roi_masks, tf.float32), boxes, box_ids, cfg.TRAIN.MASK_SHAPE) # Remove the extra dimension from masks. masks = tf.squeeze(masks, axis=3) # Threshold mask pixels at 0.5 to have GT masks be 0 or 1 to use with # binary cross entropy loss. masks = tf.round(masks) # Append negative ROIs and pad bbox deltas and masks that # are not used for negative ROIs with zeros. rois = tf.concat([positive_rois, negative_rois], axis=0) N = tf.shape(negative_rois)[0] P = tf.maximum(cfg.TRAIN.ROIS_PER_IMAGE - tf.shape(rois)[0], 0) rois = tf.pad(rois, [(0, P), (0, 0)]) # roi_gt_boxes = tf.pad(roi_gt_boxes, [(0, N + P), (0, 0)]) roi_gt_class_ids = tf.pad(roi_gt_class_ids, [(0, N + P)]) deltas = tf.pad(deltas, [(0, N + P), (0, 0)]) masks = tf.pad(masks, [[0, N + P], (0, 0), (0, 0)]) return rois, roi_gt_class_ids, deltas, masks pass
class ProposalLayer(KE.Layer): """ Receives anchor scores and selects a subset to pass as proposals to the second stage. Filtering is done based on anchor scores and non-max suppression to remove overlaps. It also applies bounding box refinement deltas to anchors. Inputs: rpn_probs: [batch, num_anchors, (bg prob, fg prob)] rpn_bbox: [batch, num_anchors, (dy, dx, log(dh), log(dw))] anchors: [batch, num_anchors, (y1, x1, y2, x2)] anchors in normalized coordinates Returns: Proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)] """ def __init__(self, proposal_count, nms_threshold, batch_size, **kwargs): super(ProposalLayer, self).__init__(**kwargs) self.proposal_count = proposal_count self.nms_threshold = nms_threshold self.batch_size = batch_size self.misc_utils = MiscUtils() self.bbox_utils = BboxUtil() pass def call(self, inputs): """ 这里的 call 方法,会被 __init__() 方法回调 :param inputs: :return: """ # Box Scores. Use the foreground class confidence. [Batch, num_rois, 1] scores = inputs[0][:, :, 1] # Box deltas [batch, num_rois, 4] deltas = inputs[1] rpn_bbox_std_dev = np.array(cfg.COMMON.RPN_BBOX_STD_DEV) deltas = deltas * np.reshape(rpn_bbox_std_dev, [1, 1, 4]) # Anchors anchors = inputs[2] # Improve performance by trimming to top anchors by score # and doing the rest on the smaller subset. pre_nms_limit = tf.minimum(cfg.COMMON.PRE_NMS_LIMIT, tf.shape(anchors)[1]) ix = tf.nn.top_k(scores, pre_nms_limit, sorted=True, name="top_anchors").indices scores = self.misc_utils.batch_slice([scores, ix], lambda x, y: tf.gather(x, y), self.batch_size) deltas = self.misc_utils.batch_slice([deltas, ix], lambda x, y: tf.gather(x, y), self.batch_size) pre_nms_anchors = self.misc_utils.batch_slice( [anchors, ix], lambda a, x: tf.gather(a, x), self.batch_size, names=["pre_nms_anchors"]) # Apply deltas to anchors to get refined anchors. # [batch, N, (y1, x1, y2, x2)] boxes = self.misc_utils.batch_slice( [pre_nms_anchors, deltas], lambda x, y: self.bbox_utils.apply_box_deltas_graph(x, y), self.batch_size, names=["refined_anchors"]) # Clip to image boundaries. Since we're in normalized coordinates, # clip to 0..1 range. [batch, N, (y1, x1, y2, x2)] window = np.array([0, 0, 1, 1], dtype=np.float32) boxes = self.misc_utils.batch_slice( boxes, lambda x: self.bbox_utils.clip_boxes_graph(x, window), self.batch_size, names=["refined_anchors_clipped"]) # Filter out small boxes # According to Xinlei Chen's paper, this reduces detection accuracy # for small objects, so we're skipping it. # Non-max suppression def nms(boxes, scores): indices = tf.image.non_max_suppression( boxes, scores, self.proposal_count, self.nms_threshold, name="rpn_non_max_suppression") proposals = tf.gather(boxes, indices) # Pad if needed padding = tf.maximum(self.proposal_count - tf.shape(proposals)[0], 0) proposals = tf.pad(proposals, [(0, padding), (0, 0)]) return proposals proposals = self.misc_utils.batch_slice([boxes, scores], nms, self.batch_size) return proposals def compute_output_shape(self, input_shape): return (None, self.proposal_count, 4)
def refine_detections_graph(rois, probs, deltas, window): """ Refine classified proposals and filter overlaps and return final detections. :param rois: [N, (y1, x1, y2, x2)] in normalized coordinates :param probs: [N, num_classes]. Class probabilities. :param deltas: [N, num_classes, (dy, dx, log(dh), log(dw))]. Class-specific bounding box deltas. :param window: (y1, x1, y2, x2) in normalized coordinates. The part of the image that contains the image excluding the padding. :return: Returns detections shaped: [num_detections, (y1, x1, y2, x2, class_id, score)] where coordinates are normalized. """ # Class IDs per ROI class_ids = tf.argmax(probs, axis=1, output_type=tf.int32) # Class probability of the top class of each ROI indices = tf.stack([tf.range(probs.shape[0]), class_ids], axis=1) class_scores = tf.gather_nd(probs, indices) # Class-specific bounding box deltas deltas_specific = tf.gather_nd(deltas, indices) # Apply bounding box deltas # Shape: [boxes, (y1, x1, y2, x2)] in normalized coordinates bbox_utils = BboxUtil() refined_rois = bbox_utils.apply_box_deltas_graph( rois, deltas_specific * cfg.COMMON.BBOX_STD_DEV) # Clip boxes to image window refined_rois = bbox_utils.clip_boxes_graph(refined_rois, window) # TODO: Filter out boxes with zero area # Filter out background boxes keep = tf.where(class_ids > 0)[:, 0] # Filter out low confidence boxes defection_min_confidence = cfg.COMMON.DETECTION_MIN_CONFIDENCE if defection_min_confidence: conf_keep = tf.where(class_scores >= defection_min_confidence)[:, 0] keep = tf.sets.set_intersection(tf.expand_dims(keep, 0), tf.expand_dims(conf_keep, 0)) keep = tf.sparse_tensor_to_dense(keep)[0] # Apply per-class NMS # 1. Prepare variables pre_nms_class_ids = tf.gather(class_ids, keep) pre_nms_scores = tf.gather(class_scores, keep) pre_nms_rois = tf.gather(refined_rois, keep) unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0] def nms_keep_map(class_id): """Apply Non-Maximum Suppression on ROIs of the given class.""" defection_max_instances = cfg.TEST.DETECTION_MAX_INSTANCES # Indices of ROIs of the given class ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0] # Apply NMS class_keep = tf.image.non_max_suppression( tf.gather(pre_nms_rois, ixs), tf.gather(pre_nms_scores, ixs), max_output_size=defection_max_instances, iou_threshold=cfg.TEST.DETECTION_NMS_THRESHOLD) # Map indices class_keep = tf.gather(keep, tf.gather(ixs, class_keep)) # Pad with -1 so returned tensors have the same shape gap = defection_max_instances - tf.shape(class_keep)[0] class_keep = tf.pad(class_keep, [(0, gap)], mode='CONSTANT', constant_values=-1) # Set shape so map_fn() can infer result shape class_keep.set_shape([defection_max_instances]) return class_keep # 2. Map over class IDs nms_keep = tf.map_fn(nms_keep_map, unique_pre_nms_class_ids, dtype=tf.int64) # 3. Merge results into one list, and remove -1 padding nms_keep = tf.reshape(nms_keep, [-1]) nms_keep = tf.gather(nms_keep, tf.where(nms_keep > -1)[:, 0]) # 4. Compute intersection between keep and nms_keep keep = tf.sets.set_intersection(tf.expand_dims(keep, 0), tf.expand_dims(nms_keep, 0)) keep = tf.sparse_tensor_to_dense(keep)[0] # Keep top detections roi_count = cfg.TEST.DETECTION_MAX_INSTANCES class_scores_keep = tf.gather(class_scores, keep) num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count) top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=True)[1] keep = tf.gather(keep, top_ids) # Arrange output as [N, (y1, x1, y2, x2, class_id, score)] # Coordinates are normalized. detections = tf.concat([ tf.gather(refined_rois, keep), tf.to_float(tf.gather(class_ids, keep))[..., tf.newaxis], tf.gather(class_scores, keep)[..., tf.newaxis] ], axis=1) # Pad with zeros if detections < DETECTION_MAX_INSTANCES gap = cfg.TEST.DETECTION_MAX_INSTANCES - tf.shape(detections)[0] detections = tf.pad(detections, [(0, gap), (0, 0)], "CONSTANT") return detections
def build_rpn_targets(anchors, gt_class_ids, gt_boxes): """ Given the anchors and GT boxes, compute overlaps and identify positive anchors and deltas to refine them to match their corresponding GT boxes. :param anchors: [num_anchors, (y1, x1, y2, x2)] :param gt_class_ids: [num_gt_boxes] Integer class IDs. :param gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)] :return: rpn_match: [N] (int32) matches between anchors and GT boxes. 1 = positive anchor, -1 = negative anchor, 0 = neutral rpn_bbox: [N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas. """ # RPN Match: 1 = positive anchor, -1 = negative anchor, 0 = neutral rpn_match = np.zeros([anchors.shape[0]], dtype=np.int32) anchor_per_image = cfg.TRAIN.ANCHORS_PER_IMAGE # RPN bounding boxes: [max anchors per image, (dy, dx, log(dh), log(dw))] rpn_bbox = np.zeros((anchor_per_image, 4)) bbox_util = BboxUtil() # Handle COCO crowds # A crowd box in COCO is a bounding box around several instances. Exclude # them from training. A crowd box is given a negative class ID. crowd_ix = np.where(gt_class_ids < 0)[0] if crowd_ix.shape[0] > 0: # Filter out crowds from ground truth class IDs and boxes non_crowd_ix = np.where(gt_class_ids > 0)[0] crowd_boxes = gt_boxes[crowd_ix] gt_class_ids = gt_class_ids[non_crowd_ix] gt_boxes = gt_boxes[non_crowd_ix] # Compute overlaps with crowd boxes [anchors, crowds] crowd_overlaps = bbox_util.compute_overlaps(anchors, crowd_boxes) crowd_iou_max = np.amax(crowd_overlaps, axis=1) no_crowd_bool = (crowd_iou_max < 0.001) pass else: # All anchors don't intersect a crowd no_crowd_bool = np.ones([anchors.shape[0]], dtype=bool) pass # Compute overlaps [num_anchors, num_gt_boxes] overlaps = bbox_util.compute_overlaps(anchors, gt_boxes) # Match anchors to GT Boxes # If an anchor overlaps a GT box with IoU >= 0.7 then it's positive. # If an anchor overlaps a GT box with IoU < 0.3 then it's negative. # Neutral anchors are those that don't match the conditions above, # and they don't influence the loss function. # However, don't keep any GT box unmatched (rare, but happens). Instead, # match it to the closest anchor (even if its max IoU is < 0.3). # # 1. Set negative anchors first. They get overwritten below if a GT box is # matched to them. Skip boxes in crowd areas. anchor_iou_argmax = np.argmax(overlaps, axis=1) anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax] rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1 # 2. Set an anchor for each GT box (regardless of IoU value). # If multiple anchors have the same IoU match all of them gt_iou_argmax = np.argwhere(overlaps == np.max(overlaps, axis=0))[:, 0] rpn_match[gt_iou_argmax] = 1 # 3. Set anchors with high overlap as positive. rpn_match[anchor_iou_max >= 0.7] = 1 # Subsample to balance positive and negative anchors # Don't let positives be more than half the anchors ids = np.where(rpn_match == 1)[0] extra = len(ids) - (anchor_per_image // 2) if extra > 0: # Reset the extra ones to neutral ids = np.random.choice(ids, extra, replace=False) rpn_match[ids] = 0 # Same for negative proposals ids = np.where(rpn_match == -1)[0] extra = len(ids) - (anchor_per_image - np.sum(rpn_match == 1)) if extra > 0: # Rest the extra ones to neutral ids = np.random.choice(ids, extra, replace=False) rpn_match[ids] = 0 pass # For positive anchors, compute shift and scale needed to transform them # to match the corresponding GT boxes. ids = np.where(rpn_match == 1)[0] ix = 0 # index into rpn_bbox # TODO: use box_refinement() rather than duplicating the code here for i, a in zip(ids, anchors[ids]): # Closest gt box (it might have IoU < 0.7) gt = gt_boxes[anchor_iou_argmax[i]] # Convert coordinates to center plus width/height. # GT Box gt_h = gt[2] - gt[0] gt_w = gt[3] - gt[1] gt_center_y = gt[0] + 0.5 * gt_h gt_center_x = gt[1] + 0.5 * gt_w # Anchor a_h = a[2] - a[0] a_w = a[3] - a[1] a_center_y = a[0] + 0.5 * a_h a_center_x = a[1] + 0.5 * a_w # Compute the bbox refinement that the RPN should predict. rpn_bbox[ix] = [ (gt_center_y - a_center_y) / a_h, (gt_center_x - a_center_x) / a_w, np.log(gt_h / a_h), np.log(gt_w / a_w), ] # Normalize rpn_bbox_std_dev = np.array(cfg.COMMON.RPN_BBOX_STD_DEV) rpn_bbox[ix] /= rpn_bbox_std_dev ix += 1 return rpn_match, rpn_bbox pass
class MaskRCNN(object): def __init__(self, train_flag=True): """ :param train_flag: 是否为训练,训练为 True,测试为 False """ self.train_flag = train_flag self.bbox_util = BboxUtil() self.anchor_utils = AnchorUtils() self.image_utils = ImageUtils() self.mask_util = MaskUtil() # 模型 路径 self.model_path = cfg.TRAIN.MODEL_PATH if self.train_flag else cfg.TEST.COCO_MODEL_PATH # batch size self.batch_size = cfg.TRAIN.BATCH_SIZE if self.train_flag else cfg.TEST.BATCH_SIZE # 模型保存路径 self.save_model_path = cfg.TRAIN.SAVE_MODEL_PATH self.backbone = cfg.COMMON.BACKBONE self.backbone_strides = cfg.COMMON.BACKBONE_STRIDES # 输入图像 self.image_shape = np.array(cfg.COMMON.IMAGE_SHAPE) # 用于构建特征金字塔的自顶向下层的大小 self.top_down_pyramid_size = cfg.COMMON.TOP_DOWN_PYRAMID_SIZE self.rpn_anchor_stride = cfg.COMMON.RPN_ANCHOR_STRIDE self.rpn_anchor_ratios = cfg.COMMON.RPN_ANCHOR_RATIOS self.rpn_nms_threshold = cfg.COMMON.RPN_NMS_THRESHOLD self.class_num = cfg.COMMON.CLASS_NUM self.rois_per_image = cfg.TRAIN.ROIS_PER_IMAGE self.roi_positive_ratio = cfg.TRAIN.ROI_POSITIVE_RATIO self.keras_model = self.build() pass def build(self): # image shape h, w, c = self.image_shape[:] print("image_shape: {}".format(self.image_shape)) if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): raise Exception( "Image size must be dividable by 2 at least 6 times " "to avoid fractions when downscaling and upscaling." "For example, use 256, 320, 384, 448, 512, ... etc. ") # Inputs input_image = kl.Input(shape=[None, None, c], name="input_image") input_image_meta = kl.Input(shape=[cfg.COMMON.IMAGE_META_SIZE], name="input_image_meta") # 训练 if self.train_flag: # RPN GT input_rpn_match = kl.Input(shape=[None, 1], name="input_rpn_match", dtype=tf.int32) input_rpn_bbox = kl.Input(shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32) # Detection GT (class IDs, bounding boxes, and masks) # 1. GT Class IDs (zero padded) input_gt_class_ids = kl.Input(shape=[None], name="input_gt_class_ids", dtype=tf.int32) # 2. GT Boxes in pixels (zero padded) # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates input_gt_boxes = kl.Input(shape=[None, 4], name="input_gt_boxes", dtype=tf.float32) # Normalize coordinates gt_boxes = kl.Lambda(lambda x: self.bbox_util.norm_boxes_graph( x, k.shape(input_image)[1:3]))(input_gt_boxes) # 3. GT Masks (zero padded) # [batch, height, width, MAX_GT_INSTANCES] if cfg.TRAIN.USE_MINI_MASK: min_h, min_w = cfg.TRAIN.MINI_MASK_SHAPE[:] input_gt_masks = kl.Input(shape=[min_h, min_w, None], name="input_gt_masks", dtype=bool) else: input_gt_masks = kl.Input(shape=[h, w, None], name="input_gt_masks", dtype=bool) pass # anchor anchors = self.anchor_utils.get_anchors(self.image_shape) # Duplicate across the batch dimension because Keras requires it # TODO: can this be optimized to avoid duplicating the anchors? anchors = np.broadcast_to(anchors, (self.batch_size, ) + anchors.shape) # A hack to get around Keras's bad support for constants anchors = kl.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image) anchors = kl.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image) pass else: # Anchors in normalized coordinates anchors = kl.Input(shape=[None, 4], name="input_anchors") # 上面训练用到的参数,测试不需要,但是在 if else 里面定义一下,免得 undefined input_rpn_match = None input_rpn_bbox = None input_gt_class_ids = None gt_boxes = None input_gt_boxes = None input_gt_masks = None pass # Build the shared convolutional layers. # Bottom-up Layers # Returns a list of the last layers of each stage, 5 in total. # Don't create the thead (stage 5), so we pick the 4th item in the list. _, c2, c3, c4, c5 = backbone.resnet_graph(input_image, self.backbone, stage5=True) # Top-down Layers # TODO: add assert to varify feature map sizes match what's in config p5 = kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c5p5')(c5) p4 = kl.Add(name="fpn_p4add")([ kl.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(p5), kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c4p4')(c4) ]) p3 = kl.Add(name="fpn_p3add")([ kl.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(p4), kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c3p3')(c3) ]) p2 = kl.Add(name="fpn_p2add")([ kl.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(p3), kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c2p2')(c2) ]) # Attach 3x3 conv to all P layers to get the final feature maps. p2 = kl.Conv2D(self.top_down_pyramid_size, (3, 3), padding="SAME", name="fpn_p2")(p2) p3 = kl.Conv2D(self.top_down_pyramid_size, (3, 3), padding="SAME", name="fpn_p3")(p3) p4 = kl.Conv2D(self.top_down_pyramid_size, (3, 3), padding="SAME", name="fpn_p4")(p4) p5 = kl.Conv2D(self.top_down_pyramid_size, (3, 3), padding="SAME", name="fpn_p5")(p5) # P6 is used for the 5th anchor scale in RPN. Generated by # subsampling from P5 with stride of 2. p6 = kl.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(p5) # Note that P6 is used in RPN, but not in the classifier heads. rpn_feature_maps = [p2, p3, p4, p5, p6] mrcnn_feature_maps = [p2, p3, p4, p5] # RPN Model rpn = common.build_rpn_model(self.rpn_anchor_stride, len(self.rpn_anchor_ratios), self.top_down_pyramid_size) # Loop through pyramid layers layer_outputs = [] # list of lists for p in rpn_feature_maps: layer_outputs.append(rpn([p])) pass # Concatenate layer outputs # Convert from list of lists of level outputs to list of lists # of outputs across levels. # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]] output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"] outputs = list(zip(*layer_outputs)) outputs = [ kl.Concatenate(axis=1, name=n)(list(o)) for o, n in zip(outputs, output_names) ] rpn_class_logits, rpn_class, rpn_bbox = outputs # Generate proposals # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates # and zero padded. proposal_count = cfg.TRAIN.POST_NMS_ROIS if self.train_flag else cfg.TEST.POST_NMS_ROIS rpn_rois = common.ProposalLayer( proposal_count=proposal_count, nms_threshold=self.rpn_nms_threshold, batch_size=self.batch_size, name="ROI")([rpn_class, rpn_bbox, anchors]) fc_layer_size = cfg.COMMON.FPN_CLASS_FC_LAYERS_SIZE pool_size = cfg.COMMON.POOL_SIZE mask_pool_size = cfg.COMMON.MASK_POOL_SIZE train_or_freeze = cfg.COMMON.TRAIN_FLAG if self.train_flag: # Class ID mask to mark class IDs supported by the dataset the image # came from. active_class_ids = kl.Lambda( lambda x: self.image_utils.parse_image_meta_graph(x)[ "active_class_ids"])(input_image_meta) if not cfg.TRAIN.USE_RPN_ROIS: # Ignore predicted ROIs and use ROIs provided as an input. input_rois = kl.Input(shape=[proposal_count, 4], name="input_roi", dtype=np.int32) # Normalize coordinates target_rois = kl.Lambda( lambda x: self.bbox_util.norm_boxes_graph( x, k.shape(input_image)[1:3]))(input_rois) else: target_rois = rpn_rois input_rois = None # Generate detection targets # Subsamples proposals and generates target outputs for training # Note that proposal class IDs, gt_boxes, and gt_masks are zero # padded. Equally, returned rois and targets are zero padded. rois, target_class_ids, target_bbox, target_mask = \ common.DetectionTargetLayer(self.batch_size, name="proposal_targets")([ target_rois, input_gt_class_ids, gt_boxes, input_gt_masks]) # Network Heads # TODO: verify that this handles zero padded ROIs mrcnn_class_logits, mrcnn_class, mrcnn_bbox = common.fpn_classifier_graph( rois, mrcnn_feature_maps, input_image_meta, pool_size, self.class_num, train_flag=train_or_freeze, fc_layers_size=fc_layer_size) mrcnn_mask = common.build_fpn_mask_graph( rois, mrcnn_feature_maps, input_image_meta, mask_pool_size, self.class_num, train_flag=train_or_freeze) # TODO: clean up (use tf.identify if necessary) output_rois = kl.Lambda(lambda x: x * 1, name="output_rois")(rois) # Losses rpn_class_loss = kl.Lambda( lambda x: common.rpn_class_loss_graph(*x), name="rpn_class_loss")([input_rpn_match, rpn_class_logits]) rpn_bbox_loss = kl.Lambda( lambda x: common.rpn_bbox_loss_graph(self.batch_size, *x), name="rpn_bbox_loss")( [input_rpn_bbox, input_rpn_match, rpn_bbox]) class_loss = kl.Lambda(lambda x: common.mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")([ target_class_ids, mrcnn_class_logits, active_class_ids ]) bbox_loss = kl.Lambda(lambda x: common.mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")([ target_bbox, target_class_ids, mrcnn_bbox ]) mask_loss = kl.Lambda(lambda x: common.mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")([ target_mask, target_class_ids, mrcnn_mask ]) # Model inputs = [ input_image, input_image_meta, input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks ] if not cfg.TRAIN.USE_RPN_ROIS: inputs.append(input_rois) outputs = [ rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, output_rois, rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss ] model = km.Model(inputs, outputs, name='mask_rcnn') pass else: # Network Heads # Proposal classifier and BBox regressor heads mrcnn_class_logits, mrcnn_class, mrcnn_bbox = common.fpn_classifier_graph( rpn_rois, mrcnn_feature_maps, input_image_meta, pool_size, self.class_num, train_flag=train_or_freeze, fc_layers_size=fc_layer_size) # Detections # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in # normalized coordinates detections = common.DetectionLayer(self.batch_size, name="mrcnn_detection")([ rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta ]) # Create masks for detections detection_boxes = kl.Lambda(lambda x: x[..., :4])(detections) mrcnn_mask = common.build_fpn_mask_graph( detection_boxes, mrcnn_feature_maps, input_image_meta, mask_pool_size, self.class_num, train_flag=train_or_freeze) model = km.Model([input_image, input_image_meta, anchors], [ detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox ], name='mask_rcnn') pass # Add multi-GPU support. 多 GPU 操作 gpu_count = cfg.COMMON.GPU_COUNT if gpu_count > 1: from m_rcnn.parallel_model import ParallelModel model = ParallelModel(model, gpu_count) return model pass def load_weights(self, model_path, by_name=False, exclude=None): """ Modified version of the corresponding Keras function with the addition of multi-GPU support and the ability to exclude some layers from loading. :param model_path: :param by_name: :param exclude: list of layer names to exclude :return: """ if exclude: by_name = True pass if h5py is None: raise ImportError('`load_weights` requires h5py.') pass model_file = h5py.File(model_path, mode='r') if 'layer_names' not in model_file.attrs and 'model_weights' in model_file: model_file = model_file['model_weights'] # In multi-GPU training, we wrap the model. Get layers # of the inner model because they have the weights. keras_model = self.keras_model layers = keras_model.inner_model.layers if hasattr( keras_model, "inner_model") else keras_model.layers print("layers: {}".format(layers)) # Exclude some layers if exclude: layers = filter(lambda l: l.name not in exclude, layers) if by_name: saving.load_weights_from_hdf5_group_by_name(model_file, layers) else: saving.load_weights_from_hdf5_group(model_file, layers) if hasattr(model_file, 'close'): model_file.close() pass def generate_random_rois(self, image_shape, count, gt_boxes): """ Generates ROI proposals similar to what a region proposal network would generate. :param image_shape: [Height, Width, Depth] :param count: Number of ROIs to generate :param gt_boxes: [N, (y1, x1, y2, x2)] Ground truth boxes in pixels. :return: """ # placeholder rois = np.zeros((count, 4), dtype=np.int32) # Generate random ROIs around GT boxes (90% of count) rois_per_box = int(0.9 * count / gt_boxes.shape[0]) for i in range(gt_boxes.shape[0]): gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[i] h = gt_y2 - gt_y1 w = gt_x2 - gt_x1 # random boundaries r_y1 = max(gt_y1 - h, 0) r_y2 = min(gt_y2 + h, image_shape[0]) r_x1 = max(gt_x1 - w, 0) r_x2 = min(gt_x2 + w, image_shape[1]) # To avoid generating boxes with zero area, we generate double what # we need and filter out the extra. If we get fewer valid boxes # than we need, we loop and try again. while True: y1y2 = np.random.randint(r_y1, r_y2, (rois_per_box * 2, 2)) x1x2 = np.random.randint(r_x1, r_x2, (rois_per_box * 2, 2)) # Filter out zero area boxes threshold = 1 y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >= threshold][:rois_per_box] x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >= threshold][:rois_per_box] if y1y2.shape[0] == rois_per_box and x1x2.shape[ 0] == rois_per_box: break # Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape # into x1, y1, x2, y2 order x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1) y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1) box_rois = np.hstack([y1, x1, y2, x2]) rois[rois_per_box * i:rois_per_box * (i + 1)] = box_rois # Generate random ROIs anywhere in the image (10% of count) remaining_count = count - (rois_per_box * gt_boxes.shape[0]) # To avoid generating boxes with zero area, we generate double what # we need and filter out the extra. If we get fewer valid boxes # than we need, we loop and try again. while True: y1y2 = np.random.randint(0, image_shape[0], (remaining_count * 2, 2)) x1x2 = np.random.randint(0, image_shape[1], (remaining_count * 2, 2)) # Filter out zero area boxes threshold = 1 y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >= threshold][:remaining_count] x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >= threshold][:remaining_count] if y1y2.shape[0] == remaining_count and x1x2.shape[ 0] == remaining_count: break # Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape # into x1, y1, x2, y2 order x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1) y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1) global_rois = np.hstack([y1, x1, y2, x2]) rois[-remaining_count:] = global_rois return rois pass def build_detection_targets(self, rpn_rois, gt_class_ids, gt_boxes, gt_masks): """ Generate targets for training Stage 2 classifier and mask heads. This is not used in normal training. It's useful for debugging or to train the Mask RCNN heads without using the RPN head. :param rpn_rois: [N, (y1, x1, y2, x2)] proposal boxes. :param gt_class_ids: [instance count] Integer class IDs :param gt_boxes: [instance count, (y1, x1, y2, x2)] :param gt_masks: [height, width, instance count] Ground truth masks. Can be full size or mini-masks. :return: rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs. bboxes: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (y, x, log(h), log(w))]. Class-specific bbox refinements. masks: [TRAIN_ROIS_PER_IMAGE, height, width, NUM_CLASSES). Class specific masks cropped to bbox boundaries and resized to neural network output size. """ assert rpn_rois.shape[0] > 0 assert gt_class_ids.dtype == np.int32, "Expected int but got {}".format( gt_class_ids.dtype) assert gt_boxes.dtype == np.int32, "Expected int but got {}".format( gt_boxes.dtype) assert gt_masks.dtype == np.bool_, "Expected bool but got {}".format( gt_masks.dtype) # It's common to add GT Boxes to ROIs but we don't do that here because # according to XinLei Chen's paper, it doesn't help. # Trim empty padding in gt_boxes and gt_masks parts instance_ids = np.where(gt_class_ids > 0)[0] assert instance_ids.shape[0] > 0, "Image must contain instances." gt_class_ids = gt_class_ids[instance_ids] gt_boxes = gt_boxes[instance_ids] gt_masks = gt_masks[:, :, instance_ids] # Compute areas of ROIs and ground truth boxes. # rpn_roi_area = (rpn_rois[:, 2] - rpn_rois[:, 0]) * (rpn_rois[:, 3] - rpn_rois[:, 1]) # gt_box_area = (gt_boxes[:, 2] - gt_boxes[:, 0]) * (gt_boxes[:, 3] - gt_boxes[:, 1]) # Compute overlaps [rpn_rois, gt_boxes] overlaps = np.zeros((rpn_rois.shape[0], gt_boxes.shape[0])) for i in range(overlaps.shape[1]): gt = gt_boxes[i] overlaps[:, i] = self.bbox_util.compute_iou(gt, rpn_rois) pass # Assign ROIs to GT boxes rpn_roi_iou_argmax = np.argmax(overlaps, axis=1) rpn_roi_iou_max = overlaps[np.arange(overlaps.shape[0]), rpn_roi_iou_argmax] # GT box assigned to each ROI rpn_roi_gt_boxes = gt_boxes[rpn_roi_iou_argmax] rpn_roi_gt_class_ids = gt_class_ids[rpn_roi_iou_argmax] # Positive ROIs are those with >= 0.5 IoU with a GT box. fg_ids = np.where(rpn_roi_iou_max > 0.5)[0] # Negative ROIs are those with max IoU 0.1-0.5 (hard example mining) # TODO: To hard example mine or not to hard example mine, that's the question # bg_ids = np.where((rpn_roi_iou_max >= 0.1) & (rpn_roi_iou_max < 0.5))[0] bg_ids = np.where(rpn_roi_iou_max < 0.5)[0] # Subsample ROIs. Aim for 33% foreground. # FG fg_roi_count = int(self.rois_per_image * self.roi_positive_ratio) if fg_ids.shape[0] > fg_roi_count: keep_fg_ids = np.random.choice(fg_ids, fg_roi_count, replace=False) else: keep_fg_ids = fg_ids # BG remaining = self.rois_per_image - keep_fg_ids.shape[0] if bg_ids.shape[0] > remaining: keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False) else: keep_bg_ids = bg_ids # Combine indices of ROIs to keep keep = np.concatenate([keep_fg_ids, keep_bg_ids]) # Need more? remaining = self.rois_per_image - keep.shape[0] if remaining > 0: # Looks like we don't have enough samples to maintain the desired # balance. Reduce requirements and fill in the rest. This is # likely different from the Mask RCNN paper. # There is a small chance we have neither fg nor bg samples. if keep.shape[0] == 0: # Pick bg regions with easier IoU threshold bg_ids = np.where(rpn_roi_iou_max < 0.5)[0] assert bg_ids.shape[0] >= remaining keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False) assert keep_bg_ids.shape[0] == remaining keep = np.concatenate([keep, keep_bg_ids]) else: # Fill the rest with repeated bg rois. keep_extra_ids = np.random.choice(keep_bg_ids, remaining, replace=True) keep = np.concatenate([keep, keep_extra_ids]) assert keep.shape[0] == self.rois_per_image, \ "keep doesn't match ROI batch size {}, {}".format(keep.shape[0], self.rois_per_image) # Reset the gt boxes assigned to BG ROIs. rpn_roi_gt_boxes[keep_bg_ids, :] = 0 rpn_roi_gt_class_ids[keep_bg_ids] = 0 # For each kept ROI, assign a class_id, and for FG ROIs also add bbox refinement. rois = rpn_rois[keep] roi_gt_boxes = rpn_roi_gt_boxes[keep] roi_gt_class_ids = rpn_roi_gt_class_ids[keep] roi_gt_assignment = rpn_roi_iou_argmax[keep] # Class-aware bbox deltas. [y, x, log(h), log(w)] bboxes = np.zeros((self.rois_per_image, self.class_num, 4), dtype=np.float32) pos_ids = np.where(roi_gt_class_ids > 0)[0] bboxes[pos_ids, roi_gt_class_ids[pos_ids]] = self.bbox_util.box_refinement( rois[pos_ids], roi_gt_boxes[pos_ids, :4]) # Normalize bbox refinements bbox_std_dev = np.array(cfg.COMMON.BBOX_STD_DEV) bboxes /= bbox_std_dev # Generate class-specific target masks masks = np.zeros((self.rois_per_image, self.image_shape[0], self.image_shape[1], self.class_num), dtype=np.float32) for i in pos_ids: class_id = roi_gt_class_ids[i] assert class_id > 0, "class id must be greater than 0" gt_id = roi_gt_assignment[i] class_mask = gt_masks[:, :, gt_id] if cfg.TRAIN.USE_MINI_MASK: # Create a mask placeholder, the size of the image placeholder = np.zeros(self.image_shape[:2], dtype=bool) # GT box gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[gt_id] gt_w = gt_x2 - gt_x1 gt_h = gt_y2 - gt_y1 # Resize mini mask to size of GT box placeholder[gt_y1:gt_y2, gt_x1:gt_x2] = \ np.round(self.image_utils.resize(class_mask, (gt_h, gt_w))).astype(bool) # Place the mini batch in the placeholder class_mask = placeholder # Pick part of the mask and resize it y1, x1, y2, x2 = rois[i].astype(np.int32) m = class_mask[y1:y2, x1:x2] mask = self.image_utils.resize(m, self.image_shape) masks[i, :, :, class_id] = mask return rois, roi_gt_class_ids, bboxes, masks pass # ############################################################################################# # test # ############################################################################################# def detect(self, images_info_list, verbose=0): """ Runs the detection pipeline. :param images_info_list: List of images, potentially of different sizes. :param verbose: :return: a list of dicts, one dict per image. The dict contains: rois: [N, (y1, x1, y2, x2)] detection bounding boxes class_ids: [N] int class IDs scores: [N] float probability scores for the class IDs masks: [H, W, N] instance binary masks """ if verbose: print("processing {} image_info.".format(len(images_info_list))) for image_info in images_info_list: print("image_info: {}".format(image_info)) pass pass # Mold inputs to format expected by the neural network molded_images_list, image_metas_list, windows_list = self.image_utils.mode_input( images_info_list) # Validate image sizes # All images in a batch MUST be of the same size image_shape = molded_images_list[0].shape for g in molded_images_list[1:]: assert g.shape == image_shape, \ "After resizing, all images must have the same size. Check IMAGE_RESIZE_MODE and image sizes." pass # Anchors anchors = self.anchor_utils.get_anchors(image_shape) # Duplicate across the batch dimension because Keras requires it # TODO: can this be optimized to avoid duplicating the anchors? anchors = np.broadcast_to(anchors, (cfg.TEST.BATCH_SIZE, ) + anchors.shape) if verbose: print("molded_images_list: ", molded_images_list) print("image_metas_list: ", image_metas_list) print("anchors: ", anchors) pass # Run object detection detections, _, _, mrcnn_mask, _, _, _ = \ self.keras_model.predict([molded_images_list, image_metas_list, anchors], verbose=0) # Process detections results_list = [] for i, image_info in enumerate(images_info_list): molded_image_shape = molded_images_list[i].shape final_rois, final_class_ids, final_scores, final_masks = self.un_mold_detections( detections[i], mrcnn_mask[i], image_info.shape, molded_image_shape, windows_list[i]) results_list.append({ "rois": final_rois, "class_ids": final_class_ids, "scores": final_scores, "masks": final_masks, }) return results_list pass def un_mold_detections(self, detections, mrcnn_mask, original_image_shape, image_shape, window): """ Reformats the detections of one image from the format of the neural network output to a format suitable for use in the rest of the application. :param detections: [N, (y1, x1, y2, x2, class_id, score)] in normalized coordinates :param mrcnn_mask: [N, height, width, num_classes] :param original_image_shape: [H, W, C] Original image shape before resizing :param image_shape: [H, W, C] Shape of the image after resizing and padding :param window: [y1, x1, y2, x2] Pixel coordinates of box in the image where the real image is excluding the padding. :return: boxes: [N, (y1, x1, y2, x2)] Bounding boxes in pixels class_ids: [N] Integer class IDs for each bounding box scores: [N] Float probability scores of the class_id masks: [height, width, num_instances] Instance masks """ # How many detections do we have? # Detections array is padded with zeros. Find the first class_id == 0. zero_ix = np.where(detections[:, 4] == 0)[0] n = zero_ix[0] if zero_ix.shape[0] > 0 else detections.shape[0] # Extract boxes, class_ids, scores, and class-specific masks boxes = detections[:n, :4] class_ids = detections[:n, 4].astype(np.int32) scores = detections[:n, 5] masks = mrcnn_mask[np.arange(n), :, :, class_ids] # Translate normalized coordinates in the resized image to pixel # coordinates in the original image before resizing window = self.bbox_util.norm_boxes(window, image_shape[:2]) wy1, wx1, wy2, wx2 = window shift = np.array([wy1, wx1, wy1, wx1]) wh = wy2 - wy1 # window height ww = wx2 - wx1 # window width scale = np.array([wh, ww, wh, ww]) # Convert boxes to normalized coordinates on the window boxes = np.divide(boxes - shift, scale) # Convert boxes to pixel coordinates on the original image boxes = self.bbox_util.denorm_boxes(boxes, original_image_shape[:2]) # Filter out detections with zero area. Happens in early training when # network weights are still random exclude_ix = np.where((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0] if exclude_ix.shape[0] > 0: boxes = np.delete(boxes, exclude_ix, axis=0) class_ids = np.delete(class_ids, exclude_ix, axis=0) scores = np.delete(scores, exclude_ix, axis=0) masks = np.delete(masks, exclude_ix, axis=0) n = class_ids.shape[0] # Resize masks to original image size and set boundary threshold. full_masks = [] for i in range(n): # Convert neural network mask to full size mask full_mask = self.mask_util.unmold_mask(masks[i], boxes[i], original_image_shape) full_masks.append(full_mask) pass full_masks = np.stack( full_masks, axis=-1) if full_masks else np.empty(original_image_shape[:2] + (0, )) return boxes, class_ids, scores, full_masks pass