def build(self, config): """Build Mask R-CNN architecture. """ # Image size must be dividable by 2 multiple times h, w = config.IMAGE_SHAPE[:2] if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): raise Exception( "Image size must be dividable by 2 at least 6 times " "to avoid fractions when downscaling and upscaling." "For example, use 256, 320, 384, 448, 512, ... etc. ") # Build the shared convolutional layers. # Bottom-up Layers # Returns a list of the last layers of each stage, 5 in total. # Don't create the thead (stage 5), so we pick the 4th item in the list. resnet = ResNet.ResNet("resnet101", stage5=True) C1, C2, C3, C4, C5 = resnet.stages() # Top-down Layers # TODO: add assert to varify feature map sizes match what's in config self.fpn = FPN.FPN(C1, C2, C3, C4, C5, out_channels=256) # Generate Anchors self.anchors = Variable(torch.from_numpy( utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, config.RPN_ANCHOR_RATIOS, config.BACKBONE_SHAPES, config.BACKBONE_STRIDES, config.RPN_ANCHOR_STRIDE)).float(), requires_grad=False) if self.config.GPU_COUNT: self.anchors = self.anchors.cuda() # RPN self.rpn = RPN.RPN(len(config.RPN_ANCHOR_RATIOS), config.RPN_ANCHOR_STRIDE, 256) # FPN Classifier self.classifier = FPN_head.Classifier(256, config.POOL_SIZE, config.IMAGE_SHAPE, config.NUM_CLASSES) # FPN Mask self.mask = FPN_head.Mask(256, config.MASK_POOL_SIZE, config.IMAGE_SHAPE, config.NUM_CLASSES) # Fix batch norm layers def set_bn_fix(m): classname = m.__class__.__name__ if classname.find('BatchNorm') != -1: for p in m.parameters(): p.requires_grad = False self.apply(set_bn_fix)
def get_anchors(self, image_shape): backbone_shapes = utils.compute_backbone_shapes( self.backbone, self.backbone_strides, image_shape) if not hasattr(self, "_anchor_cache"): self._anchor_cache = {} if not tuple(image_shape) in self._anchor_cache: a = utils.generate_pyramid_anchors(self.rpn_anchor_scales, self.rpn_anchor_ratios, backbone_shapes, self.backbone_strides, self.rpn_anchor_stride) self._anchor_cache[tuple(image_shape)] = utils.norm_boxes( a, image_shape[:2]) return self._anchor_cache[tuple(image_shape)]
def generate_all_anchors(fpn_shapes, image_shape, config): ''' generate anchor for pyramid feature maps ''' anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, \ config.RPN_ANCHOR_RATIOS, \ fpn_shapes, \ config.BACKBONE_STRIDES, \ config.RPN_ANCHOR_STRIDE) # normalize coordinates # numpy array [N, 4] norm_anchors = utils.norm_boxes(anchors, image_shape) anchors_tensor = tf.convert_to_tensor(norm_anchors) # Duplicate across the batch dimension batch_anchors = tf.broadcast_to(anchors_tensor,\ [config.IMAGES_PER_GPU, tf.shape(anchors_tensor)[0],tf.shape(anchors_tensor)[1]]) return batch_anchors
def __init__(self, dataset, config, augment=True): """A generator that returns images and corresponding target class ids, bounding box deltas, and masks. dataset: The Dataset object to pick data from config: The model config object shuffle: If True, shuffles the samples before every epoch augment: If True, applies image augmentation to images (currently only horizontal flips are supported) Returns a Python generator. Upon calling next() on it, the generator returns two lists, inputs and outputs. The containtes of the lists differs depending on the received arguments: inputs list: - images: [batch, H, W, C] - image_metas: [batch, size of image meta] - rpn_match: [batch, N] Integer (1=positive anchor, -1=negative, 0=neutral) - rpn_bbox: [batch, N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas. - gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs - gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] - gt_masks: [batch, height, width, MAX_GT_INSTANCES]. The height and width are those of the image unless use_mini_mask is True, in which case they are defined in MINI_MASK_SHAPE. outputs list: Usually empty in regular training. But if detection_targets is True then the outputs list contains target class_ids, bbox deltas, and masks. """ self.b = 0 # batch item index self.image_index = -1 self.image_ids = np.copy(dataset.image_ids) self.error_count = 0 self.dataset = dataset self.config = config self.augment = augment # Anchors # [anchor_count, (y1, x1, y2, x2)] self.anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, config.RPN_ANCHOR_RATIOS, config.BACKBONE_SHAPES, config.BACKBONE_STRIDES, config.RPN_ANCHOR_STRIDE)
def get_anchors(image_shape, config): """Returns anchor pyramid for the given image size.""" backbone_shapes = compute_backbone_shapes(config, image_shape) # Cache anchors and reuse if image shape is the same _anchor_cache = {} if not tuple(image_shape) in _anchor_cache: # Generate Anchors a = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, config.RPN_ANCHOR_RATIOS, backbone_shapes, config.BACKBONE_STRIDES, config.RPN_ANCHOR_STRIDE) # Keep a copy of the latest anchors in pixel coordinates because # it's used in inspect_model notebooks. # TODO: Remove this after the notebook are refactored to not use it anchors = a # Normalize coordinates _anchor_cache[tuple(image_shape)] = utils.norm_boxes( a, image_shape[:2]) return _anchor_cache[tuple(image_shape)]
def __init__(self, options, config, split, random=True, loadNeighborImage=False, load_semantics=False, load_boundary=False): self.options = options self.config = config self.split = split self.random = random self.dataFolder = options.dataFolder self.scenes = [] self.sceneImageIndices = [] self.loadClassMap() #planenet_scene_ids_val = np.load('datasets/scene_ids_val.npy') #planenet_scene_ids_val = {scene_id.decode('utf-8'): True for scene_id in planenet_scene_ids_val} #print(planenet_scene_ids_val) # with open(self.dataFolder + '/ScanNet/Tasks/Benchmark/scannetv2_' + split + '.txt') as f: # for line in f: # scene_id = line.strip() # if split == 'test': # ## Remove scenes which are in PlaneNet's training set for fair comparison # # if scene_id not in planenet_scene_ids_val: # # continue # pass # scenePath = self.dataFolder + '/scans/' + scene_id # if not os.path.exists(scenePath + '/' + scene_id + '.txt') or not os.path.exists(scenePath + '/annotation/planes.npy'): # # print(scenePath + '/' + scene_id + '.txt') # # print(scenePath + '/annotation/planes.npy') # # print("here") # # if True: # # exit() # continue # scene = CustomScene(options, scenePath, scene_id, self.confident_labels, self.layout_labels, load_semantics=load_semantics, load_boundary=load_boundary) # self.scenes.append(scene) # self.sceneImageIndices += [[len(self.scenes) - 1, imageIndex] for imageIndex in range(len(scene.imagePaths))] # continue # pass scene_id = 'scene0003_02' scenePath = self.dataFolder + '/scans/' + scene_id # Taking class of CustomScene from custom_scene scene = CustomScene(options, scenePath, scene_id, self.confident_labels, self.layout_labels, load_semantics=load_semantics, load_boundary=load_boundary) #print("reached #10132483") self.scenes.append(scene) print("scenes--", self.scenes) self.sceneImageIndices += [[ len(self.scenes) - 1, imageIndex ] for imageIndex in range(len(scene.imagePaths))] #print(self.sceneImageIndices) if random: t = int(time.time() * 1000000) np.random.seed(((t & 0xff000000) >> 24) + ((t & 0x00ff0000) >> 8) + ((t & 0x0000ff00) << 8) + ((t & 0x000000ff) << 24)) else: np.random.seed(0) pass np.random.shuffle(self.sceneImageIndices) print("length of indices----", len(self.sceneImageIndices)) #self.invalid_indices = {} # with open(self.dataFolder + '/invalid_indices_' + split + '.txt', 'r') as f: # for line in f: # tokens = line.split(' ') # if len(tokens) == 3: # assert(int(tokens[2]) < 10000) # invalid_index = int(tokens[1]) * 10000 + int(tokens[2]) # if invalid_index not in self.invalid_indices: # self.invalid_indices[invalid_index] = True # pass # pass # continue # pass self.sceneImageIndices = [[ sceneIndex, imageIndex ] for sceneIndex, imageIndex in self.sceneImageIndices] print('num images', len(self.sceneImageIndices)) # if True: # exit() self.anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, config.RPN_ANCHOR_RATIOS, config.BACKBONE_SHAPES, config.BACKBONE_STRIDES, config.RPN_ANCHOR_STRIDE) self.loadNeighborImage = loadNeighborImage return
def build(self, mode, config): """Build Mask R-CNN architecture. input_shape: The shape of the input image. mode: Either "training" or "inference". The inputs and outputs of the model differ accordingly. """ assert mode in ['training', 'inference'] # Image size must be dividable by 2 multiple times h, w = config.IMAGE_SHAPE[:2] if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): raise Exception( "Image size must be dividable by 2 at least 6 times " "to avoid fractions when downscaling and upscaling." "For example, use 256, 320, 384, 448, 512, ... etc. ") # Inputs input_image = KL.Input(shape=config.IMAGE_SHAPE.tolist(), name="input_image") input_image_meta = KL.Input(shape=[None], name="input_image_meta") if mode == "training": # RPN GT input_rpn_match = KL.Input(shape=[None, 1], name="input_rpn_match", dtype=tf.int32) input_rpn_bbox = KL.Input(shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32) # Detection GT (class IDs, bounding boxes, and masks) # 1. GT Class IDs (zero padded) input_gt_class_ids = KL.Input(shape=[None], name="input_gt_class_ids", dtype=tf.int32) # 2. GT Boxes in pixels (zero padded) # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates input_gt_boxes = KL.Input(shape=[None, 4], name="input_gt_boxes", dtype=tf.float32) # Normalize coordinates h, w = K.shape(input_image)[1], K.shape(input_image)[2] image_scale = K.cast(K.stack([h, w, h, w], axis=0), tf.float32) gt_boxes = KL.Lambda(lambda x: x / image_scale)(input_gt_boxes) # 3. GT Masks (zero padded) # [batch, height, width, MAX_GT_INSTANCES] if config.USE_MINI_MASK: input_gt_masks = KL.Input(shape=[ config.MINI_MASK_SHAPE[0], config.MINI_MASK_SHAPE[1], None ], name="input_gt_masks", dtype=bool) else: input_gt_masks = KL.Input( shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None], name="input_gt_masks", dtype=bool) # Build the shared convolutional layers. # Bottom-up Layers # Returns a list of the last layers of each stage, 5 in total. # Don't create the thead (stage 5), so we pick the 4th item in the list. _, C2, C3, C4, C5 = resnet_graph(input_image, "resnet101", stage5=True) # Top-down Layers # TODO: add assert to varify feature map sizes match what's in config P5 = KL.Conv2D(256, (1, 1), name='fpn_c5p5')(C5) P4 = KL.Add(name="fpn_p4add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5), KL.Conv2D(256, (1, 1), name='fpn_c4p4')(C4) ]) P3 = KL.Add(name="fpn_p3add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4), KL.Conv2D(256, (1, 1), name='fpn_c3p3')(C3) ]) P2 = KL.Add(name="fpn_p2add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3), KL.Conv2D(256, (1, 1), name='fpn_c2p2')(C2) ]) # Attach 3x3 conv to all P layers to get the final feature maps. P2 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p2")(P2) P3 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p3")(P3) P4 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p4")(P4) P5 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p5")(P5) # P6 is used for the 5th anchor scale in RPN. Generated by # subsampling from P5 with stride of 2. P6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5) # Note that P6 is used in RPN, but not in the classifier heads. rpn_feature_maps = [P2, P3, P4, P5, P6] mrcnn_feature_maps = [P2, P3, P4, P5] # Generate Anchors self.anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, config.RPN_ANCHOR_RATIOS, config.BACKBONE_SHAPES, config.BACKBONE_STRIDES, config.RPN_ANCHOR_STRIDE) # RPN Model rpn = build_rpn_model(config.RPN_ANCHOR_STRIDE, len(config.RPN_ANCHOR_RATIOS), 256) # Loop through pyramid layers layer_outputs = [] # list of lists for p in rpn_feature_maps: layer_outputs.append(rpn([p])) # Concatenate layer outputs # Convert from list of lists of level outputs to list of lists # of outputs across levels. # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]] output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"] outputs = list(zip(*layer_outputs)) outputs = [ KL.Concatenate(axis=1, name=n)(list(o)) for o, n in zip(outputs, output_names) ] rpn_class_logits, rpn_class, rpn_bbox = outputs # Generate proposals # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates # and zero padded. proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training"\ else config.POST_NMS_ROIS_INFERENCE rpn_rois = ProposalLayer(proposal_count=proposal_count, nms_threshold=config.RPN_NMS_THRESHOLD, name="ROI", anchors=self.anchors, config=config)([rpn_class, rpn_bbox]) if mode == "training": # Class ID mask to mark class IDs supported by the dataset the image # came from. _, _, _, active_class_ids = KL.Lambda( lambda x: parse_image_meta_graph(x), mask=[None, None, None, None])(input_image_meta) if not config.USE_RPN_ROIS: # Ignore predicted ROIs and use ROIs provided as an input. input_rois = KL.Input(shape=[config.POST_NMS_ROIS_TRAINING, 4], name="input_roi", dtype=np.int32) # Normalize coordinates to 0-1 range. target_rois = KL.Lambda(lambda x: K.cast(x, tf.float32) / image_scale[:4])(input_rois) else: target_rois = rpn_rois # Generate detection targets # Subsamples proposals and generates target outputs for training # Note that proposal class IDs, gt_boxes, and gt_masks are zero # padded. Equally, returned rois and targets are zero padded. rois, target_class_ids, target_bbox, target_mask =\ DetectionTargetLayer(config, name="proposal_targets")([ target_rois, input_gt_class_ids, gt_boxes, input_gt_masks]) # Network Heads # TODO: verify that this handles zero padded ROIs mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\ fpn_classifier_graph(rois, mrcnn_feature_maps, config.IMAGE_SHAPE, config.POOL_SIZE, config.NUM_CLASSES) mrcnn_mask = build_fpn_mask_graph(rois, mrcnn_feature_maps, config.IMAGE_SHAPE, config.MASK_POOL_SIZE, config.NUM_CLASSES) # TODO: clean up (use tf.identify if necessary) output_rois = KL.Lambda(lambda x: x * 1, name="output_rois")(rois) # Losses rpn_class_loss = KL.Lambda(lambda x: rpn_class_loss_graph(*x), name="rpn_class_loss")( [input_rpn_match, rpn_class_logits]) rpn_bbox_loss = KL.Lambda( lambda x: rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")( [input_rpn_bbox, input_rpn_match, rpn_bbox]) class_loss = KL.Lambda(lambda x: mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")([ target_class_ids, mrcnn_class_logits, active_class_ids ]) bbox_loss = KL.Lambda(lambda x: mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")([ target_bbox, target_class_ids, mrcnn_bbox ]) mask_loss = KL.Lambda(lambda x: mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")([ target_mask, target_class_ids, mrcnn_mask ]) # Model inputs = [ input_image, input_image_meta, input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks ] if not config.USE_RPN_ROIS: inputs.append(input_rois) outputs = [ rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, output_rois, rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss ] model = KM.Model(inputs, outputs, name='mask_rcnn') else: # Network Heads # Proposal classifier and BBox regressor heads mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\ fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, config.IMAGE_SHAPE, config.POOL_SIZE, config.NUM_CLASSES) # Detections # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in image coordinates detections = DetectionLayer(config, name="mrcnn_detection")( [rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta]) # Convert boxes to normalized coordinates # TODO: let DetectionLayer return normalized coordinates to avoid # unnecessary conversions h, w = config.IMAGE_SHAPE[:2] detection_boxes = KL.Lambda( lambda x: x[..., :4] / np.array([h, w, h, w]))(detections) # Create masks for detections mrcnn_mask = build_fpn_mask_graph(detection_boxes, mrcnn_feature_maps, config.IMAGE_SHAPE, config.MASK_POOL_SIZE, config.NUM_CLASSES) model = KM.Model([input_image, input_image_meta], [ detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox ], name='mask_rcnn') # Add multi-GPU support. if config.GPU_COUNT > 1: from parallel_model import ParallelModel model = ParallelModel(model, config.GPU_COUNT) return model
# Display image and additional stats print("image_id ", image_id, dataset.image_reference(image_id)) log("image", image) log("mask", mask) log("class_ids", class_ids) log("bbox", bbox) # Display image and instances visualize.display_instances(image, bbox, mask, class_ids, dataset.class_names) BACKBONE_SHAPES = compute_backbone_shapes(config, config.IMAGE_SHAPE) # Generate Anchors anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, config.RPN_ANCHOR_RATIOS, BACKBONE_SHAPES, config.BACKBONE_STRIDES, config.RPN_ANCHOR_STRIDE) # Print summary of anchors num_levels = len(BACKBONE_SHAPES) anchors_per_cell = len(config.RPN_ANCHOR_RATIOS) print("Count: ", anchors.shape[0]) print("Scales: ", config.RPN_ANCHOR_SCALES) print("ratios: ", config.RPN_ANCHOR_RATIOS) print("Anchors per Cell: ", anchors_per_cell) print("Levels: ", num_levels) anchors_per_level = [] for l in range(num_levels): num_cells = BACKBONE_SHAPES[l][0] * BACKBONE_SHAPES[l][1] anchors_per_level.append(anchors_per_cell * num_cells //
def build(self, mode, config): """Build Mask R-CNN architecture. input_shape: The shape of the input image. mode: Either "training" or "inference". The inputs and outputs of the model differ accordingly. """ assert mode in ['training', 'inference'] # Image size must be dividable by 2 multiple times h, w = config.IMAGE_SHAPE[:2] print("HEIGHT AND WIDTH BELOW") print(h) print(w) if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): raise Exception( "Image size must be dividable by 2 at least 6 times " "to avoid fractions when downscaling and upscaling." "For example, use 256, 320, 384, 448, 512, ... etc. ") # Inputs input_image = KL.Input(shape=config.IMAGE_SHAPE.tolist(), name="input_image") input_image_meta = KL.Input(shape=[None], name="input_image_meta") # Build the shared convolutional layers. # Bottom-up Layers # Returns a list of the last layers of each stage, 5 in total. # Don't create the thead (stage 5), so we pick the 4th item in the list. _, C2, C3, C4, C5 = resnet_graph(input_image, "resnet101", stage5=True) # Top-down Layers P5 = KL.Conv2D(256, (1, 1), name='fpn_c5p5')(C5) P4 = KL.Add(name="fpn_p4add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5), KL.Conv2D(256, (1, 1), name='fpn_c4p4')(C4) ]) P3 = KL.Add(name="fpn_p3add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4), KL.Conv2D(256, (1, 1), name='fpn_c3p3')(C3) ]) P2 = KL.Add(name="fpn_p2add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3), KL.Conv2D(256, (1, 1), name='fpn_c2p2')(C2) ]) # Attach 3x3 conv to all P layers to get the final feature maps. P2 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p2")(P2) P3 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p3")(P3) P4 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p4")(P4) P5 = KL.Conv2D(256, (3, 3), padding="SAME", name="fpn_p5")(P5) # P6 is used for the 5th anchor scale in RPN. Generated by # subsampling from P5 with stride of 2. P6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5) # Note that P6 is used in RPN, but not in the classifier heads. rpn_feature_maps = [P2, P3, P4, P5, P6] mrcnn_feature_maps = [P2, P3, P4, P5] # Generate Anchors self.anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, config.RPN_ANCHOR_RATIOS, config.BACKBONE_SHAPES, config.BACKBONE_STRIDES, config.RPN_ANCHOR_STRIDE) # RPN Model rpn = build_rpn_model(config.RPN_ANCHOR_STRIDE, len(config.RPN_ANCHOR_RATIOS), 256) # Loop through pyramid layers layer_outputs = [] # list of lists for p in rpn_feature_maps: layer_outputs.append(rpn([p])) # Concatenate layer outputs # Convert from list of lists of level outputs to list of lists # of outputs across levels. # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]] output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"] outputs = list(zip(*layer_outputs)) outputs = [ KL.Concatenate(axis=1, name=n)(list(o)) for o, n in zip(outputs, output_names) ] rpn_class_logits, rpn_class, rpn_bbox = outputs # Generate proposals # Proposals are [N, (y1, x1, y2, x2)] in normalized coordinates. # proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training" \ # else config.POST_NMS_ROIS_INFERENCE proposal_count = config.POST_NMS_ROIS_INFERENCE rpn_rois = ProposalLayer(proposal_count=proposal_count, nms_threshold=0.7, name="ROI", anchors=self.anchors, config=config)([rpn_class, rpn_bbox]) # Network Heads # Proposal classifier and BBox regressor heads mrcnn_class_logits, mrcnn_class, mrcnn_bbox = \ fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, config.IMAGE_SHAPE, config.POOL_SIZE, config.NUM_CLASSES) # Detections # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in image coordinates detections = DetectionLayer(config, name="mrcnn_detection")( [rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta]) # Convert boxes to normalized coordinates h, w = config.IMAGE_SHAPE[:2] detection_boxes = KL.Lambda( lambda x: x[..., :4] / np.array([h, w, h, w]))(detections) # Create masks for detections mrcnn_mask = build_fpn_mask_graph(detection_boxes, mrcnn_feature_maps, config.IMAGE_SHAPE, config.MASK_POOL_SIZE, config.NUM_CLASSES) model = KM.Model([input_image, input_image_meta], [ detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox ], name='mask_rcnn') # Add multi-GPU support. if config.GPU_COUNT > 1: from parallel_model import ParallelModel model = ParallelModel(model, config.GPU_COUNT) return model
def data_generator(dataset, shuffle=True, augment=False, augmentation=None, random_rois=0, batch_size=1, detection_targets=False, no_augmentation_sources=None): """A generator that returns images and corresponding target class ids, bounding box deltas, and masks. dataset: The Dataset object to pick data from config: The model config object shuffle: If True, shuffles the samples before every epoch augment: (deprecated. Use augmentation instead). If true, apply random image augmentation. Currently, only horizontal flipping is offered. augmentation: Optional. An imgaug (https://github.com/aleju/imgaug) augmentation. For example, passing imgaug.augmenters.Fliplr(0.5) flips images right/left 50% of the time. random_rois: If > 0 then generate proposals to be used to train the network classifier and mask heads. Useful if training the Mask RCNN part without the RPN. batch_size: How many images to return in each call detection_targets: If True, generate detection targets (class IDs, bbox deltas, and masks). Typically for debugging or visualizations because in trainig detection targets are generated by DetectionTargetLayer. no_augmentation_sources: Optional. List of sources to exclude for augmentation. A source is string that identifies a dataset and is defined in the Dataset class. Returns a Python generator. Upon calling next() on it, the generator returns two lists, inputs and outputs. The contents of the lists differs depending on the received arguments: inputs list: - images: [batch, H, W, C] - image_meta: [batch, (meta data)] Image details. See compose_image_meta() - rpn_match: [batch, N] Integer (1=positive anchor, -1=negative, 0=neutral) - rpn_bbox: [batch, N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas. - gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs - gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] - gt_masks: [batch, height, width, MAX_GT_INSTANCES]. The height and width are those of the image unless use_mini_mask is True, in which case they are defined in MINI_MASK_SHAPE. outputs list: Usually empty in regular training. But if detection_targets is True then the outputs list contains target class_ids, bbox deltas, and masks. """ b = 0 # batch item index image_index = -1 image_ids = np.copy(dataset.image_ids) error_count = 0 no_augmentation_sources = no_augmentation_sources or [] backbone_shapes = utils.compute_backbone_shapes( hyper_parameters.FLAGS.BACKBONE, hyper_parameters.FLAGS.BACKBONE_STRIDES, hyper_parameters.FLAGS.IMAGE_SHAPE) anchors = utils.generate_pyramid_anchors( hyper_parameters.FLAGS.RPN_ANCHOR_SCALES, hyper_parameters.FLAGS.RPN_ANCHOR_RATIOS, backbone_shapes, hyper_parameters.FLAGS.BACKBONE_STRIDES, hyper_parameters.FLAGS.RPN_ANCHOR_STRIDE) while True: try: # Increment index to pick next image. Shuffle if at the start of an epoch. image_index = (image_index + 1) % len(image_ids) if shuffle and image_index == 0: np.random.shuffle(image_ids) # Get GT bounding boxes and masks for image. image_id = image_ids[image_index] if dataset.image_info[image_id][ 'source'] in no_augmentation_sources: image, image_meta, gt_class_ids, gt_boxes, gt_masks = \ load_image_gt(dataset, image_id, augment=augment, augmentation=None, use_mini_mask=hyper_parameters.FLAGS.USE_MINI_MASK) else: image, image_meta, gt_class_ids, gt_boxes, gt_masks = \ load_image_gt(dataset, image_id, augment=augment, augmentation=augmentation, use_mini_mask=hyper_parameters.FLAGS.USE_MINI_MASK) # Skip images that have no instances. This can happen in cases # where we train on a subset of classes and the image doesn't # have any of the classes we care about. if not np.any(gt_class_ids > 0): continue # RPN Targets rpn_match, rpn_bbox = build_rpn_targets(image.shape, anchors, gt_class_ids, gt_boxes) # Mask R-CNN Targets if random_rois: rpn_rois = generate_random_rois(image.shape, random_rois, gt_class_ids, gt_boxes) if detection_targets: rois, mrcnn_class_ids, mrcnn_bbox, mrcnn_mask = \ build_detection_targets( rpn_rois, gt_class_ids, gt_boxes, gt_masks) if b == 0: batch_image_meta = np.zeros((batch_size, ) + image_meta.shape, dtype=image_meta.dtype) batch_rpn_match = np.zeros([batch_size, anchors.shape[0], 1], dtype=rpn_match.dtype) batch_rpn_bbox = np.zeros([ batch_size, hyper_parameters.FLAGS.RPN_TRAIN_ANCHORS_PER_IMAGE, 4 ], dtype=rpn_bbox.dtype) batch_images = np.zeros((batch_size, ) + image.shape, dtype=np.float32) batch_gt_class_ids = np.zeros( (batch_size, hyper_parameters.FLAGS.MAX_GT_INSTANCES), dtype=np.int32) batch_gt_boxes = np.zeros( (batch_size, hyper_parameters.FLAGS.MAX_GT_INSTANCES, 4), dtype=np.int32) batch_gt_masks = np.zeros( (batch_size, gt_masks.shape[0], gt_masks.shape[1], hyper_parameters.FLAGS.MAX_GT_INSTANCES), dtype=gt_masks.dtype) if random_rois: batch_rpn_rois = np.zeros((batch_size, random_rois, 4), dtype=np.int32) if detection_targets: batch_rois = np.zeros((batch_size, ) + rois.shape, dtype=rois.dtype) batch_mrcnn_class_ids = np.zeros( (batch_size, ) + mrcnn_class_ids.shape, dtype=mrcnn_class_ids.dtype) batch_mrcnn_bbox = np.zeros( (batch_size, ) + mrcnn_bbox.shape, dtype=mrcnn_bbox.dtype) batch_mrcnn_mask = np.zeros( (batch_size, ) + mrcnn_mask.shape, dtype=mrcnn_mask.dtype) if gt_boxes.shape[0] > hyper_parameters.FLAGS.MAX_GT_INSTANCES: ids = np.random.choice(np.arange(gt_boxes.shape[0]), hyper_parameters.FLAGS.MAX_GT_INSTANCES, replace=False) gt_boxes = gt_boxes[ids] gt_class_ids = gt_class_ids[ids] gt_masks = gt_masks[:, :, ids] batch_image_meta[b] = image_meta batch_gt_boxes[b, :gt_boxes.shape[0]] = gt_boxes batch_rpn_match[b] = rpn_match[:, np.newaxis] batch_rpn_bbox[b] = rpn_bbox batch_images[b] = utils.mold_image( image.astype(np.float32), hyper_parameters.FLAGS.MEAN_PIXEL) batch_gt_class_ids[b, :gt_class_ids.shape[0]] = gt_class_ids batch_gt_masks[b, :, :, :gt_masks.shape[-1]] = gt_masks if random_rois: batch_rpn_rois[b] = rpn_rois if detection_targets: batch_rois[b] = rois batch_mrcnn_class_ids[b] = mrcnn_class_ids batch_mrcnn_bbox[b] = mrcnn_bbox batch_mrcnn_mask[b] = mrcnn_mask b += 1 if b >= batch_size: inputs = [ batch_images, batch_image_meta, batch_rpn_match, batch_rpn_bbox, batch_gt_class_ids, batch_gt_boxes, batch_gt_masks ] outputs = [] if random_rois: inputs.extend([batch_rpn_rois]) if detection_targets: inputs.extend([batch_rois]) # Keras requires that output and targets have the same number of dimensions batch_mrcnn_class_ids = np.expand_dims( batch_mrcnn_class_ids, -1) outputs.extend([ batch_mrcnn_class_ids, batch_mrcnn_bbox, batch_mrcnn_mask ]) yield inputs, outputs b = 0 except (GeneratorExit, KeyboardInterrupt): raise except: # Log it and skip the image logging.exception("Error processing image {}".format( dataset.image_info[image_id])) error_count += 1 if error_count > 5: raise
def data_generator(dataset, config, shuffle=True, augment=True, random_rois=0, batch_size=1, detection_targets=False): """A generator that returns images and corresponding target class ids, bounding box deltas, and masks. dataset: The Dataset object to pick data from tf_config: The model tf_config object shuffle: If True, shuffles the samples before every epoch augment: If True, applies image augmentation to images (currently only horizontal flips are supported) random_rois: If > 0 then generate proposals to be used to train the network classifier and mask heads. Useful if training the Mask RCNN part without the RPN. batch_size: How many images to return in each call detection_targets: If True, generate detection targets (class IDs, bbox deltas, and masks). Typically for debugging or visualizations because in trainig detection targets are generated by DetectionTargetLayer. Returns a Python generator. Upon calling next() on it, the generator returns two lists, inputs and outputs. The containtes of the lists differs depending on the received arguments: inputs list: - images: [batch, H, W, C] - image_meta: [batch, size of image meta] - rpn_match: [batch, N] Integer (1=positive anchor, -1=negative, 0=neutral) - rpn_bbox: [batch, N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas. - gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs - gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] - gt_masks: [batch, height, width, MAX_GT_INSTANCES]. The height and width are those of the image unless use_mini_mask is True, in which case they are defined in MINI_MASK_SHAPE. outputs list: Usually empty in regular training. But if detection_targets is True then the outputs list contains target class_ids, bbox deltas, and masks. """ b = 0 # batch item index image_index = -1 image_ids = np.copy(dataset.image_ids) error_count = 0 # Anchors # [anchor_count, (y1, x1, y2, x2)] anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, config.RPN_ANCHOR_RATIOS, config.BACKBONE_SHAPES, config.BACKBONE_STRIDES, config.RPN_ANCHOR_STRIDE) # Keras requires a generator to run indefinately. while True: try: # Increment index to pick next image. Shuffle if at the start of an epoch. image_index = (image_index + 1) % len(image_ids) if shuffle and image_index == 0: np.random.shuffle(image_ids) # Get GT bounding boxes and masks for image. image_id = image_ids[image_index] image, image_meta, gt_class_ids, gt_boxes, gt_masks = \ load_image_gt(dataset, config, image_id, augment=augment, use_mini_mask=config.USE_MINI_MASK) # Skip images that have no instances. This can happen in cases # where we train on a subset of classes and the image doesn't # have any of the classes we care about. if not np.any(gt_class_ids > 0): continue # RPN Targets rpn_match, rpn_bbox = build_rpn_targets(image.shape, anchors, gt_class_ids, gt_boxes, config) # Mask R-CNN Targets if random_rois: rpn_rois = generate_random_rois( image.shape, random_rois, gt_class_ids, gt_boxes) if detection_targets: rois, mrcnn_class_ids, mrcnn_bbox, mrcnn_mask =\ build_detection_targets( rpn_rois, gt_class_ids, gt_boxes, gt_masks, config) # Init batch arrays if b == 0: batch_image_meta = np.zeros( (batch_size,) + image_meta.shape, dtype=image_meta.dtype) batch_rpn_match = np.zeros( [batch_size, anchors.shape[0], 1], dtype=rpn_match.dtype) batch_rpn_bbox = np.zeros( [batch_size, config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4], dtype=rpn_bbox.dtype) batch_images = np.zeros( (batch_size,) + image.shape, dtype=np.float32) batch_gt_class_ids = np.zeros( (batch_size, config.MAX_GT_INSTANCES), dtype=np.int32) batch_gt_boxes = np.zeros( (batch_size, config.MAX_GT_INSTANCES, 4), dtype=np.int32) if config.USE_MINI_MASK: batch_gt_masks = np.zeros((batch_size, config.MINI_MASK_SHAPE[0], config.MINI_MASK_SHAPE[1], config.MAX_GT_INSTANCES)) else: batch_gt_masks = np.zeros( (batch_size, image.shape[0], image.shape[1], config.MAX_GT_INSTANCES)) if random_rois: batch_rpn_rois = np.zeros( (batch_size, rpn_rois.shape[0], 4), dtype=rpn_rois.dtype) if detection_targets: batch_rois = np.zeros( (batch_size,) + rois.shape, dtype=rois.dtype) batch_mrcnn_class_ids = np.zeros( (batch_size,) + mrcnn_class_ids.shape, dtype=mrcnn_class_ids.dtype) batch_mrcnn_bbox = np.zeros( (batch_size,) + mrcnn_bbox.shape, dtype=mrcnn_bbox.dtype) batch_mrcnn_mask = np.zeros( (batch_size,) + mrcnn_mask.shape, dtype=mrcnn_mask.dtype) # If more instances than fits in the array, sub-sample from them. if gt_boxes.shape[0] > config.MAX_GT_INSTANCES: ids = np.random.choice( np.arange(gt_boxes.shape[0]), config.MAX_GT_INSTANCES, replace=False) gt_class_ids = gt_class_ids[ids] gt_boxes = gt_boxes[ids] gt_masks = gt_masks[:, :, ids] # Add to batch batch_image_meta[b] = image_meta batch_rpn_match[b] = rpn_match[:, np.newaxis] batch_rpn_bbox[b] = rpn_bbox batch_images[b] = mold_image(image.astype(np.float32), config) batch_gt_class_ids[b, :gt_class_ids.shape[0]] = gt_class_ids batch_gt_boxes[b, :gt_boxes.shape[0]] = gt_boxes batch_gt_masks[b, :, :, :gt_masks.shape[-1]] = gt_masks if random_rois: batch_rpn_rois[b] = rpn_rois if detection_targets: batch_rois[b] = rois batch_mrcnn_class_ids[b] = mrcnn_class_ids batch_mrcnn_bbox[b] = mrcnn_bbox batch_mrcnn_mask[b] = mrcnn_mask b += 1 # Batch full? if b >= batch_size: inputs = [batch_images, batch_image_meta, batch_rpn_match, batch_rpn_bbox, batch_gt_class_ids, batch_gt_boxes, batch_gt_masks] outputs = [] if random_rois: inputs.extend([batch_rpn_rois]) if detection_targets: inputs.extend([batch_rois]) # Keras requires that output and targets have the same number of dimensions batch_mrcnn_class_ids = np.expand_dims( batch_mrcnn_class_ids, -1) outputs.extend( [batch_mrcnn_class_ids, batch_mrcnn_bbox, batch_mrcnn_mask]) yield inputs, outputs # start a new batch b = 0 except (GeneratorExit, KeyboardInterrupt): raise except: # Log it and skip the image logging.exception("Error processing image {}".format( dataset.image_info[image_id])) error_count += 1 if error_count > 5: raise
def __init__(self, options, config, split, random=True, loadNeighborImage=False, load_semantics=False, load_boundary=False): self.options = options self.config = config self.split = split self.random = random self.dataFolder = options.dataFolder self.scenes = [] self.sceneImageIndices = [] self.loadClassMap() planenet_scene_ids_val = np.load('datasets/scene_ids_val.npy') planenet_scene_ids_val = { scene_id.decode('utf-8'): True for scene_id in planenet_scene_ids_val } with open(self.dataFolder + '/ScanNet/Tasks/Benchmark/scannetv1_' + split + '.txt') as f: for line in f: scene_id = line.strip() if split == 'test': ## Remove scenes which are in PlaneNet's training set for fair comparison if scene_id not in planenet_scene_ids_val: continue pass scenePath = self.dataFolder + '/scans/' + scene_id if not os.path.exists(scenePath + '/' + scene_id + '.txt') or not os.path.exists( scenePath + '/annotation/planes.npy'): continue scene = ScanNetScene(options, scenePath, scene_id, self.confident_labels, self.layout_labels, load_semantics=load_semantics, load_boundary=load_boundary) self.scenes.append(scene) self.sceneImageIndices += [[ len(self.scenes) - 1, imageIndex ] for imageIndex in range(len(scene.imagePaths))] continue pass if random: t = int(time.time() * 1000000) np.random.seed(((t & 0xff000000) >> 24) + ((t & 0x00ff0000) >> 8) + ((t & 0x0000ff00) << 8) + ((t & 0x000000ff) << 24)) else: np.random.seed(0) pass np.random.shuffle(self.sceneImageIndices) self.invalid_indices = {} with open(self.dataFolder + '/invalid_indices_' + split + '.txt', 'r') as f: for line in f: tokens = line.split(' ') if len(tokens) == 3: assert (int(tokens[2]) < 10000) invalid_index = int(tokens[1]) * 10000 + int(tokens[2]) if invalid_index not in self.invalid_indices: self.invalid_indices[invalid_index] = True pass pass continue pass self.sceneImageIndices = [ [sceneIndex, imageIndex] for sceneIndex, imageIndex in self.sceneImageIndices if (sceneIndex * 10000 + imageIndex) not in self.invalid_indices ] print('num images', len(self.sceneImageIndices)) self.anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, config.RPN_ANCHOR_RATIOS, config.BACKBONE_SHAPES, config.BACKBONE_STRIDES, config.RPN_ANCHOR_STRIDE) self.loadNeighborImage = loadNeighborImage return
def build(self, mode, config, images): assert mode in ['training', 'inference'] # Image size must be dividable by 2 multiple times # h, w = config.IMAGE_SHAPE[:2] # if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): # raise Exception("Image size must be dividable by 2 at least 6 times " # "to avoid fractions when downscaling and upscaling." # "For example, use 256, 320, 384, 448, 512, ... etc. ") # input_image = tf.placeholder(shape=config.IMAGE_SHAPE.tolist(), name="input_image") C2, C3, C4, C5 = resnet_graph(images, "resnet50", stage5=True) #128*4*4*256 P5 = setool.conv_op(input_op=C5, name='fpn_c5p5', kh=1, kw=1, n_out=256) P4 = setool.conv_op(input_op=C4, name='fpn_c4p4',kh=1, kw=1, n_out=256) + \ tf.image.resize_images(P5, [64,64]) P3= setool.conv_op(input_op=C3, name='fpn_c3p3',kh=1, kw=1, n_out=256) + \ tf.image.resize_images(P4, [128, 128]) P2= setool.conv_op(input_op=C2, name='fpn_c2p2',kh=1, kw=1, n_out=256) + \ tf.image.resize_images(P3, [256, 256]) P2 = setool.conv_op(input_op=P2, name='fpn_p2', n_out=256) P3 = setool.conv_op(input_op=P3, name='fpn_p3', n_out=256) P4 = setool.conv_op(input_op=P4, name='fpn_p4', n_out=256) P5 = setool.conv_op(input_op=P5, name='fpn_p5', n_out=256) P6 = setool.mpool_op(input_tensor=P5, k=1, s=2, name="fpn_p6") rpn_feature_maps = [P2, P3, P4, P5, P6] mrcnn_feature_maps = [P2, P3, P4, P5] # Generate Anchors self.anchors = utils.generate_pyramid_anchors( self.config.RPN_ANCHOR_SCALES, self.config.RPN_ANCHOR_RATIOS, self.config.BACKBONE_SHAPES, self.config.BACKBONE_STRIDES, self.config.RPN_ANCHOR_STRIDE) #(32, 64, 128, 256, 512) 3, [256,128,64,32,16], [4, 8, 16, 32, 64], 1 rpn_P6 = RPN_net( P6, anchor_stride=self.config.RPN_ANCHOR_STRIDE).build_rpn_model() rpn_P5 = RPN_net( P5, anchor_stride=self.config.RPN_ANCHOR_STRIDE).build_rpn_model() rpn_P4 = RPN_net( P4, anchor_stride=self.config.RPN_ANCHOR_STRIDE).build_rpn_model() rpn_P3 = RPN_net( P3, anchor_stride=self.config.RPN_ANCHOR_STRIDE).build_rpn_model() rpn_P2 = RPN_net( P2, anchor_stride=self.config.RPN_ANCHOR_STRIDE).build_rpn_model() rpn_class_logits = tf.concat( [rpn_P2[0], rpn_P3[0], rpn_P4[0], rpn_P5[0], rpn_P6[0]], 1) rpn_class = tf.concat( [rpn_P2[1], rpn_P3[1], rpn_P4[1], rpn_P5[1], rpn_P6[1]], 1) rpn_bbox = tf.concat( [rpn_P2[2], rpn_P3[2], rpn_P4[2], rpn_P5[2], rpn_P6[2]], 1) # print(rpn_class_logits.shape) # print(rpn_class.shape) # print(rpn_bbox.shape) return rpn_class_logits, rpn_bbox, rpn_class
molded_image = mold_image(molded_image) print("Moded image shape is : ", molded_image.shape) image_meta = compose_image_meta( 0, image.shape, molded_image.shape, inferwindow, scale, np.zeros([inferconfig.NUM_CLASSES], dtype=np.int32)) #image = image[np.newaxis,:] #anchors = anchors[np.newaxis, :] image_meta = image_meta.reshape(1, -1) backbone_shapes = compute_backbone_shapes(inferconfig, molded_image.shape) imageshapeinfer = molded_image.shape molded_image = molded_image[np.newaxis, :] #print("Backbone shape is : ", backbone_shapes) anchors = utils.generate_pyramid_anchors(inferconfig.RPN_ANCHOR_SCALES, inferconfig.RPN_ANCHOR_RATIOS, backbone_shapes, inferconfig.BACKBONE_STRIDES, inferconfig.RPN_ANCHOR_STRIDE) #print("Anchor generate parameter : ",inferconfig.RPN_ANCHOR_SCALES) #print("Anchor generate parameter : ",inferconfig.RPN_ANCHOR_RATIOS) #print("Anchor generate paramenter :",backbone_shapes) #print("Anchor generate parameter : ",inferconfig.BACKBONE_STRIDES) #print("Anchor generate parameter : ",inferconfig.RPN_ANCHOR_STRIDE) #print("Original anchor shape is :", anchors.shape) anchors = np.broadcast_to(anchors, (inferconfig.BATCH_SIZE, ) + anchors.shape) anchors = utils.norm_boxes(anchors, imageshapeinfer[:2]) print("The input anchors shape is : ", anchors.shape) print('The input anchors are : \n', anchors) #print(image.shape) test_list = []
def inference(): # Root directory of the project ROOT_DIR = os.getcwd() # Directory to save logs and model checkpoints, if not provided # through the command line argument --logs LOG_DIR = os.path.join(ROOT_DIR, "output/logs") MODEL_DIR = os.path.join(ROOT_DIR, "output/training") dataset_path = os.path.join(ROOT_DIR, 'data/coco') config = InferenceConfig() config.display() dataset_val = gen_cocodb.CocoDataSet() dataset_val.load_coco(dataset_path, "minival", year="2014", auto_download=False) dataset_val.prepare() print("Images: {}\nClasses: {}".format(len(dataset_val.image_ids), dataset_val.class_names)) image_id = random.choice(dataset_val.image_ids) # image, image_meta, gt_class_id, gt_bbox, gt_mask = dataset_val.load_image_gt(dataset_val, config, image_id, use_mini_mask=False) # info = dataset_val.image_info[image_id] # print("image ID: {}.{} ({}) {}".format(info["source"], info["id"], image_id, # dataset_val.image_reference(image_id))) image = dataset_val.load_image(image_id) images = np.expand_dims(image, axis=0) molded_images, image_metas, windows = gen_cocodb.mold_inputs( images, config) print(molded_images.shape, image_metas.shape, windows.shape) anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, config.RPN_ANCHOR_RATIOS, config.BACKBONE_SHAPES, config.BACKBONE_STRIDES, config.RPN_ANCHOR_STRIDE) with tf.device('/device:CPU:0'): model = modellib.MaskRCNN(mode='inference', config=config, model_dir=LOG_DIR, anchors=anchors) print(len(model.outputs)) feed_dict = { model.input_image: molded_images, model.input_image_meta: image_metas } detections = model.outputs['detections'] mrcnn_class = model.outputs['mrcnn_class'] mrcnn_bbox = model.outputs['mrcnn_bbox'] mrcnn_mask = model.outputs['mrcnn_mask'] saver = tf.train.Saver() init_op = tf.global_variables_initializer() with tf.device('/device:CPU:0'): with tf.Session() as sess: sess.run(init_op) # saver.restore(sess, "output/training/mrcnn.ckpt-96000") ckpt = tf.train.get_checkpoint_state(MODEL_DIR) """ resotre checkpoint of Backbone network """ if ckpt is not None: ckpt_path = tf.train.latest_checkpoint(MODEL_DIR) # ckpt_path = FLAGS.checkpoint_model saver.restore(sess, ckpt_path) else: ckpt_path = "output/training/mrcnn.ckpt-96000" saver.restore(sess, ckpt_path) print('ckpt_path', ckpt_path) pre_nms_anchors = sess.graph.get_tensor_by_name( "pre_nms_anchors:0") refined_anchors = sess.graph.get_tensor_by_name( "refined_anchors:0") refined_anchors_clipped = sess.graph.get_tensor_by_name( "refined_anchors_clipped:0") print(pre_nms_anchors) print(refined_anchors) print(refined_anchors_clipped) detect, pred_class, pred_bbox, pred_mask = sess.run( [detections, mrcnn_class, mrcnn_bbox, mrcnn_mask], feed_dict=feed_dict) print(detect.shape, pred_class.shape, pred_bbox.shape, pred_mask.shape) # Process detections final_rois, final_class_ids, final_scores, final_masks = gen_cocodb.unmold_detections( detect[0], pred_mask[0], image.shape, windows[0]) ax = get_ax(1) visualize.display_instances(image, final_rois, final_masks, final_class_ids, dataset_val.class_names, final_scores, ax=ax, title="Predictions") print(final_rois.shape, final_class_ids.shape, final_scores.shape, final_masks.shape) print(final_class_ids) print(final_scores) print(final_rois)
def train(train_dataset, config, lr, train_layers, epochs): anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, config.RPN_ANCHOR_RATIOS, config.BACKBONE_SHAPES, config.BACKBONE_STRIDES, config.RPN_ANCHOR_STRIDE) with tf.Graph().as_default(): deploy_config = model_deploy.DeploymentConfig(num_clones=config.GPU_COUNT, clone_on_cpu=False, replica_id=0, num_replicas=1, num_ps_tasks=0) with tf.device(deploy_config.variables_device()): print(deploy_config.variables_device()) global_step = tf.train.create_global_step() with tf.device(deploy_config.inputs_device()): print(deploy_config.inputs_device()) with tf.name_scope('coco_data_generator'): train_generator = data_generator(train_dataset, config, anchors, shuffle=True) models =[] def clone_fn(): model = modellib.MaskRCNN(mode=mode, config=config, model_dir=DEFAULT_LOGS_DIR, anchors=anchors) models.append(model) losses = tf.get_collection(tf.GraphKeys.LOSSES) model_loss = tf.add_n(losses) return model_loss clones = model_deploy.create_clones(deploy_config, clone_fn) first_clone_scope = deploy_config.clone_scope(0) print(first_clone_scope) # Gather update_ops from the first clone. These contain, for example, # the updates for the batch_norm variables created by network_fn. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) # Gather initial summaries. summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) for loss in tf.get_collection(tf.GraphKeys, first_clone_scope): summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss)) ######################################### # Configure the optimization procedure. # ######################################### print(deploy_config.optimizer_device()) with tf.device(deploy_config.optimizer_device()): learning_rate = tf.placeholder(dtype=tf.float32, shape=(), name='learning_rate') optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=config.LEARNING_MOMENTUM, name='Momentum') summaries.add(tf.summary.scalar('learning_rate', learning_rate)) variables_to_train = set_trainable(train_layers) total_loss, clones_gradients = model_deploy.optimize_clones(clones, optimizer, var_list=variables_to_train) # Create gradient updates. grad_updates = optimizer.apply_gradients(clones_gradients, global_step=global_step) update_ops.append(grad_updates) update_op = tf.group(*update_ops) # Add total_loss to summary. summaries.add(tf.summary.scalar('total_loss', total_loss)) print(total_loss) summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope)) summary_op = tf.summary.merge(list(summaries), name='summary_op') summary_writer = tf.summary.FileWriter(models[0].log_dir, graph=tf.Session().graph) """ set saver for saving final model and backbone model for restore """ # variables_to_restore = _get_restore_vars('FeatureExtractor/MobilenetV1') # re_saver = tf.train.Saver(var_list=variables_to_restore) saver = tf.train.Saver(max_to_keep=3) """ Set Gpu Env """ init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) """ Starting Training..... """ gpu_opt = tf.GPUOptions(per_process_gpu_memory_fraction=0.9, allow_growth=True) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_opt)) as sess: sess.run(init_op) # re_saver.restore(sess, 'data/pretrained_models/mobilenet_v1_coco/model.ckpt') ckpt = tf.train.get_checkpoint_state("output/training") """ resotre checkpoint of Backbone network """ if ckpt: lastest_ckpt = tf.train.latest_checkpoint("output/training") print('lastest', lastest_ckpt) saver.restore(sess, lastest_ckpt) try: while True: feed_dict={learning_rate:lr} inputs = train_generator.next() num_epoch = inputs[7] for i in range(len(clones)): s = 2*i e = 2*(i+1) feed_dict[models[i].input_image] = inputs[0][s:e, :] feed_dict[models[i].input_image_meta] = inputs[1][s:e, :] feed_dict[models[i].input_rpn_match] = inputs[2][s:e, :] feed_dict[models[i].input_rpn_bbox] = inputs[3][s:e, :] feed_dict[models[i].input_gt_class_ids] = inputs[4][s:e, :] feed_dict[models[i].input_gt_boxes] = inputs[5][s:e, :] feed_dict[models[i].input_gt_masks] = inputs[6][s:e, :] _, loss, current_step, summary = sess.run([update_op, total_loss, global_step, summary_op], feed_dict=feed_dict) print ("""iter %d : total-loss %.4f """ %(current_step, loss)) if np.isnan(loss) or np.isinf(loss): print('isnan or isinf', loss) raise if current_step % 1000 == 0: # write summary # summary = sess.run(summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary, current_step) summary_writer.flush() if current_step % 3000 == 0: # Save a checkpoint save_path = 'output/training/mrcnn.ckpt' saver.save(sess, save_path, global_step=current_step) if num_epoch > epochs: print("num epoch : %d and training End!!!" % num_epoch) break except Exception as ex: print('Error occured!!!! => ', ex) finally: print("Final!!") saver.save(sess, 'output/models/mrcnn_final.ckpt', write_meta_graph=False)
def data_generator(config, shuffle=True, augmentation=None,batch_size=1): """ A generator that returns images and corresponding target class ids, bounding box deltas, and masks. Returns a Python generator. Upon calling next() on it, the generator returns two lists, inputs and outputs. The contents of the lists differs depending on the received arguments: inputs list: - images: [batch, H, W, C] - image_meta: [batch, (meta data)] Image details. See compose_image_meta() - rpn_match: [batch, N] Integer (1=positive anchor, -1=negative, 0=neutral) - rpn_bbox: [batch, N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas. - gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs - gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] - gt_masks: [batch, height, width, MAX_GT_INSTANCES]. The height and width are those of the image unless use_mini_mask is True, in which case they are defined in MINI_MASK_SHAPE. outputs list: Usually empty in regular training. But if detection_targets is True then the outputs list contains target class_ids, bbox deltas, and masks. """ b = 0 ix = 0 image_files = glob.glob("./data/train/*.jpg") # Anchors # [anchor_count, (y1, x1, y2, x2)] backbone_shapes = compute_backbone_shapes(config, config.IMAGE_SHAPE) anchors = utils.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, config.RPN_ANCHOR_RATIOS, backbone_shapes, config.BACKBONE_STRIDES, config.RPN_ANCHOR_STRIDE) while True: if shuffle and ix == 0: np.random.shuffle(image_files) image_path = image_files[ix] json_path = image_files[ix].replace("jpg", "json") image = load_image(image_path) original_shape = image.shape mask, class_ids = load_mask(json_path) image, window, scale, padding, crop = utils.resize_image( image, min_dim=config.IMAGE_MIN_DIM, min_scale=config.IMAGE_MIN_SCALE, max_dim=config.IMAGE_MAX_DIM, mode=config.IMAGE_RESIZE_MODE) mask = utils.resize_mask(mask, scale, padding, crop) # Augmentation # This requires the imgaug lib (https://github.com/aleju/imgaug) if augmentation: import imgaug # Augmenters that are safe to apply to masks # Some, such as Affine, have settings that make them unsafe, so always # test your augmentation on masks MASK_AUGMENTERS = ["Sequential", "SomeOf", "OneOf", "Sometimes", "Fliplr", "Flipud", "CropAndPad", "Affine", "PiecewiseAffine"] def hook(images, augmenter, parents, default): """Determines which augmenters to apply to masks.""" return augmenter.__class__.__name__ in MASK_AUGMENTERS # Store shapes before augmentation to compare image_shape = image.shape mask_shape = mask.shape # Make augmenters deterministic to apply similarly to images and masks det = augmentation.to_deterministic() image = det.augment_image(image) # Change mask to np.uint8 because imgaug doesn't support np.bool mask = det.augment_image(mask.astype(np.uint8), hooks=imgaug.HooksImages(activator=hook)) # Verify that shapes didn't change assert image.shape == image_shape, "Augmentation shouldn't change image size" assert mask.shape == mask_shape, "Augmentation shouldn't change mask size" # Change mask back to bool mask = mask.astype(np.bool) bbox = utils.extract_bboxes(mask) use_mini_mask = True if use_mini_mask: mask = utils.minimize_mask(bbox, mask, config.MINI_MASK_SHAPE) # image_meta is for debug image_meta = compose_image_meta(0, original_shape, image.shape, window, scale, np.ones(len(class_name2idx))) # RPN Targets rpn_match, rpn_bbox = build_rpn_targets(image.shape, anchors, class_ids, bbox, config) if b == 0: batch_image_meta = np.zeros( (batch_size,) + image_meta.shape, dtype=image_meta.dtype) batch_rpn_match = np.zeros( [batch_size, anchors.shape[0], 1], dtype=rpn_match.dtype) batch_rpn_bbox = np.zeros( [batch_size, config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4], dtype=rpn_bbox.dtype) batch_images = np.zeros( (batch_size,) + image.shape, dtype=np.float32) batch_gt_class_ids = np.zeros( (batch_size, config.MAX_GT_INSTANCES), dtype=np.int32) batch_gt_boxes = np.zeros( (batch_size, config.MAX_GT_INSTANCES, 4), dtype=np.int32) batch_gt_masks = np.zeros( (batch_size, mask.shape[0], mask.shape[1], config.MAX_GT_INSTANCES), dtype=mask.dtype) # Add to batch batch_image_meta[b] = image_meta batch_rpn_match[b] = rpn_match[:, np.newaxis] batch_rpn_bbox[b] = rpn_bbox batch_images[b] = mold_image(image.astype(np.float32), config) batch_gt_class_ids[b, :class_ids.shape[0]] = class_ids batch_gt_boxes[b, :bbox.shape[0]] = bbox batch_gt_masks[b, :, :, :mask.shape[-1]] = mask b += 1 ix = (ix + 1) % len(image_files) if b >= batch_size: inputs = [batch_images, batch_image_meta, batch_rpn_match, batch_rpn_bbox, batch_gt_class_ids, batch_gt_boxes, batch_gt_masks] outputs = [] yield inputs,outputs b = 0