def __init__(self, feature_depth, cfg, mode='all', phase='train'): ''' feature_depth: num of feature channels --> int cfg: config --> class object mode: 'all' --> (default) normal mode 'RPN' --> return rpn_out.deatach() phase: 'train' / 'test' ''' super(RegionProposalNetwork, self).__init__() self.depth = feature_depth self.anchor_scales = cfg['sliding_windows_scales'] self.anchor_ratios = cfg['sliding_windows_ratio'] self.k = len(self.anchor_scales) * len(self.anchor_ratios) self.feat_stride = cfg['feature_stride'] self.mode = mode self.phase = phase # set RPN convolution layer self.rpn_conv = nn.Sequential(nn.Conv2d(self.depth, 256, 3, 1, 1), nn.ReLU(inplace=True)) self.cls_layer = nn.Conv2d(256, 2 * self.k, 1, 1, 0) self.reg_layer = nn.Conv2d(256, 4 * self.k, 1, 1, 0) # define proposal layer self.RPN_proposal = ProposalLayer(self.feat_stride, self.anchor_scales, self.anchor_ratios)
def __init__(self, in_ch=512, out_ch=512, n_anchors=9, feat_stride=16, anchor_scales=[8, 16, 32], num_classes=21, rpn_sigma=3.0): super(RPN, self).__init__( rpn_conv_3x3=L.Convolution2D(in_ch, out_ch, 3, 1, 1), rpn_cls_score=L.Convolution2D(out_ch, 2 * n_anchors, 1, 1, 0), rpn_bbox_pred=L.Convolution2D(out_ch, 4 * n_anchors, 1, 1, 0)) self.anchor_target_layer = AnchorTargetLayer(feat_stride) self.proposal_layer = ProposalLayer(feat_stride, anchor_scales) self.rpn_sigma = rpn_sigma
def __init__(self, input_dim, num_anchors_per_frame, output_dim): super(RPN, self).__init__() self.num_class = output_dim self.input_dim = input_dim self.num_anchors_per_frame = num_anchors_per_frame # this is a global value which indicate the number of anchors used in our experiments self.min_window_size = cfg.MIN_WINDOW_SIZE self.max_window_size = cfg.MAX_WINDOW_SIZE self.num_score_out = self.num_anchors_per_frame * self.num_class # 2(bg/fg) * num anchors) self.num_bbox_out = self.num_anchors_per_frame * 2 # 2(coords) * num anchors) self.cls_score_RPN = nn.Linear(self.input_dim, self.num_score_out, bias=True) self.bbox_score_RPN = nn.Linear(self.input_dim, self.num_bbox_out, bias=True) self.RPN_proposal_layer = ProposalLayer(self.num_anchors_per_frame, self.min_window_size, self.max_window_size)
def build_model(dataset, frcn_rois_per_img, train_pre_nms_N=12000, train_post_nms_N=2000, test_pre_nms_N=6000, test_post_nms_N=300, inference=False): """ Returns the Faster-RCNN model. For inference, also returns a reference to the proposal layer. Faster-RCNN contains three modules: VGG, the Region Proposal Network (RPN), and the Classification Network (ROI-pooling + Fully Connected layers), organized as a tree. Tree has 4 branches: VGG -> b1 -> Conv (3x3) -> b2 -> Conv (1x1) -> CrossEntropyMulti (objectness label) b2 -> Conv (1x1) -> SmoothL1Loss (bounding box targets) b1 -> PropLayer -> ROI -> Affine -> Affine -> b3 -> Affine -> CrossEntropyMulti b3 -> Affine -> SmoothL1Loss When the model is constructed for inference, several elements are different: - The number of regions to keep before and after non-max suppression is (6000, 300) for training and (12000, 2000) for inference. - The out_shape of the proposalLayer of the network is equal to post_nms_N (number of rois to keep after performaing nms). This is configured by passing the inference flag to the proposalLayer constructor. Arguments: dataset (objectlocalization): Dataset object. frcn_rois_per_img (int): Number of ROIs per image considered by the classification network. inference (bool): Construct the model for inference. Default is False. Returns: model (Model): Faster-RCNN model. proposalLayer (proposalLayer): Reference to proposalLayer in the model. Returned only for inference=True. """ num_classes = dataset.num_classes # define the branch points b1 = BranchNode(name="conv_branch") b2 = BranchNode(name="rpn_branch") b3 = BranchNode(name="roi_branch") # define VGG VGG = util.add_vgg_layers() # define RPN rpn_init = dict(strides=1, init=Gaussian(scale=0.01), bias=Constant(0)) # these references are passed to the ProposalLayer. RPN_3x3 = Conv((3, 3, 512), activation=Rectlin(), padding=1, **rpn_init) RPN_1x1_obj = Conv((1, 1, 18), activation=PixelwiseSoftmax(c=2), padding=0, **rpn_init) RPN_1x1_bbox = Conv((1, 1, 36), activation=Identity(), padding=0, **rpn_init) # inference uses different network settings if not inference: pre_nms_N = train_pre_nms_N # default 12000 post_nms_N = train_post_nms_N # default 2000 else: pre_nms_N = test_pre_nms_N # default 6000 post_nms_N = test_post_nms_N # default 300 proposalLayer = ProposalLayer([RPN_1x1_obj, RPN_1x1_bbox], dataset, pre_nms_N=pre_nms_N, post_nms_N=post_nms_N, num_rois=frcn_rois_per_img, inference=inference) # define ROI classification network ROI = [ proposalLayer, RoiPooling(HW=(7, 7)), Affine(nout=4096, init=Gaussian(scale=0.005), bias=Constant(.1), activation=Rectlin()), Dropout(keep=0.5), Affine(nout=4096, init=Gaussian(scale=0.005), bias=Constant(.1), activation=Rectlin()), Dropout(keep=0.5) ] ROI_category = Affine(nout=num_classes, init=Gaussian(scale=0.01), bias=Constant(0), activation=Softmax()) ROI_bbox = Affine(nout=4 * num_classes, init=Gaussian(scale=0.001), bias=Constant(0), activation=Identity()) # build the model # the four branches of the tree mirror the branches listed above frcn_tree = Tree([ROI + [b3, ROI_category], [b3, ROI_bbox]]) model = Model(layers=Tree([ VGG + [b1, RPN_3x3, b2, RPN_1x1_obj], [b2, RPN_1x1_bbox], [b1] + [frcn_tree], ])) if inference: return (model, proposalLayer) else: return model
def test_proposal_layer(backend_default, fargs): np.random.seed(seed=0) # Get a backend for tensor allocation be = backend_default be.bsz = 1 _conv_size, im_shape_arr, SCALE, pre_nms_topN, post_nms_topN, nms_thresh, min_size = fargs im_shape = be.zeros((2, 1), dtype=np.float32) im_shape[:] = np.array(im_shape_arr) im_scale = be.ones((1, 1), dtype=np.float32).fill(1.0 / 16.0) SCALE = be.ones((1, 1), dtype=np.float32).fill(SCALE) real_H = np.round(im_shape.get()[1] * im_scale.get()).astype(int).reshape((1,))[0] real_W = np.round(im_shape.get()[0] * im_scale.get()).astype(int).reshape((1,))[0] frcn_labels = be.zeros((21, 128), dtype=np.int32) frcn_labels_mask = be.zeros(frcn_labels.shape, dtype=np.int32) frcn_bbtargets = be.zeros((21 * 4, 128), dtype=np.float32) frcn_bbmask = be.zeros(frcn_bbtargets.shape, dtype=np.float32) gt_boxes = be.zeros((64, 4), dtype=np.float32) gt_boxes[:3, :] = np.array([[262, 210, 323, 338], [164, 263, 252, 371], [240, 193, 294, 298]]) gt_classes = be.zeros((64, 1), dtype=np.int32) gt_classes[:3, :] = np.array([[9], [9], [9]]) num_gt_boxes = be.zeros((1, 1), dtype=np.int32).fill(3) num_scores = 2 * 9 * _conv_size * _conv_size rpn_obj_scores_dev = be.array(np.random.choice(num_scores * 2, size=num_scores, replace=False) / float(num_scores * 2.0)) rpn_bbox_deltas_dev = be.array(np.random.random((4 * 9 * _conv_size * _conv_size, 1))) RPN_1x1_obj = mock_layer(rpn_obj_scores_dev) RPN_1x1_bbox = mock_layer(rpn_bbox_deltas_dev) # Mock loader # mock RPN_1x1_obj and RPN_1x1_bbox # set inference to true to skip proposal target layer mock_loader = mock_dataloader(_conv_size, im_scale, im_shape, SCALE, gt_boxes, gt_classes, num_gt_boxes, frcn_labels, frcn_labels_mask, frcn_bbtargets, frcn_bbmask) prop_layer = ProposalLayer([[RPN_1x1_obj], [RPN_1x1_bbox]], mock_loader, pre_nms_N=pre_nms_topN, post_nms_N=post_nms_topN, nms_thresh=nms_thresh, min_bbox_size=min_size, num_rois=128, deterministic=True, inference=False, debug=True) prop_layer.configure(mock_layer([])) prop_layer.allocate() # mock input (is not used) inputs = [] inputs, dev_proposals = prop_layer.fprop(inputs, inference=False) # extract final proposals and scores from the layer without buffered memory like dev_proposals target_proposals = prop_layer.proposals target_scores = prop_layer.scores # Prepare PyCaffe Reference Layer prop_layer_ref = PyCaffeProposalLayer() # Re-initalize inputs to same as above rpn_obj_scores = rpn_obj_scores_dev.get() rpn_bbox_deltas = rpn_bbox_deltas_dev.get() # reshape from (4KHW, 1) -> (1, K4, H, W) format for pycaffe # NB: pycaffe uses A where we use K # rpn_bbox_deltas = rpn_bbox_deltas.reshape((4, -1, _conv_size, _conv_size)) # rpn_bbox_deltas = rpn_bbbox_deltas[:, :, :real_H, :real_W].transpose((1, 0, 2, 3)) # rpn_bbox_deltas = rpn_bbox_deltas.reshape((1, -1, real_H, real_W)) # Skip unnecessecary reshaping (previously to match caffe) rpn_bbox_deltas = rpn_bbox_deltas.reshape((4, -1, _conv_size, _conv_size)) rpn_bbox_deltas = rpn_bbox_deltas[:, :, :real_H, :real_W].reshape((4, -1)).T # reshape from (2KHW, 1) -> (1, K2, H, W) rpn_obj_scores = rpn_obj_scores.reshape((2, -1, _conv_size, _conv_size)) rpn_obj_scores = rpn_obj_scores[:, :, :real_H, :real_W].transpose((0, 1, 2, 3)) rpn_obj_scores = rpn_obj_scores.reshape((1, -1, real_H, real_W)) bottom = [None, None, None] bottom[0] = rpn_obj_scores bottom[1] = rpn_bbox_deltas bottom[2] = [im_shape[1], im_shape[0], SCALE] top = [None, None] prop_layer_ref.setup(bottom, top, pre_nms_topN=pre_nms_topN, post_nms_topN=post_nms_topN, nms_thresh=nms_thresh, min_size=min_size) prop_layer_ref.forward(bottom, top) # Compare proposals and scores from proposal layer assert np.allclose(top[0][:, 1:], target_proposals, atol=1e-5, rtol=1e-4) assert np.allclose(top[1], target_scores, atol=1e-5, rtol=1e-4) # Now testing proposal target layer t_bottom = [0, 1] # use target proposals from neon RPN zeros = np.zeros((target_proposals.shape[0], 1), dtype=target_proposals.dtype) t_bottom[0] = np.hstack((zeros, target_proposals)) # convert format of gt_boxes from (num_classes, 4) to (num_gt_boxes, 5) # concat the boxes and the classes and clip to num_gt_boxes and pass it in t_bottom[1] = np.hstack((prop_layer.gt_boxes.get(), prop_layer.gt_classes.get()))[:prop_layer.num_gt_boxes.get()[0][0]] t_top = [None, None, None, None, None] prop_target_layer_ref = PyCaffeProposalTargetLayer() prop_target_layer_ref.setup(t_bottom, t_top, deterministic=True) prop_target_layer_ref.forward(t_bottom, t_top) frcn_bbtargets_reference = np.zeros(frcn_bbtargets.shape, dtype=np.float32) frcn_bbmask_reference = np.zeros(frcn_bbmask.shape, dtype=np.float32) frcn_bbtargets_reference[:t_top[2].shape[0]] = t_top[2].T frcn_bbmask_reference[:t_top[3].shape[0]] = t_top[3].T neon_labels = np.zeros((frcn_labels.shape[1],)) label_mat = (frcn_labels.get() * frcn_labels_mask.get()) # Convert neon labels into for cls in range(frcn_labels.shape[0]): for idx, elem in enumerate(label_mat[cls]): if elem != 0: neon_labels[idx] = cls # Test proposal layer targets against pycaffe layer assert (np.alltrue(t_top[1] == neon_labels)) # target labels assert (np.allclose(frcn_bbtargets_reference, frcn_bbtargets.get(), atol=1e-4)) # target bbox assert (np.alltrue(frcn_bbmask_reference == frcn_bbmask.get())) # target bbox mask
def test_proposal_layer(backend_default, fargs): np.random.seed(seed=0) # Get a backend for tensor allocation be = backend_default be.bsz = 1 _conv_size, im_shape_arr, SCALE, pre_nms_topN, post_nms_topN, nms_thresh, min_size = fargs im_shape = be.zeros((2, 1), dtype=np.float32) im_shape[:] = np.array(im_shape_arr) im_scale = be.ones((1, 1), dtype=np.float32).fill(1.0 / 16.0) SCALE = be.ones((1, 1), dtype=np.float32).fill(SCALE) real_H = np.round(im_shape.get()[1] * im_scale.get()).astype(int).reshape( (1, ))[0] real_W = np.round(im_shape.get()[0] * im_scale.get()).astype(int).reshape( (1, ))[0] frcn_labels = be.zeros((21, 128), dtype=np.int32) frcn_labels_mask = be.zeros(frcn_labels.shape, dtype=np.int32) frcn_bbtargets = be.zeros((21 * 4, 128), dtype=np.float32) frcn_bbmask = be.zeros(frcn_bbtargets.shape, dtype=np.float32) gt_boxes = be.zeros((64, 4), dtype=np.float32) gt_boxes[:3, :] = np.array([[262, 210, 323, 338], [164, 263, 252, 371], [240, 193, 294, 298]]) gt_classes = be.zeros((64, 1), dtype=np.int32) gt_classes[:3, :] = np.array([[9], [9], [9]]) num_gt_boxes = be.zeros((1, 1), dtype=np.int32).fill(3) num_scores = 2 * 9 * _conv_size * _conv_size rpn_obj_scores_dev = be.array( np.random.choice(num_scores * 2, size=num_scores, replace=False) / float(num_scores * 2.0)) rpn_bbox_deltas_dev = be.array( np.random.random((4 * 9 * _conv_size * _conv_size, 1))) RPN_1x1_obj = mock_layer(rpn_obj_scores_dev) RPN_1x1_bbox = mock_layer(rpn_bbox_deltas_dev) # Mock loader # mock RPN_1x1_obj and RPN_1x1_bbox # set inference to true to skip proposal target layer mock_loader = mock_dataloader(_conv_size, im_scale, im_shape, SCALE, gt_boxes, gt_classes, num_gt_boxes, frcn_labels, frcn_labels_mask, frcn_bbtargets, frcn_bbmask) prop_layer = ProposalLayer([[RPN_1x1_obj], [RPN_1x1_bbox]], mock_loader, pre_nms_N=pre_nms_topN, post_nms_N=post_nms_topN, nms_thresh=nms_thresh, min_bbox_size=min_size, num_rois=128, deterministic=True, inference=False, debug=True) prop_layer.configure(mock_layer([])) prop_layer.allocate() # mock input (is not used) inputs = [] inputs, dev_proposals = prop_layer.fprop(inputs, inference=False) # extract final proposals and scores from the layer without buffered memory like dev_proposals target_proposals = prop_layer.proposals target_scores = prop_layer.scores # Prepare PyCaffe Reference Layer prop_layer_ref = PyCaffeProposalLayer() # Re-initalize inputs to same as above rpn_obj_scores = rpn_obj_scores_dev.get() rpn_bbox_deltas = rpn_bbox_deltas_dev.get() # reshape from (4KHW, 1) -> (1, K4, H, W) format for pycaffe # NB: pycaffe uses A where we use K # rpn_bbox_deltas = rpn_bbox_deltas.reshape((4, -1, _conv_size, _conv_size)) # rpn_bbox_deltas = rpn_bbbox_deltas[:, :, :real_H, :real_W].transpose((1, 0, 2, 3)) # rpn_bbox_deltas = rpn_bbox_deltas.reshape((1, -1, real_H, real_W)) # Skip unnecessecary reshaping (previously to match caffe) rpn_bbox_deltas = rpn_bbox_deltas.reshape((4, -1, _conv_size, _conv_size)) rpn_bbox_deltas = rpn_bbox_deltas[:, :, :real_H, :real_W].reshape( (4, -1)).T # reshape from (2KHW, 1) -> (1, K2, H, W) rpn_obj_scores = rpn_obj_scores.reshape((2, -1, _conv_size, _conv_size)) rpn_obj_scores = rpn_obj_scores[:, :, :real_H, :real_W].transpose( (0, 1, 2, 3)) rpn_obj_scores = rpn_obj_scores.reshape((1, -1, real_H, real_W)) bottom = [None, None, None] bottom[0] = rpn_obj_scores bottom[1] = rpn_bbox_deltas bottom[2] = [im_shape[1], im_shape[0], SCALE] top = [None, None] prop_layer_ref.setup(bottom, top, pre_nms_topN=pre_nms_topN, post_nms_topN=post_nms_topN, nms_thresh=nms_thresh, min_size=min_size) prop_layer_ref.forward(bottom, top) # Compare proposals and scores from proposal layer assert np.allclose(top[0][:, 1:], target_proposals, atol=1e-5, rtol=1e-4) assert np.allclose(top[1], target_scores, atol=1e-5, rtol=1e-4) # Now testing proposal target layer t_bottom = [0, 1] # use target proposals from neon RPN zeros = np.zeros((target_proposals.shape[0], 1), dtype=target_proposals.dtype) t_bottom[0] = np.hstack((zeros, target_proposals)) # convert format of gt_boxes from (num_classes, 4) to (num_gt_boxes, 5) # concat the boxes and the classes and clip to num_gt_boxes and pass it in t_bottom[1] = np.hstack( (prop_layer.gt_boxes.get(), prop_layer.gt_classes.get()))[:prop_layer.num_gt_boxes.get()[0][0]] t_top = [None, None, None, None, None] prop_target_layer_ref = PyCaffeProposalTargetLayer() prop_target_layer_ref.setup(t_bottom, t_top, deterministic=True) prop_target_layer_ref.forward(t_bottom, t_top) frcn_bbtargets_reference = np.zeros(frcn_bbtargets.shape, dtype=np.float32) frcn_bbmask_reference = np.zeros(frcn_bbmask.shape, dtype=np.float32) frcn_bbtargets_reference[:t_top[2].shape[0]] = t_top[2].T frcn_bbmask_reference[:t_top[3].shape[0]] = t_top[3].T neon_labels = np.zeros((frcn_labels.shape[1], )) label_mat = (frcn_labels.get() * frcn_labels_mask.get()) # Convert neon labels into for cls in range(frcn_labels.shape[0]): for idx, elem in enumerate(label_mat[cls]): if elem != 0: neon_labels[idx] = cls # Test proposal layer targets against pycaffe layer assert (np.alltrue(t_top[1] == neon_labels)) # target labels assert (np.allclose(frcn_bbtargets_reference, frcn_bbtargets.get(), atol=1e-4)) # target bbox assert (np.alltrue(frcn_bbmask_reference == frcn_bbmask.get()) ) # target bbox mask