def test_targets_to_anchors(self): boxes = torch.zeros((0, 4), dtype=torch.float32) negative_target = {"boxes": boxes, "labels": torch.zeros((1, 1), dtype=torch.int64), "image_id": 4, "area": (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]), "iscrowd": torch.zeros((0,), dtype=torch.int64)} anchors = [torch.randint(-50, 50, (3, 4), dtype=torch.float32)] targets = [negative_target] anchor_sizes = ((32,), (64,), (128,), (256,), (512,)) aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator( anchor_sizes, aspect_ratios ) rpn_head = RPNHead(4, rpn_anchor_generator.num_anchors_per_location()[0]) head = RegionProposalNetwork( rpn_anchor_generator, rpn_head, 0.5, 0.3, 256, 0.5, 2000, 2000, 0.7) labels, matched_gt_boxes = head.assign_targets_to_anchors(anchors, targets) self.assertEqual(labels[0].sum(), 0) self.assertEqual(labels[0].shape, torch.Size([anchors[0].shape[0]])) self.assertEqual(labels[0].dtype, torch.float32) self.assertEqual(matched_gt_boxes[0].sum(), 0) self.assertEqual(matched_gt_boxes[0].shape, anchors[0].shape) self.assertEqual(matched_gt_boxes[0].dtype, torch.float32)
def test_targets_to_anchors(self): _, targets = self._make_empty_sample() anchors = [torch.randint(-50, 50, (3, 4), dtype=torch.float32)] anchor_sizes = ((32,), (64,), (128,), (256,), (512,)) aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator( anchor_sizes, aspect_ratios ) rpn_head = RPNHead(4, rpn_anchor_generator.num_anchors_per_location()[0]) head = RegionProposalNetwork( rpn_anchor_generator, rpn_head, 0.5, 0.3, 256, 0.5, 2000, 2000, 0.7, 0.05) labels, matched_gt_boxes = head.assign_targets_to_anchors(anchors, targets) self.assertEqual(labels[0].sum(), 0) self.assertEqual(labels[0].shape, torch.Size([anchors[0].shape[0]])) self.assertEqual(labels[0].dtype, torch.float32) self.assertEqual(matched_gt_boxes[0].sum(), 0) self.assertEqual(matched_gt_boxes[0].shape, anchors[0].shape) self.assertEqual(matched_gt_boxes[0].dtype, torch.float32)
def __init__(self): super(RPN, self).__init__() # Define FPN anchor_sizes = ((32,), (64,), (128,), (256,), (512,)) aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes) # Generate anchor boxes anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) # Define RPN Head # rpn_head = RPNHead(256, 9) rpn_head = RPNHead(256, anchor_generator.num_anchors_per_location()[0]) # RPN parameters, rpn_pre_nms_top_n_train = 2000 rpn_pre_nms_top_n_test = 1000 rpn_post_nms_top_n_train = 2000 rpn_post_nms_top_n_test = 1000 rpn_nms_thresh = 0.7 rpn_fg_iou_thresh = 0.7 rpn_bg_iou_thresh = 0.3 rpn_batch_size_per_image = 256 rpn_positive_fraction = 0.5 rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict( training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) # Create RPN self.rpn = RegionProposalNetwork( anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh)
def __init__(self, backbone_out_channels, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, rpn_nms_thresh=0.7, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000): super(RPN, self).__init__() anchor_sizes = ((32, ), (64, ), (128, ), (256, ), (512, )) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) rpn_head = RPNHead(backbone_out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) self.rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh)
def get_faster_rcnn(n_classes: int): faster_rcnn = torchvision.models.detection.fasterrcnn_resnet50_fpn( pretrained=True) anchor_generator = AnchorGenerator(sizes=tuple([(16, 32, 64, 128, 256) for _ in range(5)]), aspect_ratios=tuple([ (0.75, 0.5, 1.25) for _ in range(5) ])) rpn_head = RPNHead(256, anchor_generator.num_anchors_per_location()[0]) faster_rcnn.rpn = RegionProposalNetwork(anchor_generator=anchor_generator, head=rpn_head, fg_iou_thresh=0.7, bg_iou_thresh=0.3, batch_size_per_image=48, positive_fraction=0.5, pre_nms_top_n=dict(training=200, testing=100), post_nms_top_n=dict(training=160, testing=80), nms_thresh=0.7) in_features = faster_rcnn.roi_heads.box_predictor.cls_score.in_features faster_rcnn.roi_heads.box_predictor = FastRCNNPredictor( in_features, n_classes) faster_rcnn.roi_heads.fg_bg_sampler.batch_size_per_image = 24 faster_rcnn.roi_heads.fg_bg_sampler.positive_fraction = 0.5 return faster_rcnn
def _init_test_rpn(self): anchor_sizes = ((32, ), (64, ), (128, ), (256, ), (512, )) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) out_channels = 256 rpn_head = RPNHead(out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_fg_iou_thresh = 0.7 rpn_bg_iou_thresh = 0.3 rpn_batch_size_per_image = 256 rpn_positive_fraction = 0.5 rpn_pre_nms_top_n = dict(training=2000, testing=1000) rpn_post_nms_top_n = dict(training=2000, testing=1000) rpn_nms_thresh = 0.7 rpn_score_thresh = 0.0 rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh, score_thresh=rpn_score_thresh) return rpn
def __init__(self, backbone, dope_roi_pool, dope_head, dope_predictor, # transform parameters min_size=800, max_size=1333, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # others num_anchor_poses = {'body': 20, 'hand': 10, 'face': 10}, pose2d_reg_weights = {part: 5.0 for part in parts}, pose3d_reg_weights = {part: 5.0 for part in parts}, ): if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(dope_roi_pool, (MultiScaleRoIAlign, type(None))) out_channels = backbone.out_channels if rpn_anchor_generator is None: anchor_sizes = ((32,), (64,), (128,), (256,), (512,)) aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator( anchor_sizes, aspect_ratios ) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0] ) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork( rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) dope_heads = Dope_RoIHeads(dope_roi_pool, dope_head, dope_predictor, num_anchor_poses, pose2d_reg_weights=pose2d_reg_weights, pose3d_reg_weights=pose3d_reg_weights) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = Dope_Transform(min_size, max_size, image_mean, image_std) super(Dope_RCNN, self).__init__(backbone, rpn, dope_heads, transform)
def __init__(self): super(RPN, self).__init__() # Define FPN self.fpn = resnet_fpn_backbone(backbone_name='resnet101', pretrained=True) anchor_sizes = ((32, ), (64, ), (128, ), (256, ), (512, )) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) # Generate anchor boxes anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) # Define RPN Head # rpn_head = RPNHead(256, 9) rpn_head = RPNHead(256, anchor_generator.num_anchors_per_location()[0]) # RPN parameters, rpn_pre_nms_top_n_train = 2000 rpn_pre_nms_top_n_test = 1000 rpn_post_nms_top_n_train = 2000 rpn_post_nms_top_n_test = 1000 rpn_nms_thresh = 0.7 rpn_fg_iou_thresh = 0.7 rpn_bg_iou_thresh = 0.3 rpn_batch_size_per_image = 256 rpn_positive_fraction = 0.5 # transform parameters min_size = 800 max_size = 1333 image_mean = [0.485, 0.456, 0.406] image_std = [0.229, 0.224, 0.225] self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) # Create RPN self.rpn = RegionProposalNetwork(anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh)
def __init__(self): super(RPN, self).__init__() # Define FPN anchor_sizes = ((32, ), (64, ), (128, ), (256, ), (512, )) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) # Generate anchor boxes anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) # Define RPN Head rpn_head = RPNHead(256, anchor_generator.num_anchors_per_location()[0]) RPN_PRE_NMS_TOP_N = dict(training=cfg.RPN.PRE_NMS_TOP_N_TRAIN, testing=cfg.RPN.PRE_NMS_TOP_N_TEST) RPN_POST_NMS_TOP_N = dict(training=cfg.RPN.POST_NMS_TOP_N_TRAIN, testing=cfg.RPN.POST_NMS_TOP_N_TEST) # Create RPN self.rpn = RegionProposalNetwork( anchor_generator, rpn_head, cfg.RPN.FG_IOU_THRESH, cfg.RPN.BG_IOU_THRESH, cfg.RPN.BATCH_SIZE_PER_IMAGE, cfg.RPN.POSITIVE_FRACTION, RPN_PRE_NMS_TOP_N, RPN_POST_NMS_TOP_N, cfg.RPN.NMS_THRESH)
def __init__( self, num_classes=2, # transform parameters backbone_name='resnet50', min_size=256, max_size=512, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, rpn_score_thresh=0.0, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None, # Ellipse regressor ellipse_roi_pool=None, ellipse_head=None, ellipse_predictor=None, ellipse_loss_metric="gaussian-angle"): backbone = resnet_fpn_backbone(backbone_name, pretrained=True, trainable_layers=5) # Input image is grayscale -> in_channels = 1 instead of 3 (COCO) backbone.body.conv1 = Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False) if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) if num_classes is not None: if box_predictor is not None: raise ValueError( "num_classes should be None when box_predictor is specified" ) else: if box_predictor is None: raise ValueError( "num_classes should not be None when box_predictor " "is not specified") out_channels = backbone.out_channels if rpn_anchor_generator is None: anchor_sizes = ((32, ), (64, ), (128, ), (256, ), (512, )) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh, score_thresh=rpn_score_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(out_channels * resolution**2, representation_size) if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor(representation_size, num_classes) if ellipse_roi_pool is None: ellipse_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) if ellipse_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 ellipse_head = TwoMLPHead(out_channels * resolution**2, representation_size) if ellipse_predictor is None: representation_size = 1024 ellipse_predictor = EllipseRegressor(representation_size, num_classes) roi_heads = EllipseRoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img, # Ellipse ellipse_roi_pool=ellipse_roi_pool, ellipse_head=ellipse_head, ellipse_predictor=ellipse_predictor, ellipse_loss_metric=ellipse_loss_metric) if image_mean is None: image_mean = [0.156] if image_std is None: image_std = [0.272] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super().__init__(backbone, rpn, roi_heads, transform)
def __init__(self, cfg): super(SeqNet, self).__init__() backbone, box_head = build_resnet(name="resnet50", pretrained=True) anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512), ), aspect_ratios=((0.5, 1.0, 2.0), )) head = RPNHead( in_channels=backbone.out_channels, num_anchors=anchor_generator.num_anchors_per_location()[0], ) pre_nms_top_n = dict(training=cfg.MODEL.RPN.PRE_NMS_TOPN_TRAIN, testing=cfg.MODEL.RPN.PRE_NMS_TOPN_TEST) post_nms_top_n = dict(training=cfg.MODEL.RPN.POST_NMS_TOPN_TRAIN, testing=cfg.MODEL.RPN.POST_NMS_TOPN_TEST) rpn = RegionProposalNetwork( anchor_generator=anchor_generator, head=head, fg_iou_thresh=cfg.MODEL.RPN.POS_THRESH_TRAIN, bg_iou_thresh=cfg.MODEL.RPN.NEG_THRESH_TRAIN, batch_size_per_image=cfg.MODEL.RPN.BATCH_SIZE_TRAIN, positive_fraction=cfg.MODEL.RPN.POS_FRAC_TRAIN, pre_nms_top_n=pre_nms_top_n, post_nms_top_n=post_nms_top_n, nms_thresh=cfg.MODEL.RPN.NMS_THRESH, ) faster_rcnn_predictor = FastRCNNPredictor(2048, 2) reid_head = deepcopy(box_head) box_roi_pool = MultiScaleRoIAlign(featmap_names=["feat_res4"], output_size=14, sampling_ratio=2) box_predictor = BBoxRegressor(2048, num_classes=2, bn_neck=cfg.MODEL.ROI_HEAD.BN_NECK) roi_heads = SeqRoIHeads( # OIM num_pids=cfg.MODEL.LOSS.LUT_SIZE, num_cq_size=cfg.MODEL.LOSS.CQ_SIZE, oim_momentum=cfg.MODEL.LOSS.OIM_MOMENTUM, oim_scalar=cfg.MODEL.LOSS.OIM_SCALAR, # SeqNet faster_rcnn_predictor=faster_rcnn_predictor, reid_head=reid_head, # parent class box_roi_pool=box_roi_pool, box_head=box_head, box_predictor=box_predictor, fg_iou_thresh=cfg.MODEL.ROI_HEAD.POS_THRESH_TRAIN, bg_iou_thresh=cfg.MODEL.ROI_HEAD.NEG_THRESH_TRAIN, batch_size_per_image=cfg.MODEL.ROI_HEAD.BATCH_SIZE_TRAIN, positive_fraction=cfg.MODEL.ROI_HEAD.POS_FRAC_TRAIN, bbox_reg_weights=None, score_thresh=cfg.MODEL.ROI_HEAD.SCORE_THRESH_TEST, nms_thresh=cfg.MODEL.ROI_HEAD.NMS_THRESH_TEST, detections_per_img=cfg.MODEL.ROI_HEAD.DETECTIONS_PER_IMAGE_TEST, ) transform = GeneralizedRCNNTransform( min_size=cfg.INPUT.MIN_SIZE, max_size=cfg.INPUT.MAX_SIZE, image_mean=[0.485, 0.456, 0.406], image_std=[0.229, 0.224, 0.225], ) self.backbone = backbone self.rpn = rpn self.roi_heads = roi_heads self.transform = transform # loss weights self.lw_rpn_reg = cfg.SOLVER.LW_RPN_REG self.lw_rpn_cls = cfg.SOLVER.LW_RPN_CLS self.lw_proposal_reg = cfg.SOLVER.LW_PROPOSAL_REG self.lw_proposal_cls = cfg.SOLVER.LW_PROPOSAL_CLS self.lw_box_reg = cfg.SOLVER.LW_BOX_REG self.lw_box_cls = cfg.SOLVER.LW_BOX_CLS self.lw_box_reid = cfg.SOLVER.LW_BOX_REID
class ACFNetwork(nn.Module): """Wrapper for pre-built PyTorch models. Based off: https://pytorch.org/docs/stable/_modules/torchvision/models/detection/mask_rcnn.html https://pytorch.org/docs/stable/_modules/torchvision/models/detection/faster_rcnn.html https://github.com/pytorch/vision/blob/master/torchvision/models/detection/generalized_rcnn.py """ def __init__( self, arch, pretrained, num_classes, input_mode, acf_head='endpoints', # transform parameters min_size=800, max_size=1333, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.5, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None): super(ACFNetwork, self).__init__() self.input_mode = input_mode self.backbone = resnet_fpn_backbone(arch, pretrained) # change first layer to 4 channel for early fusion with 1 channel depth, load pretrained weights on RGB channels conv1_weight_old = nn.Parameter(self.backbone.body.conv1.weight.data ) # self.backbone.body.conv1.weight conv1_weight = torch.zeros((64, 4, 7, 7)) conv1_weight[:, 0:3, :, :] = conv1_weight_old avg_weight = conv1_weight_old.mean(dim=1, keepdim=False) conv1_weight[:, 3, :, :] = avg_weight self.backbone.body.conv1.weight = torch.nn.Parameter(conv1_weight) # self.backbone.body.conv1.weight.detach() # self.backbone.body.conv1 = nn.Conv2d(4, 64, kernel_size=7, stride=2, padding=3, bias=False) out_channels = self.backbone.out_channels if rpn_anchor_generator is None: anchor_sizes = ((16, ), (32, ), (64, ), (128, ), (256, ), (512, )) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) self.rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) self.roi_heads = RoIHeadsExtend(out_channels, num_classes, self.input_mode, acf_head) # freeze RGB backbone and RPN when training on poses if self.input_mode == config.INPUT_RGBD: for param in self.rpn.parameters(): param.requires_grad = False for param in self.backbone.parameters(): param.requires_grad = False # self.backbone_depth = resnet_fpn_backbone(arch, pretrained) def forward(self, images, targets=None): """ Arguments: images: Image batch, normalized [NxCxHxW] targets (list[Dict[Tensor]]): ground-truth boxes present in the image (optional) Returns: result (list[BoxList] or dict[Tensor]): the output from the model. During training, it returns a dict[Tensor] which contains the losses. During testing, it returns list[BoxList] contains additional fields like `scores`, `labels` and `mask` (for Mask R-CNN models). """ image_sizes = [tuple(images.shape[-2:])] * images.shape[0] features = self.backbone(images) # Might need to torch.chunk the features because it wants it to be a list for some reason. image_list = ImageList(images, image_sizes) try: proposals, proposal_losses = self.rpn(image_list, features, targets) except Exception as e: print(e) # dirty data not cleaned detections, detector_losses = self.roi_heads(features, proposals, image_sizes, targets) losses = {} losses.update(detector_losses) losses.update(proposal_losses) if targets is not None: return detections, features, losses else: return detections, features
rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) anchor_sizes = ((32,), (64,), (128,), (256,), (512,)) aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) rpn_head = RPNHead(512, rpn_anchor_generator.num_anchors_per_location()[0]) rpn = RegionProposalNetwork( rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) trainable_backbone_layers = None pretrained = True trainable_backbone_layers = _validate_resnet_trainable_layers(pretrained or pretrained_backbone, trainable_backbone_layers) if pretrained: pretrained_backbone = False backbone = resnet_fpn_backbone('resnet50', pretrained_backbone, trainable_layers=trainable_backbone_layers) fasterrcnn = FasterRCNN(backbone, 8) maskrcnn = MaskRCNN(backbone, 8)
def __init__( self, backbone, num_ID, num_classes=2, len_embeddings=128, # transform parameters min_size=720, max_size=960, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.5, rpn_bg_iou_thresh=0.4, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None): if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) out_channels = backbone.out_channels if rpn_anchor_generator is None: anchor_sizes = ((16, 22), (32, 45), (64, 90), (128, 181), (256, 362)) aspect_ratios = ((1 / 3, ), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3], output_size=7, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(out_channels * resolution**2, representation_size) emb_scale = math.sqrt(2) * math.log(num_ID - 1) if num_ID > 1 else 1 if box_predictor is None: representation_size = 1024 box_predictor = JDEPredictor(representation_size, num_classes, len_embeddings, emb_scale) roi_heads = JDE_RoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img, len_embeddings, num_ID) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super(Jde_RCNN, self).__init__(backbone, rpn, roi_heads, transform) self.eval_embed = False
def __init__( self, arch, pretrained, num_classes, input_mode, acf_head='endpoints', # transform parameters min_size=800, max_size=1333, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.5, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None): super(ACFNetwork, self).__init__() self.input_mode = input_mode self.backbone = resnet_fpn_backbone(arch, pretrained) # change first layer to 4 channel for early fusion with 1 channel depth, load pretrained weights on RGB channels conv1_weight_old = nn.Parameter(self.backbone.body.conv1.weight.data ) # self.backbone.body.conv1.weight conv1_weight = torch.zeros((64, 4, 7, 7)) conv1_weight[:, 0:3, :, :] = conv1_weight_old avg_weight = conv1_weight_old.mean(dim=1, keepdim=False) conv1_weight[:, 3, :, :] = avg_weight self.backbone.body.conv1.weight = torch.nn.Parameter(conv1_weight) # self.backbone.body.conv1.weight.detach() # self.backbone.body.conv1 = nn.Conv2d(4, 64, kernel_size=7, stride=2, padding=3, bias=False) out_channels = self.backbone.out_channels if rpn_anchor_generator is None: anchor_sizes = ((16, ), (32, ), (64, ), (128, ), (256, ), (512, )) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) self.rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) self.roi_heads = RoIHeadsExtend(out_channels, num_classes, self.input_mode, acf_head) # freeze RGB backbone and RPN when training on poses if self.input_mode == config.INPUT_RGBD: for param in self.rpn.parameters(): param.requires_grad = False for param in self.backbone.parameters(): param.requires_grad = False
def __init__( self, backbone, num_classes=None, # transform parameters scale_factor=2.5, scale_factor_jitter=0.25, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None, # added by Mohamed batched_nms=True, indep_classif_boxes=False, classification_bbox_size=None, n_fc_classif_layers=1, fc_classif_dropout=0.1, cconvhead=None, sattention_head=None, ignore_label: int = None, proposal_augmenter=None, n_testtime_augmentations=0): if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) if num_classes is not None: if box_predictor is not None: raise ValueError( "num_classes should be None when box_predictor is specified" ) else: if box_predictor is None: raise ValueError( "num_classes should not be None when box_predictor " "is not specified") out_channels = backbone.out_channels if rpn_anchor_generator is None: anchor_sizes = ((32, ), (64, ), (128, ), (256, ), (512, )) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3], output_size=7, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(out_channels * resolution**2, representation_size) if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor( in_channels=representation_size, num_classes=num_classes, n_fc_classif_layers=n_fc_classif_layers, dropout=fc_classif_dropout, batched_nms=batched_nms, ) roi_heads = RoIHeads( # Box box_roi_pool=box_roi_pool, box_head=box_head, box_predictor=box_predictor, fg_iou_thresh=box_fg_iou_thresh, bg_iou_thresh=box_bg_iou_thresh, batch_size_per_image=box_batch_size_per_image, positive_fraction=box_positive_fraction, bbox_reg_weights=bbox_reg_weights, score_thresh=box_score_thresh, nms_thresh=box_nms_thresh, detections_per_img=box_detections_per_img, # added by Mohamed batched_nms=batched_nms, indep_classif_boxes=indep_classif_boxes, classification_bbox_size=classification_bbox_size, cconvhead=cconvhead, sattention_head=sattention_head, ignore_label=ignore_label, ) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] # Mohamed: I changed GeneralizedRCNNTransform to take a scale factor # as opposed to a fixed size to allow free size images in inference transform = GeneralizedRCNNTransform( scale_factor=scale_factor, scale_factor_jitter=scale_factor_jitter, image_mean=image_mean, image_std=image_std) super(FasterRCNN, self).__init__( backbone=backbone, rpn=rpn, roi_heads=roi_heads, transform=transform, # Mohamed: added this proposal_augmenter=proposal_augmenter, n_testtime_augmentations=n_testtime_augmentations, )
def __init__( self, backbone, num_classes=2, num_pids=5532, num_cq_size=5000, # transform parameters min_size=900, max_size=1500, image_mean=None, image_std=None, # Anchor settings: anchor_scales=None, anchor_ratios=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=12000, rpn_pre_nms_top_n_test=6000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=300, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters rcnn_bbox_bn=True, box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.4, box_detections_per_img=300, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.1, box_batch_size_per_image=128, box_positive_fraction=0.5, bbox_reg_weights=None, # ReID parameters feat_head=None, reid_head=None, reid_loss=None): if rpn_anchor_generator is None: anchor_sizes = ((32, 64, 128, 256, 512), ) aspect_ratios = ((0.5, 1.0, 2.0), ) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) rpn_head = RPNHead(backbone.out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign(featmap_names=['feat2rpn'], output_size=[14, 14], sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 2048 box_head = GAP_BOX_HEAD(resolution, feat_head, representation_size) if box_predictor is None: representation_size = 2048 box_predictor = FastRCNNPredictor(representation_size, num_classes, RCNN_bbox_bn=False) if reid_head is None: reid_head = REID_HEAD(box_head.out_dims, 256) if reid_loss is None: reid_loss = OIMLoss(256, num_pids, num_cq_size, 0.5, 30) roi_heads = OIM_ROI_HEAD( reid_head, reid_loss, # box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super(FasterRCNN_OIM, self).__init__(backbone, rpn, roi_heads, transform)
def __init__(self): super(FasterRCNN, self).__init__() # Define FPN self.fpn = resnet_fpn_backbone(backbone_name='resnet101', pretrained=True) anchor_sizes = ((32, ), (64, ), (128, ), (256, ), (512, )) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) # Generate anchor boxes anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) # Define RPN Head # rpn_head = RPNHead(256, 9) rpn_head = RPNHead(256, anchor_generator.num_anchors_per_location()[0]) # RPN parameters, rpn_pre_nms_top_n_train = 2000 rpn_pre_nms_top_n_test = 1000 rpn_post_nms_top_n_train = 2000 rpn_post_nms_top_n_test = 1000 rpn_nms_thresh = 0.7 rpn_fg_iou_thresh = 0.7 rpn_bg_iou_thresh = 0.3 # rpn_nms_thresh = 0.45 # rpn_fg_iou_thresh = 0.5 # rpn_bg_iou_thresh = 0.5 rpn_batch_size_per_image = 256 rpn_positive_fraction = 0.5 # transform parameters min_size = 800 max_size = 1333 image_mean = [0.485, 0.456, 0.406] image_std = [0.229, 0.224, 0.225] self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) # Create RPN self.rpn = RegionProposalNetwork(anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) # Box parameters box_roi_pool = None box_head = None box_predictor = None box_score_thresh = 0.05 box_nms_thresh = 0.5 box_detections_per_img = 100 box_fg_iou_thresh = 0.5 box_bg_iou_thresh = 0.5 box_batch_size_per_image = 512 box_positive_fraction = 0.25 bbox_reg_weights = None num_classes = 101 if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(256 * resolution**2, representation_size) if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor(representation_size, num_classes) self.roi_heads = RoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img)
def _set_rpn(self, *args): return RegionProposalNetwork(*args)
def __init__( self, backbone, num_classes=None, # transform parameters min_size=800, max_size=1333, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None): if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) if num_classes is not None: if box_predictor is not None: raise ValueError( "num_classes should be None when box_predictor is specified" ) else: if box_predictor is None: raise ValueError( "num_classes should not be None when box_predictor " "is not specified") out_channels = backbone.out_channels if rpn_anchor_generator is None: anchor_sizes = ((32, ), (64, ), (128, ), (256, ), (512, )) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(out_channels * resolution**2, representation_size) if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor(representation_size, num_classes) roi_heads = RoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, transform)
def __init__( self, backbone, num_ID, num_classes=2, version='v1', # transform parameters min_size=800, max_size=1333, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.5, rpn_bg_iou_thresh=0.4, #FIXME 这两个参数是参照论文Towards Real-Time Multi-Object Tracking rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=256, box_positive_fraction=0.25, bbox_reg_weights=None, # Embedding parameters ##FIXME 添加的参数 len_embeddings=128, embed_head=None, embed_extractor=None): if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) out_channels = backbone.out_channels ##FIXME 改了anchor size,并且只使用宽高比1/3的anchor,参考了Towards Real-Time Multi-Object Tracking if rpn_anchor_generator is None: anchor_sizes = ((16, 22), (32, 45), (64, 90), (128, 181), (256, 362)) aspect_ratios = ((1 / 3, ), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3], output_size=11, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(out_channels * resolution**2, representation_size) emb_scale = math.sqrt(2) * math.log(num_ID - 1) if num_ID > 1 else 1 ## FIXME 现在用的是v1 if embed_head is None: if version == 'v1': resolution = box_roi_pool.output_size[0] representation_size = 1024 embed_head = featureHead(out_channels * resolution**2, representation_size) if version == 'v2': embed_head = None if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor(representation_size, num_classes) if embed_extractor is None: representation_size = 1024 embed_extractor = featureExtractor(representation_size, len_embeddings, emb_scale) roi_heads = JDE_RoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img, len_embeddings, num_ID, embed_head, embed_extractor) roi_heads.version = version #FIXME 这一部分是照搬faster RCNN代码里面的################### if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) ########################################################### super(Jde_RCNN, self).__init__(backbone, rpn, roi_heads, transform) ## FIXME 跟踪时用的参数,与训练无关 self.version = version self.original_image_sizes = None self.preprocessed_images = None self.features = None self.box_features = None