def get_box_head(self): box_head = TwoMLPHead( self.obj_detect.backbone.out_channels * self.obj_detect.roi_heads.box_roi_pool.output_size[0]**2, representation_size=1024).to(device) box_head.load_state_dict(self.bbox_head_weights) return box_head
def __init__(self, in_features, num_classes, pretrained=False): super(RoIFeatureExtractor_new, self).__init__() self.fc_head = TwoMLPHead(in_channels=1280 * 7 * 7, representation_size=in_features) layers = [ BasicBlock(256 * 5, 1024 * 5), Bottleneck(1024 * 5, 1024 * 5), BasicBlock(256 * 5, 1024 * 5), Bottleneck(1024 * 5, 1024 * 5), BasicBlock(256 * 5, 1024 * 5), Bottleneck(1024 * 5, 1024 * 5) ] self.conv_head = nn.Sequential(*layers)
def _init_test_roi_heads_faster_rcnn(self): out_channels = 256 num_classes = 91 box_fg_iou_thresh = 0.5 box_bg_iou_thresh = 0.5 box_batch_size_per_image = 512 box_positive_fraction = 0.25 bbox_reg_weights = None box_score_thresh = 0.05 box_nms_thresh = 0.5 box_detections_per_img = 100 box_roi_pool = ops.MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(out_channels * resolution**2, representation_size) representation_size = 1024 box_predictor = FastRCNNPredictor(representation_size, num_classes) roi_heads = RoIHeads(box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img) return roi_heads
def __call__(self, classes=3, sizes=((32, 64, 128, 256, 512), ), aspect_ratios=((0.5, 1.0, 2.0), )): from torchvision.models.detection.rpn import AnchorGenerator import torchvision from torchvision.models.detection import FasterRCNN from torchvision.models.detection.faster_rcnn import TwoMLPHead, FastRCNNPredictor # load a pre-trained model for classification and return # only the features backbone = torchvision.models.squeezenet1_1(pretrained=True).features # FasterRCNN needs to know the number of # output channels in a backbone. For squeezenet1_1, it's 512 # so we need to add it here backbone.out_channels = 512 anchor_generator = AnchorGenerator(sizes=sizes, aspect_ratios=aspect_ratios) roi_out_size = 7 roi_pooler = torchvision.ops.MultiScaleRoIAlign( featmap_names=['0'], output_size=roi_out_size, sampling_ratio=2) representation_size = 256 # Scaled down from 1024 in original implementation. # allows to reduce considerably the number of parameters box_head = TwoMLPHead(backbone.out_channels * roi_out_size**2, representation_size) box_predictor = FastRCNNPredictor(representation_size, classes) model = FasterRCNN(backbone, rpn_anchor_generator=anchor_generator, box_roi_pool=roi_pooler, box_head=box_head, box_predictor=box_predictor) return model
def __init__(self): super(FasterRCNN, self).__init__() # Define FPN self.fpn = resnet_fpn_backbone(backbone_name='resnet101', pretrained=True) self.rpn = RPN() # transform parameters min_size = 800 max_size = 1333 image_mean = [0.485, 0.456, 0.406] image_std = [0.229, 0.224, 0.225] self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) # Box parameters box_roi_pool = None box_head = None box_predictor = None box_score_thresh = 0.5 box_nms_thresh = 0.5 box_detections_per_img = 100 box_fg_iou_thresh = 0.5 box_bg_iou_thresh = 0.5 box_batch_size_per_image = 512 box_positive_fraction = 0.25 bbox_reg_weights = None num_classes = 101 if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(256 * resolution**2, representation_size) if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor(representation_size, num_classes) self.roi_heads = RoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img)
def test_assign_targets_to_proposals(self): proposals = [torch.randint(-50, 50, (20, 4), dtype=torch.float32)] gt_boxes = [torch.zeros((0, 4), dtype=torch.float32)] gt_labels = [torch.tensor([[0]], dtype=torch.int64)] box_roi_pool = MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(4 * resolution**2, representation_size) representation_size = 1024 box_predictor = FastRCNNPredictor(representation_size, 2) roi_heads = RoIHeads( # Box box_roi_pool, box_head, box_predictor, 0.5, 0.5, 512, 0.25, None, 0.05, 0.5, 100) matched_idxs, labels = roi_heads.assign_targets_to_proposals( proposals, gt_boxes, gt_labels) self.assertEqual(matched_idxs[0].sum(), 0) self.assertEqual(matched_idxs[0].shape, torch.Size([proposals[0].shape[0]])) self.assertEqual(matched_idxs[0].dtype, torch.int64) self.assertEqual(labels[0].sum(), 0) self.assertEqual(labels[0].shape, torch.Size([proposals[0].shape[0]])) self.assertEqual(labels[0].dtype, torch.int64)
def mask_rcnn(pretrained=False, num_classes=1 + 90, representation=1024, backbone=None, with_mask=True, **kwargs): if backbone is None: model = maskrcnn_resnet50_fpn(pretrained, pretrained_backbone=not pretrained, progress=True, **kwargs) else: model = maskrcnn_resnet50_fpn(pretrained, pretrained_backbone=False, progress=True, **kwargs) model.backbone = backbone in_features = model.roi_heads.box_predictor.cls_score.in_features out_features = model.roi_heads.box_predictor.cls_score.out_features if representation != in_features: logging.info( f"Replaced box_head with representation size of {representation}") out_channels = model.backbone.out_channels resolution = model.roi_heads.box_roi_pool.output_size[0] model.roi_heads.box_head = TwoMLPHead(out_channels * resolution**2, representation) if representation != in_features or num_classes != out_features: logging.info( f"Replaced box_predictor with (representation, num_classes) = ({representation}, {num_classes})" ) model.roi_heads.box_predictor = FastRCNNPredictor( representation, num_classes) if not with_mask: model.roi_heads.mask_roi_pool = None model.roi_heads.mask_head = None model.roi_heads.mask_predictor = None return THDetector(model)
def __init__(self): super(RoIHeads, self).__init__() self.box_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) resolution = self.box_roi_pool.output_size[0] representation_size = 1024 self.box_head = TwoMLPHead(256 * resolution**2, representation_size) self.rlp_head = copy.deepcopy(self.box_head) representation_size = 1024 self.box_predictor = FastRCNNPredictor(representation_size, cfg.BOX.NUM_CLASSES) self.RelDN = reldn_heads.reldn_head(self.box_head.fc7.out_features * 3) # concat of SPO self.box_similarity = box_ops.box_iou # assign ground-truth boxes for each proposal self.proposal_matcher = det_utils.Matcher( cfg.BOX.FG_IOU_THRESH, cfg.BOX.BG_IOU_THRESH, allow_low_quality_matches=False) self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler( cfg.BOX.BATCH_SIZE_PER_IMAGE, cfg.BOX.POSITIVE_FRACTION) self.fg_bg_sampler_so = det_utils.BalancedPositiveNegativeSampler( cfg.MODEL.BATCH_SIZE_PER_IMAGE_SO, cfg.MODEL.POSITIVE_FRACTION_SO) self.fg_bg_sampler_rlp = det_utils.BalancedPositiveNegativeSampler( cfg.MODEL.BATCH_SIZE_PER_IMAGE_REL, cfg.MODEL.POSITIVE_FRACTION_REL) bbox_reg_weights = (10., 10., 5., 5.) self.box_coder = det_utils.BoxCoder(bbox_reg_weights)
class RoIFeatureExtractor_new(nn.Module): def __init__(self, in_features, num_classes, pretrained=False): super(RoIFeatureExtractor_new, self).__init__() self.fc_head = TwoMLPHead(in_channels=1280 * 7 * 7, representation_size=in_features) layers = [ BasicBlock(256 * 5, 1024 * 5), Bottleneck(1024 * 5, 1024 * 5), BasicBlock(256 * 5, 1024 * 5), Bottleneck(1024 * 5, 1024 * 5), BasicBlock(256 * 5, 1024 * 5), Bottleneck(1024 * 5, 1024 * 5) ] self.conv_head = nn.Sequential(*layers) # self.avgpool = nn.AvgPool2d(kernel_size=7, stride=7) def forward(self, features): # N, 1280, 7, 7 print(features.shape) fc_feature = self.fc_head.forward(features) conv_feature = self.conv_head(features) avgPool = nn.AvgPool2d((conv_feature.shape[2], conv_feature.shape[3])) conv_feature = avgPool(conv_feature) return (fc_feature, conv_feature)
'min_size': 512, 'max_size': 1024, 'box_detections_per_img': 128, 'box_nms_thresh': 0.25, 'box_score_thresh': .75, 'rpn_nms_thresh': 0.25 } print(inference_args) # many small anchors anchor_generator = AnchorGenerator(sizes=tuple([(2, 4, 8, 16, 32) for r in range(5)]), aspect_ratios=tuple([(0.1, 0.25, 0.5, 1, 1.5, 2) for rh in range(5)])) box_head = TwoMLPHead(in_channels=7 * 7 * 256, representation_size=128) box_predictor = FastRCNNPredictor(in_channels=128, num_classes=3) mask_roi_pool = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3], output_size=14, sampling_ratio=2) mask_predictor = MaskRCNNPredictor(in_channels=256, dim_reduced=256, num_classes=3) inference_args['box_head'] = box_head inference_args['rpn_anchor_generator'] = anchor_generator inference_args['mask_roi_pool'] = mask_roi_pool inference_args['mask_predictor'] = mask_predictor inference_args['box_predictor'] = box_predictor maskrcnn_model = maskrcnn_resnet50_fpn(pretrained=False, **inference_args)
def __init__( self, num_classes=2, # transform parameters backbone_name='resnet50', min_size=256, max_size=512, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, rpn_score_thresh=0.0, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None, # Ellipse regressor ellipse_roi_pool=None, ellipse_head=None, ellipse_predictor=None, ellipse_loss_metric="gaussian-angle"): backbone = resnet_fpn_backbone(backbone_name, pretrained=True, trainable_layers=5) # Input image is grayscale -> in_channels = 1 instead of 3 (COCO) backbone.body.conv1 = Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False) if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) if num_classes is not None: if box_predictor is not None: raise ValueError( "num_classes should be None when box_predictor is specified" ) else: if box_predictor is None: raise ValueError( "num_classes should not be None when box_predictor " "is not specified") out_channels = backbone.out_channels if rpn_anchor_generator is None: anchor_sizes = ((32, ), (64, ), (128, ), (256, ), (512, )) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh, score_thresh=rpn_score_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(out_channels * resolution**2, representation_size) if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor(representation_size, num_classes) if ellipse_roi_pool is None: ellipse_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) if ellipse_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 ellipse_head = TwoMLPHead(out_channels * resolution**2, representation_size) if ellipse_predictor is None: representation_size = 1024 ellipse_predictor = EllipseRegressor(representation_size, num_classes) roi_heads = EllipseRoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img, # Ellipse ellipse_roi_pool=ellipse_roi_pool, ellipse_head=ellipse_head, ellipse_predictor=ellipse_predictor, ellipse_loss_metric=ellipse_loss_metric) if image_mean is None: image_mean = [0.156] if image_std is None: image_std = [0.272] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super().__init__(backbone, rpn, roi_heads, transform)
def faster_rcnn_resnet_fpn(backbone_name, image_size, num_classes, max_objs_per_image, backbone_pretrained=False, logger=None, obj_thresh=0.1): resnet = models.resnet.__dict__[backbone_name]( pretrained=backbone_pretrained) return_layers = { 'layer1': 'c2', 'layer2': 'c3', 'layer3': 'c4', 'layer4': 'c5' } in_channels_stage2 = resnet.inplanes // 8 in_channels_list = [ in_channels_stage2, in_channels_stage2 * 2, in_channels_stage2 * 4, in_channels_stage2 * 8, ] out_channels = 256 from torchvision.models.detection.backbone_utils import BackboneWithFPN backbone = BackboneWithFPN(resnet, return_layers, in_channels_list, out_channels) rpn_in_channels = out_channels roi_pooling_output_size = 7 dim_roi_features = 1024 # roi特征向量长度 from torchvision.models.detection.faster_rcnn import TwoMLPHead roi_head = TwoMLPHead(out_channels * roi_pooling_output_size**2, dim_roi_features) # roi_head = nn.Sequential() # roi_head.add_module("0", nn.Conv2d(out_channels, out_channels, 3, 2, padding=1)) # roi_head.add_module("1", nn.BatchNorm2d(out_channels)) # roi_head.add_module("2", nn.ReLU()) # roi_head.add_module("3", TwoMLPHead(out_channels * floor(roi_pooling_output_size / 2) ** 2, dim_roi_features)) strides = (2**2, 2**3, 2**4, 2**5, 2**6) # P* 的步长 sizes = [(ceil(image_size[0] / i), ceil(image_size[1] / i)) for i in strides] sizes = tuple(sizes) scales = ((32**2, ), (64**2, ), (128**2, ), (256**2, ), (512**2, )) ratios = ((0.5, 1, 2), ) * len(scales) return FasterRCNN( backbone=backbone, roi_head=roi_head, dim_roi_features=dim_roi_features, image_size=image_size, num_classes=num_classes, strides=strides, sizes=sizes, scales=scales, ratios=ratios, rpn_in_channels=rpn_in_channels, max_objs_per_image=max_objs_per_image, roi_pooling="roi_align", roi_pooling_output_size=roi_pooling_output_size, obj_thresh=obj_thresh, logger=logger, )
box_detections_per_img = 100, box_fg_iou_thresh = 0.5 box_bg_iou_thresh = 0.5 box_batch_size_per_image = 256 box_positive_fraction = 0.25 bbox_reg_weights = None if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(256 * resolution**2, representation_size) if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor(representation_size, num_classes=21) rpn = RPN() roi_heads = RoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction,
def __init__(self, train_data, mode='sgcls', require_overlap_det=True, use_bias=False, test_bias=False, backbone='vgg16', RELS_PER_IMG=1024, min_size=None, max_size=None, edge_model='motifs'): """ Base class for an SGG model :param mode: (sgcls, predcls, or sgdet) :param require_overlap_det: Whether two objects must intersect """ super(RelModelBase, self).__init__() self.classes = train_data.ind_to_classes self.rel_classes = train_data.ind_to_predicates self.mode = mode self.backbone = backbone self.RELS_PER_IMG = RELS_PER_IMG self.pool_sz = 7 self.stride = 16 self.use_bias = use_bias self.test_bias = test_bias self.require_overlap = require_overlap_det and self.mode == 'sgdet' if self.backbone == 'resnet50': self.obj_dim = 1024 self.fmap_sz = 21 if min_size is None: min_size = 1333 if max_size is None: max_size = 1333 print('\nLoading COCO pretrained model maskrcnn_resnet50_fpn...\n') # See https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html self.detector = torchvision.models.detection.maskrcnn_resnet50_fpn( pretrained=True, min_size=min_size, max_size=max_size, box_detections_per_img=50, box_score_thresh=0.2) in_features = self.detector.roi_heads.box_predictor.cls_score.in_features # replace the pre-trained head with a new one self.detector.roi_heads.box_predictor = FastRCNNPredictor( in_features, len(self.classes)) self.detector.roi_heads.mask_predictor = None layers = list(self.detector.roi_heads.children())[:2] self.roi_fmap_obj = copy.deepcopy(layers[1]) self.roi_fmap = copy.deepcopy(layers[1]) self.roi_pool = copy.deepcopy(layers[0]) elif self.backbone == 'vgg16': self.obj_dim = 4096 self.fmap_sz = 38 if min_size is None: min_size = IM_SCALE if max_size is None: max_size = IM_SCALE vgg = load_vgg(use_dropout=False, use_relu=False, use_linear=True, pretrained=False) vgg.features.out_channels = 512 anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512), ), aspect_ratios=((0.5, 1.0, 2.0), )) roi_pooler = torchvision.ops.MultiScaleRoIAlign( featmap_names=['0'], output_size=self.pool_sz, sampling_ratio=2) self.detector = FasterRCNN(vgg.features, min_size=min_size, max_size=max_size, rpn_anchor_generator=anchor_generator, box_head=TwoMLPHead( vgg.features.out_channels * self.pool_sz**2, self.obj_dim), box_predictor=FastRCNNPredictor( self.obj_dim, len(train_data.ind_to_classes)), box_roi_pool=roi_pooler, box_detections_per_img=50, box_score_thresh=0.2) self.roi_fmap = nn.Sequential(nn.Flatten(), vgg.classifier) self.roi_fmap_obj = load_vgg(pretrained=False).classifier self.roi_pool = copy.deepcopy( list(self.detector.roi_heads.children())[0]) else: raise NotImplementedError(self.backbone) self.edge_dim = self.detector.backbone.out_channels self.union_boxes = UnionBoxesAndFeats(pooling_size=self.pool_sz, stride=self.stride, dim=self.edge_dim, edge_model=edge_model) if self.use_bias: self.freq_bias = FrequencyBias(train_data)
def __init__( self, box_roi_pool, box_head, box_predictor, # Faster R-CNN training fg_iou_thresh, bg_iou_thresh, batch_size_per_image, positive_fraction, bbox_reg_weights, # Faster R-CNN inference score_thresh, nms_thresh, detections_per_img, out_channels, # Mask mask_roi_pool=None, mask_head=None, mask_predictor=None, keypoint_roi_pool=None, keypoint_head=None, keypoint_predictor=None, pose_mean=None, pose_stddev=None, threed_68_points=None, threed_5_points=None, bbox_x_factor=1.1, bbox_y_factor=1.1, expand_forehead=0.3, ): super(RoIHeads, self).__init__() self.box_similarity = box_ops.box_iou # assign ground-truth boxes for each proposal self.proposal_matcher = det_utils.Matcher( fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=False) self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler( batch_size_per_image, positive_fraction) if bbox_reg_weights is None: bbox_reg_weights = (10.0, 10.0, 5.0, 5.0) self.box_coder = det_utils.BoxCoder(bbox_reg_weights) self.box_roi_pool = box_roi_pool self.box_head = box_head self.box_predictor = box_predictor num_classes = 2 self.class_roi_pool = MultiScaleRoIAlign( featmap_names=["0", "1", "2", "3"], output_size=7, sampling_ratio=2) resolution = box_roi_pool.output_size[0] representation_size = 1024 self.class_head = TwoMLPHead(out_channels * resolution**2, representation_size) self.class_predictor = FastRCNNClassPredictor(representation_size, num_classes) self.score_thresh = score_thresh self.nms_thresh = nms_thresh self.detections_per_img = detections_per_img self.mask_roi_pool = mask_roi_pool self.mask_head = mask_head self.mask_predictor = mask_predictor self.keypoint_roi_pool = keypoint_roi_pool self.keypoint_head = keypoint_head self.keypoint_predictor = keypoint_predictor self.pose_mean = pose_mean self.pose_stddev = pose_stddev self.threed_68_points = threed_68_points self.threed_5_points = threed_5_points self.bbox_x_factor = bbox_x_factor self.bbox_y_factor = bbox_y_factor self.expand_forehead = expand_forehead
def __init__(self, out_channels, num_classes, input_mode, acf_head, fg_iou_thresh=0.5, bg_iou_thresh=0.5, batch_size_per_image=512, positive_fraction=0.25, bbox_reg_weights=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100): super(RoIHeadsExtend, self).__init__() self.in_channels = out_channels self.input_mode = input_mode self.score_thresh = box_score_thresh self.nms_thresh = box_nms_thresh self.detections_per_img = box_detections_per_img self.fg_iou_thresh = fg_iou_thresh self.bg_iou_thresh = bg_iou_thresh self.batch_size_per_image = batch_size_per_image self.positive_fraction = positive_fraction self.num_classes = num_classes # Detection self.box_similarity = box_ops.box_iou # assign ground-truth boxes for each proposal self.proposal_matcher = det_utils.Matcher( fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=False) self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler( batch_size_per_image, positive_fraction) if bbox_reg_weights is None: bbox_reg_weights = (10., 10., 5., 5.) self.box_coder = det_utils.BoxCoder(bbox_reg_weights) self.box_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3], output_size=7, sampling_ratio=2) representation_size = 1024 resolution = self.box_roi_pool.output_size[0] self.box_head = TwoMLPHead(out_channels * resolution**2, representation_size) self.box_predictor = FastRCNNPredictor(representation_size, num_classes) # Segmentation self.shared_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3], output_size=14, sampling_ratio=2) resolution = self.shared_roi_pool.output_size[0] mask_layers = (256, 256, 256, 256, 256, 256, 256, 256) mask_dilation = 1 self.mask_head = MaskRCNNHeads(out_channels, mask_layers, mask_dilation) mask_predictor_in_channels = 256 # == mask_layers[-1] mask_dim_reduced = 256 self.mask_predictor = MaskRCNNPredictor(mask_predictor_in_channels, mask_dim_reduced, num_classes) self.with_paf_branch = True if self.with_paf_branch: self.paf_head = MaskRCNNHeads(out_channels, mask_layers, mask_dilation) self.paf_predictor = MaskRCNNPredictor(mask_predictor_in_channels, mask_dim_reduced, 2 * (num_classes - 1)) if self.input_mode == config.INPUT_RGBD: self.attention_block = ContextBlock(256, 2) self.global_feature_dim = 256 self.with_3d_keypoints = True self.with_axis_keypoints = False self.regress_axis = False self.estimate_norm_vector = False if acf_head == 'endpoints': self.with_axis_keypoints = True elif acf_head == 'scatters': self.regress_axis = True elif acf_head == 'norm_vector': self.estimate_norm_vector = True else: print("Don't assign a vaild acf head") exit() keypoint_layers = (256, ) * 4 self.keypoint_dim_reduced = keypoint_layers[-1] if self.with_3d_keypoints: self.vote_keypoint_head = Vote_Kpoints_head( self.global_feature_dim, keypoint_layers, "conv2d") self.vote_keypoint_predictor = Vote_Kpoints_Predictor( self.keypoint_dim_reduced, 3 * (num_classes - 1)) if self.with_axis_keypoints: self.orientation_keypoint_head = Vote_Kpoints_head( self.global_feature_dim, keypoint_layers, "conv2d") self.orientation_keypoint_predictor = Vote_Kpoints_Predictor( self.keypoint_dim_reduced, 6 * (num_classes - 1)) if self.regress_axis: self.axis_head = Vote_Kpoints_head(self.global_feature_dim, keypoint_layers, "conv2d") self.axis_predictor = Vote_Kpoints_Predictor( self.keypoint_dim_reduced, 4 * (num_classes - 1)) if self.estimate_norm_vector: self.norm_vector_head = Vote_Kpoints_head( self.global_feature_dim, keypoint_layers, "conv2d") self.norm_vector_predictor = Vote_Kpoints_Predictor( self.keypoint_dim_reduced, 3 * (num_classes - 1))
def __init__( self, backbone, num_ID, num_classes=2, len_embeddings=128, # transform parameters min_size=720, max_size=960, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.5, rpn_bg_iou_thresh=0.4, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None): if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) out_channels = backbone.out_channels if rpn_anchor_generator is None: anchor_sizes = ((16, 22), (32, 45), (64, 90), (128, 181), (256, 362)) aspect_ratios = ((1 / 3, ), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3], output_size=7, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(out_channels * resolution**2, representation_size) emb_scale = math.sqrt(2) * math.log(num_ID - 1) if num_ID > 1 else 1 if box_predictor is None: representation_size = 1024 box_predictor = JDEPredictor(representation_size, num_classes, len_embeddings, emb_scale) roi_heads = JDE_RoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img, len_embeddings, num_ID) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) super(Jde_RCNN, self).__init__(backbone, rpn, roi_heads, transform) self.eval_embed = False
def __init__(self): super(FasterRCNN, self).__init__() # Define FPN self.fpn = resnet_fpn_backbone(backbone_name='resnet101', pretrained=True) anchor_sizes = ((32, ), (64, ), (128, ), (256, ), (512, )) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) # Generate anchor boxes anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) # Define RPN Head # rpn_head = RPNHead(256, 9) rpn_head = RPNHead(256, anchor_generator.num_anchors_per_location()[0]) # RPN parameters, rpn_pre_nms_top_n_train = 2000 rpn_pre_nms_top_n_test = 1000 rpn_post_nms_top_n_train = 2000 rpn_post_nms_top_n_test = 1000 rpn_nms_thresh = 0.7 rpn_fg_iou_thresh = 0.7 rpn_bg_iou_thresh = 0.3 # rpn_nms_thresh = 0.45 # rpn_fg_iou_thresh = 0.5 # rpn_bg_iou_thresh = 0.5 rpn_batch_size_per_image = 256 rpn_positive_fraction = 0.5 # transform parameters min_size = 800 max_size = 1333 image_mean = [0.485, 0.456, 0.406] image_std = [0.229, 0.224, 0.225] self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) # Create RPN self.rpn = RegionProposalNetwork(anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) # Box parameters box_roi_pool = None box_head = None box_predictor = None box_score_thresh = 0.05 box_nms_thresh = 0.5 box_detections_per_img = 100 box_fg_iou_thresh = 0.5 box_bg_iou_thresh = 0.5 box_batch_size_per_image = 512 box_positive_fraction = 0.25 bbox_reg_weights = None num_classes = 101 if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(256 * resolution**2, representation_size) if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor(representation_size, num_classes) self.roi_heads = RoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img)
def __init__(self, backbone, num_classes=None, # transform parameters min_size=800, max_size=1333, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None, # Pose parameters pose_roi_pool=None, pose_head=None, pose_predictor=None, # Translation translation_head=None, translation_predictor=None): assert isinstance(pose_roi_pool, (MultiScaleRoIAlign, type(None))) if num_classes is not None: if pose_predictor is not None: raise ValueError("num_classes should be None when mask_predictor is specified") out_channels = backbone.out_channels # Pose if pose_roi_pool is None: pose_roi_pool = MultiScaleRoIAlign( featmap_names=[0, 1, 2, 3], output_size=7, sampling_ratio=2) if pose_head is None: resolution = pose_roi_pool.output_size[0] # 7 representation_size = 1024 pose_head = TwoMLPHead( out_channels * resolution ** 2, representation_size) representation_size = 1024 pose_predictor = PoseRCNNPredictor(representation_size, num_classes) # Translation if translation_head is None: translation_head = MLPFeatureExtractor() if translation_predictor is None: translation_predictor = MLPCONCATPredictor(representation_size) super(PoseRCNN, self).__init__( backbone, num_classes, # transform parameters min_size, max_size, image_mean, image_std, # RPN-specific parameters rpn_anchor_generator, rpn_head, rpn_pre_nms_top_n_train, rpn_pre_nms_top_n_test, rpn_post_nms_top_n_train, rpn_post_nms_top_n_test, rpn_nms_thresh, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, # Box parameters box_roi_pool, box_head, box_predictor, box_score_thresh, box_nms_thresh, box_detections_per_img, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign( featmap_names=[0, 1, 2, 3], output_size=7, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead( out_channels * resolution ** 2, representation_size) if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor( representation_size, num_classes) self.roi_heads = MyRoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img, pose_roi_pool=pose_roi_pool, pose_head=pose_head, pose_predictor=pose_predictor, translation_head=translation_head, translation_predictor=translation_predictor)
def build_model(cls, args, task): """Build a new model instance.""" # make sure that all args are properly defaulted (in case there are any new ones) base_architecture(args) rpn_anchor_generator = task.rpn_anchor_generator rpn_head = task.rpn_head box_roi_pool = task.box_roi_pool box_predictor = task.box_predictor box_head = task.box_head # setup backbone backbone = resnet_fpn_backbone(args.backbone, args.backbone_pretrained) if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)" ) assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) if task.num_classes > 0: if box_predictor is not None: raise ValueError("num_classes should be -1 when box_predictor is specified") else: if box_predictor is None: raise ValueError("num_classes should be > 0 when box_predictor is not specified") out_channels = backbone.out_channels if rpn_anchor_generator is None: anchor_sizes = ((32,), (64,), (128,), (256,), (512,)) aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0], ) rpn_pre_nms_top_n = dict(training=args.rpn_pre_nms_top_n_train, testing=args.rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=args.rpn_post_nms_top_n_train, testing=args.rpn_post_nms_top_n_test) rpn = RPN( rpn_anchor_generator, rpn_head, args.rpn_fg_iou_thresh, args.rpn_bg_iou_thresh, args.rpn_batch_size_per_image, args.rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, args.rpn_nms_thresh, ) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign( featmap_names=[0, 1, 2, 3], output_size=7, sampling_ratio=2, ) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead( out_channels * resolution ** 2, representation_size, ) if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor( representation_size, task.num_classes, ) roi_heads = RegionOfInterestHeads( # Box box_roi_pool, box_head, box_predictor, args.box_fg_iou_thresh, args.box_bg_iou_thresh, args.box_batch_size_per_image, args.box_positive_fraction, args.bbox_reg_weights, args.box_score_thresh, args.box_nms_thresh, args.box_detections_per_img, ) if args.image_mean is None: args.image_mean = [0.485, 0.456, 0.406] if args.image_std is None: args.image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform( args.min_size, args.max_size, args.image_mean, args.image_std, ) return cls(backbone, rpn, roi_heads, transform)
def __init__(self): super(FasterRCNN, self).__init__() # Define FPN self.fpn = resnet_fpn_backbone(backbone_name='resnet101', pretrained=True) self.rpn = RPN() # transform parameters min_size = 800 max_size = 1333 image_mean = [0.485, 0.456, 0.406] image_std = [0.229, 0.224, 0.225] self.transform = GeneralizedRCNNTransform( min_size, max_size, image_mean, image_std) # Box parameters box_roi_pool=None box_head=None box_predictor=None box_score_thresh=0.5 box_nms_thresh=0.5 box_detections_per_img=100 box_fg_iou_thresh=0.5 box_bg_iou_thresh=0.5 box_batch_size_per_image=512 box_positive_fraction=0.25 bbox_reg_weights=None num_classes=101 if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead( 256 * resolution ** 2, representation_size) if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor( representation_size, num_classes) # initialize word vectors ds_name = '/Users/pranoyr/Downloads/GoogleNews-vectors-negative300.bin' # ds_name = '/home/neuroplex/data/GoogleNews-vectors-negative300.bin' self.obj_vecs, self.prd_vecs = get_obj_prd_vecs(ds_name, dataset_path) self.RelDN = reldn_heads.reldn_head(box_head.fc7.out_features * 3, self.obj_vecs, self.prd_vecs) # concat of SPO self.roi_heads = RoIHeads( # Box self.RelDN, box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img)
def main(config, main_step): devices = ['cpu', 'cuda'] # use pretrained? use_pretrained_model = config.use_pretrained_model pretrained_model = config.model if use_pretrained_model and pretrained_model is None: print("Model not provided, training from scratch") use_pretrained_model = False if not use_pretrained_model and model is not None: print("It seems you want to load the weights") use_pretrained_model = True backbone = False # if use_pretrained_model: model = torch.load(pretrained_model) # import arguments from the config file start_epoch, model_name, backbone, num_epochs, save_dir, train_data_dir, val_data_dir, imgs_dir, gt_dir, batch_size, device, save_every, lrate = \ config.start_epoch, config.model_name, config.use_pretrained_resnet_backbone, config.num_epochs, config.save_dir, \ config.train_data_dir, config.val_data_dir, config.imgs_dir, config.gt_dir, config.batch_size, config.device, config.save_every, config.lrate if use_pretrained_model: backbone = False assert device in devices if not save_dir in os.listdir('.'): os.mkdir(save_dir) if batch_size > 1: print("The model was implemented for batch size of one") if device == 'cuda' and torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print(device) torch.manual_seed(time.time()) ############################################################################################## # DATASETS+DATALOADERS # Alex: could be added in the config file in the future # parameters for the dataset dataset_covid_pars_train = { 'stage': 'train', 'gt': os.path.join(train_data_dir, gt_dir), 'data': os.path.join(train_data_dir, imgs_dir) } datapoint_covid_train = dataset.CovidCTData(**dataset_covid_pars_train) dataset_covid_pars_eval = { 'stage': 'eval', 'gt': os.path.join(val_data_dir, gt_dir), 'data': os.path.join(val_data_dir, imgs_dir) } datapoint_covid_eval = dataset.CovidCTData(**dataset_covid_pars_eval) ############################################################################################### dataloader_covid_pars_train = {'shuffle': True, 'batch_size': batch_size} dataloader_covid_train = data.DataLoader(datapoint_covid_train, **dataloader_covid_pars_train) # dataloader_covid_pars_eval = {'shuffle': True, 'batch_size': batch_size} dataloader_covid_eval = data.DataLoader(datapoint_covid_eval, **dataloader_covid_pars_eval) ############################################################################################### # MASK R-CNN model # Alex: these settings could also be added to the config maskrcnn_args = { 'min_size': 512, 'max_size': 1024, 'rpn_batch_size_per_image': 1024, 'rpn_positive_fraction': 0.75, 'box_positive_fraction': 0.75, 'box_fg_iou_thresh': 0.75, 'box_bg_iou_thresh': 0.5, 'num_classes': None, 'box_batch_size_per_image': 1024, 'box_nms_thresh': 0.75, 'rpn_nms_thresh': 0.75 } # Alex: for Ground glass opacity and consolidatin segmentation # many small anchors # use all outputs of FPN # IMPORTANT!! For the pretrained weights, this determines the size of the anchor layer in RPN!!!! # pretrained model must have anchors if not use_pretrained_model: anchor_generator = AnchorGenerator(sizes=tuple([(2, 4, 8, 16, 32) for r in range(5)]), aspect_ratios=tuple([ (0.1, 0.25, 0.5, 1, 1.5, 2) for rh in range(5) ])) else: sizes = model['anchor_generator'].sizes aspect_ratios = model['anchor_generator'].aspect_ratios anchor_generator = AnchorGenerator(sizes, aspect_ratios) # num_classes:3 (1+2) box_head_input_size = 256 * 7 * 7 box_head = TwoMLPHead(in_channels=box_head_input_size, representation_size=128) box_predictor = FastRCNNPredictor(in_channels=128, num_classes=3) mask_roi_pool = torchvision.ops.MultiScaleRoIAlign( featmap_names=[0, 1, 2, 3], output_size=14, sampling_ratio=2) mask_predictor = MaskRCNNPredictor(in_channels=256, dim_reduced=256, num_classes=3) maskrcnn_args['rpn_anchor_generator'] = anchor_generator maskrcnn_args['mask_roi_pool'] = mask_roi_pool maskrcnn_args['mask_predictor'] = mask_predictor maskrcnn_args['box_predictor'] = box_predictor maskrcnn_args['box_head'] = box_head # Instantiate the segmentation model maskrcnn_model = torchvision.models.detection.maskrcnn_resnet50_fpn( pretrained=False, pretrained_backbone=backbone, progress=True, **maskrcnn_args) # pretrained? if use_pretrained_model: maskrcnn_model.load_state_dict(model['model_weights']) if model['epoch']: start_epoch = int(model['epoch']) if model['model_name']: model_name = model['model_name'] # Set to training mode print(maskrcnn_model) maskrcnn_model.train().to(device) optimizer_pars = {'lr': lrate, 'weight_decay': 1e-3} optimizer = torch.optim.Adam(list(maskrcnn_model.parameters()), **optimizer_pars) if use_pretrained_model and model['optimizer_state']: optimizer.load_state_dict(model['optimizer_state']) start_time = time.time() for e in range(start_epoch, num_epochs): train_loss_epoch = main_step("train", e, dataloader_covid_train, optimizer, device, maskrcnn_model, save_every, lrate, model_name, None, None) eval_loss_epoch = main_step("eval", e, dataloader_covid_eval, optimizer, device, maskrcnn_model, save_every, lrate, model_name, anchor_generator, save_dir) print("Epoch {0:d}: train loss = {1:.3f}, validation loss = {2:.3f}". format(e, train_loss_epoch, eval_loss_epoch)) end_time = time.time() print("Training took {0:.1f} seconds".format(end_time - start_time))
def __init__( self, backbone, num_classes=None, # transform parameters min_size=800, max_size=1333, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.7, rpn_bg_iou_thresh=0.3, rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=512, box_positive_fraction=0.25, bbox_reg_weights=None): if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) if num_classes is not None: if box_predictor is not None: raise ValueError( "num_classes should be None when box_predictor is specified" ) else: if box_predictor is None: raise ValueError( "num_classes should not be None when box_predictor " "is not specified") out_channels = backbone.out_channels if rpn_anchor_generator is None: anchor_sizes = ((32, ), (64, ), (128, ), (256, ), (512, )) aspect_ratios = ((0.5, 1.0, 2.0), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign( featmap_names=['0', '1', '2', '3'], output_size=7, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(out_channels * resolution**2, representation_size) if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor(representation_size, num_classes) roi_heads = RoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img) if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) self.ssm = False super(FasterRCNN, self).__init__(backbone, rpn, roi_heads, transform)
def __init__( self, backbone, num_ID, num_classes=2, version='v1', # transform parameters min_size=800, max_size=1333, image_mean=None, image_std=None, # RPN parameters rpn_anchor_generator=None, rpn_head=None, rpn_pre_nms_top_n_train=2000, rpn_pre_nms_top_n_test=1000, rpn_post_nms_top_n_train=2000, rpn_post_nms_top_n_test=1000, rpn_nms_thresh=0.7, rpn_fg_iou_thresh=0.5, rpn_bg_iou_thresh=0.4, #FIXME 这两个参数是参照论文Towards Real-Time Multi-Object Tracking rpn_batch_size_per_image=256, rpn_positive_fraction=0.5, # Box parameters box_roi_pool=None, box_head=None, box_predictor=None, box_score_thresh=0.05, box_nms_thresh=0.5, box_detections_per_img=100, box_fg_iou_thresh=0.5, box_bg_iou_thresh=0.5, box_batch_size_per_image=256, box_positive_fraction=0.25, bbox_reg_weights=None, # Embedding parameters ##FIXME 添加的参数 len_embeddings=128, embed_head=None, embed_extractor=None): if not hasattr(backbone, "out_channels"): raise ValueError( "backbone should contain an attribute out_channels " "specifying the number of output channels (assumed to be the " "same for all the levels)") assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None))) assert isinstance(box_roi_pool, (MultiScaleRoIAlign, type(None))) out_channels = backbone.out_channels ##FIXME 改了anchor size,并且只使用宽高比1/3的anchor,参考了Towards Real-Time Multi-Object Tracking if rpn_anchor_generator is None: anchor_sizes = ((16, 22), (32, 45), (64, 90), (128, 181), (256, 362)) aspect_ratios = ((1 / 3, ), ) * len(anchor_sizes) rpn_anchor_generator = AnchorGenerator(anchor_sizes, aspect_ratios) if rpn_head is None: rpn_head = RPNHead( out_channels, rpn_anchor_generator.num_anchors_per_location()[0]) rpn_pre_nms_top_n = dict(training=rpn_pre_nms_top_n_train, testing=rpn_pre_nms_top_n_test) rpn_post_nms_top_n = dict(training=rpn_post_nms_top_n_train, testing=rpn_post_nms_top_n_test) rpn = RegionProposalNetwork(rpn_anchor_generator, rpn_head, rpn_fg_iou_thresh, rpn_bg_iou_thresh, rpn_batch_size_per_image, rpn_positive_fraction, rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_nms_thresh) if box_roi_pool is None: box_roi_pool = MultiScaleRoIAlign(featmap_names=[0, 1, 2, 3], output_size=11, sampling_ratio=2) if box_head is None: resolution = box_roi_pool.output_size[0] representation_size = 1024 box_head = TwoMLPHead(out_channels * resolution**2, representation_size) emb_scale = math.sqrt(2) * math.log(num_ID - 1) if num_ID > 1 else 1 ## FIXME 现在用的是v1 if embed_head is None: if version == 'v1': resolution = box_roi_pool.output_size[0] representation_size = 1024 embed_head = featureHead(out_channels * resolution**2, representation_size) if version == 'v2': embed_head = None if box_predictor is None: representation_size = 1024 box_predictor = FastRCNNPredictor(representation_size, num_classes) if embed_extractor is None: representation_size = 1024 embed_extractor = featureExtractor(representation_size, len_embeddings, emb_scale) roi_heads = JDE_RoIHeads( # Box box_roi_pool, box_head, box_predictor, box_fg_iou_thresh, box_bg_iou_thresh, box_batch_size_per_image, box_positive_fraction, bbox_reg_weights, box_score_thresh, box_nms_thresh, box_detections_per_img, len_embeddings, num_ID, embed_head, embed_extractor) roi_heads.version = version #FIXME 这一部分是照搬faster RCNN代码里面的################### if image_mean is None: image_mean = [0.485, 0.456, 0.406] if image_std is None: image_std = [0.229, 0.224, 0.225] transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std) ########################################################### super(Jde_RCNN, self).__init__(backbone, rpn, roi_heads, transform) ## FIXME 跟踪时用的参数,与训练无关 self.version = version self.original_image_sizes = None self.preprocessed_images = None self.features = None self.box_features = None