def from_config(cls, cfg): backbone = build_backbone(cfg) backbone_shape = backbone.output_shape() try: feature_shapes = [ backbone_shape[f] for f in cfg.MODEL.FCOS.IN_FEATURES ] except KeyError: raise KeyError( f"Available keys: {backbone_shape.keys()}. Requested keys: {cfg.MODEL.FCOS.IN_FEATURES}" ) head = FCOSHead( input_shape=feature_shapes, num_classes=cfg.MODEL.FCOS.NUM_CLASSES, conv_dims=[feature_shapes[0].channels] * cfg.MODEL.FCOS.NUM_CONVS, norm=cfg.MODEL.FCOS.HEAD_NORM, ) return { "backbone": backbone, "head": head, "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, "num_classes": cfg.MODEL.FCOS.NUM_CLASSES, "head_in_features": cfg.MODEL.FCOS.IN_FEATURES, # Loss parameters: "focal_loss_alpha": cfg.MODEL.FCOS.FOCAL_LOSS_ALPHA, "focal_loss_gamma": cfg.MODEL.FCOS.FOCAL_LOSS_GAMMA, # Inference parameters: "test_score_thresh": cfg.MODEL.FCOS.SCORE_THRESH_TEST, "test_topk_candidates": cfg.MODEL.FCOS.TOPK_CANDIDATES_TEST, "test_nms_thresh": cfg.MODEL.FCOS.NMS_THRESH_TEST, "max_detections_per_image": cfg.TEST.DETECTIONS_PER_IMAGE, }
def from_config(cls, cfg): backbone = build_backbone(cfg) return { "backbone": backbone, # "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()), "proposal_generator": None, "load_proposals": cfg.MODEL.LOAD_PROPOSALS, "roi_heads": build_roi_heads(cfg, backbone.output_shape()), "input_format": cfg.INPUT.FORMAT, "vis_period": cfg.VIS_PERIOD, "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, "cpg": True if "CSC" in cfg.MODEL.ROI_HEADS.NAME or "WSJDS" in cfg.MODEL.ROI_HEADS.NAME or "XROIHeads" in cfg.MODEL.ROI_HEADS.NAME # if "CSC" in cfg.MODEL.ROI_HEADS.NAME or "WSJDS" in cfg.MODEL.ROI_HEADS.NAME else False, }
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.cfg = cfg # fmt: off self.num_classes = cfg.MODEL.CENTERNET.NUM_CLASSES # Loss parameters: # Inference parameters: self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE # fmt: on self.backbone = build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) self.upsample = CenternetDeconv(cfg) self.head = CenternetHead(cfg) self.mean, self.std = cfg.MODEL.PIXEL_MEAN, cfg.MODEL.PIXEL_STD pixel_mean = (torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1)) pixel_std = (torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1)) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def from_config(cls, cfg): backbone = build_backbone(cfg) return { "backbone": backbone, "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()), "roi_heads": build_roi_heads(cfg, backbone.output_shape()), "input_format": cfg.INPUT.FORMAT, "vis_period": cfg.VIS_PERIOD, "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, "kd_args": cfg.KD, "teacher": build_teacher(cfg), "teacher_input_format": cfg.TEACHER.INPUT.FORMAT, "teacher_pixel_mean": cfg.TEACHER.MODEL.PIXEL_MEAN, "teacher_pixel_std": cfg.TEACHER.MODEL.PIXEL_STD, }
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.backbone = build_backbone(cfg) self.proposal_generator = build_proposal_generator( cfg, self.backbone.output_shape()) if cfg.MODEL.CONDINST.MASK_HEAD.USE_MULTI: from .dynamic_mask_head_multi import build_dynamic_mask_head self.mask_head = build_dynamic_mask_head(cfg) else: from .dynamic_mask_head_old import build_dynamic_mask_head self.mask_head = build_dynamic_mask_head(cfg) self.mask_branch = build_mask_branch(cfg, self.backbone.output_shape()) self.mask_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE self.max_proposals = cfg.MODEL.CONDINST.MAX_PROPOSALS # build top module in_channels = self.proposal_generator.in_channels_to_top_module self.controller = nn.Conv2d(in_channels, self.mask_head.num_gen_params, kernel_size=3, stride=1, padding=1) torch.nn.init.normal_(self.controller.weight, std=0.01) torch.nn.init.constant_(self.controller.bias, 0) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.backbone = build_backbone(cfg) self.proposal_generator = build_proposal_generator( cfg, self.backbone.output_shape()) self.mask_branch = build_mask_branch(cfg, self.backbone.output_shape()) self.mask_pred = build_mask_pred(cfg) self.mask_out_stride = cfg.MODEL.EMBEDMASK.MASK_OUT_STRIDE self.max_proposals = cfg.MODEL.EMBEDMASK.MAX_PROPOSALS self.topk_proposals_per_im = cfg.MODEL.EMBEDMASK.TOPK_PROPOSALS_PER_IM self.mask_th = cfg.MODEL.EMBEDMASK.MASK_TH # build proposal head in_channels = self.proposal_generator.in_channels_to_top_module self.proposal_head = ProposalHead(cfg, in_channels) # build pixel head self.pixel_head = EmbedHead( cfg, cfg.MODEL.EMBEDMASK.MASK_BRANCH.OUT_CHANNELS) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.backbone = build_backbone(cfg) self.proposal_generator = build_proposal_generator( cfg, self.backbone.output_shape()) self.refinement_head = build_edge_det_head( cfg, self.backbone.output_shape()) self.mask_result_src = cfg.MODEL.DANCE.MASK_IN self.semantic_filter = cfg.MODEL.DANCE.SEMANTIC_FILTER self.semantic_filter_th = cfg.MODEL.DANCE.SEMANTIC_FILTER_TH self.need_concave_hull = (True if cfg.MODEL.SNAKE_HEAD.LOSS_TYPE == "chamfer" else False) self.roi_size = cfg.MODEL.DANCE.ROI_SIZE self.re_compute_box = cfg.MODEL.DANCE.RE_COMP_BOX self.visualize_path = cfg.MODEL.SNAKE_HEAD.VIS_PATH pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( -1, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( -1, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def __init__(self, cfg): super().__init__() self.backbone = build_backbone(cfg) self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape()) self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
def setup(file): # get cfg cfg = get_cfg() cfg.merge_from_file(file) cfg.SOLVER.IMS_PER_BATCH = 2 # get data loader iter data_loader = build_detection_train_loader(cfg) data_loader_iter = iter(data_loader) batched_inputs = next(data_loader_iter) # build anchors backbone = build_backbone(cfg).to(device) images = [x["image"].to(device) for x in batched_inputs] images = ImageList.from_tensors(images, backbone.size_divisibility) features = backbone(images.tensor.float()) input_shape = backbone.output_shape() in_features = cfg.MODEL.RPN.IN_FEATURES anchor_generator = build_anchor_generator( cfg, [input_shape[f] for f in in_features]) anchors = anchor_generator([features[f] for f in in_features]) anchors = Boxes.cat(anchors).to(device) # build matcher raw_matcher = Matcher(cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, allow_low_quality_matches=True) matcher = TopKMatcher(cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, 9) return cfg, data_loader_iter, anchors, matcher, raw_matcher
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) # loss weight self.instance_loss_weight = cfg.MODEL.SOGNET.INSTANCE_LOSS_WEIGHT # options when combining instance & semantic outputs # TODO: build inference self.stuff_area_limit = cfg.MODEL.SOGNET.POSTPROCESS.STUFF_AREA_LIMIT self.stuff_num_classes = (cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES - cfg.MODEL.ROI_HEADS.NUM_CLASSES) self.combine_on = cfg.MODEL.SOGNET.COMBINE.ENABLED if self.combine_on: self.combine_overlap_threshold = cfg.MODEL.SOGNET.COMBINE.OVERLAP_THRESH self.combine_stuff_area_limit = cfg.MODEL.SOGNET.COMBINE.STUFF_AREA_LIMIT self.combine_instances_confidence_threshold = ( cfg.MODEL.SOGNET.COMBINE.INSTANCES_CONFIDENCE_THRESH) self.backbone = build_backbone(cfg) self.proposal_generator = build_proposal_generator( cfg, self.backbone.output_shape()) self.roi_heads = build_roi_heads(cfg, self.backbone.output_shape()) self.sem_seg_head = build_sem_seg_head(cfg, self.backbone.output_shape()) self.panoptic_head = build_panoptic_head(cfg) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.cfg = cfg # fmt: off self.num_classes = cfg.MODEL.CENTERNET.NUM_CLASSES # Loss parameters: # Inference parameters: self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE # fmt: on self.backbone = build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) self.upsample = build_upsample_layers(cfg) self.head = build_head(cfg) # self.cls_head = cfg.build_cls_head(cfg) # self.wh_head = cfg.build_width_height_head(cfg) # self.reg_head = cfg.build_center_reg_head(cfg) # backbone_shape = self.backbone.output_shape() # feature_shapes = [backbone_shape[f] for f in self.in_features] self.mean, self.std = cfg.MODEL.PIXEL_MEAN, cfg.MODEL.PIXEL_STD # pixel_mean = torch.Tensor(self.mean).to(self.device).view(3, 1, 1) # pixel_std = torch.Tensor(self.std).to(self.device).view(3, 1, 1) # self.normalizer = lambda x: (x - pixel_mean) / pixel_std pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.backbone = build_backbone(cfg) self.proposal_generator = build_proposal_generator( cfg, self.backbone.output_shape()) self.roi_heads = build_roi_heads(cfg, self.backbone.output_shape()) assert len(cfg.MODEL.PIXEL_MEAN) == len(cfg.MODEL.PIXEL_STD) num_channels = len(cfg.MODEL.PIXEL_MEAN) pixel_mean = (torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( num_channels, 1, 1)) pixel_std = (torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( num_channels, 1, 1)) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device) if cfg.MODEL.BACKBONE.FREEZE: for p in self.backbone.parameters(): p.requires_grad = False print("froze backbone parameters") if cfg.MODEL.PROPOSAL_GENERATOR.FREEZE: for p in self.proposal_generator.parameters(): p.requires_grad = False print("froze proposal generator parameters") if cfg.MODEL.ROI_HEADS.FREEZE_FEAT: for p in self.roi_heads.box_head.parameters(): p.requires_grad = False print("froze roi_box_head parameters")
def __init__(self, cfg): super().__init__(cfg) self.in_features = cfg.MODEL.FCOS.IN_FEATURES # Loss parameters: # defined by method<get_ground_truth> self.num_points_per_level = None self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES self.center_sampling_radius = cfg.MODEL.FCOS.CENTER_SAMPLING_RADIUS self.norm_reg_targets = cfg.MODEL.FCOS.NORM_REG_TARGETS self.focal_loss_alpha = cfg.MODEL.FCOS.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.FCOS.FOCAL_LOSS_GAMMA self.iou_loss_type = cfg.MODEL.FCOS.IOU_LOSS_TYPE # Inference parameters: self.score_thresh = 0.3 self.pre_nms_thresh = cfg.MODEL.FCOS.INFERENCE_TH self.pre_nms_top_n = cfg.MODEL.FCOS.PRE_NMS_TOP_N self.nms_thresh = cfg.MODEL.FCOS.NMS_TH self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE self.min_size = 0 self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES self.backbone = build_backbone(cfg) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] self.head = FCOSRetinaNetHead(cfg, feature_shapes) self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.backbone = build_backbone(cfg) self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape()) self.mask_head = build_dynamic_mask_head(cfg) self.mask_branch = build_mask_branch(cfg, self.backbone.output_shape()) self.iuv_head = build_iuv_head(cfg) self.iuv_fea_dim = cfg.MODEL.CONDINST.IUVHead.CHANNELS self.s_ins_fea_dim = cfg.MODEL.CONDINST.MASK_HEAD.CHANNELS assert self.iuv_fea_dim+self.s_ins_fea_dim == cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS self.mask_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE self.max_proposals = cfg.MODEL.CONDINST.MAX_PROPOSALS # build top module in_channels = self.proposal_generator.in_channels_to_top_module self.controller = nn.Conv2d( in_channels, self.mask_head.num_gen_params, kernel_size=3, stride=1, padding=1 ) torch.nn.init.normal_(self.controller.weight, std=0.01) torch.nn.init.constant_(self.controller.bias, 0) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self._init_densepose_head(cfg) self.to(self.device)
def __init__(self, cfg): super().__init__() self.image_size = cfg.MODEL.SSD.IMAGE_SIZE self.num_classes = cfg.MODEL.SSD.NUM_CLASSES self.in_features = cfg.MODEL.SSD.IN_FEATURES self.extra_layer_arch = cfg.MODEL.SSD.EXTRA_LAYER_ARCH["SIZE{}".format( self.image_size)] self.l2norm_scale = cfg.MODEL.SSD.L2NORM_SCALE # Loss parameters: self.loss_alpha = cfg.MODEL.SSD.LOSS_ALPHA self.smooth_l1_loss_beta = cfg.MODEL.SSD.SMOOTH_L1_LOSS_BETA self.negative_positive_ratio = cfg.MODEL.SSD.NEGATIVE_POSITIVE_RATIO # Inference parameters: self.score_threshold = cfg.MODEL.SSD.SCORE_THRESH_TEST self.nms_threshold = cfg.MODEL.SSD.NMS_THRESH_TEST self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE # Vis parameters self.vis_period = cfg.VIS_PERIOD self.input_format = cfg.INPUT.FORMAT self.backbone = build_backbone(cfg) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] # Build extra layers self.extra_layers = self._make_extra_layers( feature_shapes[-1].channels, self.extra_layer_arch) extra_layer_channels = [ c for c in self.extra_layer_arch if isinstance(c, int) ] feature_shapes += [ ShapeSpec(channels=c) for c in extra_layer_channels[1::2] ] # Head self.head = SSDHead(cfg, feature_shapes) self.l2norm = L2Norm(backbone_shape[self.in_features[0]].channels, self.l2norm_scale) self.default_box_generator = DefaultBox(cfg) self.default_boxes = self.default_box_generator() # Matching and loss self.box2box_transform = Box2BoxTransform( weights=cfg.MODEL.SSD.BBOX_REG_WEIGHTS) self.matcher = Matcher( cfg.MODEL.SSD.IOU_THRESHOLDS, cfg.MODEL.SSD.IOU_LABELS, allow_low_quality_matches=False, ) self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1)) # Initialization self._init_weights()
def from_config(cls, cfg): backbone = build_backbone(cfg) backbone_shape = backbone.output_shape() backbone_level = cfg.MODEL.YOLOF.ENCODER.BACKBONE_LEVEL feature_shapes = [backbone_shape[backbone_level]] encoder = DilatedEncoder(cfg, backbone_shape) decoder = Decoder(cfg) anchor_generator = build_anchor_generator(cfg, feature_shapes) return { "backbone": backbone, "encoder": encoder, "decoder": decoder, "anchor_generator": anchor_generator, "box2box_transform": YOLOFBox2BoxTransform( weights=cfg.MODEL.YOLOF.BOX_TRANSFORM.BBOX_REG_WEIGHTS, add_ctr_clamp=cfg.MODEL.YOLOF.BOX_TRANSFORM.ADD_CTR_CLAMP, ctr_clamp=cfg.MODEL.YOLOF.BOX_TRANSFORM.CTR_CLAMP), "anchor_matcher": UniformMatcher(cfg.MODEL.YOLOF.MATCHER.TOPK), "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, "num_classes": cfg.MODEL.YOLOF.DECODER.NUM_CLASSES, "backbone_level": backbone_level, # Ignore thresholds: "pos_ignore_thresh": cfg.MODEL.YOLOF.POS_IGNORE_THRESHOLD, "neg_ignore_thresh": cfg.MODEL.YOLOF.NEG_IGNORE_THRESHOLD, # Loss parameters: "focal_loss_alpha": cfg.MODEL.YOLOF.LOSSES.FOCAL_LOSS_ALPHA, "focal_loss_gamma": cfg.MODEL.YOLOF.LOSSES.FOCAL_LOSS_GAMMA, "box_reg_loss_type": cfg.MODEL.YOLOF.LOSSES.BBOX_REG_LOSS_TYPE, # Inference parameters: "test_score_thresh": cfg.MODEL.YOLOF.SCORE_THRESH_TEST, "test_topk_candidates": cfg.MODEL.YOLOF.TOPK_CANDIDATES_TEST, "test_nms_thresh": cfg.MODEL.YOLOF.NMS_THRESH_TEST, "max_detections_per_image": cfg.MODEL.YOLOF.DETECTIONS_PER_IMAGE, # Vis parameters "vis_period": cfg.VIS_PERIOD, "input_format": cfg.INPUT.FORMAT, }
def test_rpn(self): torch.manual_seed(121) cfg = get_cfg() backbone = build_backbone(cfg) proposal_generator = RPN(cfg, backbone.output_shape()) num_images = 2 images_tensor = torch.rand(num_images, 20, 30) image_sizes = [(10, 10), (20, 30)] images = ImageList(images_tensor, image_sizes) image_shape = (15, 15) num_channels = 1024 features = {"res4": torch.rand(num_images, num_channels, 1, 2)} gt_boxes = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32) gt_instances = Instances(image_shape) gt_instances.gt_boxes = Boxes(gt_boxes) with EventStorage(): # capture events in a new storage to discard them proposals, proposal_losses = proposal_generator( images, features, [gt_instances[0], gt_instances[1]] ) expected_losses = { "loss_rpn_cls": torch.tensor(0.0804563984), "loss_rpn_loc": torch.tensor(0.0990132466), } for name in expected_losses.keys(): err_msg = "proposal_losses[{}] = {}, expected losses = {}".format( name, proposal_losses[name], expected_losses[name] ) self.assertTrue(torch.allclose(proposal_losses[name], expected_losses[name]), err_msg) expected_proposal_boxes = [ Boxes(torch.tensor([[0, 0, 10, 10], [7.3365392685, 0, 10, 10]])), Boxes( torch.tensor( [ [0, 0, 30, 20], [0, 0, 16.7862777710, 13.1362524033], [0, 0, 30, 13.3173446655], [0, 0, 10.8602609634, 20], [7.7165775299, 0, 27.3875980377, 20], ] ) ), ] expected_objectness_logits = [ torch.tensor([0.1225359365, -0.0133192837]), torch.tensor([0.1415634006, 0.0989848152, 0.0565387346, -0.0072308783, -0.0428492837]), ] for proposal, expected_proposal_box, im_size, expected_objectness_logit in zip( proposals, expected_proposal_boxes, image_sizes, expected_objectness_logits ): self.assertEqual(len(proposal), len(expected_proposal_box)) self.assertEqual(proposal.image_size, im_size) self.assertTrue( torch.allclose(proposal.proposal_boxes.tensor, expected_proposal_box.tensor) ) self.assertTrue(torch.allclose(proposal.objectness_logits, expected_objectness_logit))
def test_rpn(self): torch.manual_seed(121) cfg = get_cfg() cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RPN" cfg.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator" cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1) backbone = build_backbone(cfg) proposal_generator = build_proposal_generator(cfg, backbone.output_shape()) num_images = 2 images_tensor = torch.rand(num_images, 20, 30) image_sizes = [(10, 10), (20, 30)] images = ImageList(images_tensor, image_sizes) image_shape = (15, 15) num_channels = 1024 features = {"res4": torch.rand(num_images, num_channels, 1, 2)} gt_boxes = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32) gt_instances = Instances(image_shape) gt_instances.gt_boxes = Boxes(gt_boxes) with EventStorage(): # capture events in a new storage to discard them proposals, proposal_losses = proposal_generator( images, features, gt_instances) expected_losses = { "loss_rpn_cls": torch.tensor(0.0804563984), "loss_rpn_loc": torch.tensor(0.0990132466), } for name in expected_losses.keys(): assert torch.allclose(proposal_losses[name], expected_losses[name]) expected_proposal_boxes = [ Boxes(torch.tensor([[0, 0, 10, 10], [7.3365392685, 0, 10, 10]])), Boxes( torch.tensor([ [0, 0, 30, 20], [0, 0, 16.7862777710, 13.1362524033], [0, 0, 30, 13.3173446655], [0, 0, 10.8602609634, 20], [7.7165775299, 0, 27.3875980377, 20], ])), ] expected_objectness_logits = [ torch.tensor([0.1225359365, -0.0133192837]), torch.tensor([ 0.1415634006, 0.0989848152, 0.0565387346, -0.0072308783, -0.0428492837 ]), ] for i in range(len(image_sizes)): assert len(proposals[i]) == len(expected_proposal_boxes[i]) assert proposals[i].image_size == (image_sizes[i][0], image_sizes[i][1]) assert torch.allclose(proposals[i].proposal_boxes.tensor, expected_proposal_boxes[i].tensor) assert torch.allclose(proposals[i].objectness_logits, expected_objectness_logits[i])
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.scale_ranges = cfg.MODEL.SOLOV2.FPN_SCALE_RANGES self.strides = cfg.MODEL.SOLOV2.FPN_INSTANCE_STRIDES self.sigma = cfg.MODEL.SOLOV2.SIGMA # Instance parameters. self.num_classes = cfg.MODEL.SOLOV2.NUM_CLASSES self.num_kernels = cfg.MODEL.SOLOV2.NUM_KERNELS self.num_grids = cfg.MODEL.SOLOV2.NUM_GRIDS self.instance_in_features = cfg.MODEL.SOLOV2.INSTANCE_IN_FEATURES self.instance_strides = cfg.MODEL.SOLOV2.FPN_INSTANCE_STRIDES self.instance_in_channels = cfg.MODEL.SOLOV2.INSTANCE_IN_CHANNELS # = fpn. self.instance_channels = cfg.MODEL.SOLOV2.INSTANCE_CHANNELS # Mask parameters. self.mask_on = cfg.MODEL.MASK_ON self.mask_in_features = cfg.MODEL.SOLOV2.MASK_IN_FEATURES self.mask_in_channels = cfg.MODEL.SOLOV2.MASK_IN_CHANNELS self.mask_channels = cfg.MODEL.SOLOV2.MASK_CHANNELS self.num_masks = cfg.MODEL.SOLOV2.NUM_MASKS # Inference parameters. self.max_before_nms = cfg.MODEL.SOLOV2.NMS_PRE self.score_threshold = cfg.MODEL.SOLOV2.SCORE_THR self.update_threshold = cfg.MODEL.SOLOV2.UPDATE_THR self.mask_threshold = cfg.MODEL.SOLOV2.MASK_THR self.max_per_img = cfg.MODEL.SOLOV2.MAX_PER_IMG self.nms_kernel = cfg.MODEL.SOLOV2.NMS_KERNEL self.nms_sigma = cfg.MODEL.SOLOV2.NMS_SIGMA self.nms_type = cfg.MODEL.SOLOV2.NMS_TYPE # build the backbone. self.backbone = build_backbone(cfg) backbone_shape = self.backbone.output_shape() # build the ins head. instance_shapes = [ backbone_shape[f] for f in self.instance_in_features ] self.ins_head = SOLOv2InsHead(cfg, instance_shapes) # build the mask head. mask_shapes = [backbone_shape[f] for f in self.mask_in_features] self.mask_head = SOLOv2MaskHead(cfg, mask_shapes) # image transform pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def verify_rpn(self, conv_dims, expected_conv_dims): torch.manual_seed(121) cfg = get_cfg() cfg.MODEL.RPN.CONV_DIMS = conv_dims backbone = build_backbone(cfg) proposal_generator = RPN(cfg, backbone.output_shape()) for k, conv in enumerate(proposal_generator.rpn_head.conv): self.assertEqual(expected_conv_dims[k], conv.out_channels) return proposal_generator
def __init__(self, cfg): super().__init__() # get the deice of the model self.device = torch.device(cfg.MODEL.DEVICE) # fmt: off self.num_classes = cfg.MODEL.TENSOR_MASK.NUM_CLASSES self.in_features = cfg.MODEL.TENSOR_MASK.IN_FEATURES self.anchor_sizes = cfg.MODEL.ANCHOR_GENERATOR.SIZES self.num_levels = len(cfg.MODEL.ANCHOR_GENERATOR.SIZES) # Loss parameters: self.focal_loss_alpha = cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_GAMMA # Inference parameters: self.score_threshold = cfg.MODEL.TENSOR_MASK.SCORE_THRESH_TEST self.topk_candidates = cfg.MODEL.TENSOR_MASK.TOPK_CANDIDATES_TEST self.nms_threshold = cfg.MODEL.TENSOR_MASK.NMS_THRESH_TEST self.detections_im = cfg.TEST.DETECTIONS_PER_IMAGE # Mask parameters: self.mask_on = cfg.MODEL.MASK_ON self.mask_loss_weight = cfg.MODEL.TENSOR_MASK.MASK_LOSS_WEIGHT self.mask_pos_weight = torch.tensor( cfg.MODEL.TENSOR_MASK.POSITIVE_WEIGHT, dtype=torch.float32, device=self.device) self.bipyramid_on = cfg.MODEL.TENSOR_MASK.BIPYRAMID_ON # fmt: on # build the backbone self.backbone = build_backbone(cfg) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] feature_strides = [x.stride for x in feature_shapes] # build anchors self.anchor_generator = TensorMaskAnchorGenerator(cfg, feature_shapes) self.num_anchors = self.anchor_generator.num_cell_anchors[0] anchors_min_level = cfg.MODEL.ANCHOR_GENERATOR.SIZES[0] self.mask_sizes = [ size // feature_strides[0] for size in anchors_min_level ] self.min_anchor_size = min(anchors_min_level) - feature_strides[0] # head of the TensorMask self.head = TensorMaskHead(cfg, self.num_levels, self.num_anchors, self.mask_sizes, feature_shapes) # box transform self.box2box_transform = Box2BoxTransform( weights=cfg.MODEL.TENSOR_MASK.BBOX_REG_WEIGHTS) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def __init__(self, cfg): super(RetinaNet, self).__init__() self.device = torch.device(cfg.MODEL.DEVICE) # fmt: off self.num_classes = cfg.MODEL.RETINANET.NUM_CLASSES self.in_features = cfg.MODEL.RETINANET.IN_FEATURES # Loss parameters: self.focal_loss_alpha = cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA self.smooth_l1_loss_beta = cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA # Inference parameters: self.score_threshold = cfg.MODEL.RETINANET.SCORE_THRESH_TEST self.topk_candidates = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST self.nms_threshold = cfg.MODEL.RETINANET.NMS_THRESH_TEST self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE # Vis parameters self.vis_period = cfg.VIS_PERIOD self.input_format = cfg.INPUT.FORMAT # fmt: on self.backbone = build_backbone(cfg) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] self.head = RetinaNetHead(cfg, feature_shapes) self.anchor_generator = build_anchor_generator(cfg, feature_shapes) # Matching and loss self.box2box_transform = Box2BoxTransform( weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS) self.matcher = Matcher( cfg.MODEL.RETINANET.IOU_THRESHOLDS, cfg.MODEL.RETINANET.IOU_LABELS, allow_low_quality_matches=True, ) assert len(cfg.MODEL.PIXEL_MEAN) == len(cfg.MODEL.PIXEL_STD) num_channels = len(cfg.MODEL.PIXEL_MEAN) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( num_channels, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( num_channels, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device) """ In Detectron1, loss is normalized by number of foreground samples in the batch. When batch size is 1 per GPU, #foreground has a large variance and using it lead to lower performance. Here we maintain an EMA of #foreground to stabilize the normalizer. """ self.loss_normalizer = 100 # initialize with any reasonable #fg that's not too small self.loss_normalizer_momentum = 0.9
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.backbone = build_backbone(cfg) self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape()) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(-1, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(-1, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def __init__(self, cfg): super().__init__() self.num_classes = cfg.MODEL.RETINAFACE.NUM_CLASSES self.in_features = cfg.MODEL.RETINAFACE.IN_FEATURES # loss parameters self.focal_loss_alpha = cfg.MODEL.RETINAFACE.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.RETINAFACE.FOCAL_LOSS_GAMMA self.smooth_l1_loss_beta = cfg.MODEL.RETINAFACE.SMOOTH_L1_LOSS_BETA self.loc_weight = cfg.MODEL.RETINAFACE.LOC_WEIGHT # inference parameters self.score_threshold = cfg.MODEL.RETINAFACE.SCORE_THRESH_TEST self.topk_candidates = cfg.MODEL.RETINAFACE.TOPK_CANDIDATES_TEST self.nms_threshold = cfg.MODEL.RETINAFACE.NMS_THRESH_TEST self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE # visualize parameters self.vis_period = cfg.VIS_PERIOD self.input_format = cfg.INPUT.FORMAT self.backbone = build_backbone(cfg) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] self.head = RetinaFaceHead(cfg, feature_shapes) self.anchor_generator = build_anchor_generator(cfg, feature_shapes) # Matching and loss self.box2box_transform = Box2BoxTransform( weights=cfg.MODEL.RETINAFACE.BBOX_REG_WEIGHTS ) self.landmark2landmark_transform = Landmark2LandmarkTransform( weights=cfg.MODEL.RETINAFACE.LANDMARK_REG_WEIGHTS ) self.matcher = Matcher( cfg.MODEL.RETINAFACE.IOU_THRESHOLDS, cfg.MODEL.RETINAFACE.IOU_LABELS, allow_low_quality_matches=True ) self.register_buffer( "pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1) ) self.register_buffer( "pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1) ) """ In Detectron1, loss is normalized by number of foreground samples in the batch. When batch size is 1 per GPU, #foreground has a large variance and using it lead to lower performance. Here we maintain an EMA of #foreground to stabilize the normalizer. """ # initialize with any reasonable #fg that's not too small self.loss_normalizer = 100 self.loss_normalizer_momentum = 0.9
def __init__(self, cfg): super().__init__() self.num_classes = cfg.MODEL.CLSNET.NUM_CLASSES self.in_features = cfg.MODEL.CLSNET.IN_FEATURES self.bottom_up = build_backbone(cfg) self.criterion = nn.CrossEntropyLoss() self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
def __init__(self, cfg): super().__init__() self.backbone = build_backbone(cfg) self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape()) self.roi_heads = build_roi_heads(cfg, self.backbone.output_shape()) self.vis_period = cfg.VIS_PERIOD self.input_format = cfg.INPUT.FORMAT assert len(cfg.MODEL.PIXEL_MEAN) == len(cfg.MODEL.PIXEL_STD) self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
def test_rroi_heads(self): torch.manual_seed(121) cfg = get_cfg() cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RRPN" cfg.MODEL.ANCHOR_GENERATOR.NAME = "RotatedAnchorGenerator" cfg.MODEL.ROI_HEADS.NAME = "RROIHeads" cfg.MODEL.ROI_BOX_HEAD.NAME = "FastRCNNConvFCHead" cfg.MODEL.ROI_BOX_HEAD.NUM_FC = 2 cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1, 1) cfg.MODEL.RPN.HEAD_NAME = "StandardRPNHead" cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignRotated" cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5, 1) backbone = build_backbone(cfg) num_images = 2 images_tensor = torch.rand(num_images, 20, 30) image_sizes = [(10, 10), (20, 30)] images = ImageList(images_tensor, image_sizes) num_channels = 1024 features = {"res4": torch.rand(num_images, num_channels, 1, 2)} image_shape = (15, 15) gt_boxes0 = torch.tensor([[2, 2, 2, 2, 30], [4, 4, 4, 4, 0]], dtype=torch.float32) gt_instance0 = Instances(image_shape) gt_instance0.gt_boxes = RotatedBoxes(gt_boxes0) gt_instance0.gt_classes = torch.tensor([2, 1]) gt_boxes1 = torch.tensor([[1.5, 5.5, 1, 3, 0], [8.5, 4, 3, 2, -50]], dtype=torch.float32) gt_instance1 = Instances(image_shape) gt_instance1.gt_boxes = RotatedBoxes(gt_boxes1) gt_instance1.gt_classes = torch.tensor([1, 2]) gt_instances = [gt_instance0, gt_instance1] proposal_generator = build_proposal_generator(cfg, backbone.output_shape()) roi_heads = build_roi_heads(cfg, backbone.output_shape()) with EventStorage(): # capture events in a new storage to discard them proposals, proposal_losses = proposal_generator( images, features, gt_instances) _, detector_losses = roi_heads(images, features, proposals, gt_instances) expected_losses = { "loss_cls": torch.tensor(4.381618499755859), "loss_box_reg": torch.tensor(0.0011829272843897343), } for name in expected_losses.keys(): err_msg = "detector_losses[{}] = {}, expected losses = {}".format( name, detector_losses[name], expected_losses[name]) self.assertTrue( torch.allclose(detector_losses[name], expected_losses[name]), err_msg)
def __init__(self, cfg): super().__init__() self.mask_on = cfg.MODEL.MASK_ON self.backbone = build_backbone(cfg) self.position_embedding = build_position_encoding(cfg) self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
def __init__(self, cfg): super().__init__() # fmt: off self.device = torch.device(cfg.MODEL.DEVICE) self.num_classes = cfg.MODEL.RETINANET.NUM_CLASSES self.in_features = cfg.MODEL.RETINANET.IN_FEATURES # Mask parameters: self.discard_mask_area = cfg.MODEL.YOLACT.DISCARD_MASK_AREA self.num_masks = cfg.MODEL.YOLACT.NUM_MASKS # Loss parameters: self.sem_seg_alpha = cfg.MODEL.YOLACT.SEM_SEG_ALPHA self.mask_alpha = cfg.MODEL.YOLACT.MASK_ALPHA self.mask_reweight = cfg.MODEL.YOLACT.MASK_REWEIGHT self.maskiou_alpha = cfg.MODEL.YOLACT.MASKIOU_ALPHA self.focal_loss_alpha = cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA self.smooth_l1_loss_beta = cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA # Inference parameters: self.score_threshold = cfg.MODEL.RETINANET.SCORE_THRESH_TEST self.topk_candidates = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST self.nms_threshold = cfg.MODEL.RETINANET.NMS_THRESH_TEST self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE # fmt: on # retinanet_resnet_fpn_backbone self.backbone = build_backbone(cfg) # dict[str->ShapeSpec] backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] # base retinanet add mask coefficient branch self.head = YolactHead(cfg, feature_shapes) # which layer output of backbone to protonet. see offical yolact's cfg.proto_src. # default is `res2`, but this is `res3` self.protonet = ProtoNet(cfg, feature_shapes[0]) # to mask scoring self.maskiou_net = MaskIouNet(cfg) # semantic segmentation to help training self.semantic_seg_conv = nn.Conv2d(feature_shapes[0].channels, self.num_classes, 1) self.anchor_generator = build_anchor_generator(cfg, feature_shapes) # Matching and loss self.box2box_transform = Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS) self.matcher = Matcher( cfg.MODEL.RETINANET.IOU_THRESHOLDS, cfg.MODEL.RETINANET.IOU_LABELS, allow_low_quality_matches=True, ) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.backbone = build_backbone(cfg) self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape()) self.roi_heads = build_roi_heads(cfg, self.backbone.output_shape()) self.vis_period = cfg.VIS_PERIOD self.input_format = cfg.INPUT.FORMAT self.current_video = None self.frame_idx = 0 if cfg.MODEL.SPATIOTEMPORAL.FREEZE_BACKBONE: self.freeze_component(self.backbone) if cfg.MODEL.SPATIOTEMPORAL.FREEZE_PROPOSAL_GENERATOR: self.freeze_component(self.proposal_generator) self.long_term = cfg.MODEL.SPATIOTEMPORAL.LONG_TERM self.temporal_dropout = cfg.MODEL.SPATIOTEMPORAL.TEMPORAL_DROPOUT self.num_frames = cfg.MODEL.SPATIOTEMPORAL.NUM_FRAMES self.num_keyframes = cfg.MODEL.SPATIOTEMPORAL.NUM_KEYFRAMES self.keyframe_interval = cfg.MODEL.SPATIOTEMPORAL.KEYFRAME_INTERVAL self.reference_frame_idx = -1 if cfg.MODEL.SPATIOTEMPORAL.FORWARD_AGGREGATION: # (f_{t-NUM_FRAMES}, ..., f_{t-1}, f_t, f_{t+1}, ..., f_{t+NUM_FRAMES}) self.num_frames = (2 * self.num_frames) + 1 self.reference_frame_idx = cfg.MODEL.SPATIOTEMPORAL.NUM_FRAMES if self.temporal_dropout: assert cfg.MODEL.SPATIOTEMPORAL.FORWARD_AGGREGATION, "Temporal dropout without forward aggregation." if self.temporal_dropout: self.reference_frame_idx = cfg.MODEL.SPATIOTEMPORAL.NUM_FRAMES self.train_reference_frame_idx = 1 else: self.train_reference_frame_idx = self.reference_frame_idx self.short_term_feature_buffer = deque(maxlen=self.num_frames) self.long_term_feature_buffer = deque(maxlen=self.num_keyframes) self.long_term_roi_buffer = deque(maxlen=self.num_keyframes) # RPN buffers self.predict_proposals = None self.predict_objectness_logits = None assert len(cfg.MODEL.PIXEL_MEAN) == len(cfg.MODEL.PIXEL_STD) num_channels = len(cfg.MODEL.PIXEL_MEAN) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(num_channels, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(num_channels, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)