def __init__(self, cfg): super().__init__() # get the deice of the model self.device = torch.device(cfg.MODEL.DEVICE) # fmt: off self.num_classes = cfg.MODEL.TENSOR_MASK.NUM_CLASSES self.in_features = cfg.MODEL.TENSOR_MASK.IN_FEATURES self.anchor_sizes = cfg.MODEL.ANCHOR_GENERATOR.SIZES self.num_levels = len(cfg.MODEL.ANCHOR_GENERATOR.SIZES) # Loss parameters: self.focal_loss_alpha = cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_GAMMA # Inference parameters: self.score_threshold = cfg.MODEL.TENSOR_MASK.SCORE_THRESH_TEST self.topk_candidates = cfg.MODEL.TENSOR_MASK.TOPK_CANDIDATES_TEST self.nms_threshold = cfg.MODEL.TENSOR_MASK.NMS_THRESH_TEST self.nms_type = cfg.MODEL.NMS_TYPE self.detections_im = cfg.TEST.DETECTIONS_PER_IMAGE # Mask parameters: self.mask_on = cfg.MODEL.MASK_ON self.mask_loss_weight = cfg.MODEL.TENSOR_MASK.MASK_LOSS_WEIGHT self.mask_pos_weight = torch.tensor( cfg.MODEL.TENSOR_MASK.POSITIVE_WEIGHT, dtype=torch.float32, device=self.device) self.bipyramid_on = cfg.MODEL.TENSOR_MASK.BIPYRAMID_ON # fmt: on # build the backbone self.backbone = cfg.build_backbone(cfg) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] feature_strides = [x.stride for x in feature_shapes] # build anchors self.anchor_generator = TensorMaskAnchorGenerator(cfg, feature_shapes) self.num_anchors = self.anchor_generator.num_cell_anchors[0] anchors_min_level = cfg.MODEL.ANCHOR_GENERATOR.SIZES[0] self.mask_sizes = [ size // feature_strides[0] for size in anchors_min_level ] self.min_anchor_size = min(anchors_min_level) - feature_strides[0] # head of the TensorMask self.head = TensorMaskHead(cfg, self.num_levels, self.num_anchors, self.mask_sizes, feature_shapes) # box transform self.box2box_transform = Box2BoxTransform( weights=cfg.MODEL.TENSOR_MASK.BBOX_REG_WEIGHTS) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) # fmt: off self.image_size = cfg.MODEL.SSD.IMAGE_SIZE self.num_classes = cfg.MODEL.SSD.NUM_CLASSES self.in_features = cfg.MODEL.SSD.IN_FEATURES self.extra_layer_arch = cfg.MODEL.SSD.EXTRA_LAYER_ARCH[str(self.image_size)] self.l2norm_scale = cfg.MODEL.SSD.L2NORM_SCALE # Loss parameters: self.loss_alpha = cfg.MODEL.SSD.LOSS_ALPHA self.smooth_l1_loss_beta = cfg.MODEL.SSD.SMOOTH_L1_LOSS_BETA self.negative_positive_ratio = cfg.MODEL.SSD.NEGATIVE_POSITIVE_RATIO # Inference parameters: self.score_threshold = cfg.MODEL.SSD.SCORE_THRESH_TEST self.nms_threshold = cfg.MODEL.SSD.NMS_THRESH_TEST self.nms_type = cfg.MODEL.NMS_TYPE self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE # fmt: on self.backbone = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] # build extra layers self.extra_layers = self._make_extra_layers( feature_shapes[-1].channels, self.extra_layer_arch) extra_layer_channels = [c for c in self.extra_layer_arch if isinstance(c, int)] feature_shapes += [ShapeSpec(channels=c) for c in extra_layer_channels[1::2]] # ssd head self.head = SSDHead(cfg, feature_shapes) self.l2norm = L2Norm(512, self.l2norm_scale) self.default_box_generator = cfg.build_default_box_generator(cfg) self.default_boxes = self.default_box_generator() # Matching and loss self.box2box_transform = Box2BoxTransform( weights=cfg.MODEL.SSD.BBOX_REG_WEIGHTS) self.matcher = Matcher( cfg.MODEL.SSD.IOU_THRESHOLDS, cfg.MODEL.SSD.IOU_LABELS, allow_low_quality_matches=False, ) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device) # Initialization self._init_weights()
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.num_classes = cfg.MODEL.EFFICIENTDET.NUM_CLASSES self.in_features = cfg.MODEL.EFFICIENTDET.IN_FEATURES self.freeze_bn = cfg.MODEL.EFFICIENTDET.FREEZE_BN self.freeze_backbone = cfg.MODEL.EFFICIENTDET.FREEZE_BACKBONE self.input_size = cfg.MODEL.BIFPN.INPUT_SIZE # Loss parameters: self.focal_loss_alpha = cfg.MODEL.EFFICIENTDET.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.EFFICIENTDET.FOCAL_LOSS_GAMMA self.smooth_l1_loss_beta = cfg.MODEL.EFFICIENTDET.SMOOTH_L1_LOSS_BETA self.box_loss_weight = cfg.MODEL.EFFICIENTDET.BOX_LOSS_WEIGHT self.regress_norm = cfg.MODEL.EFFICIENTDET.REG_NORM # Inference parameters: self.score_threshold = cfg.MODEL.EFFICIENTDET.SCORE_THRESH_TEST self.topk_candidates = cfg.MODEL.EFFICIENTDET.TOPK_CANDIDATES_TEST self.nms_threshold = cfg.MODEL.EFFICIENTDET.NMS_THRESH_TEST self.nms_type = cfg.MODEL.NMS_TYPE self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE self.backbone = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] self.head = EfficientDetHead(cfg, feature_shapes) self.anchor_generator = cfg.build_anchor_generator(cfg, feature_shapes) # Matching and loss self.box2box_transform = Box2BoxTransform( weights=cfg.MODEL.EFFICIENTDET.BBOX_REG_WEIGHTS) self.matcher = Matcher( cfg.MODEL.EFFICIENTDET.IOU_THRESHOLDS, cfg.MODEL.EFFICIENTDET.IOU_LABELS, allow_low_quality_matches=False, ) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x / 255. - pixel_mean) / pixel_std if self.freeze_bn: for layer in self.modules(): if isinstance(layer, nn.BatchNorm2d): layer.eval() if self.freeze_backbone: for name, params in self.named_parameters(): if name.startswith("backbone.bottom_up"): params.requires_grad = False self.to(self.device)
def get_outputs_converter(predict_net, init_net): self = types.SimpleNamespace() serialized_anchor_generator = io.BytesIO( get_pb_arg_vals(predict_net, "serialized_anchor_generator", None)) self.anchor_generator = torch.load(serialized_anchor_generator) bbox_reg_weights = get_pb_arg_floats(predict_net, "bbox_reg_weights", None) self.box2box_transform = Box2BoxTransform( weights=tuple(bbox_reg_weights)) self.score_threshold = get_pb_arg_valf(predict_net, "score_threshold", None) self.topk_candidates = get_pb_arg_vali(predict_net, "topk_candidates", None) self.nms_threshold = get_pb_arg_valf(predict_net, "nms_threshold", None) self.max_detections_per_image = get_pb_arg_vali( predict_net, "max_detections_per_image", None) # hack to reuse inference code from RetinaNet self.inference = functools.partial(meta_arch.RetinaNet.inference, self) self.inference_single_image = functools.partial( meta_arch.RetinaNet.inference_single_image, self) def f(batched_inputs, c2_inputs, c2_results): image_sizes = [[int(im[0]), int(im[1])] for im in c2_inputs["im_info"]] num_features = len( [x for x in c2_results.keys() if x.startswith("box_cls_")]) box_cls = [ c2_results["box_cls_{}".format(i)] for i in range(num_features) ] box_delta = [ c2_results["box_delta_{}".format(i)] for i in range(num_features) ] # For each feature level, feature should have the same batch size and # spatial dimension as the box_cls and box_delta. dummy_features = [ box_delta[i].clone()[:, 0:0, :, :] for i in range(num_features) ] anchors = self.anchor_generator(dummy_features) # self.num_classess can be inferred self.num_classes = box_cls[0].shape[1] // (box_delta[0].shape[1] // 4) results = self.inference(box_cls, box_delta, anchors, image_sizes) return meta_arch.GeneralizedRCNN._postprocess( results, batched_inputs, image_sizes) return f
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) # fmt: off self.num_classes = cfg.MODEL.RETINANET.NUM_CLASSES self.in_features = cfg.MODEL.RETINANET.IN_FEATURES # Loss parameters: self.focal_loss_alpha = cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA self.smooth_l1_loss_beta = cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA # Inference parameters: self.score_threshold = cfg.MODEL.RETINANET.SCORE_THRESH_TEST self.topk_candidates = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST self.nms_threshold = cfg.MODEL.RETINANET.NMS_THRESH_TEST self.nms_type = cfg.MODEL.NMS_TYPE self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE # fmt: on self.backbone = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] self.head = RetinaNetHead(cfg, feature_shapes) self.anchor_generator = cfg.build_anchor_generator(cfg, feature_shapes) # Matching and loss self.box2box_transform = Box2BoxTransform( weights=cfg.MODEL.RETINANET.BBOX_REG_WEIGHTS) self.matcher = Matcher( cfg.MODEL.RETINANET.IOU_THRESHOLDS, cfg.MODEL.RETINANET.IOU_LABELS, allow_low_quality_matches=True, ) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device) """ In Detectron1, loss is normalized by number of foreground samples in the batch. When batch size is 1 per GPU, #foreground has a large variance and using it lead to lower performance. Here we maintain an EMA of #foreground to stabilize the normalizer. """ self.loss_normalizer = 100 # initialize with any reasonable #fg that's not too small self.loss_normalizer_momentum = 0.9
def test_reconstruction(self): weights = (5, 5, 10, 10) b2b_tfm = Box2BoxTransform(weights=weights) src_boxes = random_boxes([10, 10, 20, 20], 1, 10) dst_boxes = random_boxes([10, 10, 20, 20], 1, 10) devices = [torch.device("cpu")] if torch.cuda.is_available(): devices.append(torch.device("cuda")) for device in devices: src_boxes = src_boxes.to(device=device) dst_boxes = dst_boxes.to(device=device) deltas = b2b_tfm.get_deltas(src_boxes, dst_boxes) dst_boxes_reconstructed = b2b_tfm.apply_deltas(deltas, src_boxes) assert torch.allclose(dst_boxes, dst_boxes_reconstructed)
def test_fast_rcnn_empty_batch(self, device="cpu"): cfg = RCNNConfig() cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5) box2box_transform = Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS) logits = torch.randn(0, 100, requires_grad=True, device=device) deltas = torch.randn(0, 4, requires_grad=True, device=device) smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA outputs = FastRCNNOutputs( box2box_transform, logits, deltas, [], smooth_l1_beta ) with EventStorage(): # capture events in a new storage to discard them losses = outputs.losses() for value in losses.values(): self.assertTrue(torch.allclose(value, torch.zeros_like(value)))
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) # fmt: off self.num_classes = cfg.MODEL.YOLOF.DECODER.NUM_CLASSES self.in_features = cfg.MODEL.YOLOF.ENCODER.IN_FEATURES self.pos_ignore_thresh = cfg.MODEL.YOLOF.POS_IGNORE_THRESHOLD self.neg_ignore_thresh = cfg.MODEL.YOLOF.NEG_IGNORE_THRESHOLD # Loss parameters: self.focal_loss_alpha = cfg.MODEL.YOLOF.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.YOLOF.FOCAL_LOSS_GAMMA # Inference parameters: self.score_threshold = cfg.MODEL.YOLOF.SCORE_THRESH_TEST self.topk_candidates = cfg.MODEL.YOLOF.TOPK_CANDIDATES_TEST self.nms_threshold = cfg.MODEL.YOLOF.NMS_THRESH_TEST self.nms_type = cfg.MODEL.NMS_TYPE self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE # fmt: on self.backbone = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] self.encoder = cfg.build_encoder(cfg, backbone_shape) self.decoder = cfg.build_decoder(cfg) self.anchor_generator = cfg.build_anchor_generator(cfg, feature_shapes) # Matching and loss self.box2box_transform = Box2BoxTransform( weights=cfg.MODEL.YOLOF.BBOX_REG_WEIGHTS, add_ctr_clamp=cfg.MODEL.YOLOF.ADD_CTR_CLAMP, ctr_clamp=cfg.MODEL.YOLOF.CTR_CLAMP) self.matcher = UniformMatcher(cfg.MODEL.YOLOF.MATCHER_TOPK) self.register_buffer( "pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)) self.register_buffer( "pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)) self.to(self.device)
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) # fmt: off self.num_classes = cfg.MODEL.RETINANET.NUM_CLASSES self.in_features = cfg.MODEL.RETINANET.IN_FEATURES # Loss parameters: self.focal_loss_alpha = cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA self.smooth_l1_loss_beta = cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA # Inference parameters: self.score_threshold = cfg.MODEL.RETINANET.SCORE_THRESH_TEST self.topk_candidates = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST self.nms_threshold = cfg.MODEL.RETINANET.NMS_THRESH_TEST self.nms_type = cfg.MODEL.NMS_TYPE self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE # fmt: on self.backbone = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] self.head = RetinaNetHead(cfg, feature_shapes) self.anchor_generator = cfg.build_anchor_generator(cfg, feature_shapes) # Matching and loss self.box2box_transform = Box2BoxTransform( weights=cfg.MODEL.RETINANET.BBOX_REG_WEIGHTS) self.matcher = Matcher( cfg.MODEL.RETINANET.IOU_THRESHOLDS, cfg.MODEL.RETINANET.IOU_LABELS, allow_low_quality_matches=True, ) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def test_fast_rcnn(self): torch.manual_seed(132) cfg = RCNNConfig() cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5) box2box_transform = Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS) box_head_output_size = 8 num_classes = 5 cls_agnostic_bbox_reg = False box_predictor = FastRCNNOutputLayers( box_head_output_size, num_classes, cls_agnostic_bbox_reg, box_dim=4 ) feature_pooled = torch.rand(2, box_head_output_size) pred_class_logits, pred_proposal_deltas = box_predictor(feature_pooled) image_shape = (10, 10) proposal_boxes = torch.tensor([[0.8, 1.1, 3.2, 2.8], [2.3, 2.5, 7, 8]], dtype=torch.float32) gt_boxes = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32) result = Instances(image_shape) result.proposal_boxes = Boxes(proposal_boxes) result.gt_boxes = Boxes(gt_boxes) result.gt_classes = torch.tensor([1, 2]) proposals = [] proposals.append(result) smooth_l1_beta = cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA outputs = FastRCNNOutputs( box2box_transform, pred_class_logits, pred_proposal_deltas, proposals, smooth_l1_beta ) with EventStorage(): # capture events in a new storage to discard them losses = outputs.losses() expected_losses = { "loss_cls": torch.tensor(1.7951188087), "loss_box_reg": torch.tensor(4.0357131958), } for name in expected_losses.keys(): assert torch.allclose(losses[name], expected_losses[name])