def setup(file): # get cfg cfg = get_cfg() cfg.merge_from_file(file) cfg.SOLVER.IMS_PER_BATCH = 2 # get data loader iter data_loader = build_detection_train_loader(cfg) data_loader_iter = iter(data_loader) batched_inputs = next(data_loader_iter) # build anchors backbone = build_backbone(cfg).to(device) images = [x["image"].to(device) for x in batched_inputs] images = ImageList.from_tensors(images, backbone.size_divisibility) features = backbone(images.tensor.float()) input_shape = backbone.output_shape() in_features = cfg.MODEL.RPN.IN_FEATURES anchor_generator = build_anchor_generator( cfg, [input_shape[f] for f in in_features]) anchors = anchor_generator([features[f] for f in in_features]) anchors = Boxes.cat(anchors).to(device) # build matcher raw_matcher = Matcher(cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, allow_low_quality_matches=True) matcher = TopKMatcher(cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, 9) return cfg, data_loader_iter, anchors, matcher, raw_matcher
def __init__(self, cfg): super().__init__() self.in_features = cfg.MODEL.FCOS.IN_FEATURES # Loss parameters: # defined by method<get_ground_truth> self.num_points_per_level = None self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES self.center_sampling_radius = cfg.MODEL.FCOS.CENTER_SAMPLING_RADIUS self.norm_reg_targets = cfg.MODEL.FCOS.NORM_REG_TARGETS self.focal_loss_alpha = cfg.MODEL.FCOS.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.FCOS.FOCAL_LOSS_GAMMA self.iou_loss_type = cfg.MODEL.FCOS.IOU_LOSS_TYPE # Inference parameters: self.score_thresh = 0.3 self.pre_nms_thresh = cfg.MODEL.FCOS.INFERENCE_TH self.pre_nms_top_n = cfg.MODEL.FCOS.PRE_NMS_TOP_N self.nms_thresh = cfg.MODEL.FCOS.NMS_TH self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE self.min_size = 0 self.num_classes = cfg.MODEL.FCOS.NUM_CLASSES self.backbone = build_backbone(cfg) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] self.head = FCOSAnchorHead(cfg, feature_shapes) self.anchor_generator = build_anchor_generator(cfg, feature_shapes) self.num_anchors = self.anchor_generator.num_cell_anchors[0] self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
def __init__(self, cfg, input_shape: List[ShapeSpec]): super().__init__() # Standard RPN is shared across levels: out_channels = cfg.MODEL.BUA.RPN.CONV_OUT_CHANNELS in_channels = [s.channels for s in input_shape] assert len(set(in_channels)) == 1, "Each level must have the same channel!" in_channels = in_channels[0] # RPNHead should take the same input as anchor generator # NOTE: it assumes that creating an anchor generator does not have unwanted side effect. anchor_generator = build_anchor_generator(cfg, input_shape) num_cell_anchors = anchor_generator.num_cell_anchors box_dim = anchor_generator.box_dim assert ( len(set(num_cell_anchors)) == 1 ), "Each level must have the same number of cell anchors" num_cell_anchors = num_cell_anchors[0] # 3x3 conv for the hidden representation self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1) # 1x1 conv for predicting objectness logits self.objectness_logits = nn.Conv2d(out_channels, num_cell_anchors * 2, kernel_size=1, stride=1) # 1x1 conv for predicting box2box transform deltas self.anchor_deltas = nn.Conv2d( out_channels, num_cell_anchors * box_dim, kernel_size=1, stride=1 ) for l in [self.conv, self.objectness_logits, self.anchor_deltas]: nn.init.normal_(l.weight, std=0.01) nn.init.constant_(l.bias, 0)
def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]): super().__init__() # fmt: off self.min_box_side_len = cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE self.in_features = cfg.MODEL.RPN.IN_FEATURES self.nms_thresh = cfg.MODEL.RPN.NMS_THRESH self.batch_size_per_image = cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE self.positive_fraction = cfg.MODEL.RPN.POSITIVE_FRACTION self.smooth_l1_beta = cfg.MODEL.RPN.SMOOTH_L1_BETA self.loss_weight = cfg.MODEL.RPN.LOSS_WEIGHT # fmt: on # Map from self.training state to train/test settings self.pre_nms_topk = { True: cfg.MODEL.RPN.PRE_NMS_TOPK_TRAIN, False: cfg.MODEL.RPN.PRE_NMS_TOPK_TEST, } self.post_nms_topk = { True: cfg.MODEL.RPN.POST_NMS_TOPK_TRAIN, False: cfg.MODEL.RPN.POST_NMS_TOPK_TEST, } self.boundary_threshold = cfg.MODEL.RPN.BOUNDARY_THRESH self.anchor_generator = build_anchor_generator( cfg, [input_shape[f] for f in self.in_features] ) self.box2box_transform = BUABox2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS) self.anchor_matcher = Matcher( cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, allow_low_quality_matches=True ) self.rpn_head = build_rpn_head(cfg, [input_shape[f] for f in self.in_features])
def __init__(self, cfg, input_shape: List[ShapeSpec]): super().__init__() # fmt: off in_channels = input_shape[0].channels num_classes = cfg.MODEL.RETINANET.NUM_CLASSES num_convs = cfg.MODEL.RETINANET.NUM_CONVS prior_prob = cfg.MODEL.RETINANET.PRIOR_PROB num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors # fmt: on assert ( len(set(num_anchors)) == 1 ), "Using different number of anchors between levels is not currently supported!" num_anchors = num_anchors[0] self.ssh = [] for i in range(len(cfg.MODEL.RETINANET.IN_FEATURES)): ssh = SSH(cfg, in_channels, in_channels) name = "ssh" + str(i) self.add_module(name, ssh) self.ssh.append(ssh) cls_subnet = [] bbox_subnet = [] for _ in range(num_convs): cls_subnet.append( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)) cls_subnet.append(nn.ReLU()) bbox_subnet.append( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)) bbox_subnet.append(nn.ReLU()) self.cls_score = nn.Conv2d(in_channels, num_anchors * num_classes, kernel_size=1, stride=1, padding=0) self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1, padding=0) # Initialization for modules in [self.cls_score, self.bbox_pred]: for layer in modules.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.normal_(layer.weight, mean=0, std=0.01) torch.nn.init.constant_(layer.bias, 0) # Use prior in model initialization to improve stability bias_value = -math.log((1 - prior_prob) / prior_prob) torch.nn.init.constant_(self.cls_score.bias, bias_value)
def from_config(cls, cfg): backbone = build_backbone(cfg) backbone_shape = backbone.output_shape() backbone_level = cfg.MODEL.YOLOF.ENCODER.BACKBONE_LEVEL feature_shapes = [backbone_shape[backbone_level]] encoder = DilatedEncoder(cfg, backbone_shape) decoder = Decoder(cfg) anchor_generator = build_anchor_generator(cfg, feature_shapes) return { "backbone": backbone, "encoder": encoder, "decoder": decoder, "anchor_generator": anchor_generator, "box2box_transform": YOLOFBox2BoxTransform( weights=cfg.MODEL.YOLOF.BOX_TRANSFORM.BBOX_REG_WEIGHTS, add_ctr_clamp=cfg.MODEL.YOLOF.BOX_TRANSFORM.ADD_CTR_CLAMP, ctr_clamp=cfg.MODEL.YOLOF.BOX_TRANSFORM.CTR_CLAMP), "anchor_matcher": UniformMatcher(cfg.MODEL.YOLOF.MATCHER.TOPK), "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, "num_classes": cfg.MODEL.YOLOF.DECODER.NUM_CLASSES, "backbone_level": backbone_level, # Ignore thresholds: "pos_ignore_thresh": cfg.MODEL.YOLOF.POS_IGNORE_THRESHOLD, "neg_ignore_thresh": cfg.MODEL.YOLOF.NEG_IGNORE_THRESHOLD, # Loss parameters: "focal_loss_alpha": cfg.MODEL.YOLOF.LOSSES.FOCAL_LOSS_ALPHA, "focal_loss_gamma": cfg.MODEL.YOLOF.LOSSES.FOCAL_LOSS_GAMMA, "box_reg_loss_type": cfg.MODEL.YOLOF.LOSSES.BBOX_REG_LOSS_TYPE, # Inference parameters: "test_score_thresh": cfg.MODEL.YOLOF.SCORE_THRESH_TEST, "test_topk_candidates": cfg.MODEL.YOLOF.TOPK_CANDIDATES_TEST, "test_nms_thresh": cfg.MODEL.YOLOF.NMS_THRESH_TEST, "max_detections_per_image": cfg.MODEL.YOLOF.DETECTIONS_PER_IMAGE, # Vis parameters "vis_period": cfg.VIS_PERIOD, "input_format": cfg.INPUT.FORMAT, }
def __init__(self, cfg, feature_shapes, weights=[1.0, 1.0, 1.0, 1.0], scale_clamp=_DEFAULT_SCALE_CLAMP): super().__init__() self.weights = weights self.scale_clamp = scale_clamp # Build heads. num_classes = cfg.MODEL.OneNet.NUM_CLASSES d_model = cfg.MODEL.FPN.OUT_CHANNELS activation = cfg.MODEL.OneNet.ACTIVATION num_conv = cfg.MODEL.OneNet.NUM_CONV conv_norm = cfg.MODEL.OneNet.CONV_NORM num_levels = len(cfg.MODEL.OneNet.IN_FEATURES) conv_channels = cfg.MODEL.OneNet.CONV_CHANNELS self.num_classes = num_classes self.d_model = d_model self.num_classes = num_classes self.activation = _get_activation_fn(activation) self.features_stride = cfg.MODEL.OneNet.FEATURES_STRIDE cls_conv_module = list() for idx in range(num_conv): if idx == 0: cls_conv_module.append(nn.Conv2d(d_model, conv_channels, kernel_size=3, stride=1, padding=1, bias=False)) else: cls_conv_module.append(nn.Conv2d(conv_channels, conv_channels, kernel_size=3, stride=1, padding=1, bias=False)) cls_conv_module.append(nn.ReLU(inplace=True)) self.cls_conv_module = nn.ModuleList(cls_conv_module) reg_conv_module = list() for idx in range(num_conv): if idx == 0: reg_conv_module.append(nn.Conv2d(d_model, conv_channels, kernel_size=3, stride=1, padding=1, bias=False)) else: reg_conv_module.append(nn.Conv2d(conv_channels, conv_channels, kernel_size=3, stride=1, padding=1, bias=False)) reg_conv_module.append(nn.ReLU(inplace=True)) self.reg_conv_module = nn.ModuleList(reg_conv_module) anchor_generator = build_anchor_generator(cfg, feature_shapes) self.anchor_generator = anchor_generator num_anchors = anchor_generator.num_cell_anchors assert ( len(set(num_anchors)) == 1 ), "Using different number of anchors between levels is not currently supported!" self.num_anchors = num_anchors[0] self.cls_score = nn.Conv2d(conv_channels, self.num_anchors * num_classes, kernel_size=3, stride=1, padding=1) self.bbox_pred = nn.Conv2d(conv_channels, self.num_anchors * 4, kernel_size=3, stride=1, padding=1) # Init parameters. prior_prob = cfg.MODEL.OneNet.PRIOR_PROB self.bias_value = -math.log((1 - prior_prob) / prior_prob) self._reset_parameters()
def __init__(self, cfg, input_shape: List[ShapeSpec]): super().__init__() # fmt: off in_channels = input_shape[0].channels num_classes = cfg.MODEL.RETINANET.NUM_CLASSES num_convs = cfg.MODEL.RETINANET.NUM_CONVS prior_prob = cfg.MODEL.RETINANET.PRIOR_PROB norm = cfg.MODEL.RETINANET.NORM # Disabling shared norm causes backwards compatibility issues # Hardcode to true for now # shared_norm = cfg.MODEL.RETINANET.SHARED_NORM num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors # fmt: on assert ( len(set(num_anchors)) == 1 ), "Using different number of anchors between levels is not currently supported!" num_anchors = num_anchors[0] if norm == "BN" or norm == "SyncBN": logger = logging.getLogger(__name__) logger.warn("Shared norm does not work well for BN, SyncBN, expect poor results") cls_subnet = [] bbox_subnet = [] for _ in range(num_convs): cls_subnet.append( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) ) if norm: cls_subnet.append(get_norm(norm, in_channels)) cls_subnet.append(nn.ReLU()) bbox_subnet.append( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) ) if norm: bbox_subnet.append(get_norm(norm, in_channels)) bbox_subnet.append(nn.ReLU()) self.cls_subnet = nn.Sequential(*cls_subnet) self.bbox_subnet = nn.Sequential(*bbox_subnet) self.cls_score = nn.Conv2d( in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1 ) self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1) # Initialization for modules in [self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred]: for layer in modules.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.normal_(layer.weight, mean=0, std=0.01) torch.nn.init.constant_(layer.bias, 0) # Use prior in model initialization to improve stability bias_value = -(math.log((1 - prior_prob) / prior_prob)) torch.nn.init.constant_(self.cls_score.bias, bias_value)
def __init__(self, cfg, input_shape: List[ShapeSpec]): super().__init__() # fmt: off tower_repeat = [3, 3, 3, 4, 4, 4, 5, 5] in_channels = input_shape[0].channels compound_coef = cfg.MODEL.EFFICIENTNET.COMPOUND_COEFFICIENT num_classes = cfg.MODEL.EFFICIENTDET.NUM_CLASSES num_convs = cfg.MODEL.EFFICIENTDET.NUM_CONVS num_convs = tower_repeat[compound_coef] if num_convs < 0 else num_convs prior_prob = cfg.MODEL.EFFICIENTDET.PRIOR_PROB num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors norm = cfg.MODEL.EFFICIENTDET.NORM export_onnx = cfg.MODEL.EXPORT_ONNX # fmt: on assert ( len(set(num_anchors)) == 1 ), "Using different number of anchors between levels is not currently supported!" num_anchors = num_anchors[0] cls_subnet = [] bbox_subnet = [] for _ in range(num_convs): cls_subnet.append( SeparableConvBlock(in_channels, norm=norm, activation=True, onnx_export=export_onnx)) bbox_subnet.append( SeparableConvBlock(in_channels, norm=norm, activation=True, onnx_export=export_onnx)) self.cls_subnet = nn.Sequential(*cls_subnet) self.bbox_subnet = nn.Sequential(*bbox_subnet) self.cls_score = SeparableConvBlock(in_channels, num_anchors * num_classes) self.bbox_pred = SeparableConvBlock(in_channels, num_anchors * 4) # Initialization for modules in [ self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred ]: for layer in modules.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.normal_(layer.weight, mean=0, std=0.01) if layer.bias is not None: torch.nn.init.constant_(layer.bias, 0) # Use prior in model initialization to improve stability bias_value = -(math.log((1 - prior_prob) / prior_prob)) torch.nn.init.constant_(self.cls_score.pointwise_conv.bias, bias_value)
def __init__(self, cfg) -> None: super().__init__() self.num_classes: int = cfg.MODEL.RETINANET.NUM_CLASSES self.in_features: List[str] = cfg.MODEL.RETINANET.IN_FEATURES # Loss parameters: self.focal_loss_alpha: float = cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA self.focal_loss_gamma: float = cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA self.smooth_l1_loss_beta: float = cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA # Inference parameters: self.score_threshold: float = cfg.MODEL.RETINANET.SCORE_THRESH_TEST self.topk_candidates: int = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST self.nms_threshold: float = cfg.MODEL.RETINANET.NMS_THRESH_TEST self.max_detections_per_image: int = cfg.TEST.DETECTIONS_PER_IMAGE # Vis parameters self.vis_period: int = cfg.VIS_PERIOD self.input_format: str = cfg.INPUT.FORMAT self.fpn: FPN = build_fpn_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) backbone_fpn_output_shape: Dict[str, ShapeSpec] = self.fpn.output_shape() feature_shapes: List[ShapeSpec] = [ backbone_fpn_output_shape[f] for f in self.in_features ] self.head: RetinaNetHead = RetinaNetHead(cfg, feature_shapes) self.anchor_generator: nn.Module = build_anchor_generator( cfg, feature_shapes) # Matching and loss self.box2box_transform: Box2BoxTransform = Box2BoxTransform( weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS) self.anchor_matcher: Matcher = Matcher( thresholds=cfg.MODEL.RETINANET.IOU_THRESHOLDS, labels=cfg.MODEL.RETINANET.IOU_LABELS, allow_low_quality_matches=True) self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1)) # In Detectron1, loss is normalized by number of foreground samples in the batch. # When batch size is 1 per GPU, #foreground has a large variance and # using it lead to lower performance. Here we maintain an EMA of #foreground to # stabilize the normalizer. # Initialize with any reasonable #fg that's not too small self.loss_normalizer: float = 100 self.loss_normalizer_momentum: float = 0.9
def __init__(self, cfg): super(RetinaNet, self).__init__() self.device = torch.device(cfg.MODEL.DEVICE) # fmt: off self.num_classes = cfg.MODEL.RETINANET.NUM_CLASSES self.in_features = cfg.MODEL.RETINANET.IN_FEATURES # Loss parameters: self.focal_loss_alpha = cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA self.smooth_l1_loss_beta = cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA # Inference parameters: self.score_threshold = cfg.MODEL.RETINANET.SCORE_THRESH_TEST self.topk_candidates = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST self.nms_threshold = cfg.MODEL.RETINANET.NMS_THRESH_TEST self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE # Vis parameters self.vis_period = cfg.VIS_PERIOD self.input_format = cfg.INPUT.FORMAT # fmt: on self.backbone = build_backbone(cfg) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] self.head = RetinaNetHead(cfg, feature_shapes) self.anchor_generator = build_anchor_generator(cfg, feature_shapes) # Matching and loss self.box2box_transform = Box2BoxTransform( weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS) self.matcher = Matcher( cfg.MODEL.RETINANET.IOU_THRESHOLDS, cfg.MODEL.RETINANET.IOU_LABELS, allow_low_quality_matches=True, ) assert len(cfg.MODEL.PIXEL_MEAN) == len(cfg.MODEL.PIXEL_STD) num_channels = len(cfg.MODEL.PIXEL_MEAN) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( num_channels, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( num_channels, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device) """ In Detectron1, loss is normalized by number of foreground samples in the batch. When batch size is 1 per GPU, #foreground has a large variance and using it lead to lower performance. Here we maintain an EMA of #foreground to stabilize the normalizer. """ self.loss_normalizer = 100 # initialize with any reasonable #fg that's not too small self.loss_normalizer_momentum = 0.9
def __init__(self, cfg): super().__init__() self.num_classes = cfg.MODEL.RETINAFACE.NUM_CLASSES self.in_features = cfg.MODEL.RETINAFACE.IN_FEATURES # loss parameters self.focal_loss_alpha = cfg.MODEL.RETINAFACE.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.RETINAFACE.FOCAL_LOSS_GAMMA self.smooth_l1_loss_beta = cfg.MODEL.RETINAFACE.SMOOTH_L1_LOSS_BETA self.loc_weight = cfg.MODEL.RETINAFACE.LOC_WEIGHT # inference parameters self.score_threshold = cfg.MODEL.RETINAFACE.SCORE_THRESH_TEST self.topk_candidates = cfg.MODEL.RETINAFACE.TOPK_CANDIDATES_TEST self.nms_threshold = cfg.MODEL.RETINAFACE.NMS_THRESH_TEST self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE # visualize parameters self.vis_period = cfg.VIS_PERIOD self.input_format = cfg.INPUT.FORMAT self.backbone = build_backbone(cfg) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] self.head = RetinaFaceHead(cfg, feature_shapes) self.anchor_generator = build_anchor_generator(cfg, feature_shapes) # Matching and loss self.box2box_transform = Box2BoxTransform( weights=cfg.MODEL.RETINAFACE.BBOX_REG_WEIGHTS ) self.landmark2landmark_transform = Landmark2LandmarkTransform( weights=cfg.MODEL.RETINAFACE.LANDMARK_REG_WEIGHTS ) self.matcher = Matcher( cfg.MODEL.RETINAFACE.IOU_THRESHOLDS, cfg.MODEL.RETINAFACE.IOU_LABELS, allow_low_quality_matches=True ) self.register_buffer( "pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1) ) self.register_buffer( "pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1) ) """ In Detectron1, loss is normalized by number of foreground samples in the batch. When batch size is 1 per GPU, #foreground has a large variance and using it lead to lower performance. Here we maintain an EMA of #foreground to stabilize the normalizer. """ # initialize with any reasonable #fg that's not too small self.loss_normalizer = 100 self.loss_normalizer_momentum = 0.9
def __init__(self, cfg): super().__init__() # fmt: off self.device = torch.device(cfg.MODEL.DEVICE) self.num_classes = cfg.MODEL.RETINANET.NUM_CLASSES self.in_features = cfg.MODEL.RETINANET.IN_FEATURES # Mask parameters: self.discard_mask_area = cfg.MODEL.YOLACT.DISCARD_MASK_AREA self.num_masks = cfg.MODEL.YOLACT.NUM_MASKS # Loss parameters: self.sem_seg_alpha = cfg.MODEL.YOLACT.SEM_SEG_ALPHA self.mask_alpha = cfg.MODEL.YOLACT.MASK_ALPHA self.mask_reweight = cfg.MODEL.YOLACT.MASK_REWEIGHT self.maskiou_alpha = cfg.MODEL.YOLACT.MASKIOU_ALPHA self.focal_loss_alpha = cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA self.smooth_l1_loss_beta = cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA # Inference parameters: self.score_threshold = cfg.MODEL.RETINANET.SCORE_THRESH_TEST self.topk_candidates = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST self.nms_threshold = cfg.MODEL.RETINANET.NMS_THRESH_TEST self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE # fmt: on # retinanet_resnet_fpn_backbone self.backbone = build_backbone(cfg) # dict[str->ShapeSpec] backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] # base retinanet add mask coefficient branch self.head = YolactHead(cfg, feature_shapes) # which layer output of backbone to protonet. see offical yolact's cfg.proto_src. # default is `res2`, but this is `res3` self.protonet = ProtoNet(cfg, feature_shapes[0]) # to mask scoring self.maskiou_net = MaskIouNet(cfg) # semantic segmentation to help training self.semantic_seg_conv = nn.Conv2d(feature_shapes[0].channels, self.num_classes, 1) self.anchor_generator = build_anchor_generator(cfg, feature_shapes) # Matching and loss self.box2box_transform = Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS) self.matcher = Matcher( cfg.MODEL.RETINANET.IOU_THRESHOLDS, cfg.MODEL.RETINANET.IOU_LABELS, allow_low_quality_matches=True, ) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def __init__(self, cfg, input_shape: List[ShapeSpec]): super().__init__() # fmt: off in_channels = input_shape[0].channels num_classes = cfg.MODEL.RETINAFACE.NUM_CLASSES prior_prob = cfg.MODEL.RETINAFACE.PRIOR_PROB num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors # fmt: on assert ( len(set(num_anchors)) == 1 ), "Using different number of anchors between levels is not currently supported!" num_anchors = num_anchors[0] # Add SSH Module self.ssh = SSH(cfg, input_shape) # Add heads cls_score = [] bbox_pred = [] # NOTE enable landmark landmark_pred = [] for _ in range(len(input_shape)): cls_score.append( nn.Conv2d(in_channels, num_anchors * num_classes, kernel_size=1, stride=1, padding=0)) bbox_pred.append( nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1, padding=0)) landmark_pred.append( nn.Conv2d(in_channels, num_anchors * 10, kernel_size=1, stride=1, padding=0)) self.cls_score = nn.ModuleList(cls_score) self.bbox_pred = nn.ModuleList(bbox_pred) self.landmark_pred = nn.ModuleList(landmark_pred) # NOTE Initialization # Use prior in model initialization to improve stability bias_value = -math.log((1 - prior_prob) / prior_prob) for cls_score in self.cls_score: torch.nn.init.constant_(cls_score.bias, bias_value)
def from_config(cls, cfg, input_shape): # Standard RPN is shared across levels: in_channels = [s.channels for s in input_shape] assert len(set(in_channels)) == 1, "Each level must have the same channel!" in_channels = in_channels[0] # RPNHead should take the same input as anchor generator # NOTE: it assumes that creating an anchor generator does not have unwanted side effect. anchor_generator = build_anchor_generator(cfg, input_shape) num_anchors = anchor_generator.num_anchors box_dim = anchor_generator.box_dim assert ( len(set(num_anchors)) == 1 ), "Each level must have the same number of anchors per spatial position" return {"in_channels": in_channels, "num_anchors": num_anchors[0], "box_dim": box_dim}
def from_config(cls, cfg, input_shape: List[ShapeSpec]): num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors assert ( len(set(num_anchors)) == 1 ), "Using different number of anchors between levels is not currently supported!" num_anchors = num_anchors[0] return { "input_shape": input_shape, "num_classes": cfg.MODEL.RETINANET.NUM_CLASSES, "conv_dims": [input_shape[0].channels] * cfg.MODEL.RETINANET.NUM_CONVS, "prior_prob": cfg.MODEL.RETINANET.PRIOR_PROB, "norm": cfg.MODEL.RETINANET.NORM, "num_anchors": num_anchors, }
def __init__(self, cfg, input_shape: List[ShapeSpec]): super().__init__() # Standard RPN is shared across levels: in_channels = [s.channels for s in input_shape] assert len( set(in_channels)) == 1, "Each level must have the same channel!" in_channels = in_channels[0] dwexpand_factor = cfg.MODEL.RPN.DWEXPAND_FACTOR norm = cfg.MODEL.RPN.NORM # RPNHead should take the same input as anchor generator # NOTE: it assumes that creating an anchor generator does not have unwanted side effect. anchor_generator = build_anchor_generator(cfg, input_shape) num_cell_anchors = anchor_generator.num_cell_anchors box_dim = anchor_generator.box_dim assert (len(set(num_cell_anchors)) == 1 ), "Each level must have the same number of cell anchors" num_cell_anchors = num_cell_anchors[0] # 3x3 conv for the hidden representation expand_channels = dwexpand_factor * in_channels conv = [] conv.append(Conv2d(in_channels, expand_channels, kernel_size=1, bias=not norm,\ norm=get_norm(norm, expand_channels), activation=F.relu)) conv.append(Conv2d(expand_channels, expand_channels, kernel_size=5, padding=2, groups=expand_channels,\ bias=not norm, norm=get_norm(norm, expand_channels), activation=F.relu)) self.add_module('conv', nn.Sequential(*conv)) # in_channels = expand_channels # self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) # 1x1 conv for predicting objectness logits self.objectness_logits = nn.Conv2d(in_channels, num_cell_anchors, kernel_size=1, stride=1) # 1x1 conv for predicting box2box transform deltas self.anchor_deltas = nn.Conv2d(in_channels, num_cell_anchors * box_dim, kernel_size=1, stride=1) for l in [*self.conv, self.objectness_logits, self.anchor_deltas]: nn.init.normal_(l.weight, std=0.01) if l.bias is not None: nn.init.constant_(l.bias, 0)
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) # fmt: off self.num_classes = cfg.MODEL.RETINANET.NUM_CLASSES self.in_features = cfg.MODEL.RETINANET.IN_FEATURES # Loss parameters: self.focal_loss_alpha = cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA self.smooth_l1_loss_beta = cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA # Inference parameters: self.score_threshold = cfg.MODEL.RETINANET.SCORE_THRESH_TEST self.topk_candidates = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST self.nms_threshold = cfg.MODEL.RETINANET.NMS_THRESH_TEST self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE # fmt: on self.backbone = build_backbone(cfg) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] self.head = RetinaNetHead(cfg, feature_shapes) self.anchor_generator = build_anchor_generator(cfg, feature_shapes) # Matching and loss self.box2box_transform = Box2BoxTransformRotated( weights=cfg.MODEL.RETINANET.BBOX_REG_WEIGHTS) self.matcher = Matcher( cfg.MODEL.RETINANET.IOU_THRESHOLDS, cfg.MODEL.RETINANET.IOU_LABELS, allow_low_quality_matches=True, ) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): in_features = cfg.MODEL.RPN.IN_FEATURES ret = { "in_features": in_features, "min_box_size": cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE, "nms_thresh": cfg.MODEL.RPN.NMS_THRESH, "batch_size_per_image": cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE, "positive_fraction": cfg.MODEL.RPN.POSITIVE_FRACTION, "loss_weight": { "loss_rpn_cls": cfg.MODEL.RPN.LOSS_WEIGHT, "loss_rpn_loc": cfg.MODEL.RPN.BBOX_REG_LOSS_WEIGHT * cfg.MODEL.RPN.LOSS_WEIGHT, }, "anchor_boundary_thresh": cfg.MODEL.RPN.BOUNDARY_THRESH, "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS), "box_reg_loss_type": cfg.MODEL.RPN.BBOX_REG_LOSS_TYPE, "smooth_l1_beta": cfg.MODEL.RPN.SMOOTH_L1_BETA, } ret["pre_nms_topk"] = (cfg.MODEL.RPN.PRE_NMS_TOPK_TRAIN, cfg.MODEL.RPN.PRE_NMS_TOPK_TEST) ret["post_nms_topk"] = (cfg.MODEL.RPN.POST_NMS_TOPK_TRAIN, cfg.MODEL.RPN.POST_NMS_TOPK_TEST) ret["anchor_generator"] = build_anchor_generator( cfg, [input_shape[f] for f in in_features]) ret["anchor_matcher"] = Matcher(cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, allow_low_quality_matches=True) ret["head"] = build_rpn_head(cfg, [input_shape[f] for f in in_features]) return ret
def from_config(cls, cfg): backbone = build_backbone(cfg) backbone_shape = backbone.output_shape() feature_shapes = [backbone_shape[f] for f in cfg.MODEL.RETINANET.IN_FEATURES] anchor_generator = build_anchor_generator(cfg, feature_shapes) metadata = MetadataCatalog.get( cfg.DATASETS.TRAIN[0] if len(cfg.DATASETS.TRAIN) else "__unused" ) return { "backbone": backbone, "head": RetinaFaceHead(cfg, feature_shapes), "anchor_generator": anchor_generator, "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.RETINANET.BBOX_REG_WEIGHTS), "mark2mark_transform": Mark2MarkTransform(cfg.MODEL.RETINAFACE.NUM_LANDMARK, weights=cfg.MODEL.RETINAFACE.LANDMARK_REG_WEIGHTS), "anchor_matcher": Matcher( cfg.MODEL.RETINANET.IOU_THRESHOLDS, cfg.MODEL.RETINANET.IOU_LABELS, allow_low_quality_matches=True, ), "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, "num_classes": cfg.MODEL.RETINANET.NUM_CLASSES, "num_landmark": cfg.MODEL.RETINAFACE.NUM_LANDMARK, "head_in_features": cfg.MODEL.RETINANET.IN_FEATURES, # Loss parameters: "focal_loss_alpha": cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA, "focal_loss_gamma": cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA, "smooth_l1_beta": cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA, "box_reg_loss_type": cfg.MODEL.RETINANET.BBOX_REG_LOSS_TYPE, "loc_weight": cfg.MODEL.RETINAFACE.LOC_WEIGHT, # Inference parameters: "test_score_thresh": cfg.MODEL.RETINANET.SCORE_THRESH_TEST, "test_topk_candidates": cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST, "test_nms_thresh": cfg.MODEL.RETINANET.NMS_THRESH_TEST, "max_detections_per_image": cfg.TEST.DETECTIONS_PER_IMAGE, # Vis parameters "vis_period": cfg.VIS_PERIOD, "input_format": cfg.INPUT.FORMAT, "visualizer": TrainingVisualizer(detector_postprocess, metadata), }
def __init__(self, cfg, input_shape: List[ShapeSpec]): super().__init__(cfg, input_shape) head_params = cfg.MODEL.META_ARCH self.box_reg_loss_type = head_params.BBOX_REG_LOSS_TYPE self.anchor_generator = build_anchor_generator(cfg, input_shape) self.num_anchor = self.anchor_generator.num_cell_anchors[0] self.feat_adaptive = head_params.FEAT_ADAPTION # init bbox pred self.loc_init_conv = nn.Conv2d(self.feat_channels, self.loc_feat_channels, 3, 1, 1) self.loc_init_out = nn.Conv2d(self.loc_feat_channels, 4, 3, 1, 1) # Matching and loss self.box2box_transform = Box2BoxTransform( weights=head_params.BBOX_REG_WEIGHTS) self.anchor_matcher = Matcher( head_params.IOU_THRESHOLDS, head_params.IOU_LABELS, allow_low_quality_matches=True, ) self.strides = [i.stride for i in input_shape] self.matcher = nearest_point_match # make feature adaptive layer self.make_feature_adaptive_layers() self.cls_out = nn.Conv2d(self.feat_channels, self.num_anchor * self.num_classes, 3, 1, 1) self.loc_refine_out = nn.Conv2d(self.loc_feat_channels, self.num_anchor * 4, 3, 1, 1) self._init_weights() self.loss_normalizer = 100 # initialize with any reasonable #fg that's not too small self.loss_normalizer_momentum = 0.9 grid = uniform_grid(2048) self.register_buffer("grid", grid)
def __init__(self, cfg, engine_path): super(TensorRTRetinaNet, self).__init__(engine_path) RetinaNetModel.__init__(self, cfg, self._engine) # preprocess parameters ns = types.SimpleNamespace() ns.training = False ns.input = self._cfg.INPUT ns.dynamic = self._cfg.INPUT.DYNAMIC ns.device = torch.device(self._cfg.MODEL.DEVICE) ns.pixel_mean = torch.tensor(self._cfg.MODEL.PIXEL_MEAN).view( -1, 1, 1).to(ns.device) ns.pixel_std = torch.tensor(self._cfg.MODEL.PIXEL_STD).view( -1, 1, 1).to(ns.device) ns.backbone = types.SimpleNamespace() ns.backbone.size_divisibility = 32 # inference parameters ns.num_classes = self._cfg.MODEL.RETINANET.NUM_CLASSES ns.topk_candidates = self._cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST ns.score_threshold = self._cfg.MODEL.RETINANET.SCORE_THRESH_TEST ns.nms_threshold = self._cfg.MODEL.RETINANET.NMS_THRESH_TEST ns.max_detections_per_image = self._cfg.TEST.DETECTIONS_PER_IMAGE # anchor generator feature_shapes = [ShapeSpec(stride=s) for s in (8, 16, 32, 64, 128)] self._anchor_generator = build_anchor_generator( self._cfg, feature_shapes) ns.preprocess_image = functools.partial( meta_arch.RetinaNet.preprocess_image, ns) ns.inference = functools.partial(meta_arch.RetinaNet.inference, ns) ns.inference_single_image = functools.partial( meta_arch.RetinaNet.inference_single_image, ns) ns.box2box_transform = Box2BoxTransform( weights=self._cfg.MODEL.RPN.BBOX_REG_WEIGHTS) self._ns = ns
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.backbone_level = cfg.MODEL.YOLOF.ENCODER.BACKBONE_LEVEL self.backbone = build_backbone(cfg) self.nums_classes = cfg.MODEL.YOLOF.DECODER.NUM_CLASSES # build anchor generator backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[self.backbone_level]] self.anchor_generator = build_anchor_generator(cfg, feature_shapes) # build encode decode self.encoder = DilatedEncoder(cfg, backbone_shape) self.decoder = Decoder(cfg) # prepare ground truth self.box2box_transform = YOLOFBox2BoxTransform( weights=cfg.MODEL.YOLOF.BOX_TRANSFORM.BBOX_REG_WEIGHTS, add_ctr_clamp=cfg.MODEL.YOLOF.BOX_TRANSFORM.ADD_CTR_CLAMP, ctr_clamp=cfg.MODEL.YOLOF.BOX_TRANSFORM.CTR_CLAMP) self.anchor_matcher = UniformMatcher(cfg.MODEL.YOLOF.MATCHER.TOPK) self.test_score_thresh = 0.05 self.test_nms_thresh = 0.6 self.test_topk_candidates = 1000 self.max_detections_per_image = 100 # build loss self.losses = Losses(cfg) # get normalizer pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def __init__(self, cfg, input_shape: List[ShapeSpec]): super().__init__() # fmt: off in_channels = input_shape[0].channels num_classes = cfg.MODEL.RETINANET.NUM_CLASSES prior_prob = cfg.MODEL.RETINANET.PRIOR_PROB num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors norm = cfg.MODEL.RETINANET.NORM num_convs = cfg.MODEL.RETINANET.NUM_CONVS in_features = cfg.MODEL.RETINANET.IN_FEATURES # fmt: on assert (len(set(num_anchors)) == 1), \ "Using different number of anchors between levels is not currently supported!" num_anchors = num_anchors[0] cls_depthwise_convs = [] cls_pointwise_convs = [] bbox_depthwise_convs = [] bbox_pointwise_convs = [] for _ in range(num_convs): cls_depthwise_convs.append( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, groups=in_channels, bias=False)) cls_pointwise_convs.append( nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0, bias=norm == '')) bbox_depthwise_convs.append( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1, groups=in_channels, bias=False)) bbox_pointwise_convs.append( nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0, bias=norm == '')) self.cls_subnets = nn.ModuleList() self.bbox_subnets = nn.ModuleList() for _ in in_features: cls_subnet = [] bbox_subnet = [] for cls_depthwise, cls_pointwise, bbox_depthwise, bbox_pointwise in \ zip(cls_depthwise_convs, cls_pointwise_convs, bbox_depthwise_convs, bbox_pointwise_convs): cls_subnet.append( ResHead(cls_depthwise, cls_pointwise, in_channels, norm)) bbox_subnet.append( ResHead(bbox_depthwise, bbox_pointwise, in_channels, norm)) self.cls_subnets.append(nn.Sequential(*cls_subnet)) self.bbox_subnets.append(nn.Sequential(*bbox_subnet)) self.cls_score = nn.Conv2d(in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1) self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1) # Initialization for modules in [ self.cls_subnets, self.bbox_subnets, self.cls_score, self.bbox_pred ]: for layer in modules.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.normal_(layer.weight, mean=0, std=0.01) if layer.bias is not None: torch.nn.init.constant_(layer.bias, 0) # Use prior in model initialization to improve stability bias_value = -math.log((1 - prior_prob) / prior_prob) torch.nn.init.constant_(self.cls_score.bias, bias_value)
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) # fmt: off self.num_classes = cfg.MODEL.RETINANET.NUM_CLASSES self.in_features = cfg.MODEL.RETINANET.IN_FEATURES # Loss parameters: self.focal_loss_alpha = cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA self.smooth_l1_loss_beta = cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA # Inference parameters: self.score_threshold = cfg.MODEL.RETINANET.SCORE_THRESH_TEST self.topk_candidates = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST self.nms_threshold = cfg.MODEL.RETINANET.NMS_THRESH_TEST self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE # Vis parameters self.vis_period = cfg.VIS_PERIOD self.input_format = cfg.INPUT.FORMAT # fmt: on # for onnx model export self.export_onnx = cfg.MODEL.FASHIONNET.EXPORT_ONNX # for classification task self.classification_tasks = cfg.MODEL.FASHIONNET.CLASSIFICATION_HEAD.TASK_NAMES self.classification_classes = cfg.MODEL.FASHIONNET.CLASSIFICATION_HEAD.NUM_CLASSES assert (len(self.classification_classes) == len( self.classification_tasks)) self.activation = cfg.MODEL.FASHIONNET.CLASSIFICATION_HEAD.ACTIVATION self.fashion_score_threshold = cfg.MODEL.FASHIONNET.CLASSIFICATION_HEAD.SCORE_THRESH self.backbone = build_backbone(cfg) self.size_divisibility = 32 backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] self.head = RetinaNetHead(cfg, feature_shapes) self.cls_head = FashionClassificationHead(cfg, feature_shapes) # # multi task learning with uncertainty # self.log_vars = nn.Parameter(torch.zeros(2), requires_grad=True) self.anchor_generator = build_anchor_generator(cfg, feature_shapes) # Matching and loss self.box2box_transform = Box2BoxTransform( weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS) self.matcher = Matcher( cfg.MODEL.RETINANET.IOU_THRESHOLDS, cfg.MODEL.RETINANET.IOU_LABELS, allow_low_quality_matches=True, ) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device) """ In Detectron1, loss is normalized by number of foreground samples in the batch. When batch size is 1 per GPU, #foreground has a large variance and using it lead to lower performance. Here we maintain an EMA of #foreground to stabilize the normalizer. """ self.loss_normalizer = 100 # initialize with any reasonable #fg that's not too small self.loss_normalizer_momentum = 0.9
def __init__(self, cfg, input_shape: List[ShapeSpec]): super().__init__() # the same as RetinaNetHead, we replace the cls_score net to logits net, which utilizes the deform_conv # fmt: off in_channels = input_shape[0].channels num_classes = cfg.MODEL.RETINANET.NUM_CLASSES num_convs = cfg.MODEL.RETINANET.NUM_CONVS prior_prob = cfg.MODEL.RETINANET.PRIOR_PROB num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors # fmt: on assert ( len(set(num_anchors)) == 1 ), "Using different number of anchors between levels is not currently supported!" num_anchors = num_anchors[0] cls_subnet = [] bbox_subnet = [] for _ in range(num_convs): cls_subnet.append( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) ) cls_subnet.append(nn.ReLU()) bbox_subnet.append( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) ) bbox_subnet.append(nn.ReLU()) self.cls_subnet = nn.Sequential(*cls_subnet) self.bbox_subnet = nn.Sequential(*bbox_subnet) # self.cls_score = nn.Conv2d( # in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1 # ) self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1) # Initialization for modules in [self.cls_subnet, self.bbox_subnet, self.bbox_pred]: for layer in modules.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.normal_(layer.weight, mean=0, std=0.01) torch.nn.init.constant_(layer.bias, 0) # Deform_conv block, added as a second stage refinement. The implementation follows reppoints. self.dcn_kernel = 3 self.dcn_pad = 1 self.point_base_scale = 4 self.gradient_mul = 0.1 self.in_channels = in_channels self.num_anchors = num_anchors self.box2box_transform = Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS) dcn_base = np.arange(-self.dcn_pad, self.dcn_pad + 1).astype(np.float64) dcn_base_y = np.repeat(dcn_base, self.dcn_kernel) dcn_base_x = np.tile(dcn_base, self.dcn_kernel) dcn_base_offset = np.stack([dcn_base_y, dcn_base_x], axis=1).reshape((-1)) dcn_base_offset = torch.tensor(dcn_base_offset, dtype=torch.float32).view(1, -1, 1, 1) self.register_buffer("dcn_base_offset", dcn_base_offset) self.deform_cls_conv = DeformConv( self.in_channels, self.in_channels, self.dcn_kernel, 1, self.dcn_pad) self.deform_reg_conv = DeformConv( self.in_channels, self.in_channels, self.dcn_kernel, 1, self.dcn_pad) self.offsets_refine = nn.Sequential( nn.ReLU(), nn.Conv2d(self.num_anchors * self.in_channels, num_anchors * 4, 1, 1, 0)) self.logits = nn.Sequential( nn.ReLU(), nn.Conv2d(self.num_anchors * self.in_channels, num_anchors * num_classes, 1, 1, 0)) bias_init = float(-np.log((1 - 0.01) / 0.01)) for modules in [ self.offsets_refine, self.deform_cls_conv, self.deform_reg_conv]: for layer in modules.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.normal_(layer.weight, mean=0, std=0.01) torch.nn.init.constant_(layer.bias, 0) for module in self.logits.modules(): if hasattr(module, 'bias') and module.bias is not None: torch.nn.init.constant_(module.bias, bias_init)
def __init__(self, cfg, use_dropout, dropout_rate, compute_cls_var, compute_bbox_cov, bbox_cov_dims, input_shape: List[ShapeSpec]): super().__init__(cfg, input_shape) # Extract config information # fmt: off in_channels = input_shape[0].channels num_classes = cfg.MODEL.RETINANET.NUM_CLASSES num_convs = cfg.MODEL.RETINANET.NUM_CONVS prior_prob = cfg.MODEL.RETINANET.PRIOR_PROB num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors # fmt: on assert ( len(set(num_anchors)) == 1 ), "Using different number of anchors between levels is not currently supported!" num_anchors = num_anchors[0] self.compute_cls_var = compute_cls_var self.compute_bbox_cov = compute_bbox_cov self.bbox_cov_dims = bbox_cov_dims # For consistency all configs are grabbed from original RetinaNet self.use_dropout = use_dropout self.dropout_rate = dropout_rate cls_subnet = [] bbox_subnet = [] for _ in range(num_convs): cls_subnet.append( nn.Conv2d( in_channels, in_channels, kernel_size=3, stride=1, padding=1)) cls_subnet.append(nn.ReLU()) bbox_subnet.append( nn.Conv2d( in_channels, in_channels, kernel_size=3, stride=1, padding=1)) bbox_subnet.append(nn.ReLU()) if self.use_dropout: cls_subnet.append(nn.Dropout(p=self.dropout_rate)) bbox_subnet.append(nn.Dropout(p=self.dropout_rate)) self.cls_subnet = nn.Sequential(*cls_subnet) self.bbox_subnet = nn.Sequential(*bbox_subnet) self.cls_score = nn.Conv2d( in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1) self.bbox_pred = nn.Conv2d( in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1) for modules in [ self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred]: for layer in modules.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.normal_(layer.weight, mean=0, std=0.01) torch.nn.init.constant_(layer.bias, 0) # Use prior in model initialization to improve stability bias_value = -math.log((1 - prior_prob) / prior_prob) torch.nn.init.constant_(self.cls_score.bias, bias_value) # Create subnet for classification variance estimation. if self.compute_cls_var: self.cls_var = nn.Conv2d( in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1) for layer in self.cls_var.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.normal_(layer.weight, mean=0, std=0.01) torch.nn.init.constant_(layer.bias, -10.0) # Create subnet for bounding box covariance estimation. if self.compute_bbox_cov: self.bbox_cov = nn.Conv2d( in_channels, num_anchors * self.bbox_cov_dims, kernel_size=3, stride=1, padding=1) for layer in self.bbox_cov.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.normal_(layer.weight, mean=0, std=0.0001) torch.nn.init.constant_(layer.bias, 0)
def __init__(self, cfg: CfgNode, input_shape: List[ShapeSpec]) -> None: super().__init__() in_channels = input_shape[0].channels num_classes: int = cfg.MODEL.RETINANET.NUM_CLASSES num_convs: int = cfg.MODEL.RETINANET.NUM_CONVS prior_prob: float = cfg.MODEL.RETINANET.PRIOR_PROB num_anchors: List[int] = build_anchor_generator( cfg, input_shape).num_cell_anchors assert len(set(num_anchors)) == 1,\ "Using different number of anchors between levels is not currently supported!" num_anchors_int: int = num_anchors[0] cls_subnet: List[nn.Module] = [] bbox_subnet: List[nn.Module] = [] for _ in range(num_convs): cls_subnet.append( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)) cls_subnet.append(nn.ReLU()) bbox_subnet.append( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)) bbox_subnet.append(nn.ReLU()) self.cls_subnet = nn.Sequential(*cls_subnet) self.bbox_subnet = nn.Sequential(*bbox_subnet) self.cls_score = nn.Conv2d(in_channels, num_anchors_int * num_classes, kernel_size=3, stride=1, padding=1) self.bbox_pred = nn.Conv2d(in_channels, num_anchors_int * 4, kernel_size=3, stride=1, padding=1) # Initialization for modules in [ self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred ]: for layer in modules.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.normal_(tensor=layer.weight, mean=0, std=0.01) torch.nn.init.constant_(tensor=layer.bias, val=0) # Use prior in model initialization to improve stability bias_value: float = -math.log((1 - prior_prob) / prior_prob) torch.nn.init.constant_(self.cls_score.bias, bias_value)
def __init__(self, cfg, input_shape: List[ShapeSpec]): super().__init__() # fmt: off num_conv = cfg.MODEL.HORPN.NUM_CONV conv_dim = cfg.MODEL.HORPN.CONV_DIM num_rn_fc = cfg.MODEL.HORPN.NUM_RN_FC rn_fc_dim = cfg.MODEL.HORPN.RN_FC_DIM self.topk = cfg.MODEL.HORPN.TOPK_PERSON_CELLS # fmt: on anchor_generator = build_anchor_generator(cfg, input_shape) box_dim = self.box_dim = anchor_generator.box_dim # standard HORPN head is shared across levels: in_channels = [s.channels for s in input_shape] assert len( set(in_channels)) == 1, "Each level must have the same channel!" in_channels = in_channels[0] # HORPN head should take the same input as anchor generator num_cell_anchors = anchor_generator.num_cell_anchors assert len(set(num_cell_anchors) ) == 1, "Each level must have the same number of anchors" num_cell_anchors = self.num_cell_anchors = num_cell_anchors[0] # 3x3 conv for the person hidden features _p_out_dim = in_channels self.person_convs = [] for k in range(num_conv): conv = nn.Conv2d(_p_out_dim, conv_dim, kernel_size=3, stride=1, padding=1) self.add_module("person_conv{}".format(k + 1), conv) self.person_convs.append(conv) _p_out_dim = conv_dim # 3x3 conv for the object hidden features _o_out_dim = in_channels self.object_convs = [] for k in range(num_conv): conv = nn.Conv2d(_o_out_dim, conv_dim, kernel_size=3, stride=1, padding=1) self.add_module("object_conv{}".format(k + 1), conv) self.object_convs.append(conv) _o_out_dim = conv_dim # Relational networks for interactness logits prediction _out_dim = _o_out_dim + _p_out_dim self.rn_fcs = [] for k in range(num_rn_fc): fc = nn.Linear(_out_dim, rn_fc_dim) self.add_module("rn_fc{}".format(k + 1), fc) self.rn_fcs.append(fc) _out_dim = rn_fc_dim # Proposal predictor self.person_logits = nn.Conv2d(_p_out_dim, num_cell_anchors, kernel_size=1) self.person_deltas = nn.Conv2d(_p_out_dim, num_cell_anchors * box_dim, kernel_size=1) self.object_logits = nn.Linear(_out_dim, num_cell_anchors) self.object_deltas = nn.Conv2d(_o_out_dim, num_cell_anchors * box_dim, kernel_size=1) # Weights initialization for layer in self.person_convs: weight_init.c2_msra_fill(layer) for layer in self.object_convs: weight_init.c2_msra_fill(layer) for layer in self.rn_fcs: weight_init.c2_xavier_fill(layer) for layer in [self.person_logits, self.person_deltas]: nn.init.normal_(layer.weight, std=0.01) nn.init.constant_(layer.bias, 0) for layer in [self.object_logits, self.object_deltas]: nn.init.normal_(layer.weight, std=0.01) nn.init.constant_(layer.bias, 0)
def __init__(self, cfg, input_shape: List[ShapeSpec]): super().__init__(cfg, input_shape) # fmt: off in_channels = input_shape[0].channels num_classes = cfg.MODEL.RETINANET.NUM_CLASSES num_convs = cfg.MODEL.RETINANET.NUM_CONVS prior_prob = cfg.MODEL.RETINANET.PRIOR_PROB num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors # fmt: on assert ( len(set(num_anchors)) == 1 ), "Using different number of anchors between levels is not currently supported!" num_anchors = num_anchors[0] cls_subnet = [] bbox_subnet = [] for i in range(num_convs): # add conv to cls cls_subnet.append( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)) # add relu to cls cls_subnet.append(nn.ReLU()) if (i == int(round(num_convs / 2))): cls_subnet.append(nn.Dropout2d(0.5)) # do same for relu bbox_subnet.append( nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)) bbox_subnet.append(nn.ReLU()) if (i == int(round(num_convs / 2))): bbox_subnet.append(nn.Dropout2d(0.5)) self.cls_subnet = nn.Sequential(*cls_subnet) self.bbox_subnet = nn.Sequential(*bbox_subnet) self.cls_score = nn.Conv2d(in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1) self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1) # Initialization for modules in [ self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred ]: for layer in modules.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.normal_(layer.weight, mean=0, std=0.01) torch.nn.init.constant_(layer.bias, 0) # Use prior in model initialization to improve stability bias_value = -math.log((1 - prior_prob) / prior_prob) torch.nn.init.constant_(self.cls_score.bias, bias_value)