def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) if name != 'linear' else ShapeSpec(channels=self.num_classes) for name in self._out_features }
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) # fmt: off self.image_size = cfg.MODEL.SSD.IMAGE_SIZE self.num_classes = cfg.MODEL.SSD.NUM_CLASSES self.in_features = cfg.MODEL.SSD.IN_FEATURES self.extra_layer_arch = cfg.MODEL.SSD.EXTRA_LAYER_ARCH[str(self.image_size)] self.l2norm_scale = cfg.MODEL.SSD.L2NORM_SCALE # Loss parameters: self.loss_alpha = cfg.MODEL.SSD.LOSS_ALPHA self.smooth_l1_loss_beta = cfg.MODEL.SSD.SMOOTH_L1_LOSS_BETA self.negative_positive_ratio = cfg.MODEL.SSD.NEGATIVE_POSITIVE_RATIO # Inference parameters: self.score_threshold = cfg.MODEL.SSD.SCORE_THRESH_TEST self.nms_threshold = cfg.MODEL.SSD.NMS_THRESH_TEST self.nms_type = cfg.MODEL.NMS_TYPE self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE # fmt: on self.backbone = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] # build extra layers self.extra_layers = self._make_extra_layers( feature_shapes[-1].channels, self.extra_layer_arch) extra_layer_channels = [c for c in self.extra_layer_arch if isinstance(c, int)] feature_shapes += [ShapeSpec(channels=c) for c in extra_layer_channels[1::2]] # ssd head self.head = SSDHead(cfg, feature_shapes) self.l2norm = L2Norm(512, self.l2norm_scale) self.default_box_generator = cfg.build_default_box_generator(cfg) self.default_boxes = self.default_box_generator() # Matching and loss self.box2box_transform = Box2BoxTransform( weights=cfg.MODEL.SSD.BBOX_REG_WEIGHTS) self.matcher = Matcher( cfg.MODEL.SSD.IOU_THRESHOLDS, cfg.MODEL.SSD.IOU_LABELS, allow_low_quality_matches=False, ) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device) # Initialization self._init_weights()
def __init__(self, cfg): """ dim: feature dimension (default: 128) K: queue size; number of negative keys (default: 65536) m: moco momentum of updating key encoder (default: 0.999) T: softmax temperature (default: 0.07) """ super(MoCo, self).__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.dim = cfg.MODEL.MOCO.DIM self.K = cfg.MODEL.MOCO.K self.m = cfg.MODEL.MOCO.MOMENTUM self.T = cfg.MODEL.MOCO.TAU self.mlp = cfg.MODEL.MOCO.MLP # create the encoders # num_classes is the output fc dimension cfg.MODEL.RESNETS.NUM_CLASSES = self.dim self.encoder_q = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) self.encoder_k = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) self.size_divisibility = self.encoder_q.size_divisibility if self.mlp: # hack: brute-force replacement dim_mlp = self.encoder_q.linear.weight.shape[1] self.encoder_q.linear = nn.Sequential(nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.encoder_q.linear) self.encoder_k.linear = nn.Sequential(nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.encoder_k.linear) for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()): param_k.data.copy_(param_q.data) # initialize param_k.requires_grad = False # not update by gradient # create the queue self.register_buffer("queue", torch.randn(self.dim, self.K)) self.queue = nn.functional.normalize(self.queue, dim=0) self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long)) self.loss_evaluator = nn.CrossEntropyLoss() pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 1, 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 1, 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def test_default_anchor_generator(self): cfg = BaseDetectionConfig() cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64]] cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.25, 1, 4]] anchor_generator = DefaultAnchorGenerator(cfg, [ShapeSpec(stride=4)]) # only the last two dimensions of features matter here num_images = 2 features = {"stage3": torch.rand(num_images, 96, 1, 2)} anchors = anchor_generator([features["stage3"]]) expected_anchor_tensor = torch.tensor( [ [-32.0, -8.0, 32.0, 8.0], [-16.0, -16.0, 16.0, 16.0], [-8.0, -32.0, 8.0, 32.0], [-64.0, -16.0, 64.0, 16.0], [-32.0, -32.0, 32.0, 32.0], [-16.0, -64.0, 16.0, 64.0], [-28.0, -8.0, 36.0, 8.0], # -28.0 == -32.0 + STRIDE (4) [-12.0, -16.0, 20.0, 16.0], [-4.0, -32.0, 12.0, 32.0], [-60.0, -16.0, 68.0, 16.0], [-28.0, -32.0, 36.0, 32.0], [-12.0, -64.0, 20.0, 64.0], ] ) for i in range(num_images): assert torch.allclose(anchors[i][0].tensor, expected_anchor_tensor)
def test_default_anchor_generator_centered(self): cfg = BaseDetectionConfig() cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64]] cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.25, 1, 4]] cfg.MODEL.ANCHOR_GENERATOR.OFFSET = 0.5 anchor_generator = DefaultAnchorGenerator(cfg, [ShapeSpec(stride=4)]) # only the last two dimensions of features matter here num_images = 2 features = {"stage3": torch.rand(num_images, 96, 1, 2)} anchors = anchor_generator([features["stage3"]]) expected_anchor_tensor = torch.tensor( [ [-30.0, -6.0, 34.0, 10.0], [-14.0, -14.0, 18.0, 18.0], [-6.0, -30.0, 10.0, 34.0], [-62.0, -14.0, 66.0, 18.0], [-30.0, -30.0, 34.0, 34.0], [-14.0, -62.0, 18.0, 66.0], [-26.0, -6.0, 38.0, 10.0], [-10.0, -14.0, 22.0, 18.0], [-2.0, -30.0, 14.0, 34.0], [-58.0, -14.0, 70.0, 18.0], [-26.0, -30.0, 38.0, 34.0], [-10.0, -62.0, 22.0, 66.0], ] ) for i in range(num_images): assert torch.allclose(anchors[i][0].tensor, expected_anchor_tensor)
def test_rpn_scriptability(self): cfg = RCNNConfig() proposal_generator = RPN(cfg, { "res4": ShapeSpec(channels=1024, stride=16) }).eval() num_images = 2 images_tensor = torch.rand(num_images, 30, 40) image_sizes = [(32, 32), (30, 40)] images = ImageList(images_tensor, image_sizes) features = {"res4": torch.rand(num_images, 1024, 1, 2)} fields = {"proposal_boxes": "Boxes", "objectness_logits": "Tensor"} proposal_generator_ts = export_torchscript_with_instances( proposal_generator, fields) # noqa proposals, _ = proposal_generator(images, features) proposals_ts, _ = proposal_generator_ts(images, features) for proposal, proposal_ts in zip(proposals, proposals_ts): self.assertEqual(proposal.image_size, proposal_ts.image_size) self.assertTrue( torch.equal(proposal.proposal_boxes.tensor, proposal_ts.proposal_boxes.tensor)) self.assertTrue( torch.equal(proposal.objectness_logits, proposal_ts.objectness_logits))
def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] ) for name in self._out_features }
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.instance_loss_weight = cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT # options when combining instance & semantic outputs self.combine_on = cfg.MODEL.PANOPTIC_FPN.COMBINE.ENABLED self.combine_overlap_threshold = cfg.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH self.combine_stuff_area_limit = cfg.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT self.combine_instances_confidence_threshold = ( cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH) self.backbone = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) self.proposal_generator = cfg.build_proposal_generator( cfg, self.backbone.output_shape()) self.roi_heads = cfg.build_roi_heads(cfg, self.backbone.output_shape()) self.sem_seg_head = cfg.build_sem_seg_head( cfg, self.backbone.output_shape()) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def __init__(self, cfg): super(Classification, self).__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.network = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) self.network.stem = nn.Sequential( Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm("BN", 64)), nn.ReLU(), ) self.loss_evaluator = nn.CrossEntropyLoss() pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device)
def _init_mask_head(self, cfg): # fmt: off self.mask_on = cfg.MODEL.MASK_ON if not self.mask_on: return pooler_resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION pooler_scales = tuple(1.0 / self.feature_strides[k] for k in self.in_features) sampling_ratio = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO pooler_type = cfg.MODEL.ROI_MASK_HEAD.POOLER_TYPE # fmt: on in_channels = [self.feature_channels[f] for f in self.in_features][0] self.mask_pooler = ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type=pooler_type, ) self.mask_head = cfg.build_mask_head( cfg, ShapeSpec(channels=in_channels, width=pooler_resolution, height=pooler_resolution))
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.backbone = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) self.backbone.linear = nn.Identity() self.lambd = cfg.MODEL.BT.LAMBD self.scale_loss = cfg.MODEL.BT.SCALE_LOSS # projector sizes = [2048] + list(map(int, cfg.MODEL.BT.PROJECTOR.split('-'))) layers = [] for i in range(len(sizes) - 2): layers.append(nn.Linear(sizes[i], sizes[i + 1], bias=False)) layers.append(nn.BatchNorm1d(sizes[i + 1])) layers.append(nn.ReLU(inplace=True)) layers.append(nn.Linear(sizes[-2], sizes[-1], bias=False)) self.projector = nn.Sequential(*layers) # normalization layer for the representations z1 and z2 self.bn = nn.BatchNorm1d(sizes[-1], affine=False) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1) self.normalizer = lambda x: (x / 255.0 - pixel_mean) / pixel_std self.to(self.device)
def build_dynamic_backbone(cfg, input_shape: ShapeSpec): """ Create a Dynamic Backbone from config. Args: cfg: a dl_lib CfgNode Returns: backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. """ if input_shape is None: input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) backbone = DynamicNetwork( init_channel=cfg.MODEL.BACKBONE.INIT_CHANNEL, input_shape=input_shape, cell_num_list=cfg.MODEL.BACKBONE.CELL_NUM_LIST, layer_num=cfg.MODEL.BACKBONE.LAYER_NUM, norm=cfg.MODEL.BACKBONE.NORM, cal_flops=cfg.MODEL.CAL_FLOPS, cell_type=cfg.MODEL.BACKBONE.CELL_TYPE, max_stride=cfg.MODEL.BACKBONE.MAX_STRIDE, sep_stem=cfg.MODEL.BACKBONE.SEPT_STEM, using_gate=cfg.MODEL.GATE.GATE_ON, small_gate=cfg.MODEL.GATE.SMALL_GATE, gate_bias=cfg.MODEL.GATE.GATE_INIT_BIAS, drop_prob=cfg.MODEL.BACKBONE.DROP_PROB ) return backbone
def _init_keypoint_head(self, cfg): # fmt: off self.keypoint_on = cfg.MODEL.KEYPOINT_ON if not self.keypoint_on: return pooler_resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION pooler_scales = tuple(1.0 / self.feature_strides[k] for k in self.in_features) # noqa sampling_ratio = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO pooler_type = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE self.normalize_loss_by_visible_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS # noqa self.keypoint_loss_weight = cfg.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT # fmt: on in_channels = [self.feature_channels[f] for f in self.in_features][0] self.keypoint_pooler = ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type=pooler_type, ) self.keypoint_head = cfg.build_keypoint_head( cfg, ShapeSpec(channels=in_channels, width=pooler_resolution, height=pooler_resolution))
def __init__(self, cfg): super(SimSiam, self).__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.proj_dim = cfg.MODEL.BYOL.PROJ_DIM self.pred_dim = cfg.MODEL.BYOL.PRED_DIM self.out_dim = cfg.MODEL.BYOL.OUT_DIM self.encoder_q = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) # Projection Head self.projector = nn.Sequential( nn.Linear(self.out_dim, self.proj_dim), nn.BatchNorm1d(self.proj_dim), nn.ReLU(), nn.Linear(self.proj_dim, self.proj_dim), nn.BatchNorm1d(self.proj_dim), nn.ReLU(), nn.Linear(self.proj_dim, self.proj_dim), nn.BatchNorm1d(self.proj_dim), ) # Predictor self.predictor = nn.Sequential( nn.Linear(self.proj_dim, self.pred_dim), nn.BatchNorm1d(self.pred_dim), nn.ReLU(), nn.Linear(self.pred_dim, self.out_dim), ) self.to(self.device)
def __init__(self, cfg, input_shape): super().__init__(cfg, input_shape) assert len(self.in_features) == 1 # fmt: off pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE pooler_scales = (1.0 / self.feature_strides[self.in_features[0]], ) sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO self.mask_on = cfg.MODEL.MASK_ON # fmt: on assert not cfg.MODEL.KEYPOINT_ON self.pooler = ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type=pooler_type, ) self.res5, out_channels = self._build_res5_block(cfg) self.box_predictor = FastRCNNOutputLayers(out_channels, self.num_classes, self.cls_agnostic_bbox_reg) if self.mask_on: self.mask_head = cfg.build_mask_head( cfg, ShapeSpec(channels=out_channels, width=pooler_resolution, height=pooler_resolution), )
def __init__(self, cfg): super(Classification, self).__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.network = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) self.network.stem = nn.Sequential( Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm(cfg.MODEL.RESNETS.NORM, 64)), nn.ReLU(), ) self.freeze() self.network.eval() # init the fc layer self.network.linear.weight.data.normal_(mean=0.0, std=0.01) self.network.linear.bias.data.zero_() self.loss_evaluator = nn.CrossEntropyLoss() pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 1, 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 1, 3, 1, 1) self.normalizer = lambda x: (x / 255.0 - pixel_mean) / pixel_std self.to(self.device)
def _init_box_head(self, cfg): # fmt: off pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION pooler_scales = tuple(1.0 / self.feature_strides[k] for k in self.in_features) sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE self.train_on_pred_boxes = cfg.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES # fmt: on # If StandardROIHeads is applied on multiple feature maps (as in FPN), # then we share the same predictors and therefore the channel counts must be the same in_channels = [self.feature_channels[f] for f in self.in_features] # Check all channel counts are equal assert len(set(in_channels)) == 1, in_channels in_channels = in_channels[0] self.box_pooler = ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type=pooler_type, ) # Here we split "box head" and "box predictor", which is mainly due to historical reasons. # They are used together so the "box predictor" layers should be part of the "box head". # New subclasses of ROIHeads do not need "box predictor"s. self.box_head = cfg.build_box_head( cfg, ShapeSpec(channels=in_channels, height=pooler_resolution, width=pooler_resolution)) self.box_predictor = FastRCNNOutputLayers(self.box_head.output_size, self.num_classes, self.cls_agnostic_bbox_reg)
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.num_classes = cfg.MODEL.EFFICIENTDET.NUM_CLASSES self.in_features = cfg.MODEL.EFFICIENTDET.IN_FEATURES self.freeze_bn = cfg.MODEL.EFFICIENTDET.FREEZE_BN self.freeze_backbone = cfg.MODEL.EFFICIENTDET.FREEZE_BACKBONE self.input_size = cfg.MODEL.BIFPN.INPUT_SIZE # Loss parameters: self.focal_loss_alpha = cfg.MODEL.EFFICIENTDET.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.EFFICIENTDET.FOCAL_LOSS_GAMMA self.smooth_l1_loss_beta = cfg.MODEL.EFFICIENTDET.SMOOTH_L1_LOSS_BETA self.box_loss_weight = cfg.MODEL.EFFICIENTDET.BOX_LOSS_WEIGHT self.regress_norm = cfg.MODEL.EFFICIENTDET.REG_NORM # Inference parameters: self.score_threshold = cfg.MODEL.EFFICIENTDET.SCORE_THRESH_TEST self.topk_candidates = cfg.MODEL.EFFICIENTDET.TOPK_CANDIDATES_TEST self.nms_threshold = cfg.MODEL.EFFICIENTDET.NMS_THRESH_TEST self.nms_type = cfg.MODEL.NMS_TYPE self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE self.backbone = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] self.head = EfficientDetHead(cfg, feature_shapes) self.anchor_generator = cfg.build_anchor_generator(cfg, feature_shapes) # Matching and loss self.box2box_transform = Box2BoxTransform( weights=cfg.MODEL.EFFICIENTDET.BBOX_REG_WEIGHTS) self.matcher = Matcher( cfg.MODEL.EFFICIENTDET.IOU_THRESHOLDS, cfg.MODEL.EFFICIENTDET.IOU_LABELS, allow_low_quality_matches=False, ) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x / 255. - pixel_mean) / pixel_std if self.freeze_bn: for layer in self.modules(): if isinstance(layer, nn.BatchNorm2d): layer.eval() if self.freeze_backbone: for name, params in self.named_parameters(): if name.startswith("backbone.bottom_up"): params.requires_grad = False self.to(self.device)
def __init__(self, cfg): super(SimSiam, self).__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.proj_dim = cfg.MODEL.BYOL.PROJ_DIM self.pred_dim = cfg.MODEL.BYOL.PRED_DIM self.out_dim = cfg.MODEL.BYOL.OUT_DIM self.total_steps = cfg.SOLVER.LR_SCHEDULER.MAX_ITER * cfg.SOLVER.BATCH_SUBDIVISIONS # create the encoders # num_classes is the output fc dimension cfg.MODEL.RESNETS.NUM_CLASSES = self.out_dim self.encoder = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) self.encoder.stem = nn.Sequential( Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False, norm=get_norm(cfg.MODEL.RESNETS.NORM, 64)), nn.ReLU(), ) self.size_divisibility = self.encoder.size_divisibility dim_mlp = self.encoder.linear.weight.shape[1] # Projection Head self.encoder.linear = nn.Sequential( nn.Linear(dim_mlp, self.proj_dim), nn.SyncBatchNorm(self.proj_dim), nn.ReLU(), nn.Linear(self.proj_dim, self.proj_dim), nn.SyncBatchNorm(self.proj_dim), ) # Predictor self.predictor = nn.Sequential( nn.Linear(self.proj_dim, self.pred_dim), nn.SyncBatchNorm(self.pred_dim), nn.ReLU(), nn.Linear(self.pred_dim, self.out_dim), ) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 1, 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 1, 3, 1, 1) self.normalizer = lambda x: (x / 255.0 - pixel_mean) / pixel_std self.to(self.device)
def __init__(self, cfg): """ dim: feature dimension (default: 128) K: queue size; number of negative keys (default: 65536) m: moco momentum of updating key encoder (default: 0.999) T: softmax temperature (default: 0.07) """ super(SimCLR, self).__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.dim = cfg.MODEL.CLR.DIM self.T = cfg.MODEL.CLR.TAU self.mlp = cfg.MODEL.CLR.MLP self.norm = cfg.MODEL.CLR.NORM # create the encoders # num_classes is the output fc dimension cfg.MODEL.RESNETS.NUM_CLASSES = self.dim self.network = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) self.size_divisibility = self.network.size_divisibility if self.mlp: # hack: brute-force replacement dim_mlp = self.network.linear.weight.shape[1] if self.norm == "SyncBN": self.network.linear = nn.Sequential( nn.Linear(dim_mlp, dim_mlp, bias=False), NaiveSyncBatchNorm1d(dim_mlp), nn.ReLU(), nn.Linear(dim_mlp, self.dim, bias=False), NaiveSyncBatchNorm1d(self.dim) ) nn.init.normal_(self.network.linear[0].weight, mean=0.0, std=0.01) # linear weight nn.init.normal_(self.network.linear[3].weight, mean=0.0, std=0.01) # linear weight nn.init.constant_(self.network.linear[1].weight, 1.0) # bn gamma nn.init.constant_(self.network.linear[4].weight, 1.0) # bn gamma else: self.network.linear = nn.Sequential( nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), nn.Linear(dim_mlp, self.dim), ) nn.init.normal_(self.network.linear[0].weight, mean=0.0, std=0.01) # linear weight nn.init.normal_(self.network.linear[2].weight, mean=0.0, std=0.01) # linear weight # self.loss_evaluator = NTXentLoss(self.device, cfg.SOLVER.IMS_PER_DEVICE, self.T, True) self.loss_evaluator = NT_Xent(cfg.SOLVER.IMS_PER_DEVICE, self.T, self.device) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(1, 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(1, 3, 1, 1) self.normalizer = lambda x: (x / 255.0 - pixel_mean) / pixel_std self.to(self.device)
def output_shape(self): """ Returns: dict[str->ShapeSpec] """ return { name: ShapeSpec(channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]) for name in self._out_features }
def output_shape(self): return { name: ShapeSpec( channels=self._out_feature_channels[name], height=self._out_feature_resolution[name][0], width=self._out_feature_resolution[name][0], stride=self._out_feature_strides[name] ) for name in self._out_features }
def __init__(self, cfg): super(YOLOv3, self).__init__() self.device = torch.device(cfg.MODEL.DEVICE) self.num_classes = cfg.MODEL.YOLO.CLASSES self.backbone = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) backbone_shape = self.backbone.output_shape self.in_features = cfg.MODEL.YOLO.IN_FEATURES # out 0 out_filter_0 = len( cfg.MODEL.YOLO.ANCHORS[0]) * (5 + cfg.MODEL.YOLO.CLASSES) self.out0 = self._make_embedding([512, 1024], backbone_shape[-1], out_filter_0) # out 1 out_filter_1 = len( cfg.MODEL.YOLO.ANCHORS[1]) * (5 + cfg.MODEL.YOLO.CLASSES) self.out1_cbl = self._make_cbl(512, 256, 1) self.out1_upsample = nn.Upsample(scale_factor=2, mode='nearest') self.out1 = self._make_embedding([256, 512], backbone_shape[-2] + 256, out_filter_1) # out 2 out_filter_2 = len( cfg.MODEL.YOLO.ANCHORS[2]) * (5 + cfg.MODEL.YOLO.CLASSES) self.out2_cbl = self._make_cbl(256, 128, 1) self.out2_upsample = nn.Upsample(scale_factor=2, mode='nearest') self.out2 = self._make_embedding([128, 256], backbone_shape[-3] + 128, out_filter_2) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x / 255. - pixel_mean) / pixel_std self.loss_evaluators = [ YOLOHead(cfg, anchor, level) for level, anchor in enumerate(cfg.MODEL.YOLO.ANCHORS) ] self.conf_threshold = cfg.MODEL.YOLO.CONF_THRESHOLD self.nms_threshold = cfg.MODEL.YOLO.NMS_THRESHOLD self.nms_type = cfg.MODEL.NMS_TYPE self.size = 512 self.multi_size = [320, 352, 384, 416, 448, 480, 512, 544, 576, 608] self.change_iter = 10 self.iter = 0 self.max_iter = cfg.SOLVER.LR_SCHEDULER.MAX_ITER self.to(self.device)
def output_shape(self): """ Returns: dict[str->ShapeSpec] """ # this is a backward-compatible default return { name: ShapeSpec(channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]) for name in self._out_features }
def build_backbone(cfg, input_shape=None): """ Build a backbone from `cfg.MODEL.BACKBONE.NAME`. Returns: an instance of :class:`Backbone` """ if input_shape is None: input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) backbone = build_resnet_backbone(cfg, input_shape) assert isinstance(backbone, Backbone) return backbone
def _init_box_head(self, cfg): # fmt: off pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION pooler_scales = tuple(1.0 / self.feature_strides[k] for k in self.in_features) sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE cascade_bbox_reg_weights = cfg.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS cascade_ious = cfg.MODEL.ROI_BOX_CASCADE_HEAD.IOUS self.num_cascade_stages = len(cascade_ious) assert len(cascade_bbox_reg_weights) == self.num_cascade_stages assert cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG, \ "CascadeROIHeads only support class-agnostic regression now!" assert cascade_ious[0] == cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS[0] # fmt: on in_channels = [self.feature_channels[f] for f in self.in_features] # Check all channel counts are equal assert len(set(in_channels)) == 1, in_channels in_channels = in_channels[0] self.box_pooler = ROIPooler( output_size=pooler_resolution, scales=pooler_scales, sampling_ratio=sampling_ratio, pooler_type=pooler_type, ) pooled_shape = ShapeSpec(channels=in_channels, width=pooler_resolution, height=pooler_resolution) self.box_head = nn.ModuleList() self.box_predictor = nn.ModuleList() self.box2box_transform = [] self.proposal_matchers = [] for k in range(self.num_cascade_stages): box_head = cfg.build_box_head(cfg, pooled_shape) self.box_head.append(box_head) self.box_predictor.append( FastRCNNOutputLayers(box_head.output_size, self.num_classes, cls_agnostic_bbox_reg=True)) self.box2box_transform.append( Box2BoxTransform(weights=cascade_bbox_reg_weights[k])) if k == 0: # The first matching is done by the matcher of ROIHeads (self.proposal_matcher). self.proposal_matchers.append(None) else: self.proposal_matchers.append( Matcher([cascade_ious[k]], [0, 1], allow_low_quality_matches=False))
def __init__(self, cfg): super().__init__() self.device = torch.device(cfg.MODEL.DEVICE) # fmt: off self.num_classes = cfg.MODEL.RETINANET.NUM_CLASSES self.in_features = cfg.MODEL.RETINANET.IN_FEATURES # Loss parameters: self.focal_loss_alpha = cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA self.focal_loss_gamma = cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA self.smooth_l1_loss_beta = cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA # Inference parameters: self.score_threshold = cfg.MODEL.RETINANET.SCORE_THRESH_TEST self.topk_candidates = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST self.nms_threshold = cfg.MODEL.RETINANET.NMS_THRESH_TEST self.nms_type = cfg.MODEL.NMS_TYPE self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE # fmt: on self.backbone = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] self.head = RetinaNetHead(cfg, feature_shapes) self.anchor_generator = cfg.build_anchor_generator(cfg, feature_shapes) # Matching and loss self.box2box_transform = Box2BoxTransform( weights=cfg.MODEL.RETINANET.BBOX_REG_WEIGHTS) self.matcher = Matcher( cfg.MODEL.RETINANET.IOU_THRESHOLDS, cfg.MODEL.RETINANET.IOU_LABELS, allow_low_quality_matches=True, ) pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view( 3, 1, 1) pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view( 3, 1, 1) self.normalizer = lambda x: (x - pixel_mean) / pixel_std self.to(self.device) """ In Detectron1, loss is normalized by number of foreground samples in the batch. When batch size is 1 per GPU, #foreground has a large variance and using it lead to lower performance. Here we maintain an EMA of #foreground to stabilize the normalizer. """ self.loss_normalizer = 100 # initialize with any reasonable #fg that's not too small self.loss_normalizer_momentum = 0.9
def __init__(self, cfg): super(EncoderWithProjection, self).__init__() self.proj_dim = cfg.MODEL.BYOL.PROJ_DIM self.out_dim = cfg.MODEL.BYOL.OUT_DIM self.encoder = cfg.build_backbone( cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))) self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) self.projector = nn.Sequential( nn.Linear(2048, self.proj_dim), nn.BatchNorm1d(self.proj_dim), nn.ReLU(), nn.Linear(self.proj_dim, self.out_dim, bias=False), )
def build_backbone(cfg, input_shape=None): """ Build a backbone from `cfg.MODEL.BACKBONE.NAME`. Returns: an instance of :class:`Backbone` """ if input_shape is None: input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN), height=cfg.INPUT.FIX_SIZE_FOR_FLOPS[0], width=cfg.INPUT.FIX_SIZE_FOR_FLOPS[1]) backbone = build_dynamic_backbone(cfg, input_shape) assert isinstance(backbone, Backbone) return backbone
def _init_point_head(self, cfg, input_shape: Dict[str, ShapeSpec]): # fmt: off assert cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES == cfg.MODEL.POINT_HEAD.NUM_CLASSES feature_channels = {k: v.channels for k, v in input_shape.items()} self.in_features = cfg.MODEL.POINT_HEAD.IN_FEATURES self.train_num_points = cfg.MODEL.POINT_HEAD.TRAIN_NUM_POINTS self.oversample_ratio = cfg.MODEL.POINT_HEAD.OVERSAMPLE_RATIO self.importance_sample_ratio = cfg.MODEL.POINT_HEAD.IMPORTANCE_SAMPLE_RATIO self.subdivision_steps = cfg.MODEL.POINT_HEAD.SUBDIVISION_STEPS self.subdivision_num_points = cfg.MODEL.POINT_HEAD.SUBDIVISION_NUM_POINTS # fmt: on in_channels = np.sum([feature_channels[f] for f in self.in_features]) self.point_head = cfg.build_point_head( cfg, ShapeSpec(channels=in_channels, width=1, height=1))