Esempio n. 1
0
 def output_shape(self):
     return {
         name: ShapeSpec(
             channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
         ) if name != 'linear' else
         ShapeSpec(channels=self.num_classes)
         for name in self._out_features
     }
Esempio n. 2
0
    def __init__(self, cfg):
        super().__init__()

        self.device = torch.device(cfg.MODEL.DEVICE)

        # fmt: off
        self.image_size = cfg.MODEL.SSD.IMAGE_SIZE
        self.num_classes = cfg.MODEL.SSD.NUM_CLASSES
        self.in_features = cfg.MODEL.SSD.IN_FEATURES
        self.extra_layer_arch = cfg.MODEL.SSD.EXTRA_LAYER_ARCH[str(self.image_size)]
        self.l2norm_scale = cfg.MODEL.SSD.L2NORM_SCALE
        # Loss parameters:
        self.loss_alpha = cfg.MODEL.SSD.LOSS_ALPHA
        self.smooth_l1_loss_beta = cfg.MODEL.SSD.SMOOTH_L1_LOSS_BETA
        self.negative_positive_ratio = cfg.MODEL.SSD.NEGATIVE_POSITIVE_RATIO
        # Inference parameters:
        self.score_threshold = cfg.MODEL.SSD.SCORE_THRESH_TEST
        self.nms_threshold = cfg.MODEL.SSD.NMS_THRESH_TEST
        self.nms_type = cfg.MODEL.NMS_TYPE
        self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE
        # fmt: on

        self.backbone = cfg.build_backbone(
            cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)))
        backbone_shape = self.backbone.output_shape()
        feature_shapes = [backbone_shape[f] for f in self.in_features]

        # build extra layers
        self.extra_layers = self._make_extra_layers(
            feature_shapes[-1].channels, self.extra_layer_arch)
        extra_layer_channels = [c for c in self.extra_layer_arch if isinstance(c, int)]
        feature_shapes += [ShapeSpec(channels=c) for c in extra_layer_channels[1::2]]

        # ssd head
        self.head = SSDHead(cfg, feature_shapes)
        self.l2norm = L2Norm(512, self.l2norm_scale)
        self.default_box_generator = cfg.build_default_box_generator(cfg)
        self.default_boxes = self.default_box_generator()

        # Matching and loss
        self.box2box_transform = Box2BoxTransform(
            weights=cfg.MODEL.SSD.BBOX_REG_WEIGHTS)
        self.matcher = Matcher(
            cfg.MODEL.SSD.IOU_THRESHOLDS,
            cfg.MODEL.SSD.IOU_LABELS,
            allow_low_quality_matches=False,
        )

        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(
            3, 1, 1)
        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(
            3, 1, 1)
        self.normalizer = lambda x: (x - pixel_mean) / pixel_std
        self.to(self.device)

        # Initialization
        self._init_weights()
Esempio n. 3
0
    def __init__(self, cfg):
        """
        dim: feature dimension (default: 128)
        K: queue size; number of negative keys (default: 65536)
        m: moco momentum of updating key encoder (default: 0.999)
        T: softmax temperature (default: 0.07)
        """
        super(MoCo, self).__init__()

        self.device = torch.device(cfg.MODEL.DEVICE)

        self.dim = cfg.MODEL.MOCO.DIM
        self.K = cfg.MODEL.MOCO.K
        self.m = cfg.MODEL.MOCO.MOMENTUM
        self.T = cfg.MODEL.MOCO.TAU
        self.mlp = cfg.MODEL.MOCO.MLP

        # create the encoders
        # num_classes is the output fc dimension
        cfg.MODEL.RESNETS.NUM_CLASSES = self.dim

        self.encoder_q = cfg.build_backbone(
            cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)))
        self.encoder_k = cfg.build_backbone(
            cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)))

        self.size_divisibility = self.encoder_q.size_divisibility

        if self.mlp:  # hack: brute-force replacement
            dim_mlp = self.encoder_q.linear.weight.shape[1]
            self.encoder_q.linear = nn.Sequential(nn.Linear(dim_mlp, dim_mlp),
                                                  nn.ReLU(),
                                                  self.encoder_q.linear)
            self.encoder_k.linear = nn.Sequential(nn.Linear(dim_mlp, dim_mlp),
                                                  nn.ReLU(),
                                                  self.encoder_k.linear)

        for param_q, param_k in zip(self.encoder_q.parameters(),
                                    self.encoder_k.parameters()):
            param_k.data.copy_(param_q.data)  # initialize
            param_k.requires_grad = False  # not update by gradient

        # create the queue
        self.register_buffer("queue", torch.randn(self.dim, self.K))
        self.queue = nn.functional.normalize(self.queue, dim=0)

        self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long))

        self.loss_evaluator = nn.CrossEntropyLoss()

        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(
            1, 3, 1, 1)
        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(
            1, 3, 1, 1)
        self.normalizer = lambda x: (x - pixel_mean) / pixel_std

        self.to(self.device)
Esempio n. 4
0
    def test_default_anchor_generator(self):
        cfg = BaseDetectionConfig()
        cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64]]
        cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.25, 1, 4]]

        anchor_generator = DefaultAnchorGenerator(cfg, [ShapeSpec(stride=4)])

        # only the last two dimensions of features matter here
        num_images = 2
        features = {"stage3": torch.rand(num_images, 96, 1, 2)}
        anchors = anchor_generator([features["stage3"]])
        expected_anchor_tensor = torch.tensor(
            [
                [-32.0, -8.0, 32.0, 8.0],
                [-16.0, -16.0, 16.0, 16.0],
                [-8.0, -32.0, 8.0, 32.0],
                [-64.0, -16.0, 64.0, 16.0],
                [-32.0, -32.0, 32.0, 32.0],
                [-16.0, -64.0, 16.0, 64.0],
                [-28.0, -8.0, 36.0, 8.0],  # -28.0 == -32.0 + STRIDE (4)
                [-12.0, -16.0, 20.0, 16.0],
                [-4.0, -32.0, 12.0, 32.0],
                [-60.0, -16.0, 68.0, 16.0],
                [-28.0, -32.0, 36.0, 32.0],
                [-12.0, -64.0, 20.0, 64.0],
            ]
        )

        for i in range(num_images):
            assert torch.allclose(anchors[i][0].tensor, expected_anchor_tensor)
Esempio n. 5
0
    def test_default_anchor_generator_centered(self):
        cfg = BaseDetectionConfig()
        cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64]]
        cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.25, 1, 4]]
        cfg.MODEL.ANCHOR_GENERATOR.OFFSET = 0.5

        anchor_generator = DefaultAnchorGenerator(cfg, [ShapeSpec(stride=4)])

        # only the last two dimensions of features matter here
        num_images = 2
        features = {"stage3": torch.rand(num_images, 96, 1, 2)}
        anchors = anchor_generator([features["stage3"]])
        expected_anchor_tensor = torch.tensor(
            [
                [-30.0, -6.0, 34.0, 10.0],
                [-14.0, -14.0, 18.0, 18.0],
                [-6.0, -30.0, 10.0, 34.0],
                [-62.0, -14.0, 66.0, 18.0],
                [-30.0, -30.0, 34.0, 34.0],
                [-14.0, -62.0, 18.0, 66.0],
                [-26.0, -6.0, 38.0, 10.0],
                [-10.0, -14.0, 22.0, 18.0],
                [-2.0, -30.0, 14.0, 34.0],
                [-58.0, -14.0, 70.0, 18.0],
                [-26.0, -30.0, 38.0, 34.0],
                [-10.0, -62.0, 22.0, 66.0],
            ]
        )

        for i in range(num_images):
            assert torch.allclose(anchors[i][0].tensor, expected_anchor_tensor)
Esempio n. 6
0
    def test_rpn_scriptability(self):
        cfg = RCNNConfig()
        proposal_generator = RPN(cfg, {
            "res4": ShapeSpec(channels=1024, stride=16)
        }).eval()
        num_images = 2
        images_tensor = torch.rand(num_images, 30, 40)
        image_sizes = [(32, 32), (30, 40)]
        images = ImageList(images_tensor, image_sizes)
        features = {"res4": torch.rand(num_images, 1024, 1, 2)}

        fields = {"proposal_boxes": "Boxes", "objectness_logits": "Tensor"}
        proposal_generator_ts = export_torchscript_with_instances(
            proposal_generator, fields)  # noqa

        proposals, _ = proposal_generator(images, features)
        proposals_ts, _ = proposal_generator_ts(images, features)

        for proposal, proposal_ts in zip(proposals, proposals_ts):
            self.assertEqual(proposal.image_size, proposal_ts.image_size)
            self.assertTrue(
                torch.equal(proposal.proposal_boxes.tensor,
                            proposal_ts.proposal_boxes.tensor))
            self.assertTrue(
                torch.equal(proposal.objectness_logits,
                            proposal_ts.objectness_logits))
Esempio n. 7
0
 def output_shape(self):
     return {
         name: ShapeSpec(
             channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
         )
         for name in self._out_features
     }
Esempio n. 8
0
    def __init__(self, cfg):
        super().__init__()

        self.device = torch.device(cfg.MODEL.DEVICE)

        self.instance_loss_weight = cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT

        # options when combining instance & semantic outputs
        self.combine_on = cfg.MODEL.PANOPTIC_FPN.COMBINE.ENABLED
        self.combine_overlap_threshold = cfg.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH
        self.combine_stuff_area_limit = cfg.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT
        self.combine_instances_confidence_threshold = (
            cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH)

        self.backbone = cfg.build_backbone(
            cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)))
        self.proposal_generator = cfg.build_proposal_generator(
            cfg, self.backbone.output_shape())
        self.roi_heads = cfg.build_roi_heads(cfg, self.backbone.output_shape())
        self.sem_seg_head = cfg.build_sem_seg_head(
            cfg, self.backbone.output_shape())

        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(
            3, 1, 1)
        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(
            3, 1, 1)
        self.normalizer = lambda x: (x - pixel_mean) / pixel_std
        self.to(self.device)
Esempio n. 9
0
    def __init__(self, cfg):
        super(Classification, self).__init__()

        self.device = torch.device(cfg.MODEL.DEVICE)

        self.network = cfg.build_backbone(
            cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)))

        self.network.stem = nn.Sequential(
            Conv2d(3,
                   64,
                   kernel_size=3,
                   stride=1,
                   padding=1,
                   bias=False,
                   norm=get_norm("BN", 64)),
            nn.ReLU(),
        )

        self.loss_evaluator = nn.CrossEntropyLoss()

        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(
            3, 1, 1)
        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(
            3, 1, 1)
        self.normalizer = lambda x: (x - pixel_mean) / pixel_std

        self.to(self.device)
Esempio n. 10
0
    def _init_mask_head(self, cfg):
        # fmt: off
        self.mask_on = cfg.MODEL.MASK_ON
        if not self.mask_on:
            return
        pooler_resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION
        pooler_scales = tuple(1.0 / self.feature_strides[k]
                              for k in self.in_features)
        sampling_ratio = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO
        pooler_type = cfg.MODEL.ROI_MASK_HEAD.POOLER_TYPE
        # fmt: on

        in_channels = [self.feature_channels[f] for f in self.in_features][0]

        self.mask_pooler = ROIPooler(
            output_size=pooler_resolution,
            scales=pooler_scales,
            sampling_ratio=sampling_ratio,
            pooler_type=pooler_type,
        )
        self.mask_head = cfg.build_mask_head(
            cfg,
            ShapeSpec(channels=in_channels,
                      width=pooler_resolution,
                      height=pooler_resolution))
Esempio n. 11
0
    def __init__(self, cfg):
        super().__init__()

        self.device = torch.device(cfg.MODEL.DEVICE)

        self.backbone = cfg.build_backbone(
            cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)))
        self.backbone.linear = nn.Identity()

        self.lambd = cfg.MODEL.BT.LAMBD
        self.scale_loss = cfg.MODEL.BT.SCALE_LOSS

        # projector
        sizes = [2048] + list(map(int, cfg.MODEL.BT.PROJECTOR.split('-')))
        layers = []
        for i in range(len(sizes) - 2):
            layers.append(nn.Linear(sizes[i], sizes[i + 1], bias=False))
            layers.append(nn.BatchNorm1d(sizes[i + 1]))
            layers.append(nn.ReLU(inplace=True))
        layers.append(nn.Linear(sizes[-2], sizes[-1], bias=False))
        self.projector = nn.Sequential(*layers)

        # normalization layer for the representations z1 and z2
        self.bn = nn.BatchNorm1d(sizes[-1], affine=False)

        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(3, 1, 1)
        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(3, 1, 1)
        self.normalizer = lambda x: (x / 255.0 - pixel_mean) / pixel_std

        self.to(self.device)
Esempio n. 12
0
def build_dynamic_backbone(cfg, input_shape: ShapeSpec):
    """
    Create a Dynamic Backbone from config.
    Args:
        cfg: a dl_lib CfgNode
    Returns:
        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
    """
    if input_shape is None:
        input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
    backbone = DynamicNetwork(
        init_channel=cfg.MODEL.BACKBONE.INIT_CHANNEL,
        input_shape=input_shape,
        cell_num_list=cfg.MODEL.BACKBONE.CELL_NUM_LIST,
        layer_num=cfg.MODEL.BACKBONE.LAYER_NUM,
        norm=cfg.MODEL.BACKBONE.NORM,
        cal_flops=cfg.MODEL.CAL_FLOPS,
        cell_type=cfg.MODEL.BACKBONE.CELL_TYPE,
        max_stride=cfg.MODEL.BACKBONE.MAX_STRIDE,
        sep_stem=cfg.MODEL.BACKBONE.SEPT_STEM,
        using_gate=cfg.MODEL.GATE.GATE_ON,
        small_gate=cfg.MODEL.GATE.SMALL_GATE,
        gate_bias=cfg.MODEL.GATE.GATE_INIT_BIAS,
        drop_prob=cfg.MODEL.BACKBONE.DROP_PROB
    )

    return backbone
Esempio n. 13
0
    def _init_keypoint_head(self, cfg):
        # fmt: off
        self.keypoint_on = cfg.MODEL.KEYPOINT_ON
        if not self.keypoint_on:
            return
        pooler_resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION
        pooler_scales = tuple(1.0 / self.feature_strides[k]
                              for k in self.in_features)  # noqa
        sampling_ratio = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO
        pooler_type = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE
        self.normalize_loss_by_visible_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS  # noqa
        self.keypoint_loss_weight = cfg.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT
        # fmt: on

        in_channels = [self.feature_channels[f] for f in self.in_features][0]

        self.keypoint_pooler = ROIPooler(
            output_size=pooler_resolution,
            scales=pooler_scales,
            sampling_ratio=sampling_ratio,
            pooler_type=pooler_type,
        )
        self.keypoint_head = cfg.build_keypoint_head(
            cfg,
            ShapeSpec(channels=in_channels,
                      width=pooler_resolution,
                      height=pooler_resolution))
Esempio n. 14
0
    def __init__(self, cfg):
        super(SimSiam, self).__init__()

        self.device = torch.device(cfg.MODEL.DEVICE)

        self.proj_dim = cfg.MODEL.BYOL.PROJ_DIM
        self.pred_dim = cfg.MODEL.BYOL.PRED_DIM
        self.out_dim = cfg.MODEL.BYOL.OUT_DIM

        self.encoder_q = cfg.build_backbone(
            cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)))

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        # Projection Head
        self.projector = nn.Sequential(
            nn.Linear(self.out_dim, self.proj_dim),
            nn.BatchNorm1d(self.proj_dim),
            nn.ReLU(),
            nn.Linear(self.proj_dim, self.proj_dim),
            nn.BatchNorm1d(self.proj_dim),
            nn.ReLU(),
            nn.Linear(self.proj_dim, self.proj_dim),
            nn.BatchNorm1d(self.proj_dim),
        )

        # Predictor
        self.predictor = nn.Sequential(
            nn.Linear(self.proj_dim, self.pred_dim),
            nn.BatchNorm1d(self.pred_dim),
            nn.ReLU(),
            nn.Linear(self.pred_dim, self.out_dim),
        )

        self.to(self.device)
Esempio n. 15
0
    def __init__(self, cfg, input_shape):
        super().__init__(cfg, input_shape)

        assert len(self.in_features) == 1

        # fmt: off
        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
        pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
        pooler_scales = (1.0 / self.feature_strides[self.in_features[0]], )
        sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
        self.mask_on = cfg.MODEL.MASK_ON
        # fmt: on
        assert not cfg.MODEL.KEYPOINT_ON

        self.pooler = ROIPooler(
            output_size=pooler_resolution,
            scales=pooler_scales,
            sampling_ratio=sampling_ratio,
            pooler_type=pooler_type,
        )

        self.res5, out_channels = self._build_res5_block(cfg)
        self.box_predictor = FastRCNNOutputLayers(out_channels,
                                                  self.num_classes,
                                                  self.cls_agnostic_bbox_reg)

        if self.mask_on:
            self.mask_head = cfg.build_mask_head(
                cfg,
                ShapeSpec(channels=out_channels,
                          width=pooler_resolution,
                          height=pooler_resolution),
            )
Esempio n. 16
0
    def __init__(self, cfg):
        super(Classification, self).__init__()

        self.device = torch.device(cfg.MODEL.DEVICE)

        self.network = cfg.build_backbone(
            cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)))
        self.network.stem = nn.Sequential(
            Conv2d(3,
                   64,
                   kernel_size=3,
                   stride=1,
                   padding=1,
                   bias=False,
                   norm=get_norm(cfg.MODEL.RESNETS.NORM, 64)),
            nn.ReLU(),
        )

        self.freeze()
        self.network.eval()

        # init the fc layer
        self.network.linear.weight.data.normal_(mean=0.0, std=0.01)
        self.network.linear.bias.data.zero_()

        self.loss_evaluator = nn.CrossEntropyLoss()

        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(
            1, 3, 1, 1)
        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(
            1, 3, 1, 1)
        self.normalizer = lambda x: (x / 255.0 - pixel_mean) / pixel_std

        self.to(self.device)
Esempio n. 17
0
    def _init_box_head(self, cfg):
        # fmt: off
        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
        pooler_scales = tuple(1.0 / self.feature_strides[k]
                              for k in self.in_features)
        sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
        pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
        self.train_on_pred_boxes = cfg.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES
        # fmt: on

        # If StandardROIHeads is applied on multiple feature maps (as in FPN),
        # then we share the same predictors and therefore the channel counts must be the same
        in_channels = [self.feature_channels[f] for f in self.in_features]
        # Check all channel counts are equal
        assert len(set(in_channels)) == 1, in_channels
        in_channels = in_channels[0]

        self.box_pooler = ROIPooler(
            output_size=pooler_resolution,
            scales=pooler_scales,
            sampling_ratio=sampling_ratio,
            pooler_type=pooler_type,
        )
        # Here we split "box head" and "box predictor", which is mainly due to historical reasons.
        # They are used together so the "box predictor" layers should be part of the "box head".
        # New subclasses of ROIHeads do not need "box predictor"s.
        self.box_head = cfg.build_box_head(
            cfg,
            ShapeSpec(channels=in_channels,
                      height=pooler_resolution,
                      width=pooler_resolution))
        self.box_predictor = FastRCNNOutputLayers(self.box_head.output_size,
                                                  self.num_classes,
                                                  self.cls_agnostic_bbox_reg)
Esempio n. 18
0
    def __init__(self, cfg):
        super().__init__()

        self.device = torch.device(cfg.MODEL.DEVICE)

        self.num_classes = cfg.MODEL.EFFICIENTDET.NUM_CLASSES
        self.in_features = cfg.MODEL.EFFICIENTDET.IN_FEATURES
        self.freeze_bn = cfg.MODEL.EFFICIENTDET.FREEZE_BN
        self.freeze_backbone = cfg.MODEL.EFFICIENTDET.FREEZE_BACKBONE
        self.input_size = cfg.MODEL.BIFPN.INPUT_SIZE
        # Loss parameters:
        self.focal_loss_alpha = cfg.MODEL.EFFICIENTDET.FOCAL_LOSS_ALPHA
        self.focal_loss_gamma = cfg.MODEL.EFFICIENTDET.FOCAL_LOSS_GAMMA
        self.smooth_l1_loss_beta = cfg.MODEL.EFFICIENTDET.SMOOTH_L1_LOSS_BETA
        self.box_loss_weight = cfg.MODEL.EFFICIENTDET.BOX_LOSS_WEIGHT
        self.regress_norm = cfg.MODEL.EFFICIENTDET.REG_NORM
        # Inference parameters:
        self.score_threshold = cfg.MODEL.EFFICIENTDET.SCORE_THRESH_TEST
        self.topk_candidates = cfg.MODEL.EFFICIENTDET.TOPK_CANDIDATES_TEST
        self.nms_threshold = cfg.MODEL.EFFICIENTDET.NMS_THRESH_TEST
        self.nms_type = cfg.MODEL.NMS_TYPE
        self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE

        self.backbone = cfg.build_backbone(
            cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)))

        backbone_shape = self.backbone.output_shape()
        feature_shapes = [backbone_shape[f] for f in self.in_features]
        self.head = EfficientDetHead(cfg, feature_shapes)
        self.anchor_generator = cfg.build_anchor_generator(cfg, feature_shapes)

        # Matching and loss
        self.box2box_transform = Box2BoxTransform(
            weights=cfg.MODEL.EFFICIENTDET.BBOX_REG_WEIGHTS)
        self.matcher = Matcher(
            cfg.MODEL.EFFICIENTDET.IOU_THRESHOLDS,
            cfg.MODEL.EFFICIENTDET.IOU_LABELS,
            allow_low_quality_matches=False,
        )

        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(
            3, 1, 1)
        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(
            3, 1, 1)
        self.normalizer = lambda x: (x / 255. - pixel_mean) / pixel_std

        if self.freeze_bn:
            for layer in self.modules():
                if isinstance(layer, nn.BatchNorm2d):
                    layer.eval()

        if self.freeze_backbone:
            for name, params in self.named_parameters():
                if name.startswith("backbone.bottom_up"):
                    params.requires_grad = False

        self.to(self.device)
Esempio n. 19
0
    def __init__(self, cfg):
        super(SimSiam, self).__init__()

        self.device = torch.device(cfg.MODEL.DEVICE)

        self.proj_dim = cfg.MODEL.BYOL.PROJ_DIM
        self.pred_dim = cfg.MODEL.BYOL.PRED_DIM
        self.out_dim = cfg.MODEL.BYOL.OUT_DIM

        self.total_steps = cfg.SOLVER.LR_SCHEDULER.MAX_ITER * cfg.SOLVER.BATCH_SUBDIVISIONS

        # create the encoders
        # num_classes is the output fc dimension
        cfg.MODEL.RESNETS.NUM_CLASSES = self.out_dim

        self.encoder = cfg.build_backbone(
            cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)))
        self.encoder.stem = nn.Sequential(
            Conv2d(3,
                   64,
                   kernel_size=3,
                   stride=1,
                   padding=1,
                   bias=False,
                   norm=get_norm(cfg.MODEL.RESNETS.NORM, 64)),
            nn.ReLU(),
        )

        self.size_divisibility = self.encoder.size_divisibility

        dim_mlp = self.encoder.linear.weight.shape[1]

        # Projection Head
        self.encoder.linear = nn.Sequential(
            nn.Linear(dim_mlp, self.proj_dim),
            nn.SyncBatchNorm(self.proj_dim),
            nn.ReLU(),
            nn.Linear(self.proj_dim, self.proj_dim),
            nn.SyncBatchNorm(self.proj_dim),
        )

        # Predictor
        self.predictor = nn.Sequential(
            nn.Linear(self.proj_dim, self.pred_dim),
            nn.SyncBatchNorm(self.pred_dim),
            nn.ReLU(),
            nn.Linear(self.pred_dim, self.out_dim),
        )

        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(
            1, 3, 1, 1)
        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(
            1, 3, 1, 1)
        self.normalizer = lambda x: (x / 255.0 - pixel_mean) / pixel_std

        self.to(self.device)
Esempio n. 20
0
    def __init__(self, cfg):
        """
        dim: feature dimension (default: 128)
        K: queue size; number of negative keys (default: 65536)
        m: moco momentum of updating key encoder (default: 0.999)
        T: softmax temperature (default: 0.07)
        """
        super(SimCLR, self).__init__()

        self.device = torch.device(cfg.MODEL.DEVICE)

        self.dim = cfg.MODEL.CLR.DIM
        self.T = cfg.MODEL.CLR.TAU
        self.mlp = cfg.MODEL.CLR.MLP
        self.norm = cfg.MODEL.CLR.NORM

        # create the encoders
        # num_classes is the output fc dimension
        cfg.MODEL.RESNETS.NUM_CLASSES = self.dim

        self.network = cfg.build_backbone(
            cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)))

        self.size_divisibility = self.network.size_divisibility

        if self.mlp:  # hack: brute-force replacement
            dim_mlp = self.network.linear.weight.shape[1]
            if self.norm == "SyncBN":
                self.network.linear = nn.Sequential(
                    nn.Linear(dim_mlp, dim_mlp, bias=False),
                    NaiveSyncBatchNorm1d(dim_mlp),
                    nn.ReLU(),
                    nn.Linear(dim_mlp, self.dim, bias=False),
                    NaiveSyncBatchNorm1d(self.dim)
                )
                nn.init.normal_(self.network.linear[0].weight, mean=0.0, std=0.01)  # linear weight
                nn.init.normal_(self.network.linear[3].weight, mean=0.0, std=0.01)  # linear weight
                nn.init.constant_(self.network.linear[1].weight, 1.0)  # bn gamma
                nn.init.constant_(self.network.linear[4].weight, 1.0)  # bn gamma
            else:
                self.network.linear = nn.Sequential(
                    nn.Linear(dim_mlp, dim_mlp),
                    nn.ReLU(),
                    nn.Linear(dim_mlp, self.dim),
                )
                nn.init.normal_(self.network.linear[0].weight, mean=0.0, std=0.01)  # linear weight
                nn.init.normal_(self.network.linear[2].weight, mean=0.0, std=0.01)  # linear weight

        # self.loss_evaluator = NTXentLoss(self.device, cfg.SOLVER.IMS_PER_DEVICE, self.T, True)
        self.loss_evaluator = NT_Xent(cfg.SOLVER.IMS_PER_DEVICE, self.T, self.device)

        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(1, 3, 1, 1)
        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(1, 3, 1, 1)
        self.normalizer = lambda x: (x / 255.0 - pixel_mean) / pixel_std

        self.to(self.device)
Esempio n. 21
0
 def output_shape(self):
     """
     Returns:
         dict[str->ShapeSpec]
     """
     return {
         name: ShapeSpec(channels=self._out_feature_channels[name],
                         stride=self._out_feature_strides[name])
         for name in self._out_features
     }
Esempio n. 22
0
 def output_shape(self):
     return {
         name: ShapeSpec(
             channels=self._out_feature_channels[name],
             height=self._out_feature_resolution[name][0],
             width=self._out_feature_resolution[name][0],
             stride=self._out_feature_strides[name]
         )
         for name in self._out_features
     }
Esempio n. 23
0
    def __init__(self, cfg):
        super(YOLOv3, self).__init__()
        self.device = torch.device(cfg.MODEL.DEVICE)

        self.num_classes = cfg.MODEL.YOLO.CLASSES

        self.backbone = cfg.build_backbone(
            cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)))
        backbone_shape = self.backbone.output_shape
        self.in_features = cfg.MODEL.YOLO.IN_FEATURES

        # out 0
        out_filter_0 = len(
            cfg.MODEL.YOLO.ANCHORS[0]) * (5 + cfg.MODEL.YOLO.CLASSES)
        self.out0 = self._make_embedding([512, 1024], backbone_shape[-1],
                                         out_filter_0)

        # out 1
        out_filter_1 = len(
            cfg.MODEL.YOLO.ANCHORS[1]) * (5 + cfg.MODEL.YOLO.CLASSES)
        self.out1_cbl = self._make_cbl(512, 256, 1)
        self.out1_upsample = nn.Upsample(scale_factor=2, mode='nearest')
        self.out1 = self._make_embedding([256, 512], backbone_shape[-2] + 256,
                                         out_filter_1)

        # out 2
        out_filter_2 = len(
            cfg.MODEL.YOLO.ANCHORS[2]) * (5 + cfg.MODEL.YOLO.CLASSES)
        self.out2_cbl = self._make_cbl(256, 128, 1)
        self.out2_upsample = nn.Upsample(scale_factor=2, mode='nearest')
        self.out2 = self._make_embedding([128, 256], backbone_shape[-3] + 128,
                                         out_filter_2)

        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(
            3, 1, 1)
        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(
            3, 1, 1)
        self.normalizer = lambda x: (x / 255. - pixel_mean) / pixel_std

        self.loss_evaluators = [
            YOLOHead(cfg, anchor, level)
            for level, anchor in enumerate(cfg.MODEL.YOLO.ANCHORS)
        ]

        self.conf_threshold = cfg.MODEL.YOLO.CONF_THRESHOLD
        self.nms_threshold = cfg.MODEL.YOLO.NMS_THRESHOLD
        self.nms_type = cfg.MODEL.NMS_TYPE

        self.size = 512
        self.multi_size = [320, 352, 384, 416, 448, 480, 512, 544, 576, 608]
        self.change_iter = 10
        self.iter = 0
        self.max_iter = cfg.SOLVER.LR_SCHEDULER.MAX_ITER

        self.to(self.device)
Esempio n. 24
0
 def output_shape(self):
     """
     Returns:
         dict[str->ShapeSpec]
     """
     # this is a backward-compatible default
     return {
         name: ShapeSpec(channels=self._out_feature_channels[name],
                         stride=self._out_feature_strides[name])
         for name in self._out_features
     }
Esempio n. 25
0
def build_backbone(cfg, input_shape=None):
    """
    Build a backbone from `cfg.MODEL.BACKBONE.NAME`.
    Returns:
        an instance of :class:`Backbone`
    """
    if input_shape is None:
        input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))

    backbone = build_resnet_backbone(cfg, input_shape)
    assert isinstance(backbone, Backbone)
    return backbone
Esempio n. 26
0
    def _init_box_head(self, cfg):
        # fmt: off
        pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
        pooler_scales = tuple(1.0 / self.feature_strides[k]
                              for k in self.in_features)
        sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
        pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE
        cascade_bbox_reg_weights = cfg.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS
        cascade_ious = cfg.MODEL.ROI_BOX_CASCADE_HEAD.IOUS
        self.num_cascade_stages = len(cascade_ious)
        assert len(cascade_bbox_reg_weights) == self.num_cascade_stages
        assert cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG,  \
            "CascadeROIHeads only support class-agnostic regression now!"
        assert cascade_ious[0] == cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS[0]
        # fmt: on

        in_channels = [self.feature_channels[f] for f in self.in_features]
        # Check all channel counts are equal
        assert len(set(in_channels)) == 1, in_channels
        in_channels = in_channels[0]

        self.box_pooler = ROIPooler(
            output_size=pooler_resolution,
            scales=pooler_scales,
            sampling_ratio=sampling_ratio,
            pooler_type=pooler_type,
        )
        pooled_shape = ShapeSpec(channels=in_channels,
                                 width=pooler_resolution,
                                 height=pooler_resolution)

        self.box_head = nn.ModuleList()
        self.box_predictor = nn.ModuleList()
        self.box2box_transform = []
        self.proposal_matchers = []
        for k in range(self.num_cascade_stages):
            box_head = cfg.build_box_head(cfg, pooled_shape)
            self.box_head.append(box_head)
            self.box_predictor.append(
                FastRCNNOutputLayers(box_head.output_size,
                                     self.num_classes,
                                     cls_agnostic_bbox_reg=True))
            self.box2box_transform.append(
                Box2BoxTransform(weights=cascade_bbox_reg_weights[k]))

            if k == 0:
                # The first matching is done by the matcher of ROIHeads (self.proposal_matcher).
                self.proposal_matchers.append(None)
            else:
                self.proposal_matchers.append(
                    Matcher([cascade_ious[k]], [0, 1],
                            allow_low_quality_matches=False))
Esempio n. 27
0
    def __init__(self, cfg):
        super().__init__()

        self.device = torch.device(cfg.MODEL.DEVICE)

        # fmt: off
        self.num_classes = cfg.MODEL.RETINANET.NUM_CLASSES
        self.in_features = cfg.MODEL.RETINANET.IN_FEATURES
        # Loss parameters:
        self.focal_loss_alpha = cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA
        self.focal_loss_gamma = cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA
        self.smooth_l1_loss_beta = cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA
        # Inference parameters:
        self.score_threshold = cfg.MODEL.RETINANET.SCORE_THRESH_TEST
        self.topk_candidates = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST
        self.nms_threshold = cfg.MODEL.RETINANET.NMS_THRESH_TEST
        self.nms_type = cfg.MODEL.NMS_TYPE
        self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE
        # fmt: on

        self.backbone = cfg.build_backbone(
            cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)))

        backbone_shape = self.backbone.output_shape()
        feature_shapes = [backbone_shape[f] for f in self.in_features]
        self.head = RetinaNetHead(cfg, feature_shapes)
        self.anchor_generator = cfg.build_anchor_generator(cfg, feature_shapes)

        # Matching and loss
        self.box2box_transform = Box2BoxTransform(
            weights=cfg.MODEL.RETINANET.BBOX_REG_WEIGHTS)
        self.matcher = Matcher(
            cfg.MODEL.RETINANET.IOU_THRESHOLDS,
            cfg.MODEL.RETINANET.IOU_LABELS,
            allow_low_quality_matches=True,
        )

        pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(self.device).view(
            3, 1, 1)
        pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(self.device).view(
            3, 1, 1)
        self.normalizer = lambda x: (x - pixel_mean) / pixel_std
        self.to(self.device)
        """
        In Detectron1, loss is normalized by number of foreground samples in the batch.
        When batch size is 1 per GPU, #foreground has a large variance and
        using it lead to lower performance. Here we maintain an EMA of #foreground to
        stabilize the normalizer.
        """
        self.loss_normalizer = 100  # initialize with any reasonable #fg that's not too small
        self.loss_normalizer_momentum = 0.9
Esempio n. 28
0
    def __init__(self, cfg):
        super(EncoderWithProjection, self).__init__()
        self.proj_dim = cfg.MODEL.BYOL.PROJ_DIM
        self.out_dim = cfg.MODEL.BYOL.OUT_DIM

        self.encoder = cfg.build_backbone(
            cfg, input_shape=ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)))
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        self.projector = nn.Sequential(
            nn.Linear(2048, self.proj_dim),
            nn.BatchNorm1d(self.proj_dim),
            nn.ReLU(),
            nn.Linear(self.proj_dim, self.out_dim, bias=False),
        )
Esempio n. 29
0
def build_backbone(cfg, input_shape=None):
    """
    Build a backbone from `cfg.MODEL.BACKBONE.NAME`.

    Returns:
        an instance of :class:`Backbone`
    """
    if input_shape is None:
        input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN),
                                height=cfg.INPUT.FIX_SIZE_FOR_FLOPS[0],
                                width=cfg.INPUT.FIX_SIZE_FOR_FLOPS[1])

    backbone = build_dynamic_backbone(cfg, input_shape)
    assert isinstance(backbone, Backbone)
    return backbone
Esempio n. 30
0
    def _init_point_head(self, cfg, input_shape: Dict[str, ShapeSpec]):
        # fmt: off
        assert cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES == cfg.MODEL.POINT_HEAD.NUM_CLASSES
        feature_channels = {k: v.channels for k, v in input_shape.items()}
        self.in_features = cfg.MODEL.POINT_HEAD.IN_FEATURES
        self.train_num_points = cfg.MODEL.POINT_HEAD.TRAIN_NUM_POINTS
        self.oversample_ratio = cfg.MODEL.POINT_HEAD.OVERSAMPLE_RATIO
        self.importance_sample_ratio = cfg.MODEL.POINT_HEAD.IMPORTANCE_SAMPLE_RATIO
        self.subdivision_steps = cfg.MODEL.POINT_HEAD.SUBDIVISION_STEPS
        self.subdivision_num_points = cfg.MODEL.POINT_HEAD.SUBDIVISION_NUM_POINTS
        # fmt: on

        in_channels = np.sum([feature_channels[f] for f in self.in_features])
        self.point_head = cfg.build_point_head(
            cfg, ShapeSpec(channels=in_channels, width=1, height=1))