Esempio n. 1
0
    def __init__(
        self,
        pretrained="coco",  # not used here for proper signature
        encoder_name="resnet50",
        encoder_weights="imagenet",
        pyramid_channels=256,
        num_classes=80,
        # drop_connect_rate=0, # TODO: add
        encoder_norm_layer="abn",
        encoder_norm_act="relu",
        decoder_norm_layer="none",  # None by default to match detectron & mmdet versions
        decoder_norm_act="relu",
        **encoder_params,
    ):
        super().__init__()
        self.encoder = get_encoder(
            encoder_name,
            norm_layer=encoder_norm_layer,
            norm_act=encoder_norm_act,
            encoder_weights=encoder_weights,
            **encoder_params,
        )
        norm_layer = bn_from_name(decoder_norm_layer)
        self.pyramid6 = nn.Sequential(
            conv3x3(self.encoder.out_shapes[0], pyramid_channels, 2,
                    bias=True),
            norm_layer(pyramid_channels, activation="identity"),
        )
        self.pyramid7 = nn.Sequential(
            conv3x3(pyramid_channels, pyramid_channels, 2, bias=True),
            norm_layer(pyramid_channels, activation="identity"),
        )
        self.fpn = FPN(self.encoder.out_shapes[:-2],
                       pyramid_channels=pyramid_channels)

        def make_final_convs():
            layers = []
            for _ in range(4):
                layers += [
                    conv3x3(pyramid_channels, pyramid_channels, bias=True)
                ]
                # Norm here is fine for GroupNorm but for BN it should be implemented the other way
                # see EffDet for example. Maybe need to change this implementation to align with EffDet
                layers += [
                    norm_layer(pyramid_channels, activation=decoder_norm_act)
                ]
            return nn.Sequential(*layers)

        anchors_per_location = 9
        self.cls_convs = make_final_convs()
        self.cls_head_conv = conv3x3(pyramid_channels,
                                     num_classes * anchors_per_location,
                                     bias=True)
        self.box_convs = make_final_convs()
        self.box_head_conv = conv3x3(pyramid_channels,
                                     4 * anchors_per_location,
                                     bias=True)
        self.num_classes = num_classes
        self._initialize_weights()
Esempio n. 2
0
    def __init__(
        self,
        encoder_name="resnet34",
        encoder_weights="imagenet",
        pyramid_channels=256,
        num_fpn_layers=1,
        segmentation_channels=128,
        num_classes=1,
        merge_policy="add",
        last_upsample=True,
        output_stride=32,
        drop_rate=0,
        norm_layer="abn",
        norm_act="relu",
        **encoder_params,
    ):
        super().__init__()
        if output_stride != 32:
            encoder_params["output_stride"] = output_stride
        self.encoder = get_encoder(
            encoder_name,
            norm_layer=norm_layer,
            norm_act=norm_act,
            encoder_weights=encoder_weights,
            **encoder_params,
        )

        bn_args = {
            "norm_layer": bn_from_name(norm_layer),
            "norm_act": norm_act
        }

        self.fpn = self.__class__.FEATURE_PYRAMID(
            self.encoder.
            out_shapes[:-1],  # only want features from 1/4 to 1/32
            pyramid_channels=pyramid_channels,
            num_layers=num_fpn_layers,
            output_stride=output_stride,
            **bn_args,
        )

        self.decoder = PanopticDecoder(
            pyramid_channels=pyramid_channels,
            segmentation_channels=segmentation_channels,
            merge_policy=merge_policy,
            upsamples=[2, 2, 1, 0] if output_stride == 16 else [3, 2, 1, 0],
            **bn_args,
        )
        if merge_policy == "cat":
            segmentation_channels *= 4

        self.dropout = nn.Dropout2d(drop_rate, inplace=True)
        self.segm_head = conv1x1(segmentation_channels, num_classes)
        self.upsample = nn.Upsample(
            scale_factor=4,
            mode="bilinear") if last_upsample else nn.Identity()
        self.name = f"segm-fpn-{encoder_name}"
Esempio n. 3
0
    def __init__(
        self,
        encoder_name="efficientnet_b0",
        encoder_weights="imagenet",
        pyramid_channels=128,
        head_channels=256,
        num_classes=1,
        last_upsample=True,
        encoder_norm_layer="abn",
        encoder_norm_act="swish",
        decoder_norm_layer="abn",
        decoder_norm_act="swish",
        **encoder_params,
    ):
        super().__init__()
        self.encoder = get_encoder(
            encoder_name,
            norm_layer=encoder_norm_layer,
            norm_act=encoder_norm_act,
            encoder_weights=encoder_weights,
            **encoder_params,
        )
        norm_layer = bn_from_name(decoder_norm_layer)
        bn_args = dict(norm_layer=norm_layer, norm_act=decoder_norm_act)

        self.bifpn = BiFPN(
            # pass P2-P5
            encoder_channels=self.encoder.out_shapes[:-1],
            pyramid_channels=pyramid_channels,
            num_layers=3,  # hardcode num_fpn_layers=3
            **bn_args,
        )

        self.cls_head_conv = nn.Sequential(
            DepthwiseSeparableConv(pyramid_channels, head_channels, **bn_args),
            DepthwiseSeparableConv(head_channels, head_channels, **bn_args),
            DepthwiseSeparableConv(head_channels, num_classes, use_norm=False),
        )

        self.upsample = nn.Upsample(
            scale_factor=4,
            mode="bilinear") if last_upsample else nn.Identity()

        self.num_classes = num_classes

        patch_bn_mom(self, 0.01)
        # set last layer bias for better convergence with sigmoid loss
        # -4.59 = -np.log((1 - 0.01) / 0.01)
        nn.init.constant_(self.cls_head_conv[-1][1].bias, -4.59)
Esempio n. 4
0
    def __init__(
        self,
        encoder_name="resnet34",
        encoder_weights="imagenet",
        pyramid_channels=256,
        num_classes=80,
        norm_layer="abn",
        norm_act="relu",
        **encoder_params,
    ):
        super().__init__()
        self.encoder = get_encoder(
            encoder_name,
            norm_layer=norm_layer,
            norm_act=norm_act,
            encoder_weights=encoder_weights,
            **encoder_params,
        )
        norm_layer = bn_from_name(norm_layer)
        self.pyramid6 = conv3x3(256, 256, 2, bias=True)
        self.pyramid7 = conv3x3(256, 256, 2, bias=True)
        self.fpn = FPN(
            self.encoder.out_shapes[:-2],
            pyramid_channels=pyramid_channels,
        )

        def make_head(out_size):
            layers = []
            for _ in range(4):
                # some implementations don't use BN here but I think it's needed
                # TODO: test how it affects results
                layers += [
                    nn.Conv2d(256, 256, 3, padding=1),
                    norm_layer(256, activation=norm_act)
                ]
                # layers += [nn.Conv2d(256, 256, 3, padding=1), nn.ReLU()]

            layers += [nn.Conv2d(256, out_size, 3, padding=1)]
            return nn.Sequential(*layers)

        self.ratios = [1.0, 2.0, 0.5]
        self.scales = [4 * 2**(i / 3) for i in range(3)]
        anchors = len(self.ratios) * len(self.scales)  # 9

        self.cls_head = make_head(num_classes * anchors)
        self.box_head = make_head(4 * anchors)
Esempio n. 5
0
    def __init__(
        self,
        pretrained="coco",  # Not used. here for proper signature
        encoder_name="efficientnet_b0",
        encoder_weights="imagenet",
        pyramid_channels=64,
        num_fpn_layers=3,
        num_head_repeats=3,
        num_classes=90,
        encoder_norm_layer="frozenabn",
        encoder_norm_act="swish",
        decoder_norm_layer="abn",
        decoder_norm_act="swish",
        match_tf_same_padding=False,
        anchors_per_location=9,
        **encoder_params,
    ):
        super().__init__()
        self.encoder = get_encoder(
            encoder_name,
            norm_layer=encoder_norm_layer,
            norm_act=encoder_norm_act,
            encoder_weights=encoder_weights,
            **encoder_params,
        )
        norm_layer = bn_from_name(decoder_norm_layer)
        bn_args = dict(norm_layer=norm_layer, norm_act=decoder_norm_act)
        self.pyramid6 = nn.Sequential(
            conv1x1(self.encoder.out_shapes[0], pyramid_channels, bias=True),
            norm_layer(pyramid_channels, activation="identity"),
            nn.MaxPool2d(3, stride=2, padding=1),
        )
        self.pyramid7 = nn.MaxPool2d(
            3, stride=2, padding=1)  # in EffDet it's a simple maxpool

        self.bifpn = BiFPN(
            encoder_channels=(pyramid_channels, ) * 2 +
            self.encoder.out_shapes[:-2],
            pyramid_channels=pyramid_channels,
            num_layers=num_fpn_layers,
            **bn_args,
        )

        def make_head(out_size):
            layers = []
            for _ in range(num_head_repeats):
                layers += [
                    DepthwiseSeparableConv(pyramid_channels,
                                           pyramid_channels,
                                           use_norm=False)
                ]
            return nn.ModuleList(layers)

        # The convolution layers in the head are shared among all levels, but
        # each level has its batch normalization to capture the statistical
        # difference among different levels.
        def make_head_norm():
            return nn.ModuleList([
                nn.ModuleList([
                    norm_layer(pyramid_channels, activation=decoder_norm_act)
                    for _ in range(num_head_repeats)
                ]) for _ in range(5)
            ])

        self.cls_convs = make_head(num_classes * anchors_per_location)
        self.cls_head_conv = DepthwiseSeparableConv(pyramid_channels,
                                                    num_classes *
                                                    anchors_per_location,
                                                    use_norm=False)
        self.cls_norms = make_head_norm()

        self.box_convs = make_head(4 * anchors_per_location)
        self.box_head_conv = DepthwiseSeparableConv(pyramid_channels,
                                                    4 * anchors_per_location,
                                                    use_norm=False)
        self.box_norms = make_head_norm()

        self.num_classes = num_classes
        self.num_head_repeats = num_head_repeats

        patch_bn_tf(self)
        self._initialize_weights()
        if match_tf_same_padding:
            conv_to_same_conv(self)
            maxpool_to_same_maxpool(self)