Beispiel #1
0
    def __init__(self, cfg, in_channels):
        super(ResNetConv52MLPFeatureExtractor, self).__init__()

        stage = resnet.StageSpec(index=4, block_count=3, return_features=False)
        head = resnet.ResNetHead(
            block_module=cfg.MODEL.RESNETS.TRANS_FUNC,
            stages=(stage, ),
            num_groups=cfg.MODEL.RESNETS.NUM_GROUPS,
            width_per_group=cfg.MODEL.RESNETS.WIDTH_PER_GROUP,
            stride_in_1x1=cfg.MODEL.RESNETS.STRIDE_IN_1X1,
            stride_init=1,
            res2_out_channels=cfg.MODEL.RESNETS.RES2_OUT_CHANNELS,
            dilation=cfg.MODEL.RESNETS.RES5_DILATION,
        )

        in_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * 2**(stage.index -
                                                                1)
        if cfg.MODEL.VID.ROI_BOX_HEAD.REDUCE_CHANNEL:
            new_conv = nn.Conv2d(in_channels, 256, kernel_size=1, stride=1)
            nn.init.kaiming_uniform_(new_conv.weight, a=1)
            nn.init.constant_(new_conv.bias, 0)
            output_channel = 256
        else:
            new_conv = None
            output_channel = in_channels

        resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
        scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES
        sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
        pooler = Pooler(
            output_size=(resolution, resolution),
            scales=scales,
            sampling_ratio=sampling_ratio,
        )

        self.head = head
        self.conv = new_conv
        self.pooler = pooler

        input_size = output_channel * resolution**2
        representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM
        use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN
        self.fc6 = make_fc(input_size, representation_size, use_gn)
        self.fc7 = make_fc(representation_size, representation_size, use_gn)

        self.out_channels = representation_size
Beispiel #2
0
    def __init__(self, cfg, in_channels):
        super(FPN2MLPFeatureExtractor, self).__init__()

        resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
        scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES
        sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
        pooler = Pooler(
            output_size=(resolution, resolution),
            scales=scales,
            sampling_ratio=sampling_ratio,
        )
        input_size = in_channels * resolution**2
        representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM
        use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN
        self.pooler = pooler
        self.fc6 = make_fc(input_size, representation_size, use_gn)
        self.fc7 = make_fc(representation_size, representation_size, use_gn)
        self.out_channels = representation_size
Beispiel #3
0
    def __init__(self, cfg, in_channels):
        super(FPNXconv1fcFeatureExtractor, self).__init__()

        resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
        scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES
        sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
        pooler = Pooler(
            output_size=(resolution, resolution),
            scales=scales,
            sampling_ratio=sampling_ratio,
        )
        self.pooler = pooler

        use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN
        conv_head_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM
        num_stacked_convs = cfg.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS
        dilation = cfg.MODEL.ROI_BOX_HEAD.DILATION

        xconvs = []
        for ix in range(num_stacked_convs):
            xconvs.append(
                nn.Conv2d(in_channels,
                          conv_head_dim,
                          kernel_size=3,
                          stride=1,
                          padding=dilation,
                          dilation=dilation,
                          bias=False if use_gn else True))
            in_channels = conv_head_dim
            if use_gn:
                xconvs.append(group_norm(in_channels))
            xconvs.append(nn.ReLU(inplace=True))

        self.add_module("xconvs", nn.Sequential(*xconvs))
        for modules in [
                self.xconvs,
        ]:
            for l in modules.modules():
                if isinstance(l, nn.Conv2d):
                    torch.nn.init.normal_(l.weight, std=0.01)
                    if not use_gn:
                        torch.nn.init.constant_(l.bias, 0)

        input_size = conv_head_dim * resolution**2
        representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM
        self.fc6 = make_fc(input_size, representation_size, use_gn=False)
        self.out_channels = representation_size
Beispiel #4
0
    def __init__(self, cfg, in_channels):
        super(MEGAFeatureExtractor, self).__init__(cfg, in_channels)

        stage = resnet.StageSpec(index=4, block_count=3, return_features=False)
        head = resnet.ResNetHead(
            block_module=cfg.MODEL.RESNETS.TRANS_FUNC,
            stages=(stage, ),
            num_groups=cfg.MODEL.RESNETS.NUM_GROUPS,
            width_per_group=cfg.MODEL.RESNETS.WIDTH_PER_GROUP,
            stride_in_1x1=cfg.MODEL.RESNETS.STRIDE_IN_1X1,
            stride_init=1,
            res2_out_channels=cfg.MODEL.RESNETS.RES2_OUT_CHANNELS,
            dilation=cfg.MODEL.RESNETS.RES5_DILATION,
        )

        in_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * 2**(stage.index -
                                                                1)
        if cfg.MODEL.VID.ROI_BOX_HEAD.REDUCE_CHANNEL:
            new_conv = nn.Conv2d(in_channels, 256, kernel_size=1, stride=1)
            nn.init.kaiming_uniform_(new_conv.weight, a=1)
            nn.init.constant_(new_conv.bias, 0)
            output_channel = 256
        else:
            new_conv = None
            output_channel = in_channels

        resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
        scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES
        sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
        pooler = Pooler(
            output_size=(resolution, resolution),
            scales=scales,
            sampling_ratio=sampling_ratio,
        )

        self.head = head
        self.conv = new_conv
        self.pooler = pooler

        input_size = output_channel * resolution**2
        representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM
        use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN

        self.all_frame_interval = cfg.MODEL.VID.MEGA.ALL_FRAME_INTERVAL

        if cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.ENABLE:
            self.embed_dim = cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.EMBED_DIM
            self.groups = cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.GROUP
            self.feat_dim = representation_size

            self.stage = cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.STAGE

            self.base_num = cfg.MODEL.VID.RPN.REF_POST_NMS_TOP_N
            self.advanced_num = int(self.base_num * cfg.MODEL.VID.MEGA.RATIO)

            fcs, Wgs, Wqs, Wks, Wvs, us = [], [], [], [], [], []

            for i in range(self.stage):
                r_size = input_size if i == 0 else representation_size
                fcs.append(make_fc(r_size, representation_size, use_gn))
                Wgs.append(
                    Conv2d(self.embed_dim,
                           self.groups,
                           kernel_size=1,
                           stride=1,
                           padding=0))
                Wqs.append(make_fc(self.feat_dim, self.feat_dim))
                Wks.append(make_fc(self.feat_dim, self.feat_dim))
                Wvs.append(
                    Conv2d(self.feat_dim * self.groups,
                           self.feat_dim,
                           kernel_size=1,
                           stride=1,
                           padding=0,
                           groups=self.groups))
                us.append(
                    nn.Parameter(torch.Tensor(self.groups, 1, self.embed_dim)))
                for l in [Wgs[i], Wvs[i]]:
                    torch.nn.init.normal_(l.weight, std=0.01)
                    torch.nn.init.constant_(l.bias, 0)
                for weight in [us[i]]:
                    torch.nn.init.normal_(weight, std=0.01)

                self.l_fcs = nn.ModuleList(fcs)
                self.l_Wgs = nn.ModuleList(Wgs)
                self.l_Wqs = nn.ModuleList(Wqs)
                self.l_Wks = nn.ModuleList(Wks)
                self.l_Wvs = nn.ModuleList(Wvs)
                self.l_us = nn.ParameterList(us)

        # Long Range Memory
        self.memory_enable = cfg.MODEL.VID.MEGA.MEMORY.ENABLE
        if self.memory_enable:
            self.memory_size = cfg.MODEL.VID.MEGA.MEMORY.SIZE

        # Global Aggregation Stage
        self.global_enable = cfg.MODEL.VID.MEGA.GLOBAL.ENABLE
        if self.global_enable:
            self.global_size = cfg.MODEL.VID.MEGA.GLOBAL.SIZE
            self.global_res_stage = cfg.MODEL.VID.MEGA.GLOBAL.RES_STAGE

            Wqs, Wks, Wvs, us = [], [], [], []

            for i in range(self.global_res_stage + 1):
                Wqs.append(make_fc(self.feat_dim, self.feat_dim))
                Wks.append(make_fc(self.feat_dim, self.feat_dim))
                Wvs.append(
                    Conv2d(self.feat_dim * self.groups,
                           self.feat_dim,
                           kernel_size=1,
                           stride=1,
                           padding=0,
                           groups=self.groups))
                us.append(
                    nn.Parameter(torch.Tensor(self.groups, 1, self.embed_dim)))
                for l in [Wvs[i]]:
                    torch.nn.init.normal_(l.weight, std=0.01)
                    torch.nn.init.constant_(l.bias, 0)
                for weight in [us[i]]:
                    torch.nn.init.normal_(weight, std=0.01)

            self.g_Wqs = nn.ModuleList(Wqs)
            self.g_Wks = nn.ModuleList(Wks)
            self.g_Wvs = nn.ModuleList(Wvs)
            self.g_us = nn.ParameterList(us)

        self.out_channels = representation_size
Beispiel #5
0
    def __init__(self, cfg, in_channels):
        super(RDNFeatureExtractor, self).__init__(cfg, in_channels)

        stage = resnet.StageSpec(index=4, block_count=3, return_features=False)
        head = resnet.ResNetHead(
            block_module=cfg.MODEL.RESNETS.TRANS_FUNC,
            stages=(stage, ),
            num_groups=cfg.MODEL.RESNETS.NUM_GROUPS,
            width_per_group=cfg.MODEL.RESNETS.WIDTH_PER_GROUP,
            stride_in_1x1=cfg.MODEL.RESNETS.STRIDE_IN_1X1,
            stride_init=1,
            res2_out_channels=cfg.MODEL.RESNETS.RES2_OUT_CHANNELS,
            dilation=cfg.MODEL.RESNETS.RES5_DILATION,
        )

        in_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * 2**(stage.index -
                                                                1)
        if cfg.MODEL.VID.ROI_BOX_HEAD.REDUCE_CHANNEL:
            new_conv = nn.Conv2d(in_channels, 256, kernel_size=1, stride=1)
            nn.init.kaiming_uniform_(new_conv.weight, a=1)
            nn.init.constant_(new_conv.bias, 0)
            output_channel = 256
        else:
            new_conv = None
            output_channel = in_channels

        resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
        scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES
        sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
        pooler = Pooler(
            output_size=(resolution, resolution),
            scales=scales,
            sampling_ratio=sampling_ratio,
        )

        self.head = head
        self.conv = new_conv
        self.pooler = pooler

        input_size = output_channel * resolution**2
        representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM
        use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN

        if cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.ENABLE:
            self.embed_dim = cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.EMBED_DIM
            self.groups = cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.GROUP
            self.feat_dim = representation_size

            self.base_stage = cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.STAGE
            self.advanced_stage = cfg.MODEL.VID.ROI_BOX_HEAD.ATTENTION.ADVANCED_STAGE

            self.base_num = cfg.MODEL.VID.RPN.REF_POST_NMS_TOP_N
            self.advanced_num = int(self.base_num * cfg.MODEL.VID.RDN.RATIO)

            fcs, Wgs, Wqs, Wks, Wvs = [], [], [], [], []

            for i in range(self.base_stage + self.advanced_stage + 1):
                r_size = input_size if i == 0 else representation_size

                if i == self.base_stage and self.advanced_stage == 0:
                    break

                if i != self.base_stage + self.advanced_stage:
                    fcs.append(make_fc(r_size, representation_size, use_gn))
                Wgs.append(
                    Conv2d(self.embed_dim,
                           self.groups,
                           kernel_size=1,
                           stride=1,
                           padding=0))
                Wqs.append(make_fc(self.feat_dim, self.feat_dim))
                Wks.append(make_fc(self.feat_dim, self.feat_dim))
                Wvs.append(
                    Conv2d(self.feat_dim * self.groups,
                           self.feat_dim,
                           kernel_size=1,
                           stride=1,
                           padding=0,
                           groups=self.groups))
                for l in [Wgs[i], Wvs[i]]:
                    torch.nn.init.normal_(l.weight, std=0.01)
                    torch.nn.init.constant_(l.bias, 0)
            self.fcs = nn.ModuleList(fcs)
            self.Wgs = nn.ModuleList(Wgs)
            self.Wqs = nn.ModuleList(Wqs)
            self.Wks = nn.ModuleList(Wks)
            self.Wvs = nn.ModuleList(Wvs)

        self.out_channels = representation_size