Example #1
0
    def __init__(self, level, norm, dims=[512, 256, 256], rfb=False):
        super(ASFF, self).__init__()
        conv_bn_relu = conv_with_kaiming_uniform(norm, activation=True)
        self.level = level
        # 输入的三个特征层的channels, 根据实际修改
        self.dim = dims
        self.inter_dim = self.dim[self.level]
        # 每个层级三者输出通道数需要一致
        if level == 0:
            self.stride_level_1 = conv_bn_relu(self.dim[1], self.inter_dim, 3,
                                               2)
            self.stride_level_2 = conv_bn_relu(self.dim[2], self.inter_dim, 3,
                                               2)
            self.expand = conv_bn_relu(self.inter_dim, 1024, 3, 1)
        elif level == 1:
            self.compress_level_0 = conv_bn_relu(self.dim[0], self.inter_dim,
                                                 1, 1)
            self.stride_level_2 = conv_bn_relu(self.dim[2], self.inter_dim, 3,
                                               2)
            self.expand = conv_bn_relu(self.inter_dim, 512, 3, 1)
        elif level == 2:
            self.compress_level_0 = conv_bn_relu(self.dim[0], self.inter_dim,
                                                 1, 1)
            if self.dim[1] != self.dim[2]:
                self.compress_level_1 = conv_bn_relu(self.dim[1],
                                                     self.inter_dim, 1, 1)
            self.expand = add_conv(self.inter_dim, 256, 3, 1)
        compress_c = 8 if rfb else 16
        self.weight_level_0 = conv_bn_relu(self.inter_dim, compress_c, 1, 1)
        self.weight_level_1 = conv_bn_relu(self.inter_dim, compress_c, 1, 1)
        self.weight_level_2 = conv_bn_relu(self.inter_dim, compress_c, 1, 1)

        self.weight_levels = nn.Conv2d(compress_c * 3, 3, 1, 1, 0)
Example #2
0
    def __init__(self, cfg, disable_rel_coords=False):
        super().__init__()
        self.num_outputs = cfg.MODEL.CONDINST.IUVHead.OUT_CHANNELS
        norm = cfg.MODEL.CONDINST.IUVHead.NORM
        num_convs = cfg.MODEL.CONDINST.IUVHead.NUM_CONVS
        channels = cfg.MODEL.CONDINST.IUVHead.CHANNELS
        soi = cfg.MODEL.FCOS.SIZES_OF_INTEREST
        self.register_buffer("sizes_of_interest", torch.tensor(soi + [soi[-1] * 2]))
        self.in_channels = channels + 2
        self.iuv_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
        self.disable_rel_coords = disable_rel_coords

        conv_block = conv_with_kaiming_uniform(norm, activation=True)

        tower = []
        tower.append(conv_block(
            self.in_channels, channels, 3, 1
        ))
        for i in range(1,num_convs):
            tower.append(conv_block(
                channels, channels, 3, 1
            ))
        tower.append(nn.Conv2d(
            channels, max(self.num_outputs, 1), 1
        ))
        self.add_module('tower', nn.Sequential(*tower))
Example #3
0
    def __init__(self, cfg):
        super().__init__()
        self.num_outputs = cfg.MODEL.CONDINST.IUVHead.OUT_CHANNELS
        norm = cfg.MODEL.CONDINST.IUVHead.NORM
        num_convs = cfg.MODEL.CONDINST.IUVHead.NUM_CONVS
        channels = cfg.MODEL.CONDINST.IUVHead.CHANNELS
        self.iuv_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE

        conv_block = conv_with_kaiming_uniform(norm, activation=True)

        tower = []
        for i in range(num_convs):
            tower.append(conv_block(channels, channels, 3, 1))
        tower.append(nn.Conv2d(channels, max(self.num_outputs, 1), 1))
        self.add_module('tower', nn.Sequential(*tower))
Example #4
0
    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
        super().__init__()
        self.in_features = cfg.MODEL.CONDINST.MASK_BRANCH.IN_FEATURES
        self.sem_loss_on = cfg.MODEL.CONDINST.MASK_BRANCH.SEMANTIC_LOSS_ON
        self.num_outputs = cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS
        norm = cfg.MODEL.CONDINST.MASK_BRANCH.NORM
        num_convs = cfg.MODEL.CONDINST.MASK_BRANCH.NUM_CONVS
        channels = cfg.MODEL.CONDINST.MASK_BRANCH.CHANNELS
        self.out_stride = input_shape[self.in_features[0]].stride

        feature_channels = {k: v.channels for k, v in input_shape.items()}

        conv_block = conv_with_kaiming_uniform(norm, activation=True)

        self.refine = nn.ModuleList()
        for in_feature in self.in_features:
            self.refine.append(conv_block(
                feature_channels[in_feature],
                channels, 3, 1
            ))

        tower = []
        for i in range(num_convs):
            tower.append(conv_block(
                channels, channels, 3, 1
            ))
        tower.append(nn.Conv2d(
            channels, max(self.num_outputs, 1), 1
        ))
        self.add_module('tower', nn.Sequential(*tower))

        # pdb.set_trace()
        if self.sem_loss_on:
            num_classes = cfg.MODEL.FCOS.NUM_CLASSES
            self.focal_loss_alpha = cfg.MODEL.FCOS.LOSS_ALPHA
            self.focal_loss_gamma = cfg.MODEL.FCOS.LOSS_GAMMA

            in_channels = feature_channels[self.in_features[0]]
            self.seg_head = nn.Sequential(
                conv_block(in_channels, channels, kernel_size=3, stride=1),
                conv_block(channels, channels, kernel_size=3, stride=1)
            )

            self.logits = nn.Conv2d(channels, num_classes, kernel_size=1, stride=1)

            prior_prob = cfg.MODEL.FCOS.PRIOR_PROB
            bias_value = -math.log((1 - prior_prob) / prior_prob)
            torch.nn.init.constant_(self.logits.bias, bias_value)
Example #5
0
    def __init__(self, cfg, use_rel_coords=True):
        super().__init__()
        self.num_outputs = cfg.MODEL.CONDINST.IUVHead.OUT_CHANNELS
        norm = cfg.MODEL.CONDINST.IUVHead.NORM
        num_convs = cfg.MODEL.CONDINST.IUVHead.NUM_CONVS
        num_lambda_layer = cfg.MODEL.CONDINST.IUVHead.NUM_LAMBDA_LAYER
        lambda_layer_r = cfg.MODEL.CONDINST.IUVHead.LAMBDA_LAYER_R
        num_dcn_layer = cfg.MODEL.CONDINST.IUVHead.NUM_DCN_LAYER
        assert num_lambda_layer<=num_convs

        agg_channels = cfg.MODEL.CONDINST.MASK_BRANCH.AGG_CHANNELS
        channels = cfg.MODEL.CONDINST.IUVHead.CHANNELS
        self.norm_feat = cfg.MODEL.CONDINST.IUVHead.NORM_FEATURES
        soi = cfg.MODEL.FCOS.SIZES_OF_INTEREST
        self.register_buffer("sizes_of_interest", torch.tensor(soi + [soi[-1] * 2]))
        self.iuv_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
        self.use_rel_coords = cfg.MODEL.CONDINST.IUVHead.REL_COORDS
        self.use_abs_coords = cfg.MODEL.CONDINST.IUVHead.ABS_COORDS
        self.use_down_up_sampling = cfg.MODEL.CONDINST.IUVHead.DOWN_UP_SAMPLING
        self.use_partial_conv = cfg.MODEL.CONDINST.IUVHead.PARTIAL_CONV
        self.use_partial_norm = cfg.MODEL.CONDINST.IUVHead.PARTIAL_NORM
        # pdb.set_trace()
        # if self.use_rel_coords:
        #     self.in_channels = channels + 2
        # else:
        self.pos_emb_num_freqs = cfg.MODEL.CONDINST.IUVHead.POSE_EMBEDDING_NUM_FREQS
        self.use_pos_emb = self.pos_emb_num_freqs>0
        if self.use_pos_emb:
            self.position_embedder, self.position_emb_dim = get_embedder(multires=self.pos_emb_num_freqs, input_dims=2)
            self.in_channels = agg_channels + self.position_emb_dim
        else:
            self.in_channels = agg_channels + 2

        if self.use_abs_coords:
            if self.use_pos_emb:
                self.in_channels += self.position_emb_dim
            else:
                self.in_channels += 2


        conv_block = conv_with_kaiming_uniform(norm, activation=True)

        partial_conv_block = conv_with_kaiming_uniform(norm, activation=True, use_partial_conv=True)
        deform_conv_block = conv_with_kaiming_uniform(norm, activation=True, use_deformable=True)

        tower = []
        if self.use_partial_conv:
            # pdb.set_trace()
            layer = partial_conv_block(self.in_channels, channels, 3, 1)
            tower.append(layer)
            self.in_channels = channels

        if num_lambda_layer>0:
            layer = LambdaLayer(
                dim = self.in_channels,
                dim_out = channels,
                r = lambda_layer_r,         # the receptive field for relative positional encoding (23 x 23)
                dim_k = 16,
                heads = 4,
                dim_u = 4
            )
            tower.append(layer)
        else:
            tower.append(conv_block(
                self.in_channels, channels, 3, 1
            ))
        if num_dcn_layer>0:
            tower.append(deform_conv_block(
                    channels, channels, 3, 1
            ))

        if self.use_down_up_sampling:
            for i in range(1,num_convs):
                if i==1:
                    tower.append(conv_block(
                        channels, channels*2, 3, 2
                    ))
                else:
                    tower.append(conv_block(
                        channels*2, channels*2, 3, 1
                    ))

            tower.append(ConvTranspose2d(
                channels*2, self.num_outputs, 4, stride=2, padding=int(4 / 2 - 1)
            ))
        else:
            for i in range(1,num_convs):
                tower.append(conv_block(
                    channels, channels, 3, 1
                ))
            tower.append(nn.Conv2d(
                channels, max(self.num_outputs, 1), 1
            ))

        self.add_module('tower', nn.Sequential(*tower))
Example #6
0
    def __init__(self, cfg, use_rel_coords=True):
        super().__init__()
        self.num_outputs = cfg.MODEL.CONDINST.IUVHead.OUT_CHANNELS
        norm = cfg.MODEL.CONDINST.IUVHead.NORM
        num_convs = cfg.MODEL.CONDINST.IUVHead.NUM_CONVS
        num_lambda_layer = cfg.MODEL.CONDINST.IUVHead.NUM_LAMBDA_LAYER
        lambda_layer_r = cfg.MODEL.CONDINST.IUVHead.LAMBDA_LAYER_R
        assert num_lambda_layer <= num_convs

        agg_channels = cfg.MODEL.CONDINST.MASK_BRANCH.AGG_CHANNELS
        channels = cfg.MODEL.CONDINST.IUVHead.CHANNELS
        self.norm_feat = cfg.MODEL.CONDINST.IUVHead.NORM_FEATURES
        soi = cfg.MODEL.FCOS.SIZES_OF_INTEREST
        self.register_buffer("sizes_of_interest",
                             torch.tensor(soi + [soi[-1] * 2]))
        self.iuv_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
        self.use_rel_coords = cfg.MODEL.CONDINST.IUVHead.REL_COORDS
        self.use_abs_coords = cfg.MODEL.CONDINST.IUVHead.ABS_COORDS
        # pdb.set_trace()
        # if self.use_rel_coords:
        #     self.in_channels = channels + 2
        # else:
        self.pos_emb_num_freqs = cfg.MODEL.CONDINST.IUVHead.POSE_EMBEDDING_NUM_FREQS
        self.use_pos_emb = self.pos_emb_num_freqs > 0
        extra_channels = 0
        if self.use_pos_emb:
            self.position_embedder, self.position_emb_dim = get_embedder(
                multires=self.pos_emb_num_freqs, input_dims=2)
            extra_channels += self.position_emb_dim
        else:
            extra_channels += 2

        if self.use_abs_coords:
            if self.use_pos_emb:
                extra_channels += self.position_emb_dim
            else:
                extra_channels += 2

        # pdb.set_trace()
        conv_block = conv_with_kaiming_uniform(norm, activation=True)

        cnt = 0
        self.layers = []
        if num_lambda_layer > 0:
            layer = LambdaLayer(
                dim=agg_channels + extra_channels,
                dim_out=channels,
                r=lambda_layer_r,  # the receptive field for relative positional encoding (23 x 23)
                dim_k=16,
                heads=4,
                dim_u=4)
        else:
            layer = conv_block(channels + extra_channels, channels, 3, 1)
        setattr(self, 'layer_{}'.format(cnt), layer)
        self.layers.append(layer)
        cnt += 1

        for i in range(1, num_convs):
            if i < num_lambda_layer:
                layer = LambdaLayer(
                    dim=channels + extra_channels,
                    dim_out=channels,
                    r=lambda_layer_r,  # the receptive field for relative positional encoding (23 x 23)
                    dim_k=16,
                    heads=4,
                    dim_u=4)
            else:
                layer = conv_block(channels + extra_channels, channels, 3, 1)
            setattr(self, 'layer_{}'.format(cnt), layer)
            self.layers.append(layer)
            cnt += 1

        layer = nn.Conv2d(channels + extra_channels, max(self.num_outputs, 1),
                          1)
        setattr(self, 'layer_{}'.format(cnt), layer)
        self.layers.append(layer)
Example #7
0
    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
        super().__init__()
        self.in_features = cfg.MODEL.CONDINST.MASK_BRANCH.IN_FEATURES
        self.sem_loss_on = cfg.MODEL.CONDINST.MASK_BRANCH.SEMANTIC_LOSS_ON
        self.num_outputs = cfg.MODEL.CONDINST.MASK_BRANCH.OUT_CHANNELS
        norm = cfg.MODEL.CONDINST.MASK_BRANCH.NORM
        num_convs = cfg.MODEL.CONDINST.MASK_BRANCH.NUM_CONVS
        agg_channels = cfg.MODEL.CONDINST.MASK_BRANCH.AGG_CHANNELS
        channels = cfg.MODEL.CONDINST.MASK_BRANCH.CHANNELS
        self.out_stride = input_shape[
            cfg.MODEL.CONDINST.MASK_BRANCH.IN_FEATURES[0]].stride
        # pdb.set_trace()
        # self.num_lambda_layer = cfg.MODEL.CONDINST.MASK_BRANCH.NUM_LAMBDA_LAYER
        self.use_aspp = cfg.MODEL.CONDINST.MASK_BRANCH.USE_ASPP
        self.use_san = cfg.MODEL.CONDINST.MASK_BRANCH.USE_SAN
        self.san_type = cfg.MODEL.CONDINST.SAN_TYPE
        self.use_attn = cfg.MODEL.CONDINST.MASK_BRANCH.USE_ATTN
        self.attn_type = cfg.MODEL.CONDINST.ATTN_TYPE
        # lambda_layer_r = cfg.MODEL.CONDINST.MASK_BRANCH.LAMBDA_LAYER_R
        self.checkpoint_grad_num = cfg.MODEL.CONDINST.CHECKPOINT_GRAD_NUM
        self.v2 = cfg.MODEL.CONDINST.v2
        self.use_res_input = cfg.MODEL.CONDINST.MASK_BRANCH.RESIDUAL_INPUT
        self.use_res_after_relu = cfg.MODEL.CONDINST.MASK_BRANCH.RESIDUAL_SKIP_AFTER_RELU

        self.use_agg_feat = cfg.MODEL.CONDINST.IUVHead.USE_AGG_FEATURES
        if self.use_agg_feat:
            self.in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES

        self.use_weight_std = cfg.MODEL.CONDINST.IUVHead.WEIGHT_STANDARDIZATION
        self.use_eca = cfg.MODEL.CONDINST.IUVHead.Efficient_Channel_Attention
        # self.use_tree_filter = cfg.MODEL.CONDINST.MASK_BRANCH.TREE_FILTER
        self.tf_embed_dim = cfg.MODEL.CONDINST.MASK_BRANCH.TREE_FILTER_EMBED_DIM
        self.tf_group_num = cfg.MODEL.CONDINST.MASK_BRANCH.TREE_FILTER_GROUP_NUM

        self.add_skeleton_feat = cfg.MODEL.CONDINST.IUVHead.SKELETON_FEATURES

        feature_channels = {k: v.channels for k, v in input_shape.items()}

        conv_block_no_act = conv_with_kaiming_uniform(
            norm, activation=False, use_weight_std=self.use_weight_std)
        conv_block = conv_with_kaiming_uniform(
            norm, activation=True, use_weight_std=self.use_weight_std)

        self.use_decoder = False
        # self.use_decoder           = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON
        # if self.use_decoder:
        #     self.decoder = Decoder(cfg, input_shape, self.in_features)
        #     assert agg_channels==cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS
        # else:
        self.refine = nn.ModuleList()
        self.tf = nn.ModuleList()
        for idx, in_feature in enumerate(self.in_features):

            # if num_lambda_layer>=len(self.in_features)-idx:
            #     layer = LambdaLayer(
            #         dim = feature_channels[in_feature],
            #         dim_out = agg_channels,
            #         r = lambda_layer_r,         # the receptive field for relative positional encoding (23 x 23)
            #         dim_k = 16,
            #         heads = 4,
            #         dim_u = 4
            #     )
            #     self.refine.append(layer)
            # else:

            # pdb.set_trace()
            # self.ASFF = ASFF(level=2, norm=norm, dims=[256, 256, 256], rfb=False)
            # self.ASFF(x_level_0, x_level_1, x_level_2):

            # if self.v2 and idx>0 and in_feature not in ["p6","p7"]:
            if idx > 0 and in_feature not in ["p6", "p7"]:
                if self.add_skeleton_feat:
                    self.refine.append(
                        nn.Sequential(*[
                            conv_block_no_act(feature_channels[in_feature],
                                              agg_channels, 3, 1),
                            nn.Upsample(scale_factor=2**idx)
                        ]))
                else:
                    self.refine.append(
                        nn.Sequential(*[
                            conv_block(feature_channels[in_feature],
                                       agg_channels, 3, 1),
                            nn.Upsample(scale_factor=2**idx)
                        ]))

                    # aligned_bilinear_layer(
                    #     factor=2**idx
                    # ),
                # if self.use_tree_filter:
                #     self.tf.append(TreeFilterV2_layer(agg_channels,
                #                                     feature_channels[self.in_features[0]],
                #                                     embed_dim=self.tf_embed_dim,
                #                                     num_groups=self.tf_group_num))
            else:
                self.refine.append(
                    conv_block(feature_channels[in_feature], agg_channels, 3,
                               1))

        if self.add_skeleton_feat:
            self.conv_skeleton = conv_block(agg_channels + 55, agg_channels, 3,
                                            1)

        if self.use_eca:
            self.eca = eca_layer(agg_channels, k_size=3)

        if self.use_aspp:
            # self.ASPP = ASPP_share(agg_channels, [1,2,3], agg_channels)  # 6, 12, 56
            self.ASPP = ASPP_share_attn(agg_channels, [1, 2, 3],
                                        agg_channels)  # 6, 12, 56

            self.add_module("ASPP", self.ASPP)

        # if self.num_lambda_layer>0:
        #     self.lambda_layer = LambdaLayer(
        #         dim = agg_channels,
        #         dim_out = agg_channels,
        #         r = lambda_layer_r,         # the receptive field for relative positional encoding (23 x 23)
        #         dim_k = 16,
        #         heads = 4,
        #         dim_u = 4
        #     )

        if self.use_san:
            # sa_type = 1 ## 0: pairwise; 1: patchwise
            sa_type = 1
            if self.san_type == "SAN_BottleneckGN":
                san_func = SAN_BottleneckGN
            elif self.san_type == "SAN_BottleneckGN_GatedEarly":
                san_func = SAN_BottleneckGN_GatedEarly
            elif self.san_type == "SAN_BottleneckGN_Gated":
                SAN_BottleneckGN_Gated
            self.san_blks = []
            for idx in range(len(self.in_features)):
                san_blk = san_func(sa_type,
                                   agg_channels,
                                   agg_channels // 16,
                                   agg_channels // 4,
                                   agg_channels,
                                   8,
                                   kernel_size=7,
                                   stride=1)
                self.add_module("san_blk_{}".format(idx), san_blk)
                self.san_blks.append(san_blk)

        if self.use_attn:
            ks = 7
            if self.attn_type == "Spatial_Attn":  # SpatialMaxAvg_Attn, SpatialMaxAvg_ChannelMaxAvg_Attn
                ch_in = sum([feature_channels[k] for k in self.in_features])
                ch_out = len(self.in_features)
                self.attn_blk = nn.Sequential(*[
                    nn.Conv2d(ch_in,
                              ch_out,
                              kernel_size=ks,
                              stride=1,
                              padding=ks // 2,
                              bias=False),
                    nn.Softmax(dim=1)
                ])
            elif self.attn_type == "SpatialMaxAvg_Attn":
                ch_in = len(self.in_features) * 2
                ch_out = len(self.in_features)
                self.attn_blk = nn.Sequential(*[
                    nn.Conv2d(ch_in,
                              ch_out,
                              kernel_size=ks,
                              stride=1,
                              padding=ks // 2,
                              bias=False),
                    nn.Softmax(dim=1)
                ])
            elif self.attn_type == "SpatialMaxAvg_ChannelMaxAvg_Attn":
                ch_in = len(self.in_features) * 2
                ch_out = len(self.in_features)
                self.attn_blk = nn.Sequential(*[
                    nn.Conv2d(ch_in,
                              ch_out,
                              kernel_size=ks,
                              stride=1,
                              padding=ks // 2,
                              bias=False),
                    nn.Softmax(dim=1)
                ])
                "todo channel attn"
                self.ch_attn_max_list = []
                self.ch_attn_avg_list = []
                reduct_ratio = 16
                for idx, key in enumerate(self.in_features):
                    ch_attn_max = nn.Sequential(*[
                        nn.Linear(feature_channels[key],
                                  feature_channels[key] // 16),
                        nn.ReLU(inplace=True),
                        nn.Linear(feature_channels[key] //
                                  16, feature_channels[key]),
                    ])
                    self.add_module("ch_attn_max_{}".format(idx), ch_attn_max)
                    self.ch_attn_max_list.append(ch_attn_max)
                    #
                    ch_attn_avg = nn.Sequential(*[
                        nn.Linear(feature_channels[key],
                                  feature_channels[key] // 16),
                        nn.ReLU(inplace=True),
                        nn.Linear(feature_channels[key] //
                                  16, feature_channels[key]),
                    ])
                    self.add_module("ch_attn_avg_{}".format(idx), ch_attn_avg)
                    self.ch_attn_avg_list.append(ch_attn_avg)

            # agg_channels = channels
        # if "p1" == self.in_features[0]:
        #     self.down_conv = conv_block(
        #         channels, channels, 3, 2, 1
        #     )
        if "p2" == self.in_features[0]:
            # if self.v2:
            # if self.add_skeleton_feat:
            #     tower = [conv_block(
            #             agg_channels+55, channels, 3, 2, 1
            #         )]
            # else:
            tower = [conv_block(agg_channels, channels, 3, 2, 1)]
            # else:
            #     self.down_conv = conv_block(
            #         agg_channels, channels, 3, 2, 1
            #     )
            #     tower = [conv_block(
            #             channels, channels, 3, 1
            #         )]
        else:
            tower = [conv_block(agg_channels, channels, 3, 1)]
        for i in range(1, num_convs):
            tower.append(conv_block(channels, channels, 3, 1))
        tower.append(nn.Conv2d(channels, max(self.num_outputs, 1), 1))
        if self.use_res_input or self.use_res_after_relu:
            for idx, layer in enumerate(tower):
                self.add_module('tower_layer{}'.format(idx), layer)
            self.tower = tower
        else:
            self.add_module('tower', nn.Sequential(*tower))
Example #8
0
    def __init__(self, cfg, use_rel_coords=True):
        super().__init__()
        self.num_outputs = cfg.MODEL.CONDINST.IUVHead.OUT_CHANNELS
        norm = cfg.MODEL.CONDINST.IUVHead.NORM
        num_convs = cfg.MODEL.CONDINST.IUVHead.NUM_CONVS
        num_lambda_layer = cfg.MODEL.CONDINST.IUVHead.NUM_LAMBDA_LAYER
        lambda_layer_r = cfg.MODEL.CONDINST.IUVHead.LAMBDA_LAYER_R
        assert num_lambda_layer <= num_convs

        agg_channels = cfg.MODEL.CONDINST.MASK_BRANCH.AGG_CHANNELS
        channels = cfg.MODEL.CONDINST.IUVHead.CHANNELS
        self.norm_feat = cfg.MODEL.CONDINST.IUVHead.NORM_FEATURES
        soi = cfg.MODEL.FCOS.SIZES_OF_INTEREST
        self.register_buffer("sizes_of_interest",
                             torch.tensor(soi + [soi[-1] * 2]))
        self.iuv_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
        self.use_rel_coords = cfg.MODEL.CONDINST.IUVHead.REL_COORDS
        self.use_abs_coords = cfg.MODEL.CONDINST.IUVHead.ABS_COORDS
        self.use_partial_conv = cfg.MODEL.CONDINST.IUVHead.PARTIAL_CONV
        self.use_partial_norm = cfg.MODEL.CONDINST.IUVHead.PARTIAL_NORM
        # pdb.set_trace()
        # if self.use_rel_coords:
        #     self.in_channels = channels + 2
        # else:
        self.pos_emb_num_freqs = cfg.MODEL.CONDINST.IUVHead.POSE_EMBEDDING_NUM_FREQS
        self.use_pos_emb = self.pos_emb_num_freqs > 0

        if self.use_pos_emb:
            self.position_embedder, self.position_emb_dim = get_embedder(
                multires=self.pos_emb_num_freqs, input_dims=2)
            self.in_channels = agg_channels + self.position_emb_dim
        else:
            self.in_channels = agg_channels + 2

        if self.use_abs_coords:
            if self.use_pos_emb:
                self.in_channels += self.position_emb_dim
            else:
                self.in_channels += 2

        if self.use_partial_conv:
            conv_block = conv_with_kaiming_uniform(norm,
                                                   activation=True,
                                                   use_partial_conv=True)
        else:
            conv_block = conv_with_kaiming_uniform(norm, activation=True)
            # pdb.set_trace()
        conv_block_bn = conv_with_kaiming_uniform("BN", activation=True)

        # tower_attn = []
        # tower_attn.append(conv_block_bn(
        #     self.position_emb_dim, 32, 3, 1
        # ))
        # tower_attn.append(nn.Conv2d(
        #     32, 3, 3, stride=1, padding=1
        # ))
        # self.add_module('tower_attn', nn.Sequential(*tower_attn))

        num_layer = 3

        tower0 = []
        if num_lambda_layer > 0:
            layer = LambdaLayer(
                dim=self.in_channels,
                dim_out=channels,
                r=lambda_layer_r,  # the receptive field for relative positional encoding (23 x 23)
                dim_k=8,
                heads=4,
                dim_u=4)
            tower0.append(layer)
        else:
            tower0.append(conv_block(self.in_channels, channels, 3, 1))
        for i in range(num_layer):
            tower0.append(conv_block(channels, channels, 3, 1))
        self.add_module('tower0', nn.Sequential(*tower0))

        tower1 = []
        if num_lambda_layer > 0:
            layer = LambdaLayer(
                dim=self.in_channels,
                dim_out=channels,
                r=lambda_layer_r,  # the receptive field for relative positional encoding (23 x 23)
                dim_k=8,
                heads=4,
                dim_u=4)
            tower1.append(layer)
        else:
            tower1.append(conv_block(self.in_channels, channels, 3, 1))
        for i in range(num_layer):
            tower1.append(conv_block(channels, channels, 3, 1))
        self.add_module('tower1', nn.Sequential(*tower1))

        tower2 = []
        if num_lambda_layer > 0:
            layer = LambdaLayer(
                dim=self.in_channels,
                dim_out=channels,
                r=lambda_layer_r,  # the receptive field for relative positional encoding (23 x 23)
                dim_k=8,
                heads=4,
                dim_u=4)
            tower2.append(layer)
        else:
            tower2.append(conv_block(self.in_channels, channels, 3, 1))
        for i in range(num_layer):
            tower2.append(conv_block(channels, channels, 3, 1))
        self.add_module('tower2', nn.Sequential(*tower2))

        tower_out = []
        for i in range(num_convs - num_layer - 1):
            if i == 0:
                tower_out.append(conv_block(channels * 3, channels, 1, 1))
            else:
                tower_out.append(conv_block(channels, channels, 3, 1))
        self.add_module('tower_out', nn.Sequential(*tower_out))
Example #9
0
    def __init__(self, cfg, use_rel_coords=True):
        super().__init__()
        self.num_outputs = cfg.MODEL.CONDINST.IUVHead.OUT_CHANNELS
        norm = cfg.MODEL.CONDINST.IUVHead.NORM
        num_convs = cfg.MODEL.CONDINST.IUVHead.NUM_CONVS
        num_lambda_layer = cfg.MODEL.CONDINST.IUVHead.NUM_LAMBDA_LAYER
        assert num_lambda_layer <= num_convs
        channels = cfg.MODEL.CONDINST.IUVHead.CHANNELS
        self.norm_feat = cfg.MODEL.CONDINST.IUVHead.NORM_FEATURES
        soi = cfg.MODEL.FCOS.SIZES_OF_INTEREST
        self.register_buffer("sizes_of_interest",
                             torch.tensor(soi + [soi[-1] * 2]))
        self.iuv_out_stride = cfg.MODEL.CONDINST.MASK_OUT_STRIDE
        self.use_rel_coords = cfg.MODEL.CONDINST.IUVHead.REL_COORDS
        self.use_abs_coords = cfg.MODEL.CONDINST.IUVHead.ABS_COORDS
        # pdb.set_trace()
        # if self.use_rel_coords:
        #     self.in_channels = channels + 2
        # else:
        self.pos_emb_num_freqs = cfg.MODEL.CONDINST.IUVHead.POSE_EMBEDDING_NUM_FREQS
        self.use_pos_emb = self.pos_emb_num_freqs > 0
        if self.use_pos_emb:
            self.position_embedder, self.position_emb_dim = get_embedder(
                multires=self.pos_emb_num_freqs, input_dims=2)
            self.in_channels = channels + self.position_emb_dim
        else:
            self.in_channels = channels + 2

        if self.use_abs_coords:
            if self.use_pos_emb:
                self.in_channels += self.position_emb_dim
            else:
                self.in_channels += 2

        conv_block = conv_with_kaiming_uniform(norm, activation=True)

        tower = []
        if num_lambda_layer > 0:
            layer = LambdaLayer(
                dim=self.in_channels,
                dim_out=channels,
                r=23,  # the receptive field for relative positional encoding (23 x 23)
                dim_k=16,
                heads=4,
                dim_u=4)
            tower.append(layer)
        else:
            tower.append(conv_block(self.in_channels, channels, 3, 1))

        for i in range(1, num_convs - 1):
            if i < num_lambda_layer:
                layer = LambdaLayer(
                    dim=channels,
                    dim_out=channels,
                    r=23,  # the receptive field for relative positional encoding (23 x 23)
                    dim_k=16,
                    heads=4,
                    dim_u=4)
                tower.append(layer)
            else:
                tower.append(conv_block(channels, channels, 3, 1))

        self.add_module('tower', nn.Sequential(*tower))

        self.mid_res_conv = conv_block(channels, channels, 3, 1)
        self.mid_res_out = nn.Conv2d(channels, self.num_outputs, 1)

        self.low_res_conv = conv_block(channels, channels, 3, 2)
        self.low_res_out = nn.Conv2d(channels, self.num_outputs, 1)

        deconv_block = conv_with_kaiming_uniform(norm,
                                                 activation=True,
                                                 use_deconv=True)
        self.high_res_conv = deconv_block(channels, channels, 3, 2)
        self.high_res_out = nn.Conv2d(channels, self.num_outputs, 1)