def __init__(self, num_classes, in_channels, bbox_coder, num_decoder_layers, transformerlayers, decoder_self_posembeds=dict(type='ConvBNPositionalEncoding', input_channel=6, num_pos_feats=288), decoder_cross_posembeds=dict(type='ConvBNPositionalEncoding', input_channel=3, num_pos_feats=288), train_cfg=None, test_cfg=None, num_proposal=128, pred_layer_cfg=None, size_cls_agnostic=True, gt_per_seed=3, sampling_objectness_loss=None, objectness_loss=None, center_loss=None, dir_class_loss=None, dir_res_loss=None, size_class_loss=None, size_res_loss=None, size_reg_loss=None, semantic_loss=None, init_cfg=None): super(GroupFree3DHead, self).__init__(init_cfg=init_cfg) self.num_classes = num_classes self.train_cfg = train_cfg self.test_cfg = test_cfg self.num_proposal = num_proposal self.in_channels = in_channels self.num_decoder_layers = num_decoder_layers self.size_cls_agnostic = size_cls_agnostic self.gt_per_seed = gt_per_seed # Transformer decoder layers if isinstance(transformerlayers, ConfigDict): transformerlayers = [ copy.deepcopy(transformerlayers) for _ in range(num_decoder_layers) ] else: assert isinstance(transformerlayers, list) and \ len(transformerlayers) == num_decoder_layers self.decoder_layers = nn.ModuleList() for i in range(self.num_decoder_layers): self.decoder_layers.append( build_transformer_layer(transformerlayers[i])) self.embed_dims = self.decoder_layers[0].embed_dims assert self.embed_dims == decoder_self_posembeds['num_pos_feats'] assert self.embed_dims == decoder_cross_posembeds['num_pos_feats'] # bbox_coder self.bbox_coder = build_bbox_coder(bbox_coder) self.num_sizes = self.bbox_coder.num_sizes self.num_dir_bins = self.bbox_coder.num_dir_bins # Initial object candidate sampling self.gsample_module = GeneralSamplingModule() self.fps_module = Points_Sampler([self.num_proposal]) self.points_obj_cls = PointsObjClsModule(self.in_channels) self.fp16_enabled = False # initial candidate prediction self.conv_pred = BaseConvBboxHead( **pred_layer_cfg, num_cls_out_channels=self._get_cls_out_channels(), num_reg_out_channels=self._get_reg_out_channels()) # query proj and key proj self.decoder_query_proj = nn.Conv1d(self.embed_dims, self.embed_dims, kernel_size=1) self.decoder_key_proj = nn.Conv1d(self.embed_dims, self.embed_dims, kernel_size=1) # query position embed self.decoder_self_posembeds = nn.ModuleList() for _ in range(self.num_decoder_layers): self.decoder_self_posembeds.append( build_positional_encoding(decoder_self_posembeds)) # key position embed self.decoder_cross_posembeds = nn.ModuleList() for _ in range(self.num_decoder_layers): self.decoder_cross_posembeds.append( build_positional_encoding(decoder_cross_posembeds)) # Prediction Head self.prediction_heads = nn.ModuleList() for i in range(self.num_decoder_layers): self.prediction_heads.append( BaseConvBboxHead( **pred_layer_cfg, num_cls_out_channels=self._get_cls_out_channels(), num_reg_out_channels=self._get_reg_out_channels())) self.sampling_objectness_loss = build_loss(sampling_objectness_loss) self.objectness_loss = build_loss(objectness_loss) self.center_loss = build_loss(center_loss) self.dir_res_loss = build_loss(dir_res_loss) self.dir_class_loss = build_loss(dir_class_loss) self.semantic_loss = build_loss(semantic_loss) if self.size_cls_agnostic: self.size_reg_loss = build_loss(size_reg_loss) else: self.size_res_loss = build_loss(size_res_loss) self.size_class_loss = build_loss(size_class_loss)
def __init__(self, num_classes=150, num_ffn_fcs=2, num_heads=8, num_mask_fcs=3, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, act_cfg=dict(type='ReLU', inplace=True), ffn_act_cfg=dict(type='ReLU', inplace=True), conv_kernel_size=1, feat_transform_cfg=None, kernel_init=False, with_ffn=True, feat_gather_stride=1, mask_transform_stride=1, kernel_updator_cfg=dict(type='DynamicConv', in_channels=256, feat_channels=64, out_channels=256, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN'))): super(KernelUpdateHead, self).__init__() self.num_classes = num_classes self.in_channels = in_channels self.out_channels = out_channels self.fp16_enabled = False self.dropout = dropout self.num_heads = num_heads self.kernel_init = kernel_init self.with_ffn = with_ffn self.conv_kernel_size = conv_kernel_size self.feat_gather_stride = feat_gather_stride self.mask_transform_stride = mask_transform_stride self.attention = MultiheadAttention(in_channels * conv_kernel_size**2, num_heads, dropout) self.attention_norm = build_norm_layer( dict(type='LN'), in_channels * conv_kernel_size**2)[1] self.kernel_update_conv = build_transformer_layer(kernel_updator_cfg) if feat_transform_cfg is not None: kernel_size = feat_transform_cfg.pop('kernel_size', 1) transform_channels = in_channels self.feat_transform = ConvModule(transform_channels, in_channels, kernel_size, stride=feat_gather_stride, padding=int(feat_gather_stride // 2), **feat_transform_cfg) else: self.feat_transform = None if self.with_ffn: self.ffn = FFN(in_channels, feedforward_channels, num_ffn_fcs, act_cfg=ffn_act_cfg, dropout=dropout) self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1] self.mask_fcs = nn.ModuleList() for _ in range(num_mask_fcs): self.mask_fcs.append( nn.Linear(in_channels, in_channels, bias=False)) self.mask_fcs.append( build_norm_layer(dict(type='LN'), in_channels)[1]) self.mask_fcs.append(build_activation_layer(act_cfg)) self.fc_mask = nn.Linear(in_channels, out_channels)