def __init__(self, embed_dims, num_heads, ffn_ratio=4, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., num_fcs=2, qkv_bias=False, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), batch_first=True, init_cfg=None): super(TransformerBlock, self).__init__(init_cfg=init_cfg) self.norm_attn = build_norm_layer(norm_cfg, embed_dims)[1] self.attn = MultiheadAttention( embed_dims=embed_dims, num_heads=num_heads, attn_drop=attn_drop_rate, proj_drop=drop_rate, dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), batch_first=batch_first) self.norm_ffn = build_norm_layer(norm_cfg, embed_dims)[1] self.ffn = FFN( embed_dims=embed_dims, feedforward_channels=embed_dims * ffn_ratio, num_fcs=num_fcs, ffn_drop=drop_rate, dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), act_cfg=act_cfg) if not qkv_bias: self.attn.attn.in_proj_bias = None
def __init__(self, embed_dims, num_heads, feedforward_channels, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., num_fcs=2, qkv_bias=True, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), batch_first=True): super(TransformerEncoderLayer, self).__init__() self.norm1_name, norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1) self.add_module(self.norm1_name, norm1) self.attn = MultiheadAttention(embed_dims=embed_dims, num_heads=num_heads, attn_drop=attn_drop_rate, proj_drop=drop_rate, dropout_layer=dict( type='DropPath', drop_prob=drop_path_rate), batch_first=batch_first, bias=qkv_bias) self.norm2_name, norm2 = build_norm_layer(norm_cfg, embed_dims, postfix=2) self.add_module(self.norm2_name, norm2) self.ffn = FFN(embed_dims=embed_dims, feedforward_channels=feedforward_channels, num_fcs=num_fcs, ffn_drop=drop_rate, dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), act_cfg=act_cfg)
def __init__(self, num_classes=80, num_ffn_fcs=2, num_heads=8, num_cls_fcs=1, num_reg_fcs=3, feedforward_channels=2048, in_channels=256, dropout=0.0, ffn_act_cfg=dict(type='ReLU', inplace=True), dynamic_conv_cfg=dict(type='DynamicConv', in_channels=256, feat_channels=64, out_channels=256, input_feat_shape=7, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN')), loss_iou=dict(type='GIoULoss', loss_weight=2.0), init_cfg=None, **kwargs): assert init_cfg is None, 'To prevent abnormal initialization ' \ 'behavior, init_cfg is not allowed to be set' super(DIIHead, self).__init__(num_classes=num_classes, reg_decoded_bbox=True, reg_class_agnostic=True, init_cfg=init_cfg, **kwargs) self.loss_iou = build_loss(loss_iou) self.in_channels = in_channels self.fp16_enabled = False self.attention = MultiheadAttention(in_channels, num_heads, dropout) self.attention_norm = build_norm_layer(dict(type='LN'), in_channels)[1] self.instance_interactive_conv = build_transformer(dynamic_conv_cfg) self.instance_interactive_conv_dropout = nn.Dropout(dropout) self.instance_interactive_conv_norm = build_norm_layer( dict(type='LN'), in_channels)[1] self.ffn = FFN(in_channels, feedforward_channels, num_ffn_fcs, act_cfg=ffn_act_cfg, dropout=dropout) self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1] self.cls_fcs = nn.ModuleList() for _ in range(num_cls_fcs): self.cls_fcs.append(nn.Linear(in_channels, in_channels, bias=False)) self.cls_fcs.append( build_norm_layer(dict(type='LN'), in_channels)[1]) self.cls_fcs.append( build_activation_layer(dict(type='ReLU', inplace=True))) # over load the self.fc_cls in BBoxHead if self.loss_cls.use_sigmoid: self.fc_cls = nn.Linear(in_channels, self.num_classes) else: self.fc_cls = nn.Linear(in_channels, self.num_classes + 1) self.reg_fcs = nn.ModuleList() for _ in range(num_reg_fcs): self.reg_fcs.append(nn.Linear(in_channels, in_channels, bias=False)) self.reg_fcs.append( build_norm_layer(dict(type='LN'), in_channels)[1]) self.reg_fcs.append( build_activation_layer(dict(type='ReLU', inplace=True))) # over load the self.fc_cls in BBoxHead self.fc_reg = nn.Linear(in_channels, 4) assert self.reg_class_agnostic, 'DIIHead only ' \ 'suppport `reg_class_agnostic=True` ' assert self.reg_decoded_bbox, 'DIIHead only ' \ 'suppport `reg_decoded_bbox=True`'
def test_multiheadattention(): MultiheadAttention(embed_dims=5, num_heads=5, attn_drop=0, proj_drop=0, dropout_layer=dict(type='Dropout', drop_prob=0.), batch_first=True) batch_dim = 2 embed_dim = 5 num_query = 100 attn_batch_first = MultiheadAttention(embed_dims=5, num_heads=5, attn_drop=0, proj_drop=0, dropout_layer=dict(type='DropPath', drop_prob=0.), batch_first=True) attn_query_first = MultiheadAttention(embed_dims=5, num_heads=5, attn_drop=0, proj_drop=0, dropout_layer=dict(type='DropPath', drop_prob=0.), batch_first=False) param_dict = dict(attn_query_first.named_parameters()) for n, v in attn_batch_first.named_parameters(): param_dict[n].data = v.data input_batch_first = torch.rand(batch_dim, num_query, embed_dim) input_query_first = input_batch_first.transpose(0, 1) assert torch.allclose( attn_query_first(input_query_first).sum(), attn_batch_first(input_batch_first).sum()) key_batch_first = torch.rand(batch_dim, num_query, embed_dim) key_query_first = key_batch_first.transpose(0, 1) assert torch.allclose( attn_query_first(input_query_first, key_query_first).sum(), attn_batch_first(input_batch_first, key_batch_first).sum()) identity = torch.ones_like(input_query_first) # check deprecated arguments can be used normally assert torch.allclose( attn_query_first(input_query_first, key_query_first, residual=identity).sum(), attn_batch_first(input_batch_first, key_batch_first).sum() + identity.sum() - input_batch_first.sum()) assert torch.allclose( attn_query_first(input_query_first, key_query_first, identity=identity).sum(), attn_batch_first(input_batch_first, key_batch_first).sum() + identity.sum() - input_batch_first.sum()) attn_query_first(input_query_first, key_query_first, identity=identity).sum(),
def __init__(self, num_classes=150, num_ffn_fcs=2, num_heads=8, num_mask_fcs=3, feedforward_channels=2048, in_channels=256, out_channels=256, dropout=0.0, act_cfg=dict(type='ReLU', inplace=True), ffn_act_cfg=dict(type='ReLU', inplace=True), conv_kernel_size=1, feat_transform_cfg=None, kernel_init=False, with_ffn=True, feat_gather_stride=1, mask_transform_stride=1, kernel_updator_cfg=dict(type='DynamicConv', in_channels=256, feat_channels=64, out_channels=256, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN'))): super(KernelUpdateHead, self).__init__() self.num_classes = num_classes self.in_channels = in_channels self.out_channels = out_channels self.fp16_enabled = False self.dropout = dropout self.num_heads = num_heads self.kernel_init = kernel_init self.with_ffn = with_ffn self.conv_kernel_size = conv_kernel_size self.feat_gather_stride = feat_gather_stride self.mask_transform_stride = mask_transform_stride self.attention = MultiheadAttention(in_channels * conv_kernel_size**2, num_heads, dropout) self.attention_norm = build_norm_layer( dict(type='LN'), in_channels * conv_kernel_size**2)[1] self.kernel_update_conv = build_transformer_layer(kernel_updator_cfg) if feat_transform_cfg is not None: kernel_size = feat_transform_cfg.pop('kernel_size', 1) transform_channels = in_channels self.feat_transform = ConvModule(transform_channels, in_channels, kernel_size, stride=feat_gather_stride, padding=int(feat_gather_stride // 2), **feat_transform_cfg) else: self.feat_transform = None if self.with_ffn: self.ffn = FFN(in_channels, feedforward_channels, num_ffn_fcs, act_cfg=ffn_act_cfg, dropout=dropout) self.ffn_norm = build_norm_layer(dict(type='LN'), in_channels)[1] self.mask_fcs = nn.ModuleList() for _ in range(num_mask_fcs): self.mask_fcs.append( nn.Linear(in_channels, in_channels, bias=False)) self.mask_fcs.append( build_norm_layer(dict(type='LN'), in_channels)[1]) self.mask_fcs.append(build_activation_layer(act_cfg)) self.fc_mask = nn.Linear(in_channels, out_channels)
def build_attn(self, attn_cfg): self.attn = MultiheadAttention(**attn_cfg)