def __init__(self, in_channels, feat_channels, out_channels, norm_cfg=dict(type='GN', num_groups=32), act_cfg=dict(type='ReLU'), encoder=None, positional_encoding=dict(type='SinePositionalEncoding', num_feats=128, normalize=True), init_cfg=None): super(TransformerEncoderPixelDecoder, self).__init__(in_channels, feat_channels, out_channels, norm_cfg, act_cfg, init_cfg=init_cfg) self.last_feat_conv = None self.encoder = build_transformer_layer_sequence(encoder) self.encoder_embed_dims = self.encoder.embed_dims assert self.encoder_embed_dims == feat_channels, 'embed_dims({}) of ' \ 'tranformer encoder must equal to feat_channels({})'.format( feat_channels, self.encoder_embed_dims) self.positional_encoding = build_positional_encoding( positional_encoding) self.encoder_in_proj = Conv2d(in_channels[-1], feat_channels, kernel_size=1) self.encoder_out_proj = ConvModule(feat_channels, feat_channels, kernel_size=3, stride=1, padding=1, bias=self.use_bias, norm_cfg=norm_cfg, act_cfg=act_cfg)
def __init__(self, in_channels, feat_channels, out_channels, num_things_classes=80, num_stuff_classes=53, num_queries=100, pixel_decoder=None, enforce_decoder_input_project=False, transformer_decoder=None, positional_encoding=None, loss_cls=dict(type='CrossEntropyLoss', bg_cls_weight=0.1, use_sigmoid=False, loss_weight=1.0, class_weight=1.0), loss_mask=dict(type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=20.0), loss_dice=dict(type='DiceLoss', use_sigmoid=True, activate=True, naive_dice=True, loss_weight=1.0), train_cfg=None, test_cfg=None, init_cfg=None, **kwargs): super(AnchorFreeHead, self).__init__(init_cfg) self.num_things_classes = num_things_classes self.num_stuff_classes = num_stuff_classes self.num_classes = self.num_things_classes + self.num_stuff_classes self.num_queries = num_queries pixel_decoder.update(in_channels=in_channels, feat_channels=feat_channels, out_channels=out_channels) self.pixel_decoder = build_plugin_layer(pixel_decoder)[1] self.transformer_decoder = build_transformer_layer_sequence( transformer_decoder) self.decoder_embed_dims = self.transformer_decoder.embed_dims pixel_decoder_type = pixel_decoder.get('type') if pixel_decoder_type == 'PixelDecoder' and ( self.decoder_embed_dims != in_channels[-1] or enforce_decoder_input_project): self.decoder_input_proj = Conv2d(in_channels[-1], self.decoder_embed_dims, kernel_size=1) else: self.decoder_input_proj = nn.Identity() self.decoder_pe = build_positional_encoding(positional_encoding) self.query_embed = nn.Embedding(self.num_queries, out_channels) self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1) self.mask_embed = nn.Sequential( nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), nn.Linear(feat_channels, out_channels)) self.test_cfg = test_cfg self.train_cfg = train_cfg if train_cfg: assert 'assigner' in train_cfg, 'assigner should be provided '\ 'when train_cfg is set.' assigner = train_cfg['assigner'] self.assigner = build_assigner(assigner) sampler_cfg = dict(type='MaskPseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.bg_cls_weight = 0 class_weight = loss_cls.get('class_weight', None) if class_weight is not None and (self.__class__ is MaskFormerHead): assert isinstance(class_weight, float), 'Expected ' \ 'class_weight to have type float. Found ' \ f'{type(class_weight)}.' # NOTE following the official MaskFormerHead repo, bg_cls_weight # means relative classification weight of the VOID class. bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight) assert isinstance(bg_cls_weight, float), 'Expected ' \ 'bg_cls_weight to have type float. Found ' \ f'{type(bg_cls_weight)}.' class_weight = torch.ones(self.num_classes + 1) * class_weight # set VOID class as the last indice class_weight[self.num_classes] = bg_cls_weight loss_cls.update({'class_weight': class_weight}) if 'bg_cls_weight' in loss_cls: loss_cls.pop('bg_cls_weight') self.bg_cls_weight = bg_cls_weight self.loss_cls = build_loss(loss_cls) self.loss_mask = build_loss(loss_mask) self.loss_dice = build_loss(loss_dice)
def __init__(self, encoder=None, decoder=None, init_cfg=None): super(Transformer, self).__init__(init_cfg=init_cfg) self.encoder = build_transformer_layer_sequence(encoder) self.decoder = build_transformer_layer_sequence(decoder) self.embed_dims = self.encoder.embed_dims
def __init__(self, in_channels, feat_channels, out_channels, num_things_classes=80, num_stuff_classes=53, num_queries=100, num_transformer_feat_level=3, pixel_decoder=None, enforce_decoder_input_project=False, transformer_decoder=None, positional_encoding=None, loss_cls=None, loss_mask=None, loss_dice=None, train_cfg=None, test_cfg=None, init_cfg=None, **kwargs): super(AnchorFreeHead, self).__init__(init_cfg) self.num_things_classes = num_things_classes self.num_stuff_classes = num_stuff_classes self.num_classes = self.num_things_classes + self.num_stuff_classes self.num_queries = num_queries self.num_transformer_feat_level = num_transformer_feat_level self.num_heads = transformer_decoder.transformerlayers.\ attn_cfgs.num_heads self.num_transformer_decoder_layers = transformer_decoder.num_layers assert pixel_decoder.encoder.transformerlayers.\ attn_cfgs.num_levels == num_transformer_feat_level pixel_decoder_ = copy.deepcopy(pixel_decoder) pixel_decoder_.update(in_channels=in_channels, feat_channels=feat_channels, out_channels=out_channels) self.pixel_decoder = build_plugin_layer(pixel_decoder_)[1] self.transformer_decoder = build_transformer_layer_sequence( transformer_decoder) self.decoder_embed_dims = self.transformer_decoder.embed_dims self.decoder_input_projs = ModuleList() # from low resolution to high resolution for _ in range(num_transformer_feat_level): if (self.decoder_embed_dims != feat_channels or enforce_decoder_input_project): self.decoder_input_projs.append( Conv2d(feat_channels, self.decoder_embed_dims, kernel_size=1)) else: self.decoder_input_projs.append(nn.Identity()) self.decoder_positional_encoding = build_positional_encoding( positional_encoding) self.query_embed = nn.Embedding(self.num_queries, feat_channels) self.query_feat = nn.Embedding(self.num_queries, feat_channels) # from low resolution to high resolution self.level_embed = nn.Embedding(self.num_transformer_feat_level, feat_channels) self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1) self.mask_embed = nn.Sequential( nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), nn.Linear(feat_channels, out_channels)) self.test_cfg = test_cfg self.train_cfg = train_cfg if train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) self.sampler = build_sampler(self.train_cfg.sampler, context=self) self.num_points = self.train_cfg.get('num_points', 12544) self.oversample_ratio = self.train_cfg.get('oversample_ratio', 3.0) self.importance_sample_ratio = self.train_cfg.get( 'importance_sample_ratio', 0.75) self.class_weight = loss_cls.class_weight self.loss_cls = build_loss(loss_cls) self.loss_mask = build_loss(loss_mask) self.loss_dice = build_loss(loss_dice)
def __init__(self, num_frames, img_size, patch_size, pretrained=None, embed_dims=768, num_heads=12, num_transformer_layers=12, in_channels=3, dropout_ratio=0., transformer_layers=None, attention_type='divided_space_time', norm_cfg=dict(type='LN', eps=1e-6), **kwargs): super().__init__(**kwargs) assert attention_type in self.supported_attention_types, ( f'Unsupported Attention Type {attention_type}!') assert transformer_layers is None or isinstance( transformer_layers, (dict, list)) self.num_frames = num_frames self.pretrained = pretrained self.embed_dims = embed_dims self.num_transformer_layers = num_transformer_layers self.attention_type = attention_type self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_channels=in_channels, embed_dims=embed_dims) num_patches = self.patch_embed.num_patches self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims)) self.pos_embed = nn.Parameter( torch.zeros(1, num_patches + 1, embed_dims)) self.drop_after_pos = nn.Dropout(p=dropout_ratio) if self.attention_type != 'space_only': self.time_embed = nn.Parameter( torch.zeros(1, num_frames, embed_dims)) self.drop_after_time = nn.Dropout(p=dropout_ratio) self.norm = build_norm_layer(norm_cfg, embed_dims)[1] if transformer_layers is None: # stochastic depth decay rule dpr = np.linspace(0, 0.1, num_transformer_layers) if self.attention_type == 'divided_space_time': _transformerlayers_cfg = [ dict( type='BaseTransformerLayer', attn_cfgs=[ dict( type='DividedTemporalAttentionWithNorm', embed_dims=embed_dims, num_heads=num_heads, num_frames=num_frames, dropout_layer=dict( type='DropPath', drop_prob=dpr[i]), norm_cfg=dict(type='LN', eps=1e-6)), dict( type='DividedSpatialAttentionWithNorm', embed_dims=embed_dims, num_heads=num_heads, num_frames=num_frames, dropout_layer=dict( type='DropPath', drop_prob=dpr[i]), norm_cfg=dict(type='LN', eps=1e-6)) ], ffn_cfgs=dict( type='FFNWithNorm', embed_dims=embed_dims, feedforward_channels=embed_dims * 4, num_fcs=2, act_cfg=dict(type='GELU'), dropout_layer=dict( type='DropPath', drop_prob=dpr[i]), norm_cfg=dict(type='LN', eps=1e-6)), operation_order=('self_attn', 'self_attn', 'ffn')) for i in range(num_transformer_layers) ] else: # Sapce Only & Joint Space Time _transformerlayers_cfg = [ dict( type='BaseTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=num_heads, batch_first=True, dropout_layer=dict( type='DropPath', drop_prob=dpr[i])) ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims * 4, num_fcs=2, act_cfg=dict(type='GELU'), dropout_layer=dict( type='DropPath', drop_prob=dpr[i])), operation_order=('norm', 'self_attn', 'norm', 'ffn'), norm_cfg=dict(type='LN', eps=1e-6), batch_first=True) for i in range(num_transformer_layers) ] transformer_layers = ConfigDict( dict( type='TransformerLayerSequence', transformerlayers=_transformerlayers_cfg, num_layers=num_transformer_layers)) self.transformer_layers = build_transformer_layer_sequence( transformer_layers)
def __init__(self, in_channels, feat_channels, out_channels, num_things_classes=80, num_stuff_classes=53, num_queries=100, pixel_decoder=None, enforce_decoder_input_project=False, transformer_decoder=None, positional_encoding=None, loss_cls=dict(type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0, class_weight=[1.0] * 133 + [0.1]), loss_mask=dict(type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=20.0), loss_dice=dict(type='DiceLoss', use_sigmoid=True, activate=True, naive_dice=True, loss_weight=1.0), train_cfg=None, test_cfg=None, init_cfg=None, **kwargs): super(AnchorFreeHead, self).__init__(init_cfg) self.num_things_classes = num_things_classes self.num_stuff_classes = num_stuff_classes self.num_classes = self.num_things_classes + self.num_stuff_classes self.num_queries = num_queries pixel_decoder.update(in_channels=in_channels, feat_channels=feat_channels, out_channels=out_channels) self.pixel_decoder = build_plugin_layer(pixel_decoder)[1] self.transformer_decoder = build_transformer_layer_sequence( transformer_decoder) self.decoder_embed_dims = self.transformer_decoder.embed_dims pixel_decoder_type = pixel_decoder.get('type') if pixel_decoder_type == 'PixelDecoder' and ( self.decoder_embed_dims != in_channels[-1] or enforce_decoder_input_project): self.decoder_input_proj = Conv2d(in_channels[-1], self.decoder_embed_dims, kernel_size=1) else: self.decoder_input_proj = nn.Identity() self.decoder_pe = build_positional_encoding(positional_encoding) self.query_embed = nn.Embedding(self.num_queries, out_channels) self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1) self.mask_embed = nn.Sequential( nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), nn.Linear(feat_channels, feat_channels), nn.ReLU(inplace=True), nn.Linear(feat_channels, out_channels)) self.test_cfg = test_cfg self.train_cfg = train_cfg if train_cfg: self.assigner = build_assigner(train_cfg.assigner) self.sampler = build_sampler(train_cfg.sampler, context=self) self.class_weight = loss_cls.class_weight self.loss_cls = build_loss(loss_cls) self.loss_mask = build_loss(loss_mask) self.loss_dice = build_loss(loss_dice)