def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight, std=.02) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0)
def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512], num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], alpha=1): super().__init__() self.num_classes = num_classes self.depths = depths self.alpha = alpha # patch_embed self.patch_embed1 = PatchEmbed(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dims[0]) # pos_embed self.pos_embed1 = nn.Parameter(torch.zeros(1, self.patch_embed1.num_patches, embed_dims[0])) self.pos_drop1 = nn.Dropout(p=drop_rate) self.pos_embed2 = nn.Parameter(torch.zeros(1, self.patch_embed1.num_patches, embed_dims[1])) self.pos_embed3 = nn.Parameter(torch.zeros(1, self.patch_embed1.num_patches, embed_dims[2])) self.pos_embed4 = nn.Parameter(torch.zeros(1, self.patch_embed1.num_patches, embed_dims[3])) self.pos_size = [img_size // patch_size, img_size // patch_size] # transformer encoder dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule sample_num = self.patch_embed1.num_patches cur = 0 # stage 1 self.block1 = nn.ModuleList([Block( dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, sr_ratio=sr_ratios[0]) # , alpha=alpha) for i in range(depths[0])]) cur += depths[0] # stage 2 sample_num = sample_num // 4 self.down_layers1 = DownLayer(sample_num=sample_num, embed_dim=embed_dims[0], drop_rate=drop_rate, down_block=MyBlock( dim=embed_dims[0], dim_out=embed_dims[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur], norm_layer=norm_layer, sr_ratio=sr_ratios[0], alpha=alpha)) self.block2 = nn.ModuleList([MyBlock( dim=embed_dims[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, sr_ratio=sr_ratios[1], alpha=alpha) for i in range(1, depths[1])]) cur += depths[1] # stage 3 sample_num = sample_num // 4 self.down_layers2 = DownLayer(sample_num=sample_num, embed_dim=embed_dims[1], drop_rate=drop_rate, down_block=MyBlock( dim=embed_dims[1], dim_out=embed_dims[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur], norm_layer=norm_layer, sr_ratio=sr_ratios[1], alpha=alpha)) self.block3 = nn.ModuleList([MyBlock( dim=embed_dims[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, sr_ratio=sr_ratios[2], alpha=alpha) for i in range(1, depths[2])]) cur += depths[2] # stage 4 sample_num = sample_num // 4 self.down_layers3 = DownLayer(sample_num=sample_num, embed_dim=embed_dims[2], drop_rate=drop_rate, down_block=MyBlock( dim=embed_dims[2], dim_out=embed_dims[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur], norm_layer=norm_layer, sr_ratio=sr_ratios[2], alpha=alpha)) self.block4 = nn.ModuleList([MyBlock( dim=embed_dims[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, sr_ratio=sr_ratios[3], alpha=alpha) for i in range(1, depths[3])]) self.norm = norm_layer(embed_dims[3]) # cls_token self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims[3])) # classification head self.head = nn.Linear(embed_dims[3], num_classes) if num_classes > 0 else nn.Identity() # init weights trunc_normal_(self.pos_embed1, std=.02) trunc_normal_(self.pos_embed2, std=.02) trunc_normal_(self.pos_embed3, std=.02) trunc_normal_(self.pos_embed4, std=.02) trunc_normal_(self.cls_token, std=.02) self.apply(self._init_weights)