def __init__(self, dim, in_dim, head_cnt=1, kernel_ratio=0.5, dp1=0.1, dp2=0.1): super().__init__() self.emb = in_dim * head_cnt # we use 1, so it is no need here self.kqv = nn.Linear(dim, 3 * self.emb) self.dp = nn.Dropout(dp1) self.proj = nn.Linear(self.emb, self.emb) self.head_cnt = head_cnt self.norm1 = nn.LayerNorm(dim) self.norm2 = nn.LayerNorm(self.emb) self.epsilon = 1e-8 # for stable in division self.mlp = nn.Sequential( nn.Linear(self.emb, 1 * self.emb), nn.GELU(), nn.Linear(1 * self.emb, self.emb), nn.Dropout(dp2), ) self.m = int(self.emb * kernel_ratio) self.w = paddle.randn((self.m, self.emb)) self.w = add_parameter(self, orthogonal_(self.w) * math.sqrt(self.m))
def __init__(self, in_planes, out_channels, groups=1, bias=True): super(GroupLinear, self).__init__() assert in_planes % groups == 0 assert out_channels % groups == 0 self.in_dim = in_planes self.out_dim = out_channels self.groups = groups self.group_in_dim = int(self.in_dim / self.groups) self.group_out_dim = int(self.out_dim / self.groups) self.group_weight = add_parameter( self, paddle.zeros((self.groups, self.group_in_dim, self.group_out_dim))) if bias is True: self.group_bias = add_parameter(self, paddle.zeros( (self.out_dim, ))) else: self.group_bias = None
def __init__(self, *args, **kwargs): super(DistilledPoolingTransformer, self).__init__(*args, **kwargs) self.cls_token = add_parameter( self, paddle.randn((1, 2, self.base_dims[0] * self.heads[0]))) if self.class_dim > 0: self.head_dist = nn.Linear(self.base_dims[-1] * self.heads[-1], self.class_dim) trunc_normal_(self.cls_token) self.head_dist.apply(self._init_weights)
def __init__( self, dim, num_heads, mlp_ratio=4.0, qkv_bias=False, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, epsilon=1e-6, Attention_block=Attention_talking_head, Mlp_block=Mlp, init_values=1e-4, ): super().__init__() self.norm1 = norm_layer(dim, epsilon=epsilon) self.attn = Attention_block( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() self.norm2 = norm_layer(dim, epsilon=epsilon) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp_block( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, ) self.gamma_1 = add_parameter(self, init_values * paddle.ones((dim, ))) self.gamma_2 = add_parameter(self, init_values * paddle.ones((dim, )))
def __init__(self, img_size=224, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=False, norm_layer=nn.LayerNorm, epsilon=1e-5, class_dim=1000, **kwargs): super().__init__(img_size=img_size, patch_size=patch_size, class_dim=class_dim, embed_dim=embed_dim, depth=depth, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, norm_layer=norm_layer, epsilon=epsilon, **kwargs) self.pos_embed = add_parameter( self, paddle.zeros( (1, self.patch_embed.num_patches + 2, self.embed_dim))) self.dist_token = add_parameter(self, paddle.zeros((1, 1, self.embed_dim))) if class_dim > 0: self.head_dist = nn.Linear(self.embed_dim, self.class_dim) self.head_dist.apply(self._init_weights) trunc_normal_(self.dist_token) trunc_normal_(self.pos_embed)
def __init__( self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0.0, proj_drop=0.0, ): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim ** -0.5 # define a parameter table of relative position bias self.relative_position_bias_table = add_parameter( self, paddle.zeros( ((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads) ), ) # get pair-wise relative position index for each token inside the window coords_h = paddle.arange(self.window_size[0]) coords_w = paddle.arange(self.window_size[1]) coords = paddle.stack(paddle.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww relative_coords = coords_flatten.unsqueeze(-1) - coords_flatten.unsqueeze( 1 ) # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.transpose((1, 2, 0)) # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww self.register_buffer("relative_position_index", relative_position_index) self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) trunc_normal_(self.relative_position_bias_table) self.softmax = nn.Softmax(axis=-1)
def __init__( self, img_size=224, tokens_type="performer", in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4.0, qkv_bias=False, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, norm_layer=nn.LayerNorm, token_dim=64, class_dim=1000, ): super().__init__() self.class_dim = class_dim self.num_features = ( self.embed_dim ) = embed_dim # num_features for consistency with other models self.tokens_to_token = T2T_Layer( img_size=img_size, tokens_type=tokens_type, in_chans=in_chans, embed_dim=embed_dim, token_dim=token_dim, ) num_patches = self.tokens_to_token.num_patches self.cls_token = add_parameter(self, paddle.zeros((1, 1, embed_dim))) self.pos_embed = add_parameter( self, get_sinusoid_encoding(n_position=num_patches + 1, d_hid=embed_dim)) self.pos_drop = nn.Dropout(p=drop_rate) dpr = np.linspace(0, drop_path_rate, depth) # stochastic depth decay rule self.blocks = nn.LayerList([ Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, ) for i in range(depth) ]) self.norm = norm_layer(embed_dim) # Classifier head if class_dim > 0: self.head = nn.Linear(embed_dim, class_dim) trunc_normal_(self.cls_token) self.apply(self._init_weights)
def __init__( self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, norm_layer=nn.LayerNorm, epsilon=1e-6, block_layers=LayerScale_Block, block_layers_token=LayerScale_Block_CA, Patch_layer=PatchEmbed, act_layer=nn.GELU, Attention_block=Attention_talking_head, Mlp_block=Mlp, init_scale=1e-4, Attention_block_token_only=Class_Attention, Mlp_block_token_only=Mlp, depth_token_only=2, mlp_ratio_clstk=4.0, class_dim=1000, ): super().__init__() self.class_dim = class_dim self.num_features = self.embed_dim = embed_dim self.patch_embed = Patch_layer( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ) num_patches = self.patch_embed.num_patches self.cls_token = add_parameter(self, paddle.zeros((1, 1, embed_dim))) self.pos_embed = add_parameter( self, paddle.zeros((1, num_patches, embed_dim))) self.pos_drop = nn.Dropout(p=drop_rate) dpr = [drop_path_rate for i in range(depth)] self.blocks = nn.LayerList([ block_layers( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, epsilon=epsilon, act_layer=act_layer, Attention_block=Attention_block, Mlp_block=Mlp_block, init_values=init_scale, ) for i in range(depth) ]) self.blocks_token_only = nn.LayerList([ block_layers_token( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio_clstk, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=0.0, attn_drop=0.0, drop_path=0.0, norm_layer=norm_layer, epsilon=epsilon, act_layer=act_layer, Attention_block=Attention_block_token_only, Mlp_block=Mlp_block_token_only, init_values=init_scale, ) for i in range(depth_token_only) ]) self.norm = norm_layer(embed_dim, epsilon=epsilon) # Classifier head if class_dim > 0: self.head = nn.Linear(embed_dim, class_dim) trunc_normal_(self.pos_embed) trunc_normal_(self.cls_token) self.apply(self._init_weights)
def __init__( self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.1, norm_layer=nn.LayerNorm, ape=False, patch_norm=True, class_dim=1000, with_pool=True, **kwargs, ): super().__init__() self.class_dim = class_dim self.with_pool = with_pool self.num_layers = len(depths) self.embed_dim = embed_dim self.ape = ape self.patch_norm = patch_norm self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) self.mlp_ratio = mlp_ratio # split image into non-overlapping patches self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer if self.patch_norm else None, ) num_patches = self.patch_embed.num_patches patches_resolution = self.patch_embed.patches_resolution self.patches_resolution = patches_resolution # absolute position embedding if self.ape: self.absolute_pos_embed = add_parameter( self, paddle.zeros((1, num_patches, embed_dim)) ) trunc_normal_(self.absolute_pos_embed) self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth dpr = np.linspace(0, drop_path_rate, sum(depths)) # build layers self.layers = nn.LayerList() for i_layer in range(self.num_layers): layer = BasicLayer( dim=int(embed_dim * 2 ** i_layer), input_resolution=( patches_resolution[0] // (2 ** i_layer), patches_resolution[1] // (2 ** i_layer), ), depth=depths[i_layer], num_heads=num_heads[i_layer], window_size=window_size, mlp_ratio=self.mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])], norm_layer=norm_layer, downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, ) self.layers.append(layer) self.norm = norm_layer(self.num_features) if with_pool: self.avgpool = nn.AdaptiveAvgPool1D(1) if class_dim > 0: self.head = nn.Linear(self.num_features, class_dim) self.apply(self._init_weights)
def __init__( self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, qkv_bias=False, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, norm_layer=nn.LayerNorm, epsilon=1e-5, class_dim=1000, ): super().__init__() self.class_dim = class_dim self.num_features = self.embed_dim = embed_dim self.patch_embed = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ) num_patches = self.patch_embed.num_patches self.pos_embed = add_parameter( self, paddle.zeros((1, num_patches + 1, embed_dim)) ) self.cls_token = add_parameter(self, paddle.zeros((1, 1, embed_dim))) self.pos_drop = nn.Dropout(p=drop_rate) dpr = np.linspace(0, drop_path_rate, depth) self.blocks = nn.LayerList( [ Block( dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, epsilon=epsilon, ) for i in range(depth) ] ) self.norm = norm_layer(embed_dim, epsilon=epsilon) # Classifier head if class_dim > 0: self.head = nn.Linear(embed_dim, class_dim) if paddle.in_dynamic_mode(): trunc_normal_(self.pos_embed) trunc_normal_(self.cls_token) self.apply(self._init_weights)
def __init__( self, img_size=224, patch_size=16, in_chans=3, embed_dims=[0, 0, 0, 0], serial_depths=[0, 0, 0, 0], parallel_depth=0, num_heads=0, mlp_ratios=[0, 0, 0, 0], qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, norm_layer=nn.LayerNorm, epsilon=1e-6, return_interm_layers=False, out_features=None, crpe_window={ 3: 2, 5: 3, 7: 3 }, class_dim=1000, **kwargs, ): super().__init__() self.return_interm_layers = return_interm_layers self.out_features = out_features self.class_dim = class_dim # Patch embeddings. self.patch_embed1 = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dims[0], ) self.patch_embed2 = PatchEmbed( img_size=img_size // 4, patch_size=2, in_chans=embed_dims[0], embed_dim=embed_dims[1], ) self.patch_embed3 = PatchEmbed( img_size=img_size // 8, patch_size=2, in_chans=embed_dims[1], embed_dim=embed_dims[2], ) self.patch_embed4 = PatchEmbed( img_size=img_size // 16, patch_size=2, in_chans=embed_dims[2], embed_dim=embed_dims[3], ) # Class tokens. self.cls_token1 = add_parameter(self, paddle.zeros((1, 1, embed_dims[0]))) self.cls_token2 = add_parameter(self, paddle.zeros((1, 1, embed_dims[1]))) self.cls_token3 = add_parameter(self, paddle.zeros((1, 1, embed_dims[2]))) self.cls_token4 = add_parameter(self, paddle.zeros((1, 1, embed_dims[3]))) # Convolutional position encodings. self.cpe1 = ConvPosEnc(dim=embed_dims[0], k=3) self.cpe2 = ConvPosEnc(dim=embed_dims[1], k=3) self.cpe3 = ConvPosEnc(dim=embed_dims[2], k=3) self.cpe4 = ConvPosEnc(dim=embed_dims[3], k=3) # Convolutional relative position encodings. self.crpe1 = ConvRelPosEnc(Ch=embed_dims[0] // num_heads, h=num_heads, window=crpe_window) self.crpe2 = ConvRelPosEnc(Ch=embed_dims[1] // num_heads, h=num_heads, window=crpe_window) self.crpe3 = ConvRelPosEnc(Ch=embed_dims[2] // num_heads, h=num_heads, window=crpe_window) self.crpe4 = ConvRelPosEnc(Ch=embed_dims[3] // num_heads, h=num_heads, window=crpe_window) # Disable stochastic depth. dpr = drop_path_rate assert dpr == 0.0 # Serial blocks 1. self.serial_blocks1 = nn.LayerList([ SerialBlock( dim=embed_dims[0], num_heads=num_heads, mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, epsilon=epsilon, shared_cpe=self.cpe1, shared_crpe=self.crpe1, ) for _ in range(serial_depths[0]) ]) # Serial blocks 2. self.serial_blocks2 = nn.LayerList([ SerialBlock( dim=embed_dims[1], num_heads=num_heads, mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, epsilon=epsilon, shared_cpe=self.cpe2, shared_crpe=self.crpe2, ) for _ in range(serial_depths[1]) ]) # Serial blocks 3. self.serial_blocks3 = nn.LayerList([ SerialBlock( dim=embed_dims[2], num_heads=num_heads, mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, epsilon=epsilon, shared_cpe=self.cpe3, shared_crpe=self.crpe3, ) for _ in range(serial_depths[2]) ]) # Serial blocks 4. self.serial_blocks4 = nn.LayerList([ SerialBlock( dim=embed_dims[3], num_heads=num_heads, mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, epsilon=epsilon, shared_cpe=self.cpe4, shared_crpe=self.crpe4, ) for _ in range(serial_depths[3]) ]) # Parallel blocks. self.parallel_depth = parallel_depth if self.parallel_depth > 0: self.parallel_blocks = nn.LayerList([ ParallelBlock( dims=embed_dims, num_heads=num_heads, mlp_ratios=mlp_ratios, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, epsilon=epsilon, shared_cpes=[self.cpe1, self.cpe2, self.cpe3, self.cpe4], shared_crpes=[ self.crpe1, self.crpe2, self.crpe3, self.crpe4 ], ) for _ in range(parallel_depth) ]) # Classification head(s). if not self.return_interm_layers: self.norm1 = norm_layer(embed_dims[0], epsilon=epsilon) self.norm2 = norm_layer(embed_dims[1], epsilon=epsilon) self.norm3 = norm_layer(embed_dims[2], epsilon=epsilon) self.norm4 = norm_layer(embed_dims[3], epsilon=epsilon) # CoaT series: Aggregate features of last three scales for classification. if self.parallel_depth > 0: assert embed_dims[1] == embed_dims[2] == embed_dims[3] self.aggregate = nn.Conv1D(in_channels=3, out_channels=1, kernel_size=1) self.head = nn.Linear(embed_dims[3], class_dim) else: # CoaT-Lite series: Use feature of last scale for classification. self.head = nn.Linear(embed_dims[3], class_dim) # Initialize weights. trunc_normal_(self.cls_token1) trunc_normal_(self.cls_token2) trunc_normal_(self.cls_token3) trunc_normal_(self.cls_token4) self.apply(self._init_weights)
def __init__( self, image_size, patch_size, stride, base_dims, depth, heads, mlp_ratio, in_chans=3, attn_drop_rate=0.0, drop_rate=0.0, drop_path_rate=0.0, class_dim=1000, ): super(PoolingTransformer, self).__init__() total_block = sum(depth) padding = 0 block_idx = 0 width = math.floor((image_size + 2 * padding - patch_size) / stride + 1) self.base_dims = base_dims self.heads = heads self.class_dim = class_dim self.patch_size = patch_size self.pos_embed = add_parameter( self, paddle.randn((1, base_dims[0] * heads[0], width, width))) self.patch_embed = conv_embedding(in_chans, base_dims[0] * heads[0], patch_size, stride, padding) self.cls_token = add_parameter( self, paddle.randn((1, 1, base_dims[0] * heads[0]))) self.pos_drop = nn.Dropout(p=drop_rate) self.transformers = nn.LayerList([]) self.pools = nn.LayerList([]) for stage in range(len(depth)): drop_path_prob = [ drop_path_rate * i / total_block for i in range(block_idx, block_idx + depth[stage]) ] block_idx += depth[stage] self.transformers.append( Transformer( base_dims[stage], depth[stage], heads[stage], mlp_ratio, drop_rate, attn_drop_rate, drop_path_prob, )) if stage < len(heads) - 1: self.pools.append( conv_head_pooling( base_dims[stage] * heads[stage], base_dims[stage + 1] * heads[stage + 1], stride=2, )) self.norm = nn.LayerNorm(base_dims[-1] * heads[-1], epsilon=1e-6) self.embed_dim = base_dims[-1] * heads[-1] # Classifier head if class_dim > 0: self.head = nn.Linear(base_dims[-1] * heads[-1], class_dim) trunc_normal_(self.pos_embed) trunc_normal_(self.cls_token) self.apply(self._init_weights)
def __init__( self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, depth=12, num_heads=12, mlp_ratio=3.0, qkv_bias=False, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, drop_path_decay="linear", hybrid_backbone=None, norm_layer=nn.LayerNorm, p_emb="4_2", head_dim=None, skip_lam=1.0, order=None, mix_token=True, return_dense=True, class_dim=1000, ): super().__init__() self.class_dim = class_dim # num_features for consistency with other models self.num_features = self.embed_dim = embed_dim self.output_dim = embed_dim if class_dim == 0 else class_dim if hybrid_backbone is not None: self.patch_embed = HybridEmbed( hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim, ) else: if p_emb == "4_2": patch_embed_fn = PatchEmbed4_2 elif p_emb == "4_2_128": patch_embed_fn = PatchEmbed4_2_128 else: patch_embed_fn = PatchEmbedNaive self.patch_embed = patch_embed_fn( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, ) num_patches = self.patch_embed.num_patches self.cls_token = add_parameter(self, paddle.zeros((1, 1, embed_dim))) self.pos_embed = add_parameter( self, paddle.zeros((1, num_patches + 1, embed_dim))) self.pos_drop = nn.Dropout(p=drop_rate) if order is None: dpr = get_dpr(drop_path_rate, depth, drop_path_decay) self.blocks = nn.LayerList([ Block( dim=embed_dim, num_heads=num_heads, head_dim=head_dim, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, skip_lam=skip_lam, ) for i in range(depth) ]) else: # use given order to sequentially generate modules dpr = get_dpr(drop_path_rate, len(order), drop_path_decay) self.blocks = nn.LayerList([ get_block( order[i], dim=embed_dim, num_heads=num_heads, head_dim=head_dim, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, skip_lam=skip_lam, ) for i in range(len(order)) ]) self.norm = norm_layer(embed_dim) if class_dim > 0: self.head = nn.Linear(embed_dim, class_dim) self.return_dense = return_dense self.mix_token = mix_token if (return_dense) and (class_dim > 0): self.aux_head = nn.Linear(embed_dim, class_dim) if mix_token: self.beta = 1.0 assert return_dense, "always return all features when mixtoken is enabled" trunc_normal_(self.pos_embed) trunc_normal_(self.cls_token) self.apply(self._init_weights)
def __init__( self, img_size=224, patch_size=4, in_chans=3, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0, norm_layer=nn.LayerNorm, epsilon=1e-6, depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], class_dim=1000, ): super().__init__() self.class_dim = class_dim self.depths = depths # patch_embed self.patch_embed1 = PatchEmbed( img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dims[0], ) self.patch_embed2 = PatchEmbed( img_size=img_size // 4, patch_size=2, in_chans=embed_dims[0], embed_dim=embed_dims[1], ) self.patch_embed3 = PatchEmbed( img_size=img_size // 8, patch_size=2, in_chans=embed_dims[1], embed_dim=embed_dims[2], ) self.patch_embed4 = PatchEmbed( img_size=img_size // 16, patch_size=2, in_chans=embed_dims[2], embed_dim=embed_dims[3], ) # pos_embed self.pos_embed1 = add_parameter( self, paddle.zeros((1, self.patch_embed1.num_patches, embed_dims[0]))) self.pos_drop1 = nn.Dropout(p=drop_rate) self.pos_embed2 = add_parameter( self, paddle.zeros((1, self.patch_embed2.num_patches, embed_dims[1]))) self.pos_drop2 = nn.Dropout(p=drop_rate) self.pos_embed3 = add_parameter( self, paddle.zeros((1, self.patch_embed3.num_patches, embed_dims[2]))) self.pos_drop3 = nn.Dropout(p=drop_rate) self.pos_embed4 = add_parameter( self, paddle.zeros( (1, self.patch_embed4.num_patches + 1, embed_dims[3]))) self.pos_drop4 = nn.Dropout(p=drop_rate) # transformer encoder dpr = np.linspace(0, drop_path_rate, sum(depths)) cur = 0 self.block1 = nn.LayerList([ Block( dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, epsilon=epsilon, sr_ratio=sr_ratios[0], ) for i in range(depths[0]) ]) cur += depths[0] self.block2 = nn.LayerList([ Block( dim=embed_dims[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, epsilon=epsilon, sr_ratio=sr_ratios[1], ) for i in range(depths[1]) ]) cur += depths[1] self.block3 = nn.LayerList([ Block( dim=embed_dims[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, epsilon=epsilon, sr_ratio=sr_ratios[2], ) for i in range(depths[2]) ]) cur += depths[2] self.block4 = nn.LayerList([ Block( dim=embed_dims[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, epsilon=epsilon, sr_ratio=sr_ratios[3], ) for i in range(depths[3]) ]) self.norm = norm_layer(embed_dims[3], epsilon=epsilon) # cls_token self.cls_token = add_parameter(self, paddle.zeros( (1, 1, embed_dims[3]))) # classification head if class_dim > 0: self.head = nn.Linear(embed_dims[3], class_dim) # init weights trunc_normal_(self.pos_embed1) trunc_normal_(self.pos_embed2) trunc_normal_(self.pos_embed3) trunc_normal_(self.pos_embed4) trunc_normal_(self.cls_token) self.apply(self._init_weights)