Beispiel #1
0
    def __init__(
        self,
        img_size=224,
        patch_size=16,
        in_chans=3,
        embed_dim=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        norm_layer=nn.LayerNorm,
        epsilon=1e-6,
        block_layers=LayerScale_Block,
        block_layers_token=LayerScale_Block_CA,
        Patch_layer=PatchEmbed,
        act_layer=nn.GELU,
        Attention_block=Attention_talking_head,
        Mlp_block=Mlp,
        init_scale=1e-4,
        Attention_block_token_only=Class_Attention,
        Mlp_block_token_only=Mlp,
        depth_token_only=2,
        mlp_ratio_clstk=4.0,
        class_dim=1000,
    ):
        super().__init__()

        self.class_dim = class_dim
        self.num_features = self.embed_dim = embed_dim

        self.patch_embed = Patch_layer(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim,
        )

        num_patches = self.patch_embed.num_patches

        self.cls_token = add_parameter(self, paddle.zeros((1, 1, embed_dim)))
        self.pos_embed = add_parameter(
            self, paddle.zeros((1, num_patches, embed_dim)))
        self.pos_drop = nn.Dropout(p=drop_rate)

        dpr = [drop_path_rate for i in range(depth)]
        self.blocks = nn.LayerList([
            block_layers(
                dim=embed_dim,
                num_heads=num_heads,
                mlp_ratio=mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[i],
                norm_layer=norm_layer,
                epsilon=epsilon,
                act_layer=act_layer,
                Attention_block=Attention_block,
                Mlp_block=Mlp_block,
                init_values=init_scale,
            ) for i in range(depth)
        ])

        self.blocks_token_only = nn.LayerList([
            block_layers_token(
                dim=embed_dim,
                num_heads=num_heads,
                mlp_ratio=mlp_ratio_clstk,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=0.0,
                attn_drop=0.0,
                drop_path=0.0,
                norm_layer=norm_layer,
                epsilon=epsilon,
                act_layer=act_layer,
                Attention_block=Attention_block_token_only,
                Mlp_block=Mlp_block_token_only,
                init_values=init_scale,
            ) for i in range(depth_token_only)
        ])

        self.norm = norm_layer(embed_dim, epsilon=epsilon)

        # Classifier head
        if class_dim > 0:
            self.head = nn.Linear(embed_dim, class_dim)

        trunc_normal_(self.pos_embed)
        trunc_normal_(self.cls_token)
        self.apply(self._init_weights)
Beispiel #2
0
    def __init__(
        self,
        img_size=224,
        patch_size=16,
        in_chans=3,
        embed_dim=768,
        in_dim=48,
        depth=12,
        num_heads=12,
        in_num_head=4,
        mlp_ratio=4.0,
        qkv_bias=False,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        norm_layer=nn.LayerNorm,
        first_stride=4,
        class_dim=1000,
    ):
        super().__init__()
        self.class_dim = class_dim
        # num_features for consistency with other models
        self.num_features = self.embed_dim = embed_dim

        self.pixel_embed = PixelEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            in_dim=in_dim,
            stride=first_stride,
        )
        num_patches = self.pixel_embed.num_patches
        self.num_patches = num_patches
        new_patch_size = self.pixel_embed.new_patch_size
        num_pixel = new_patch_size ** 2

        self.norm1_proj = norm_layer(num_pixel * in_dim)
        self.proj = nn.Linear(num_pixel * in_dim, embed_dim)
        self.norm2_proj = norm_layer(embed_dim)

        self.cls_token = self.create_parameter(
            shape=(1, 1, embed_dim), default_initializer=zeros_
        )
        self.add_parameter("cls_token", self.cls_token)

        self.patch_pos = self.create_parameter(
            shape=(1, num_patches + 1, embed_dim), default_initializer=zeros_
        )
        self.add_parameter("patch_pos", self.patch_pos)

        self.pixel_pos = self.create_parameter(
            shape=(1, in_dim, new_patch_size, new_patch_size),
            default_initializer=zeros_,
        )
        self.add_parameter("pixel_pos", self.pixel_pos)

        self.pos_drop = nn.Dropout(p=drop_rate)

        # stochastic depth decay rule
        dpr = np.linspace(0, drop_path_rate, depth)

        blocks = []
        for i in range(depth):
            blocks.append(
                Block(
                    dim=embed_dim,
                    in_dim=in_dim,
                    num_pixel=num_pixel,
                    num_heads=num_heads,
                    in_num_head=in_num_head,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[i],
                    norm_layer=norm_layer,
                )
            )
        self.blocks = nn.LayerList(blocks)
        self.norm = norm_layer(embed_dim)

        if class_dim > 0:
            self.head = nn.Linear(embed_dim, class_dim)

        trunc_normal_(self.cls_token)
        trunc_normal_(self.patch_pos)
        trunc_normal_(self.pixel_pos)
        self.apply(self._init_weights)
Beispiel #3
0
    def __init__(
        self,
        img_size=224,
        patch_size=4,
        in_chans=3,
        embed_dim=96,
        depths=[2, 2, 6, 2],
        num_heads=[3, 6, 12, 24],
        window_size=7,
        mlp_ratio=4.0,
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.1,
        norm_layer=nn.LayerNorm,
        ape=False,
        patch_norm=True,
        class_dim=1000,
        with_pool=True,
        **kwargs,
    ):
        super().__init__()
        self.class_dim = class_dim
        self.with_pool = with_pool

        self.num_layers = len(depths)
        self.embed_dim = embed_dim
        self.ape = ape
        self.patch_norm = patch_norm
        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
        self.mlp_ratio = mlp_ratio

        # split image into non-overlapping patches
        self.patch_embed = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim,
            norm_layer=norm_layer if self.patch_norm else None,
        )
        num_patches = self.patch_embed.num_patches
        patches_resolution = self.patch_embed.patches_resolution
        self.patches_resolution = patches_resolution

        # absolute position embedding
        if self.ape:
            self.absolute_pos_embed = add_parameter(
                self, paddle.zeros((1, num_patches, embed_dim))
            )
            trunc_normal_(self.absolute_pos_embed)

        self.pos_drop = nn.Dropout(p=drop_rate)

        # stochastic depth
        dpr = np.linspace(0, drop_path_rate, sum(depths))

        # build layers
        self.layers = nn.LayerList()
        for i_layer in range(self.num_layers):
            layer = BasicLayer(
                dim=int(embed_dim * 2 ** i_layer),
                input_resolution=(
                    patches_resolution[0] // (2 ** i_layer),
                    patches_resolution[1] // (2 ** i_layer),
                ),
                depth=depths[i_layer],
                num_heads=num_heads[i_layer],
                window_size=window_size,
                mlp_ratio=self.mlp_ratio,
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
                norm_layer=norm_layer,
                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
            )
            self.layers.append(layer)

        self.norm = norm_layer(self.num_features)

        if with_pool:
            self.avgpool = nn.AdaptiveAvgPool1D(1)

        if class_dim > 0:
            self.head = nn.Linear(self.num_features, class_dim)

        self.apply(self._init_weights)
Beispiel #4
0
    def __init__(
        self,
        img_size=224,
        patch_size=16,
        in_chans=3,
        embed_dim=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4,
        qkv_bias=False,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        norm_layer=nn.LayerNorm,
        epsilon=1e-5,
        class_dim=1000,
    ):
        super().__init__()
        self.class_dim = class_dim
        self.num_features = self.embed_dim = embed_dim

        self.patch_embed = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim,
        )
        num_patches = self.patch_embed.num_patches

        self.pos_embed = add_parameter(
            self, paddle.zeros((1, num_patches + 1, embed_dim))
        )
        self.cls_token = add_parameter(self, paddle.zeros((1, 1, embed_dim)))

        self.pos_drop = nn.Dropout(p=drop_rate)

        dpr = np.linspace(0, drop_path_rate, depth)

        self.blocks = nn.LayerList(
            [
                Block(
                    dim=embed_dim,
                    num_heads=num_heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    qk_scale=qk_scale,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[i],
                    norm_layer=norm_layer,
                    epsilon=epsilon,
                )
                for i in range(depth)
            ]
        )

        self.norm = norm_layer(embed_dim, epsilon=epsilon)

        # Classifier head
        if class_dim > 0:
            self.head = nn.Linear(embed_dim, class_dim)

        if paddle.in_dynamic_mode():
            trunc_normal_(self.pos_embed)
            trunc_normal_(self.cls_token)
            self.apply(self._init_weights)
Beispiel #5
0
    def __init__(
        self,
        img_size=224,
        patch_size=16,
        in_chans=3,
        embed_dims=[0, 0, 0, 0],
        serial_depths=[0, 0, 0, 0],
        parallel_depth=0,
        num_heads=0,
        mlp_ratios=[0, 0, 0, 0],
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        norm_layer=nn.LayerNorm,
        epsilon=1e-6,
        return_interm_layers=False,
        out_features=None,
        crpe_window={
            3: 2,
            5: 3,
            7: 3
        },
        class_dim=1000,
        **kwargs,
    ):
        super().__init__()
        self.return_interm_layers = return_interm_layers
        self.out_features = out_features
        self.class_dim = class_dim

        # Patch embeddings.
        self.patch_embed1 = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dims[0],
        )
        self.patch_embed2 = PatchEmbed(
            img_size=img_size // 4,
            patch_size=2,
            in_chans=embed_dims[0],
            embed_dim=embed_dims[1],
        )
        self.patch_embed3 = PatchEmbed(
            img_size=img_size // 8,
            patch_size=2,
            in_chans=embed_dims[1],
            embed_dim=embed_dims[2],
        )
        self.patch_embed4 = PatchEmbed(
            img_size=img_size // 16,
            patch_size=2,
            in_chans=embed_dims[2],
            embed_dim=embed_dims[3],
        )

        # Class tokens.
        self.cls_token1 = add_parameter(self,
                                        paddle.zeros((1, 1, embed_dims[0])))
        self.cls_token2 = add_parameter(self,
                                        paddle.zeros((1, 1, embed_dims[1])))
        self.cls_token3 = add_parameter(self,
                                        paddle.zeros((1, 1, embed_dims[2])))
        self.cls_token4 = add_parameter(self,
                                        paddle.zeros((1, 1, embed_dims[3])))

        # Convolutional position encodings.
        self.cpe1 = ConvPosEnc(dim=embed_dims[0], k=3)
        self.cpe2 = ConvPosEnc(dim=embed_dims[1], k=3)
        self.cpe3 = ConvPosEnc(dim=embed_dims[2], k=3)
        self.cpe4 = ConvPosEnc(dim=embed_dims[3], k=3)

        # Convolutional relative position encodings.
        self.crpe1 = ConvRelPosEnc(Ch=embed_dims[0] // num_heads,
                                   h=num_heads,
                                   window=crpe_window)
        self.crpe2 = ConvRelPosEnc(Ch=embed_dims[1] // num_heads,
                                   h=num_heads,
                                   window=crpe_window)
        self.crpe3 = ConvRelPosEnc(Ch=embed_dims[2] // num_heads,
                                   h=num_heads,
                                   window=crpe_window)
        self.crpe4 = ConvRelPosEnc(Ch=embed_dims[3] // num_heads,
                                   h=num_heads,
                                   window=crpe_window)

        # Disable stochastic depth.
        dpr = drop_path_rate
        assert dpr == 0.0

        # Serial blocks 1.
        self.serial_blocks1 = nn.LayerList([
            SerialBlock(
                dim=embed_dims[0],
                num_heads=num_heads,
                mlp_ratio=mlp_ratios[0],
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr,
                norm_layer=norm_layer,
                epsilon=epsilon,
                shared_cpe=self.cpe1,
                shared_crpe=self.crpe1,
            ) for _ in range(serial_depths[0])
        ])

        # Serial blocks 2.
        self.serial_blocks2 = nn.LayerList([
            SerialBlock(
                dim=embed_dims[1],
                num_heads=num_heads,
                mlp_ratio=mlp_ratios[1],
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr,
                norm_layer=norm_layer,
                epsilon=epsilon,
                shared_cpe=self.cpe2,
                shared_crpe=self.crpe2,
            ) for _ in range(serial_depths[1])
        ])

        # Serial blocks 3.
        self.serial_blocks3 = nn.LayerList([
            SerialBlock(
                dim=embed_dims[2],
                num_heads=num_heads,
                mlp_ratio=mlp_ratios[2],
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr,
                norm_layer=norm_layer,
                epsilon=epsilon,
                shared_cpe=self.cpe3,
                shared_crpe=self.crpe3,
            ) for _ in range(serial_depths[2])
        ])

        # Serial blocks 4.
        self.serial_blocks4 = nn.LayerList([
            SerialBlock(
                dim=embed_dims[3],
                num_heads=num_heads,
                mlp_ratio=mlp_ratios[3],
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr,
                norm_layer=norm_layer,
                epsilon=epsilon,
                shared_cpe=self.cpe4,
                shared_crpe=self.crpe4,
            ) for _ in range(serial_depths[3])
        ])

        # Parallel blocks.
        self.parallel_depth = parallel_depth
        if self.parallel_depth > 0:
            self.parallel_blocks = nn.LayerList([
                ParallelBlock(
                    dims=embed_dims,
                    num_heads=num_heads,
                    mlp_ratios=mlp_ratios,
                    qkv_bias=qkv_bias,
                    qk_scale=qk_scale,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr,
                    norm_layer=norm_layer,
                    epsilon=epsilon,
                    shared_cpes=[self.cpe1, self.cpe2, self.cpe3, self.cpe4],
                    shared_crpes=[
                        self.crpe1, self.crpe2, self.crpe3, self.crpe4
                    ],
                ) for _ in range(parallel_depth)
            ])

        # Classification head(s).
        if not self.return_interm_layers:
            self.norm1 = norm_layer(embed_dims[0], epsilon=epsilon)
            self.norm2 = norm_layer(embed_dims[1], epsilon=epsilon)
            self.norm3 = norm_layer(embed_dims[2], epsilon=epsilon)
            self.norm4 = norm_layer(embed_dims[3], epsilon=epsilon)

            # CoaT series: Aggregate features of last three scales for classification.
            if self.parallel_depth > 0:
                assert embed_dims[1] == embed_dims[2] == embed_dims[3]
                self.aggregate = nn.Conv1D(in_channels=3,
                                           out_channels=1,
                                           kernel_size=1)
                self.head = nn.Linear(embed_dims[3], class_dim)
            else:
                # CoaT-Lite series: Use feature of last scale for classification.
                self.head = nn.Linear(embed_dims[3], class_dim)

        # Initialize weights.
        trunc_normal_(self.cls_token1)
        trunc_normal_(self.cls_token2)
        trunc_normal_(self.cls_token3)
        trunc_normal_(self.cls_token4)
        self.apply(self._init_weights)
Beispiel #6
0
    def __init__(
        self,
        image_size,
        patch_size,
        stride,
        base_dims,
        depth,
        heads,
        mlp_ratio,
        in_chans=3,
        attn_drop_rate=0.0,
        drop_rate=0.0,
        drop_path_rate=0.0,
        class_dim=1000,
    ):
        super(PoolingTransformer, self).__init__()

        total_block = sum(depth)
        padding = 0
        block_idx = 0

        width = math.floor((image_size + 2 * padding - patch_size) / stride +
                           1)

        self.base_dims = base_dims
        self.heads = heads
        self.class_dim = class_dim

        self.patch_size = patch_size

        self.pos_embed = add_parameter(
            self, paddle.randn((1, base_dims[0] * heads[0], width, width)))

        self.patch_embed = conv_embedding(in_chans, base_dims[0] * heads[0],
                                          patch_size, stride, padding)

        self.cls_token = add_parameter(
            self, paddle.randn((1, 1, base_dims[0] * heads[0])))

        self.pos_drop = nn.Dropout(p=drop_rate)

        self.transformers = nn.LayerList([])
        self.pools = nn.LayerList([])

        for stage in range(len(depth)):
            drop_path_prob = [
                drop_path_rate * i / total_block
                for i in range(block_idx, block_idx + depth[stage])
            ]
            block_idx += depth[stage]

            self.transformers.append(
                Transformer(
                    base_dims[stage],
                    depth[stage],
                    heads[stage],
                    mlp_ratio,
                    drop_rate,
                    attn_drop_rate,
                    drop_path_prob,
                ))
            if stage < len(heads) - 1:
                self.pools.append(
                    conv_head_pooling(
                        base_dims[stage] * heads[stage],
                        base_dims[stage + 1] * heads[stage + 1],
                        stride=2,
                    ))

        self.norm = nn.LayerNorm(base_dims[-1] * heads[-1], epsilon=1e-6)
        self.embed_dim = base_dims[-1] * heads[-1]

        # Classifier head
        if class_dim > 0:
            self.head = nn.Linear(base_dims[-1] * heads[-1], class_dim)

        trunc_normal_(self.pos_embed)
        trunc_normal_(self.cls_token)
        self.apply(self._init_weights)
Beispiel #7
0
    def __init__(
        self,
        img_size=224,
        patch_size=16,
        in_chans=3,
        embed_dim=768,
        depth=12,
        num_heads=12,
        mlp_ratio=3.0,
        qkv_bias=False,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        drop_path_decay="linear",
        hybrid_backbone=None,
        norm_layer=nn.LayerNorm,
        p_emb="4_2",
        head_dim=None,
        skip_lam=1.0,
        order=None,
        mix_token=True,
        return_dense=True,
        class_dim=1000,
    ):
        super().__init__()
        self.class_dim = class_dim
        # num_features for consistency with other models
        self.num_features = self.embed_dim = embed_dim
        self.output_dim = embed_dim if class_dim == 0 else class_dim

        if hybrid_backbone is not None:
            self.patch_embed = HybridEmbed(
                hybrid_backbone,
                img_size=img_size,
                in_chans=in_chans,
                embed_dim=embed_dim,
            )
        else:
            if p_emb == "4_2":
                patch_embed_fn = PatchEmbed4_2
            elif p_emb == "4_2_128":
                patch_embed_fn = PatchEmbed4_2_128
            else:
                patch_embed_fn = PatchEmbedNaive

            self.patch_embed = patch_embed_fn(
                img_size=img_size,
                patch_size=patch_size,
                in_chans=in_chans,
                embed_dim=embed_dim,
            )

        num_patches = self.patch_embed.num_patches

        self.cls_token = add_parameter(self, paddle.zeros((1, 1, embed_dim)))
        self.pos_embed = add_parameter(
            self, paddle.zeros((1, num_patches + 1, embed_dim)))
        self.pos_drop = nn.Dropout(p=drop_rate)

        if order is None:
            dpr = get_dpr(drop_path_rate, depth, drop_path_decay)
            self.blocks = nn.LayerList([
                Block(
                    dim=embed_dim,
                    num_heads=num_heads,
                    head_dim=head_dim,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    qk_scale=qk_scale,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[i],
                    norm_layer=norm_layer,
                    skip_lam=skip_lam,
                ) for i in range(depth)
            ])
        else:
            # use given order to sequentially generate modules
            dpr = get_dpr(drop_path_rate, len(order), drop_path_decay)
            self.blocks = nn.LayerList([
                get_block(
                    order[i],
                    dim=embed_dim,
                    num_heads=num_heads,
                    head_dim=head_dim,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    qk_scale=qk_scale,
                    drop=drop_rate,
                    attn_drop=attn_drop_rate,
                    drop_path=dpr[i],
                    norm_layer=norm_layer,
                    skip_lam=skip_lam,
                ) for i in range(len(order))
            ])

        self.norm = norm_layer(embed_dim)

        if class_dim > 0:
            self.head = nn.Linear(embed_dim, class_dim)

        self.return_dense = return_dense
        self.mix_token = mix_token

        if (return_dense) and (class_dim > 0):
            self.aux_head = nn.Linear(embed_dim, class_dim)

        if mix_token:
            self.beta = 1.0
            assert return_dense, "always return all features when mixtoken is enabled"

        trunc_normal_(self.pos_embed)
        trunc_normal_(self.cls_token)
        self.apply(self._init_weights)
Beispiel #8
0
    def __init__(
        self,
        img_size=224,
        patch_size=4,
        in_chans=3,
        embed_dims=[64, 128, 320, 512],
        num_heads=[1, 2, 5, 8],
        mlp_ratios=[8, 8, 4, 4],
        qkv_bias=True,
        qk_scale=None,
        drop_rate=0.0,
        attn_drop_rate=0.0,
        drop_path_rate=0.0,
        norm_layer=nn.LayerNorm,
        epsilon=1e-6,
        depths=[3, 4, 6, 3],
        sr_ratios=[8, 4, 2, 1],
        class_dim=1000,
    ):
        super().__init__()
        self.class_dim = class_dim
        self.depths = depths

        # patch_embed
        self.patch_embed1 = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dims[0],
        )
        self.patch_embed2 = PatchEmbed(
            img_size=img_size // 4,
            patch_size=2,
            in_chans=embed_dims[0],
            embed_dim=embed_dims[1],
        )
        self.patch_embed3 = PatchEmbed(
            img_size=img_size // 8,
            patch_size=2,
            in_chans=embed_dims[1],
            embed_dim=embed_dims[2],
        )
        self.patch_embed4 = PatchEmbed(
            img_size=img_size // 16,
            patch_size=2,
            in_chans=embed_dims[2],
            embed_dim=embed_dims[3],
        )

        # pos_embed
        self.pos_embed1 = add_parameter(
            self,
            paddle.zeros((1, self.patch_embed1.num_patches, embed_dims[0])))
        self.pos_drop1 = nn.Dropout(p=drop_rate)

        self.pos_embed2 = add_parameter(
            self,
            paddle.zeros((1, self.patch_embed2.num_patches, embed_dims[1])))
        self.pos_drop2 = nn.Dropout(p=drop_rate)

        self.pos_embed3 = add_parameter(
            self,
            paddle.zeros((1, self.patch_embed3.num_patches, embed_dims[2])))
        self.pos_drop3 = nn.Dropout(p=drop_rate)

        self.pos_embed4 = add_parameter(
            self,
            paddle.zeros(
                (1, self.patch_embed4.num_patches + 1, embed_dims[3])))
        self.pos_drop4 = nn.Dropout(p=drop_rate)

        # transformer encoder
        dpr = np.linspace(0, drop_path_rate, sum(depths))
        cur = 0
        self.block1 = nn.LayerList([
            Block(
                dim=embed_dims[0],
                num_heads=num_heads[0],
                mlp_ratio=mlp_ratios[0],
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[cur + i],
                norm_layer=norm_layer,
                epsilon=epsilon,
                sr_ratio=sr_ratios[0],
            ) for i in range(depths[0])
        ])

        cur += depths[0]
        self.block2 = nn.LayerList([
            Block(
                dim=embed_dims[1],
                num_heads=num_heads[1],
                mlp_ratio=mlp_ratios[1],
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[cur + i],
                norm_layer=norm_layer,
                epsilon=epsilon,
                sr_ratio=sr_ratios[1],
            ) for i in range(depths[1])
        ])

        cur += depths[1]
        self.block3 = nn.LayerList([
            Block(
                dim=embed_dims[2],
                num_heads=num_heads[2],
                mlp_ratio=mlp_ratios[2],
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[cur + i],
                norm_layer=norm_layer,
                epsilon=epsilon,
                sr_ratio=sr_ratios[2],
            ) for i in range(depths[2])
        ])

        cur += depths[2]
        self.block4 = nn.LayerList([
            Block(
                dim=embed_dims[3],
                num_heads=num_heads[3],
                mlp_ratio=mlp_ratios[3],
                qkv_bias=qkv_bias,
                qk_scale=qk_scale,
                drop=drop_rate,
                attn_drop=attn_drop_rate,
                drop_path=dpr[cur + i],
                norm_layer=norm_layer,
                epsilon=epsilon,
                sr_ratio=sr_ratios[3],
            ) for i in range(depths[3])
        ])
        self.norm = norm_layer(embed_dims[3], epsilon=epsilon)

        # cls_token
        self.cls_token = add_parameter(self, paddle.zeros(
            (1, 1, embed_dims[3])))

        # classification head
        if class_dim > 0:
            self.head = nn.Linear(embed_dims[3], class_dim)

        # init weights
        trunc_normal_(self.pos_embed1)
        trunc_normal_(self.pos_embed2)
        trunc_normal_(self.pos_embed3)
        trunc_normal_(self.pos_embed4)
        trunc_normal_(self.cls_token)
        self.apply(self._init_weights)