Esempio n. 1
0
    def __init__(
        self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None
    ):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        patches_resolution = [
            img_size[0] // patch_size[0],
            img_size[1] // patch_size[1],
        ]
        self.img_size = img_size
        self.patch_size = patch_size
        self.patches_resolution = patches_resolution
        self.num_patches = patches_resolution[0] * patches_resolution[1]

        self.in_chans = in_chans
        self.embed_dim = embed_dim

        self.proj = nn.Conv2D(
            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
        )
        if norm_layer is not None:
            self.norm = norm_layer(embed_dim)
        else:
            self.norm = None
Esempio n. 2
0
 def __init__(self,
              backbone,
              img_size=224,
              feature_size=None,
              in_chans=3,
              embed_dim=768):
     super().__init__()
     assert isinstance(backbone, nn.Layer)
     img_size = to_2tuple(img_size)
     self.img_size = img_size
     self.backbone = backbone
     if feature_size is None:
         with paddle.no_grad():
             training = backbone.training
             if training:
                 backbone.eval()
             o = self.backbone(
                 paddle.zeros((1, in_chans, img_size[0], img_size[1])))[-1]
             feature_size = o.shape[-2:]
             feature_dim = o.shape[1]
             backbone.train(training)
     else:
         feature_size = to_2tuple(feature_size)
         feature_dim = self.backbone.feature_info.channels()[-1]
     self.num_patches = feature_size[0] * feature_size[1]
     self.proj = nn.Conv2D(feature_dim, embed_dim, kernel_size=1)
Esempio n. 3
0
    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches

        self.proj = nn.Conv2D(
            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
        )
Esempio n. 4
0
    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()

        new_patch_size = to_2tuple(patch_size // 2)

        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)
        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] //
                                                        patch_size[0])
        self.img_size = img_size
        self.patch_size = patch_size
        self.num_patches = num_patches
        self.embed_dim = embed_dim

        self.conv1 = nn.Conv2D(in_chans,
                               128,
                               kernel_size=7,
                               stride=2,
                               padding=3,
                               bias_attr=False)  # 112x112
        self.bn1 = nn.BatchNorm2D(128)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv2D(128,
                               128,
                               kernel_size=3,
                               stride=1,
                               padding=1,
                               bias_attr=False)  # 112x112
        self.bn2 = nn.BatchNorm2D(128)
        self.conv3 = nn.Conv2D(128,
                               128,
                               kernel_size=3,
                               stride=1,
                               padding=1,
                               bias_attr=False)
        self.bn3 = nn.BatchNorm2D(128)

        self.proj = nn.Conv2D(128,
                              embed_dim,
                              kernel_size=new_patch_size,
                              stride=new_patch_size)
Esempio n. 5
0
    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
        super().__init__()
        img_size = to_2tuple(img_size)
        patch_size = to_2tuple(patch_size)

        self.img_size = img_size
        self.patch_size = patch_size
        assert (
            img_size[0] % patch_size[0] == 0
            and img_size[1] % patch_size[1] == 0
        ), f"img_size {img_size} should be divided by patch_size {patch_size}."
        # Note: self.H, self.W and self.num_patches are not used
        self.H, self.W = img_size[0] // patch_size[0], img_size[
            1] // patch_size[1]
        # since the image size may change on the fly.
        self.num_patches = self.H * self.W
        self.proj = nn.Conv2D(in_chans,
                              embed_dim,
                              kernel_size=patch_size,
                              stride=patch_size)
        self.norm = nn.LayerNorm(embed_dim)
Esempio n. 6
0
    def __init__(
        self,
        dim,
        input_resolution,
        num_heads,
        window_size=7,
        shift_size=0,
        mlp_ratio=4.0,
        qkv_bias=True,
        qk_scale=None,
        drop=0.0,
        attn_drop=0.0,
        drop_path=0.0,
        act_layer=nn.GELU,
        norm_layer=nn.LayerNorm,
    ):
        super().__init__()
        self.dim = dim
        self.input_resolution = input_resolution
        self.num_heads = num_heads
        self.window_size = window_size
        self.shift_size = shift_size
        self.mlp_ratio = mlp_ratio
        if min(self.input_resolution) <= self.window_size:
            # if window size is larger than input resolution, we don't partition windows
            self.shift_size = 0
            self.window_size = min(self.input_resolution)
        assert (
            0 <= self.shift_size < self.window_size
        ), "shift_size must in 0-window_size"

        self.norm1 = norm_layer(dim)
        self.attn = WindowAttention(
            dim,
            window_size=to_2tuple(self.window_size),
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            qk_scale=qk_scale,
            attn_drop=attn_drop,
            proj_drop=drop,
        )

        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(
            in_features=dim,
            hidden_features=mlp_hidden_dim,
            act_layer=act_layer,
            drop=drop,
        )

        if self.shift_size > 0:
            # calculate attention mask for SW-MSA
            H, W = self.input_resolution
            img_mask = paddle.zeros((1, H, W, 1))  # 1 H W 1

            h_slices = (
                slice(0, -self.window_size),
                slice(-self.window_size, -self.shift_size),
                slice(-self.shift_size, None),
            )
            w_slices = (
                slice(0, -self.window_size),
                slice(-self.window_size, -self.shift_size),
                slice(-self.shift_size, None),
            )
            cnt = 0
            for h in h_slices:
                for w in w_slices:
                    img_mask[:, h, w, :] = cnt
                    cnt += 1

            # nW, window_size, window_size, 1
            mask_windows = window_partition(img_mask, self.window_size)
            mask_windows = mask_windows.reshape(
                (-1, self.window_size * self.window_size)
            )
            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)

            _h = paddle.full_like(attn_mask, -100.0, dtype="float32")
            _z = paddle.full_like(attn_mask, 0.0, dtype="float32")
            attn_mask = paddle.where(attn_mask != 0, _h, _z)

        else:
            attn_mask = None

        self.register_buffer("attn_mask", attn_mask)