def __init__( self, dim, num_heads, mlp_ratio=4.0, qkv_bias=False, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, epsilon=1e-5, ): super().__init__() self.norm1 = norm_layer(dim, epsilon=epsilon) self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, ) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() self.norm2 = norm_layer(dim, epsilon=epsilon) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, )
def __init__( self, dim, num_heads, head_dim=None, mlp_ratio=4.0, qkv_bias=False, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, group=1, skip_lam=1.0, ): super().__init__() self.skip_lam = skip_lam self.dim = dim self.mlp_hidden_dim = int(dim * mlp_ratio) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() self.norm2 = norm_layer(dim) self.mlp = Mlp( in_features=dim, hidden_features=self.mlp_hidden_dim, act_layer=act_layer, drop=drop, group=group, )
def __init__( self, dim, num_heads, head_dim=None, mlp_ratio=4.0, qkv_bias=False, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, group=1, skip_lam=1.0, ): super().__init__() self.dim = dim self.norm1 = norm_layer(dim) self.skip_lam = skip_lam self.attn = Attention( dim, num_heads=num_heads, head_dim=head_dim, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
def __init__( self, dim, in_dim, num_pixel, num_heads=12, in_num_head=4, mlp_ratio=4.0, qkv_bias=False, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, ): super().__init__() # Inner transformer self.norm_in = norm_layer(in_dim) self.attn_in = Attention( in_dim, in_dim, num_heads=in_num_head, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, ) self.norm_mlp_in = norm_layer(in_dim) self.mlp_in = Mlp( in_features=in_dim, hidden_features=int(in_dim * 4), out_features=in_dim, act_layer=act_layer, drop=drop, ) self.norm1_proj = norm_layer(in_dim) self.proj = nn.Linear(in_dim * num_pixel, dim) # Outer transformer self.norm_out = norm_layer(dim) self.attn_out = Attention( dim, dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() self.norm_mlp = norm_layer(dim) self.mlp = Mlp( in_features=dim, hidden_features=int(dim * mlp_ratio), out_features=dim, act_layer=act_layer, drop=drop, )
def __init__( self, dim, num_heads, mlp_ratio=4.0, qkv_bias=False, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, epsilon=1e-6, shared_cpe=None, shared_crpe=None, ): super().__init__() # Conv-Attention. self.cpe = shared_cpe self.norm1 = norm_layer(dim, epsilon=epsilon) self.factoratt_crpe = FactorAtt_ConvRelPosEnc( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, shared_crpe=shared_crpe, ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() # MLP. self.norm2 = norm_layer(dim, epsilon=epsilon) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, )
def __init__( self, dim, in_dim, num_heads, mlp_ratio=1.0, qkv_bias=False, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, ): super().__init__() self.norm1 = norm_layer(dim) self.attn = Attention( dim, in_dim=in_dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() self.norm2 = norm_layer(in_dim) self.mlp = Mlp( in_features=in_dim, hidden_features=int(in_dim * mlp_ratio), out_features=in_dim, act_layer=act_layer, drop=drop, )
def __init__( self, dim, num_heads, mlp_ratio=4.0, qkv_bias=False, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, epsilon=1e-6, Attention_block=Attention_talking_head, Mlp_block=Mlp, init_values=1e-4, ): super().__init__() self.norm1 = norm_layer(dim, epsilon=epsilon) self.attn = Attention_block( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() self.norm2 = norm_layer(dim, epsilon=epsilon) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp_block( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, ) self.gamma_1 = add_parameter(self, init_values * paddle.ones((dim, ))) self.gamma_2 = add_parameter(self, init_values * paddle.ones((dim, )))
def __init__( self, levels, block, in_channels, out_channels, stride=1, dilation=1, cardinality=1, base_width=64, level_root=False, root_dim=0, root_kernel_size=1, root_residual=False, ): super(DlaTree, self).__init__() if root_dim == 0: root_dim = 2 * out_channels if level_root: root_dim += in_channels self.downsample = (nn.MaxPool2D(stride, stride=stride) if stride > 1 else Identity()) self.project = Identity() cargs = dict(dilation=dilation, cardinality=cardinality, base_width=base_width) if levels == 1: self.tree1 = block(in_channels, out_channels, stride, **cargs) self.tree2 = block(out_channels, out_channels, 1, **cargs) if in_channels != out_channels: self.project = nn.Sequential( nn.Conv2D( in_channels, out_channels, kernel_size=1, stride=1, bias_attr=False, ), nn.BatchNorm2D(out_channels), ) else: cargs.update( dict(root_kernel_size=root_kernel_size, root_residual=root_residual)) self.tree1 = DlaTree(levels - 1, block, in_channels, out_channels, stride, root_dim=0, **cargs) self.tree2 = DlaTree(levels - 1, block, out_channels, out_channels, root_dim=root_dim + out_channels, **cargs) if levels == 1: self.root = DlaRoot(root_dim, out_channels, root_kernel_size, root_residual) self.level_root = level_root self.root_dim = root_dim self.levels = levels
def __init__( self, dim, input_resolution, num_heads, window_size=7, shift_size=0, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, ): super().__init__() self.dim = dim self.input_resolution = input_resolution self.num_heads = num_heads self.window_size = window_size self.shift_size = shift_size self.mlp_ratio = mlp_ratio if min(self.input_resolution) <= self.window_size: # if window size is larger than input resolution, we don't partition windows self.shift_size = 0 self.window_size = min(self.input_resolution) assert ( 0 <= self.shift_size < self.window_size ), "shift_size must in 0-window_size" self.norm1 = norm_layer(dim) self.attn = WindowAttention( dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, ) if self.shift_size > 0: # calculate attention mask for SW-MSA H, W = self.input_resolution img_mask = paddle.zeros((1, H, W, 1)) # 1 H W 1 h_slices = ( slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None), ) w_slices = ( slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None), ) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 # nW, window_size, window_size, 1 mask_windows = window_partition(img_mask, self.window_size) mask_windows = mask_windows.reshape( (-1, self.window_size * self.window_size) ) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) _h = paddle.full_like(attn_mask, -100.0, dtype="float32") _z = paddle.full_like(attn_mask, 0.0, dtype="float32") attn_mask = paddle.where(attn_mask != 0, _h, _z) else: attn_mask = None self.register_buffer("attn_mask", attn_mask)
def __init__( self, dims, num_heads, mlp_ratios=[], qkv_bias=False, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, epsilon=1e-6, shared_cpes=None, shared_crpes=None, ): super().__init__() # Conv-Attention. self.cpes = shared_cpes self.norm12 = norm_layer(dims[1], epsilon=epsilon) self.norm13 = norm_layer(dims[2], epsilon=epsilon) self.norm14 = norm_layer(dims[3], epsilon=epsilon) self.factoratt_crpe2 = FactorAtt_ConvRelPosEnc( dims[1], num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, shared_crpe=shared_crpes[1], ) self.factoratt_crpe3 = FactorAtt_ConvRelPosEnc( dims[2], num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, shared_crpe=shared_crpes[2], ) self.factoratt_crpe4 = FactorAtt_ConvRelPosEnc( dims[3], num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, shared_crpe=shared_crpes[3], ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity() # MLP. self.norm22 = norm_layer(dims[1], epsilon=epsilon) self.norm23 = norm_layer(dims[2], epsilon=epsilon) self.norm24 = norm_layer(dims[3], epsilon=epsilon) # In parallel block, we assume dimensions are the same and share the linear transformation. assert dims[1] == dims[2] == dims[3] assert mlp_ratios[1] == mlp_ratios[2] == mlp_ratios[3] mlp_hidden_dim = int(dims[1] * mlp_ratios[1]) self.mlp2 = self.mlp3 = self.mlp4 = Mlp( in_features=dims[1], hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, )