def __init__(self, in_channels=3, embed_dims=768, conv_type=None, kernel_size=16, stride=16, padding=0, dilation=1, pad_to_patch_size=True, norm_cfg=None, init_cfg=None): super(PatchEmbed, self).__init__() self.embed_dims = embed_dims self.init_cfg = init_cfg if stride is None: stride = kernel_size self.pad_to_patch_size = pad_to_patch_size # The default setting of patch size is equal to kernel size. patch_size = kernel_size if isinstance(patch_size, int): patch_size = to_2tuple(patch_size) elif isinstance(patch_size, tuple): if len(patch_size) == 1: patch_size = to_2tuple(patch_size[0]) assert len(patch_size) == 2, \ f'The size of patch should have length 1 or 2, ' \ f'but got {len(patch_size)}' self.patch_size = patch_size # Use conv layer to embed conv_type = conv_type or 'Conv2d' self.projection = build_conv_layer(dict(type=conv_type), in_channels=in_channels, out_channels=embed_dims, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation) if norm_cfg is not None: self.norm = build_norm_layer(norm_cfg, embed_dims)[1] else: self.norm = None
def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dim=768, norm_cfg=None, conv_cfg=None): super(PatchEmbed, self).__init__() self.img_size = img_size self.patch_size = to_2tuple(patch_size) patches_resolution = [ img_size[0] // self.patch_size[0], img_size[1] // self.patch_size[1] ] num_patches = patches_resolution[0] * patches_resolution[1] self.patches_resolution = patches_resolution self.num_patches = num_patches # Use conv layer to embed self.projection = build_conv_layer(conv_cfg, in_channels, embed_dim, kernel_size=patch_size, stride=patch_size) if norm_cfg is not None: self.norm = build_norm_layer(norm_cfg, embed_dim)[1] else: self.norm = None
def __init__(self, embed_dims, num_heads, window_size, shift_size=0, qkv_bias=True, qk_scale=None, attn_drop_rate=0, proj_drop_rate=0, dropout_layer=dict(type='DropPath', drop_prob=0.), init_cfg=None): super().__init__(init_cfg) self.window_size = window_size self.shift_size = shift_size assert 0 <= self.shift_size < self.window_size self.w_msa = WindowMSA(embed_dims=embed_dims, num_heads=num_heads, window_size=to_2tuple(window_size), qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop_rate=attn_drop_rate, proj_drop_rate=proj_drop_rate, init_cfg=None) self.drop = build_dropout(dropout_layer)
def __init__(self, pos_shape, pos_dim, drop_rate=0., init_cfg=None): super().__init__(init_cfg=init_cfg) if isinstance(pos_shape, int): pos_shape = to_2tuple(pos_shape) elif isinstance(pos_shape, tuple): if len(pos_shape) == 1: pos_shape = to_2tuple(pos_shape[0]) assert len(pos_shape) == 2, \ f'The size of image should have length 1 or 2, ' \ f'but got {len(pos_shape)}' self.pos_shape = pos_shape self.pos_dim = pos_dim self.pos_embed = nn.Parameter( torch.zeros(1, pos_shape[0] * pos_shape[1], pos_dim)) self.drop = nn.Dropout(p=drop_rate)
def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dims=768, num_layers=12, num_heads=12, mlp_ratio=4, out_indices=-1, qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., with_cls_token=True, output_cls_token=False, norm_cfg=dict(type='LN'), act_cfg=dict(type='GELU'), patch_norm=False, final_norm=False, interpolate_mode='bicubic', num_fcs=2, norm_eval=False, with_cp=False, pretrain_style='timm', pretrained=None, init_cfg=None): super(VisionTransformer, self).__init__() if isinstance(img_size, int): img_size = to_2tuple(img_size) elif isinstance(img_size, tuple): if len(img_size) == 1: img_size = to_2tuple(img_size[0]) assert len(img_size) == 2, \ f'The size of image should have length 1 or 2, ' \ f'but got {len(img_size)}' assert pretrain_style in ['timm', 'mmcls'] if output_cls_token: assert with_cls_token is True, f'with_cls_token must be True if' \ f'set output_cls_token to True, but got {with_cls_token}' if isinstance(pretrained, str) or pretrained is None: warnings.warn('DeprecationWarning: pretrained is a deprecated, ' 'please use "init_cfg" instead') else: raise TypeError('pretrained must be a str or None') self.img_size = img_size self.patch_size = patch_size self.interpolate_mode = interpolate_mode self.norm_eval = norm_eval self.with_cp = with_cp self.pretrain_style = pretrain_style self.pretrained = pretrained self.init_cfg = init_cfg self.patch_embed = PatchEmbed( in_channels=in_channels, embed_dims=embed_dims, conv_type='Conv2d', kernel_size=patch_size, stride=patch_size, pad_to_patch_size=True, norm_cfg=norm_cfg if patch_norm else None, init_cfg=None, ) num_patches = (img_size[0] // patch_size) * \ (img_size[1] // patch_size) self.with_cls_token = with_cls_token self.output_cls_token = output_cls_token self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims)) self.pos_embed = nn.Parameter( torch.zeros(1, num_patches + 1, embed_dims)) self.drop_after_pos = nn.Dropout(p=drop_rate) if isinstance(out_indices, int): if out_indices == -1: out_indices = num_layers - 1 self.out_indices = [out_indices] elif isinstance(out_indices, list) or isinstance(out_indices, tuple): self.out_indices = out_indices else: raise TypeError('out_indices must be type of int, list or tuple') dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, num_layers) ] # stochastic depth decay rule self.layers = ModuleList() for i in range(num_layers): self.layers.append( TransformerEncoderLayer(embed_dims=embed_dims, num_heads=num_heads, feedforward_channels=mlp_ratio * embed_dims, attn_drop_rate=attn_drop_rate, drop_rate=drop_rate, drop_path_rate=dpr[i], num_fcs=num_fcs, qkv_bias=qkv_bias, act_cfg=act_cfg, norm_cfg=norm_cfg, batch_first=True)) self.final_norm = final_norm if final_norm: self.norm1_name, norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1) self.add_module(self.norm1_name, norm1)
def __init__(self, pretrain_img_size=224, in_channels=3, embed_dims=64, num_stages=4, num_layers=[3, 4, 6, 3], num_heads=[1, 2, 5, 8], patch_sizes=[4, 2, 2, 2], strides=[4, 2, 2, 2], paddings=[0, 0, 0, 0], sr_ratios=[8, 4, 2, 1], out_indices=(0, 1, 2, 3), mlp_ratios=[8, 8, 4, 4], qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, use_abs_pos_embed=True, norm_after_stage=False, use_conv_ffn=False, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN', eps=1e-6), pretrained=None, convert_weights=True, init_cfg=None): super().__init__(init_cfg=init_cfg) self.convert_weights = convert_weights if isinstance(pretrain_img_size, int): pretrain_img_size = to_2tuple(pretrain_img_size) elif isinstance(pretrain_img_size, tuple): if len(pretrain_img_size) == 1: pretrain_img_size = to_2tuple(pretrain_img_size[0]) assert len(pretrain_img_size) == 2, \ f'The size of image should have length 1 or 2, ' \ f'but got {len(pretrain_img_size)}' assert not (init_cfg and pretrained), \ 'init_cfg and pretrained cannot be setting at the same time' if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) elif pretrained is None: self.init_cfg = init_cfg else: raise TypeError('pretrained must be a str or None') self.embed_dims = embed_dims self.num_stages = num_stages self.num_layers = num_layers self.num_heads = num_heads self.patch_sizes = patch_sizes self.strides = strides self.sr_ratios = sr_ratios assert num_stages == len(num_layers) == len(num_heads) \ == len(patch_sizes) == len(strides) == len(sr_ratios) self.out_indices = out_indices assert max(out_indices) < self.num_stages self.pretrained = pretrained # transformer encoder dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, sum(num_layers)) ] # stochastic num_layer decay rule cur = 0 self.layers = ModuleList() for i, num_layer in enumerate(num_layers): embed_dims_i = embed_dims * num_heads[i] patch_embed = PatchEmbed( in_channels=in_channels, embed_dims=embed_dims_i, kernel_size=patch_sizes[i], stride=strides[i], padding=paddings[i], bias=True, norm_cfg=norm_cfg) layers = ModuleList() if use_abs_pos_embed: pos_shape = pretrain_img_size // np.prod(patch_sizes[:i + 1]) pos_embed = AbsolutePositionEmbedding( pos_shape=pos_shape, pos_dim=embed_dims_i, drop_rate=drop_rate) layers.append(pos_embed) layers.extend([ PVTEncoderLayer( embed_dims=embed_dims_i, num_heads=num_heads[i], feedforward_channels=mlp_ratios[i] * embed_dims_i, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[cur + idx], qkv_bias=qkv_bias, act_cfg=act_cfg, norm_cfg=norm_cfg, sr_ratio=sr_ratios[i], use_conv_ffn=use_conv_ffn) for idx in range(num_layer) ]) in_channels = embed_dims_i # The ret[0] of build_norm_layer is norm name. if norm_after_stage: norm = build_norm_layer(norm_cfg, embed_dims_i)[1] else: norm = nn.Identity() self.layers.append(ModuleList([patch_embed, layers, norm])) cur += num_layer
def __init__(self, pretrain_img_size=224, in_channels=3, embed_dims=96, patch_size=4, window_size=7, mlp_ratio=4, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), strides=(4, 2, 2, 2), out_indices=(0, 1, 2, 3), qkv_bias=True, qk_scale=None, patch_norm=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, use_abs_pos_embed=False, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), pretrain_style='official', pretrained=None, init_cfg=None): super(SwinTransformer, self).__init__() if isinstance(pretrain_img_size, int): pretrain_img_size = to_2tuple(pretrain_img_size) elif isinstance(pretrain_img_size, tuple): if len(pretrain_img_size) == 1: pretrain_img_size = to_2tuple(pretrain_img_size[0]) assert len(pretrain_img_size) == 2, \ f'The size of image should have length 1 or 2, ' \ f'but got {len(pretrain_img_size)}' assert pretrain_style in ['official', 'mmcls'], 'We only support load ' 'official ckpt and mmcls ckpt.' if isinstance(pretrained, str) or pretrained is None: warnings.warn('DeprecationWarning: pretrained is a deprecated, ' 'please use "init_cfg" instead') else: raise TypeError('pretrained must be a str or None') num_layers = len(depths) self.out_indices = out_indices self.use_abs_pos_embed = use_abs_pos_embed self.pretrain_style = pretrain_style self.pretrained = pretrained self.init_cfg = init_cfg assert strides[0] == patch_size, 'Use non-overlapping patch embed.' self.patch_embed = PatchEmbed( in_channels=in_channels, embed_dims=embed_dims, conv_type='Conv2d', kernel_size=patch_size, stride=strides[0], pad_to_patch_size=True, norm_cfg=norm_cfg if patch_norm else None, init_cfg=None) if self.use_abs_pos_embed: patch_row = pretrain_img_size[0] // patch_size patch_col = pretrain_img_size[1] // patch_size num_patches = patch_row * patch_col self.absolute_pos_embed = nn.Parameter( torch.zeros((1, num_patches, embed_dims))) self.drop_after_pos = nn.Dropout(p=drop_rate) # stochastic depth total_depth = sum(depths) dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, total_depth) ] # stochastic depth decay rule self.stages = ModuleList() in_channels = embed_dims for i in range(num_layers): if i < num_layers - 1: downsample = PatchMerging( in_channels=in_channels, out_channels=2 * in_channels, stride=strides[i + 1], norm_cfg=norm_cfg if patch_norm else None, init_cfg=None) else: downsample = None stage = SwinBlockSequence(embed_dims=in_channels, num_heads=num_heads[i], feedforward_channels=mlp_ratio * in_channels, depth=depths[i], window_size=window_size, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[:depths[i]], downsample=downsample, act_cfg=act_cfg, norm_cfg=norm_cfg, init_cfg=None) self.stages.append(stage) dpr = dpr[depths[i]:] if downsample: in_channels = downsample.out_channels self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)] # Add a norm layer for each output for i in out_indices: layer = build_norm_layer(norm_cfg, self.num_features[i])[1] layer_name = f'norm{i}' self.add_module(layer_name, layer)
def __init__(self, img_size=224, patch_size=16, in_channels=3, embed_dims=768, num_layers=12, num_heads=12, mlp_ratio=4, out_indices=-1, qv_bias=True, attn_drop_rate=0., drop_path_rate=0., norm_cfg=dict(type='LN'), act_cfg=dict(type='GELU'), patch_norm=False, final_norm=False, num_fcs=2, norm_eval=False, pretrained=None, init_values=0.1, init_cfg=None): super(BEiT, self).__init__(init_cfg=init_cfg) if isinstance(img_size, int): img_size = to_2tuple(img_size) elif isinstance(img_size, tuple): if len(img_size) == 1: img_size = to_2tuple(img_size[0]) assert len(img_size) == 2, \ f'The size of image should have length 1 or 2, ' \ f'but got {len(img_size)}' assert not (init_cfg and pretrained), \ 'init_cfg and pretrained cannot be set at the same time' if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) elif pretrained is not None: raise TypeError('pretrained must be a str or None') self.in_channels = in_channels self.img_size = img_size self.patch_size = patch_size self.norm_eval = norm_eval self.pretrained = pretrained self.num_layers = num_layers self.embed_dims = embed_dims self.num_heads = num_heads self.mlp_ratio = mlp_ratio self.attn_drop_rate = attn_drop_rate self.drop_path_rate = drop_path_rate self.num_fcs = num_fcs self.qv_bias = qv_bias self.act_cfg = act_cfg self.norm_cfg = norm_cfg self.patch_norm = patch_norm self.init_values = init_values self.window_size = (img_size[0] // patch_size, img_size[1] // patch_size) self.patch_shape = self.window_size self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims)) self._build_patch_embedding() self._build_layers() if isinstance(out_indices, int): if out_indices == -1: out_indices = num_layers - 1 self.out_indices = [out_indices] elif isinstance(out_indices, list) or isinstance(out_indices, tuple): self.out_indices = out_indices else: raise TypeError('out_indices must be type of int, list or tuple') self.final_norm = final_norm if final_norm: self.norm1_name, norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1) self.add_module(self.norm1_name, norm1)