def __init__(self, embed_dims, num_heads, num_frames, attn_drop=0., proj_drop=0., dropout_layer=dict(type='DropPath', drop_prob=0.1), norm_cfg=dict(type='LN'), init_cfg=None, **kwargs): super().__init__(init_cfg) self.embed_dims = embed_dims self.num_heads = num_heads self.num_frames = num_frames self.norm = build_norm_layer(norm_cfg, self.embed_dims)[1] if digit_version(torch.__version__) < digit_version('1.9.0'): kwargs.pop('batch_first', None) self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, **kwargs) self.proj_drop = nn.Dropout(proj_drop) self.dropout_layer = build_dropout( dropout_layer) if dropout_layer else nn.Identity() self.temporal_fc = nn.Linear(self.embed_dims, self.embed_dims) self.init_weights()
def __init__(self, embed_dims, num_heads, window_size, shift_size=0, qkv_bias=True, qk_scale=None, attn_drop_rate=0, proj_drop_rate=0, dropout_layer=dict(type='DropPath', drop_prob=0.), init_cfg=None): super().__init__(init_cfg) self.window_size = window_size self.shift_size = shift_size assert 0 <= self.shift_size < self.window_size self.w_msa = WindowMSA(embed_dims=embed_dims, num_heads=num_heads, window_size=to_2tuple(window_size), qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop_rate=attn_drop_rate, proj_drop_rate=proj_drop_rate, init_cfg=None) self.drop = build_dropout(dropout_layer)
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_cfg=dict(type='LN'), sr_ratio=1, use_sr_conv=True): super(TCFormerRegularBlock, self).__init__() self.norm1 = build_norm_layer(norm_cfg, dim)[1] self.attn = TCFormerDynamicAttention(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio, use_sr_conv=use_sr_conv) self.drop_path = build_dropout( dict(type='DropPath', drop_prob=drop_path)) self.norm2 = build_norm_layer(norm_cfg, dim)[1] mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = TCMLP(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
def __init__(self, embed_dims, num_heads, window_size, shift_size=0, qkv_bias=True, qk_scale=None, attn_drop=0, proj_drop=0, dropout_layer=dict(type='DropPath', drop_prob=0.), pad_small_map=False, input_resolution=None, auto_pad=None, init_cfg=None): super().__init__(init_cfg) if input_resolution is not None or auto_pad is not None: warnings.warn( 'The ShiftWindowMSA in new version has supported auto padding ' 'and dynamic input shape in all condition. And the argument ' '`auto_pad` and `input_resolution` have been deprecated.', DeprecationWarning) self.shift_size = shift_size self.window_size = window_size assert 0 <= self.shift_size < self.window_size self.w_msa = WindowMSA( embed_dims=embed_dims, window_size=to_2tuple(self.window_size), num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=proj_drop, ) self.drop = build_dropout(dropout_layer) self.pad_small_map = pad_small_map
def __init__(self, embed_dims, input_resolution, num_heads, window_size, shift_size=0, qkv_bias=True, qk_scale=None, attn_drop=0, proj_drop=0, dropout_layer=dict(type='DropPath', drop_prob=0.), auto_pad=False, init_cfg=None): super().__init__(init_cfg) self.embed_dims = embed_dims self.input_resolution = input_resolution self.shift_size = shift_size self.window_size = window_size if min(self.input_resolution) <= self.window_size: # if window size is larger than input resolution, don't partition self.shift_size = 0 self.window_size = min(self.input_resolution) self.w_msa = WindowMSA(embed_dims, to_2tuple(self.window_size), num_heads, qkv_bias, qk_scale, attn_drop, proj_drop) self.drop = build_dropout(dropout_layer) H, W = self.input_resolution # Handle auto padding self.auto_pad = auto_pad if self.auto_pad: self.pad_r = (self.window_size - W % self.window_size) % self.window_size self.pad_b = (self.window_size - H % self.window_size) % self.window_size self.H_pad = H + self.pad_b self.W_pad = W + self.pad_r else: H_pad, W_pad = self.input_resolution assert H_pad % self.window_size + W_pad % self.window_size == 0,\ f'input_resolution({self.input_resolution}) is not divisible '\ f'by window_size({self.window_size}). Please check feature '\ f'map shape or set `auto_pad=True`.' self.H_pad, self.W_pad = H_pad, W_pad self.pad_r, self.pad_b = 0, 0 if self.shift_size > 0: # calculate attention mask for SW-MSA img_mask = torch.zeros((1, self.H_pad, self.W_pad, 1)) # 1 H W 1 h_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) w_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 # nW, window_size, window_size, 1 mask_windows = self.window_partition(img_mask) mask_windows = mask_windows.view( -1, self.window_size * self.window_size) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( attn_mask == 0, float(0.0)) else: attn_mask = None self.register_buffer('attn_mask', attn_mask)
def build_drop_path(drop_path_rate): """Build drop path layer.""" return build_dropout(dict(type='DropPath', drop_prob=drop_path_rate))