def tnt_b_patch16_224(pretrained=False, **kwargs): patch_size = 16 inner_stride = 4 outer_dim = 640 inner_dim = 40 outer_num_heads = 10 inner_num_heads = 4 outer_dim = make_divisible(outer_dim, outer_num_heads) inner_dim = make_divisible(inner_dim, inner_num_heads) model = TNT(img_size=224, patch_size=patch_size, outer_dim=outer_dim, inner_dim=inner_dim, depth=12, outer_num_heads=outer_num_heads, inner_num_heads=inner_num_heads, qkv_bias=False, inner_stride=inner_stride, **kwargs) model.default_cfg = default_cfgs['tnt_b_patch16_224'] if pretrained: load_pretrained(model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3), filter_fn=_conv_filter) return model
def _create_vision_transformer(variant, pretrained=False, distilled=False, **kwargs): default_cfg = default_cfgs[variant] default_num_classes = default_cfg['num_classes'] default_img_size = default_cfg['input_size'][-1] num_classes = kwargs.pop('num_classes', default_num_classes) img_size = kwargs.pop('img_size', default_img_size) repr_size = kwargs.pop('representation_size', None) if repr_size is not None and num_classes != default_num_classes: # Remove representation layer if fine-tuning. This may not always be the desired action, # but I feel better than doing nothing by default for fine-tuning. Perhaps a better interface? _logger.warning("Removing representation layer for fine-tuning.") repr_size = None # model_cls = DistilledVisionTransformer if distilled else VisionTransformer model_cls = VisionTransformer # model = model_cls(img_size=img_size, num_classes=num_classes, representation_size=repr_size, **kwargs) model = model_cls(img_size=img_size, num_classes=num_classes, **kwargs) model.default_cfg = default_cfg if pretrained: load_pretrained(model, num_classes=num_classes, in_chans=kwargs.get('in_chans', 3), filter_fn=partial(checkpoint_filter_fn, model=model)) return model
def tnt_b_patch16_224(pretrained=False, **kwargs): model = TNT(patch_size=16, embed_dim=640, in_dim=40, depth=12, num_heads=10, in_num_head=4, qkv_bias=False, **kwargs) model.default_cfg = default_cfgs['tnt_b_patch16_224'] if pretrained: load_pretrained( model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) return model
def T2t_vit_16_ghost(pretrained=False, **kwargs): if pretrained: kwargs.setdefault('qk_scale', 384 ** -0.5) model = T2T_ViT_Ghost(tokens_type='performer', embed_dim=384, depth=16, num_heads=6, mlp_ratio=3., **kwargs) model.default_cfg = default_cfgs['T2t_vit_16_ghost'] if pretrained: load_pretrained( model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) return model
def T2t_vit_19(pretrained=False, **kwargs): # adopt performer for tokens to token if pretrained: kwargs.setdefault('qk_scale', 448 ** -0.5) model = T2T_ViT(tokens_type='performer', embed_dim=448, depth=19, num_heads=7, mlp_ratio=3., **kwargs) model.default_cfg = default_cfgs['T2t_vit_19'] if pretrained: load_pretrained( model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) return model
def T2t_vit_t_24(pretrained=False, **kwargs): # adopt transformers for tokens to token if pretrained: kwargs.setdefault('qk_scale', 512 ** -0.5) model = T2T_ViT(tokens_type='transformer', embed_dim=512, depth=24, num_heads=8, mlp_ratio=3., **kwargs) model.default_cfg = default_cfgs['T2t_vit_t_24'] if pretrained: load_pretrained( model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) return model
def T2t_vit_14_wide(pretrained=False, **kwargs): if pretrained: kwargs.setdefault('qk_scale', 512 ** -0.5) model = T2T_ViT(tokens_type='performer', embed_dim=768, depth=4, num_heads=12, mlp_ratio=3., **kwargs) model.default_cfg = default_cfgs['T2t_vit_14_wide'] if pretrained: load_pretrained( model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) return model
def deepvit_patch16_224_re_attn_32b(pretrained=False, **kwargs): apply_transform = [False] * 0 + [True] * 32 model = DeepVisionTransformer( patch_size=16, embed_dim=384, depth=[False] * 32, apply_transform=apply_transform, num_heads=12, mlp_ratio=3, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) # We following the same settings for original ViT model.default_cfg = default_cfgs['Deepvit_base_patch16_224_32B'] if pretrained: load_pretrained( model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3), filter_fn=_conv_filter) return model
def create_vit(variant, mode, img_size, pretrained, patch_size, **kwargs): model = ViT(mode=mode, img_size=img_size, **kwargs) model.default_cfg = ViTcls.default_cfgs[variant] gs_new = (int(img_size[0] / patch_size), int(img_size[1] / patch_size)) if pretrained: load_pretrained(model, filter_fn=partial(checkpoint_filter_fn, model=model, gs_new=gs_new)) return model
def deepvit_L_384(pretrained=False, **kwargs): apply_transform = [False] * 20 + [True] * 12 model = DeepVisionTransformer( img_size=384, patch_size=16, embed_dim=420, depth=[False] * 32, apply_transform=apply_transform, num_heads=12, mlp_ratio=3, qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), use_cnn_embed = True, scale_adjustment=0.5, **kwargs) # We following the same settings for original ViT model.default_cfg = default_cfgs['Deepvit_L_384'] if pretrained: load_pretrained( model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3), filter_fn=_conv_filter) return model
def build_T2T_backbond(cfg, pretrained=False, **kwargs): if pretrained: kwargs.setdefault('qk_scale', 384 ** -0.5) body = T2T_ViT(tokens_type='transformer', embed_dim=384, depth=14, num_heads=6, mlp_ratio=3., **kwargs) body.default_cfg = default_cfgs['T2t_vit_t_14'] model = nn.Sequential(OrderedDict([("body", body)])) model.out_channels = cfg.MODEL.BACKBONE.BACKBONE_OUT_CHANNELS if pretrained: load_pretrained( model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) return model
def T2t_vit_dense(pretrained=False, **kwargs): model = T2T_ViT_Dense(growth_rate=64, block_config=(3, 6, 6, 4), embed_dim=128, num_heads=8, mlp_ratio=2., **kwargs) model.default_cfg = default_cfgs['t2t_vit_dense'] if pretrained: load_pretrained(model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) return model
def coat_lite_mini(pretrained=False, **kwargs): model = CoaT(patch_size=4, embed_dims=[64, 128, 320, 512], serial_depths=[2, 2, 2, 2], parallel_depth=0, num_heads=8, mlp_ratios=[8, 8, 4, 4], **kwargs) # FIXME use builder model.default_cfg = default_cfgs['coat_lite_mini'] if pretrained: load_pretrained(model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) return model
def T2t_vit_nys_14_resnext(pretrained=False, **kwargs): if pretrained: kwargs.setdefault('qk_scale', 384**-0.5) model = T2T_ViT(tokens_type='nystromformer', embed_dim=384, depth=14, num_heads=32, mlp_ratio=3., **kwargs) model.default_cfg = default_cfgs['T2t_vit_14_resnext'] if pretrained: load_pretrained(model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) return model
def muxnet_l(pretrained=False, num_classes=1000, in_chans=3, **kwargs): """ MUXNet-l """ default_cfg = default_cfgs['msunet_s'] # NOTE for train, drop_rate should be 0.2 kwargs[ 'drop_connect_rate'] = 0.15 # set when training, TODO add as cmd arg model = _gen_muxnet_l(channel_multiplier=1.0, depth_multiplier=1.0, num_classes=num_classes, in_chans=in_chans, **kwargs) model.default_cfg = default_cfg if pretrained: load_pretrained(model, default_cfg, num_classes, in_chans) return model
def localvit_tnt_s_patch16_224(pretrained=False, **kwargs): model = LocalViT_TNT(patch_size=16, embed_dim=384, in_dim=24, depth=12, num_heads=6, in_num_head=4, qkv_bias=False, **kwargs) model.default_cfg = default_cfgs['tnt_s_conv_patch16_224'] if pretrained: load_pretrained(model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) return model
def T2t_vit_nys_12(pretrained=False, **kwargs): # adopt performer for tokens to token if pretrained: kwargs.setdefault('qk_scale', 256**-0.5) model = T2T_ViT(tokens_type='nystromformer', embed_dim=256, depth=12, num_heads=4, mlp_ratio=2., **kwargs) model.default_cfg = default_cfgs['T2t_vit_12'] if pretrained: load_pretrained(model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) return model
def tnt_s_patch16_224(pretrained=False, **kwargs): model = TNT(patch_size=16, embed_dim=384, in_dim=24, depth=12, num_heads=6, in_num_head=4, qkv_bias=False, **kwargs) model.default_cfg = default_cfgs['tnt_s_patch16_224'] if pretrained: load_pretrained(model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3), filter_fn=checkpoint_filter_fn) return model
def localvit_T2t_conv7(pretrained=False, **kwargs): # adopt performer for tokens to token if pretrained: kwargs.setdefault('qk_scale', 256**-0.5) model = LocalViT_T2T(tokens_type='performer', embed_dim=256, depth=7, num_heads=4, mlp_ratio=2., reduction=128, **kwargs) model.default_cfg = default_cfgs['localvit_T2t_conv7'] if pretrained: load_pretrained(model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3)) return model
def ptnt_b_patch16_256(pretrained=False, **kwargs): outer_dim = 256 inner_dim = 16 outer_head = 4 inner_head = 2 configs = { 'depths': [2, 10, 6, 2], 'outer_dims': [outer_dim, outer_dim*2, outer_dim*4, outer_dim*4], 'inner_dims': [inner_dim, inner_dim*2, inner_dim*4, inner_dim*4], 'outer_heads': [outer_head, outer_head*2, outer_head*4, outer_head*4], 'inner_heads': [inner_head, inner_head*2, inner_head*4, inner_head*4], } model = PyramidTNT(configs=configs, img_size=256, qkv_bias=False, **kwargs) model.default_cfg = default_cfgs['tnt_b_patch16_256'] if pretrained: load_pretrained( model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3), filter_fn=_conv_filter) return model
def create_vit(model_cfg): model_cfg = model_cfg.copy() backbone = model_cfg.pop("backbone") normalization = model_cfg.pop("normalization") model_cfg["n_cls"] = 1000 mlp_expansion_ratio = 4 model_cfg["d_ff"] = mlp_expansion_ratio * model_cfg["d_model"] if backbone in default_cfgs: default_cfg = default_cfgs[backbone] else: default_cfg = dict( pretrained=False, num_classes=1000, drop_rate=0.0, drop_path_rate=0.0, drop_block_rate=None, ) default_cfg["input_size"] = ( 3, model_cfg["image_size"][0], model_cfg["image_size"][1], ) model = VisionTransformer(**model_cfg) if backbone == "vit_base_patch8_384": path = os.path.expandvars( "$TORCH_HOME/hub/checkpoints/vit_base_patch8_384.pth") state_dict = torch.load(path, map_location="cpu") filtered_dict = checkpoint_filter_fn(state_dict, model) model.load_state_dict(filtered_dict, strict=True) elif "deit" in backbone: load_pretrained(model, default_cfg, filter_fn=checkpoint_filter_fn) else: load_custom_pretrained(model, default_cfg) return model