Esempio n. 1
0
def tnt_b_patch16_224(pretrained=False, **kwargs):
    patch_size = 16
    inner_stride = 4
    outer_dim = 640
    inner_dim = 40
    outer_num_heads = 10
    inner_num_heads = 4
    outer_dim = make_divisible(outer_dim, outer_num_heads)
    inner_dim = make_divisible(inner_dim, inner_num_heads)
    model = TNT(img_size=224,
                patch_size=patch_size,
                outer_dim=outer_dim,
                inner_dim=inner_dim,
                depth=12,
                outer_num_heads=outer_num_heads,
                inner_num_heads=inner_num_heads,
                qkv_bias=False,
                inner_stride=inner_stride,
                **kwargs)
    model.default_cfg = default_cfgs['tnt_b_patch16_224']
    if pretrained:
        load_pretrained(model,
                        num_classes=model.num_classes,
                        in_chans=kwargs.get('in_chans', 3),
                        filter_fn=_conv_filter)
    return model
Esempio n. 2
0
def _create_vision_transformer(variant,
                               pretrained=False,
                               distilled=False,
                               **kwargs):
    default_cfg = default_cfgs[variant]
    default_num_classes = default_cfg['num_classes']
    default_img_size = default_cfg['input_size'][-1]

    num_classes = kwargs.pop('num_classes', default_num_classes)
    img_size = kwargs.pop('img_size', default_img_size)
    repr_size = kwargs.pop('representation_size', None)
    if repr_size is not None and num_classes != default_num_classes:
        # Remove representation layer if fine-tuning. This may not always be the desired action,
        # but I feel better than doing nothing by default for fine-tuning. Perhaps a better interface?
        _logger.warning("Removing representation layer for fine-tuning.")
        repr_size = None

    # model_cls = DistilledVisionTransformer if distilled else VisionTransformer
    model_cls = VisionTransformer
    # model = model_cls(img_size=img_size, num_classes=num_classes, representation_size=repr_size, **kwargs)
    model = model_cls(img_size=img_size, num_classes=num_classes, **kwargs)
    model.default_cfg = default_cfg

    if pretrained:
        load_pretrained(model,
                        num_classes=num_classes,
                        in_chans=kwargs.get('in_chans', 3),
                        filter_fn=partial(checkpoint_filter_fn, model=model))
    return model
Esempio n. 3
0
def tnt_b_patch16_224(pretrained=False, **kwargs):
    model = TNT(patch_size=16, embed_dim=640, in_dim=40, depth=12, num_heads=10, in_num_head=4,
        qkv_bias=False, **kwargs)
    model.default_cfg = default_cfgs['tnt_b_patch16_224']
    if pretrained:
        load_pretrained(
            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
    return model
Esempio n. 4
0
def T2t_vit_16_ghost(pretrained=False, **kwargs):
    if pretrained:
        kwargs.setdefault('qk_scale', 384 ** -0.5)
    model = T2T_ViT_Ghost(tokens_type='performer', embed_dim=384, depth=16, num_heads=6, mlp_ratio=3., **kwargs)
    model.default_cfg = default_cfgs['T2t_vit_16_ghost']
    if pretrained:
        load_pretrained(
            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
    return model
Esempio n. 5
0
def T2t_vit_19(pretrained=False, **kwargs): # adopt performer for tokens to token
    if pretrained:
        kwargs.setdefault('qk_scale', 448 ** -0.5)
    model = T2T_ViT(tokens_type='performer', embed_dim=448, depth=19, num_heads=7, mlp_ratio=3., **kwargs)
    model.default_cfg = default_cfgs['T2t_vit_19']
    if pretrained:
        load_pretrained(
            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
    return model
Esempio n. 6
0
def T2t_vit_t_24(pretrained=False, **kwargs):  # adopt transformers for tokens to token
    if pretrained:
        kwargs.setdefault('qk_scale', 512 ** -0.5)
    model = T2T_ViT(tokens_type='transformer', embed_dim=512, depth=24, num_heads=8, mlp_ratio=3., **kwargs)
    model.default_cfg = default_cfgs['T2t_vit_t_24']
    if pretrained:
        load_pretrained(
            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
    return model
Esempio n. 7
0
def T2t_vit_14_wide(pretrained=False, **kwargs):
    if pretrained:
        kwargs.setdefault('qk_scale', 512 ** -0.5)
    model = T2T_ViT(tokens_type='performer', embed_dim=768, depth=4, num_heads=12, mlp_ratio=3., **kwargs)
    model.default_cfg = default_cfgs['T2t_vit_14_wide']
    if pretrained:
        load_pretrained(
            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
    return model
Esempio n. 8
0
def deepvit_patch16_224_re_attn_32b(pretrained=False, **kwargs):
    apply_transform = [False] * 0 + [True] * 32
    model = DeepVisionTransformer(
        patch_size=16, embed_dim=384, depth=[False] * 32, apply_transform=apply_transform, num_heads=12, mlp_ratio=3, qkv_bias=True,
        norm_layer=partial(nn.LayerNorm, eps=1e-6),  **kwargs)
    # We following the same settings for original ViT
    model.default_cfg = default_cfgs['Deepvit_base_patch16_224_32B']
    if pretrained:
        load_pretrained(
            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3), filter_fn=_conv_filter)
    return model
Esempio n. 9
0
def create_vit(variant, mode, img_size, pretrained, patch_size, **kwargs):
    model = ViT(mode=mode, img_size=img_size, **kwargs)
    model.default_cfg = ViTcls.default_cfgs[variant]
    gs_new = (int(img_size[0] / patch_size), int(img_size[1] / patch_size))

    if pretrained:
        load_pretrained(model,
                        filter_fn=partial(checkpoint_filter_fn,
                                          model=model,
                                          gs_new=gs_new))
    return model
Esempio n. 10
0
def deepvit_L_384(pretrained=False, **kwargs):
    apply_transform = [False] * 20 + [True] * 12
    model = DeepVisionTransformer(
        img_size=384, patch_size=16, embed_dim=420, depth=[False] * 32, apply_transform=apply_transform, num_heads=12, mlp_ratio=3, qkv_bias=True,
        norm_layer=partial(nn.LayerNorm, eps=1e-6), use_cnn_embed = True, scale_adjustment=0.5, **kwargs)
    # We following the same settings for original ViT
    model.default_cfg = default_cfgs['Deepvit_L_384']
    if pretrained:
        load_pretrained(
            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3), filter_fn=_conv_filter)
    return model
Esempio n. 11
0
def build_T2T_backbond(cfg, pretrained=False, **kwargs):
    if pretrained:
        kwargs.setdefault('qk_scale', 384 ** -0.5)
    body = T2T_ViT(tokens_type='transformer', embed_dim=384, depth=14, num_heads=6, mlp_ratio=3., **kwargs)
    body.default_cfg = default_cfgs['T2t_vit_t_14']
    model = nn.Sequential(OrderedDict([("body", body)]))
    model.out_channels = cfg.MODEL.BACKBONE.BACKBONE_OUT_CHANNELS
    if pretrained:
        load_pretrained(
            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3))
    return model
Esempio n. 12
0
def T2t_vit_dense(pretrained=False, **kwargs):
    model = T2T_ViT_Dense(growth_rate=64,
                          block_config=(3, 6, 6, 4),
                          embed_dim=128,
                          num_heads=8,
                          mlp_ratio=2.,
                          **kwargs)
    model.default_cfg = default_cfgs['t2t_vit_dense']
    if pretrained:
        load_pretrained(model,
                        num_classes=model.num_classes,
                        in_chans=kwargs.get('in_chans', 3))
    return model
Esempio n. 13
0
def coat_lite_mini(pretrained=False, **kwargs):
    model = CoaT(patch_size=4,
                 embed_dims=[64, 128, 320, 512],
                 serial_depths=[2, 2, 2, 2],
                 parallel_depth=0,
                 num_heads=8,
                 mlp_ratios=[8, 8, 4, 4],
                 **kwargs)
    # FIXME use builder
    model.default_cfg = default_cfgs['coat_lite_mini']
    if pretrained:
        load_pretrained(model,
                        num_classes=model.num_classes,
                        in_chans=kwargs.get('in_chans', 3))
    return model
Esempio n. 14
0
def T2t_vit_nys_14_resnext(pretrained=False, **kwargs):
    if pretrained:
        kwargs.setdefault('qk_scale', 384**-0.5)
    model = T2T_ViT(tokens_type='nystromformer',
                    embed_dim=384,
                    depth=14,
                    num_heads=32,
                    mlp_ratio=3.,
                    **kwargs)
    model.default_cfg = default_cfgs['T2t_vit_14_resnext']
    if pretrained:
        load_pretrained(model,
                        num_classes=model.num_classes,
                        in_chans=kwargs.get('in_chans', 3))
    return model
Esempio n. 15
0
def muxnet_l(pretrained=False, num_classes=1000, in_chans=3, **kwargs):
    """ MUXNet-l """
    default_cfg = default_cfgs['msunet_s']
    # NOTE for train, drop_rate should be 0.2
    kwargs[
        'drop_connect_rate'] = 0.15  # set when training, TODO add as cmd arg
    model = _gen_muxnet_l(channel_multiplier=1.0,
                          depth_multiplier=1.0,
                          num_classes=num_classes,
                          in_chans=in_chans,
                          **kwargs)
    model.default_cfg = default_cfg
    if pretrained:
        load_pretrained(model, default_cfg, num_classes, in_chans)
    return model
Esempio n. 16
0
def localvit_tnt_s_patch16_224(pretrained=False, **kwargs):
    model = LocalViT_TNT(patch_size=16,
                         embed_dim=384,
                         in_dim=24,
                         depth=12,
                         num_heads=6,
                         in_num_head=4,
                         qkv_bias=False,
                         **kwargs)
    model.default_cfg = default_cfgs['tnt_s_conv_patch16_224']
    if pretrained:
        load_pretrained(model,
                        num_classes=model.num_classes,
                        in_chans=kwargs.get('in_chans', 3))
    return model
Esempio n. 17
0
def T2t_vit_nys_12(pretrained=False,
                   **kwargs):  # adopt performer for tokens to token
    if pretrained:
        kwargs.setdefault('qk_scale', 256**-0.5)
    model = T2T_ViT(tokens_type='nystromformer',
                    embed_dim=256,
                    depth=12,
                    num_heads=4,
                    mlp_ratio=2.,
                    **kwargs)
    model.default_cfg = default_cfgs['T2t_vit_12']
    if pretrained:
        load_pretrained(model,
                        num_classes=model.num_classes,
                        in_chans=kwargs.get('in_chans', 3))
    return model
Esempio n. 18
0
def tnt_s_patch16_224(pretrained=False, **kwargs):
    model = TNT(patch_size=16,
                embed_dim=384,
                in_dim=24,
                depth=12,
                num_heads=6,
                in_num_head=4,
                qkv_bias=False,
                **kwargs)
    model.default_cfg = default_cfgs['tnt_s_patch16_224']
    if pretrained:
        load_pretrained(model,
                        num_classes=model.num_classes,
                        in_chans=kwargs.get('in_chans', 3),
                        filter_fn=checkpoint_filter_fn)
    return model
Esempio n. 19
0
def localvit_T2t_conv7(pretrained=False,
                       **kwargs):  # adopt performer for tokens to token
    if pretrained:
        kwargs.setdefault('qk_scale', 256**-0.5)
    model = LocalViT_T2T(tokens_type='performer',
                         embed_dim=256,
                         depth=7,
                         num_heads=4,
                         mlp_ratio=2.,
                         reduction=128,
                         **kwargs)
    model.default_cfg = default_cfgs['localvit_T2t_conv7']
    if pretrained:
        load_pretrained(model,
                        num_classes=model.num_classes,
                        in_chans=kwargs.get('in_chans', 3))
    return model
Esempio n. 20
0
def ptnt_b_patch16_256(pretrained=False, **kwargs):
    outer_dim = 256
    inner_dim = 16
    outer_head = 4
    inner_head = 2    
    configs = {
        'depths': [2, 10, 6, 2],
        'outer_dims': [outer_dim, outer_dim*2, outer_dim*4, outer_dim*4],
        'inner_dims': [inner_dim, inner_dim*2, inner_dim*4, inner_dim*4],
        'outer_heads': [outer_head, outer_head*2, outer_head*4, outer_head*4],
        'inner_heads': [inner_head, inner_head*2, inner_head*4, inner_head*4],
    }
    
    model = PyramidTNT(configs=configs, img_size=256, qkv_bias=False, **kwargs)
    model.default_cfg = default_cfgs['tnt_b_patch16_256']
    if pretrained:
        load_pretrained(
            model, num_classes=model.num_classes, in_chans=kwargs.get('in_chans', 3), filter_fn=_conv_filter)
    return model
Esempio n. 21
0
def create_vit(model_cfg):
    model_cfg = model_cfg.copy()
    backbone = model_cfg.pop("backbone")

    normalization = model_cfg.pop("normalization")
    model_cfg["n_cls"] = 1000
    mlp_expansion_ratio = 4
    model_cfg["d_ff"] = mlp_expansion_ratio * model_cfg["d_model"]

    if backbone in default_cfgs:
        default_cfg = default_cfgs[backbone]
    else:
        default_cfg = dict(
            pretrained=False,
            num_classes=1000,
            drop_rate=0.0,
            drop_path_rate=0.0,
            drop_block_rate=None,
        )

    default_cfg["input_size"] = (
        3,
        model_cfg["image_size"][0],
        model_cfg["image_size"][1],
    )
    model = VisionTransformer(**model_cfg)
    if backbone == "vit_base_patch8_384":
        path = os.path.expandvars(
            "$TORCH_HOME/hub/checkpoints/vit_base_patch8_384.pth")
        state_dict = torch.load(path, map_location="cpu")
        filtered_dict = checkpoint_filter_fn(state_dict, model)
        model.load_state_dict(filtered_dict, strict=True)
    elif "deit" in backbone:
        load_pretrained(model, default_cfg, filter_fn=checkpoint_filter_fn)
    else:
        load_custom_pretrained(model, default_cfg)

    return model