x = Encoder(x, train=train, name='Transformer', **transformer) if classifier == 'token': x = x[:, 0] elif classifier == 'gap': x = jnp.mean(x, axis=list(range(1, x.ndim - 1))) # (1,) or (1,2) if representation_size is not None: x = nn.Dense(x, representation_size, name='pre_logits') x = nn.tanh(x) else: x = IdentityLayer(x, name='pre_logits') x = nn.Dense(x, num_classes, name='head', kernel_init=nn.initializers.zeros) return x CONFIGS = { 'ViT-B_16': configs.get_b16_config(), 'ViT-B_32': configs.get_b32_config(), 'ViT-L_16': configs.get_l16_config(), 'ViT-L_32': configs.get_l32_config(), 'ViT-H_14': configs.get_h14_config(), 'testing': configs.get_testing(), } KNOWN_MODELS = { name: VisionTransformer.partial(**config) for name, config in CONFIGS.items() }
cls = jnp.tile(cls, [n, 1, 1]) x = jnp.concatenate([cls, x], axis=1) x = Encoder(x, train=train, name="Transformer", **transformer) if classifier == "token": x = x[:, 0] elif classifier == "gap": x = jnp.mean(x, axis=list(range(1, x.ndim - 1))) # (1,) or (1,2) if representation_size is not None: x = nn.Dense(x, representation_size, name="pre_logits") x = nn.tanh(x) else: x = IdentityLayer(x, name="pre_logits") x = nn.Dense(x, num_classes, name="head", kernel_init=nn.initializers.zeros) return x CONFIGS = { "ViT-B_16": configs.get_b16_config(), "ViT-B_32": configs.get_b32_config(), "ViT-L_16": configs.get_l16_config(), "ViT-L_32": configs.get_l32_config(), "testing": configs.get_testing(), } KNOWN_MODELS = { name: VisionTransformer.partial(**config) for name, config in CONFIGS.items() }