Python ValueChoice Examples, nni.retiarii.nn.pytorch.ValueChoice Python Examples

Example #1

0

Show file

File: test_highlevel_apis.py Project: xiaowu0162/nni

 def __init__(self):
     super().__init__()
     choices = [
         {'b': [3], 'bp': [6]},
         {'b': [6], 'bp': [12]}
     ]
     self.conv = nn.Conv2d(3, nn.ValueChoice(choices, label='a')['b'][0], 1)
     self.conv1 = nn.Conv2d(nn.ValueChoice(choices, label='a')['bp'][0], 3, 1)

Example #2

0

Show file

File: search.py Project: yinfupai/nni

 def __init__(self):
     super().__init__()
     self.conv1 = nn.Conv2d(1, 32, 3, 1)
     self.conv2 = nn.LayerChoice(
         [nn.Conv2d(32, 64, 3, 1),
          DepthwiseSeparableConv(32, 64)])
     self.dropout1 = nn.Dropout(nn.ValueChoice([0.25, 0.5, 0.75]))
     self.dropout2 = nn.Dropout(0.5)
     feature = nn.ValueChoice([64, 128, 256])
     self.fc1 = nn.Linear(9216, feature)
     self.fc2 = nn.Linear(feature, 10)

Example #3

0

Show file

File: mobilenetv3.py Project: microsoft/nni

    def _make_stage(self, stage_idx, inp, oup, se, stride, act):
        # initialize them first because they are related to layer_count.
        exp, ks, se_blocks = [], [], []
        for _ in range(4):
            exp.append(
                nn.ValueChoice(list(self.expand_ratios),
                               label=f'exp_{self.layer_count}'))
            ks.append(nn.ValueChoice([3, 5, 7],
                                     label=f'ks_{self.layer_count}'))
            if se:
                # if SE is true, assign a layer choice to SE
                se_blocks.append(lambda hidden_ch: nn.LayerChoice(
                    [nn.Identity(), SELayer(hidden_ch)],
                    label=f'se_{self.layer_count}'))
            else:
                se_blocks.append(None)
            self.layer_count += 1

        blocks = [
            # stride = 2
            InvertedResidual(inp,
                             oup,
                             exp[0],
                             ks[0],
                             stride,
                             squeeze_and_excite=se_blocks[0],
                             activation_layer=act),
            # stride = 1, residual connection should be automatically enabled
            InvertedResidual(oup,
                             oup,
                             exp[1],
                             ks[1],
                             squeeze_and_excite=se_blocks[1],
                             activation_layer=act),
            InvertedResidual(oup,
                             oup,
                             exp[2],
                             ks[2],
                             squeeze_and_excite=se_blocks[2],
                             activation_layer=act),
            InvertedResidual(oup,
                             oup,
                             exp[3],
                             ks[3],
                             squeeze_and_excite=se_blocks[3],
                             activation_layer=act)
        ]

        # mutable depth
        return nn.Repeat(blocks, depth=(1, 4), label=f'depth_{stage_idx}')

Example #4

0

Show file

File: retiarii_transformer_demo.py Project: yinfupai/nni

 def __init__(self,
              n_token: int,
              n_head: int = 8,
              d_model: int = 512,
              d_ff: int = 2048):
     super().__init__()
     p_dropout = nn.ValueChoice([0.1, 0.2, 0.3, 0.4, 0.5],
                                label='p_dropout')
     n_layer = nn.ValueChoice([5, 6, 7, 8, 9], label='n_layer')
     self.encoder = nn.TransformerEncoder(
         nn.TransformerEncoderLayer(d_model, n_head, d_ff, p_dropout),
         n_layer)
     self.d_model = d_model
     self.decoder = nn.Linear(d_model, n_token)
     self.embeddings = nn.Embedding(n_token, d_model)
     self.position = PositionalEncoding(d_model)

Example #5

0

Show file

File: hello_nas.py Project: yinfupai/nni

 def __init__(self):
     super().__init__()
     self.conv1 = nn.Conv2d(1, 32, 3, 1)
     # LayerChoice is used to select a layer between Conv2d and DwConv.
     self.conv2 = nn.LayerChoice(
         [nn.Conv2d(32, 64, 3, 1),
          DepthwiseSeparableConv(32, 64)])
     # ValueChoice is used to select a dropout rate.
     # ValueChoice can be used as parameter of modules wrapped in `nni.retiarii.nn.pytorch`
     # or customized modules wrapped with `@basic_unit`.
     self.dropout1 = nn.Dropout(nn.ValueChoice(
         [0.25, 0.5, 0.75]))  # choose dropout rate from 0.25, 0.5 and 0.75
     self.dropout2 = nn.Dropout(0.5)
     feature = nn.ValueChoice([64, 128, 256])
     self.fc1 = nn.Linear(9216, feature)
     self.fc2 = nn.Linear(feature, 10)

Example #6

0

Show file

File: test_highlevel_apis.py Project: yimikai/nni

 def __init__(self):
     super().__init__()
     self.dropout_rate = nn.ValueChoice([[
         1.05,
     ], [
         1.1,
     ]])

Example #7

0

Show file

File: test_experiment.py Project: yinfupai/nni

 def __init__(self):
     super().__init__()
     channels = nn.ValueChoice([4, 6, 8])
     self.conv1 = nn.Conv2d(1, channels, 5)
     self.pool1 = nn.LayerChoice([
         nn.MaxPool2d((2, 2)), nn.AvgPool2d((2, 2))
     ])
     self.conv2 = nn.Conv2d(channels, 16, 5)
     self.pool2 = nn.LayerChoice([
         nn.MaxPool2d(2), nn.AvgPool2d(2), nn.Conv2d(16, 16, 2, 2)
     ])
     self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
     self.fc2 = nn.Linear(120, 84)
     self.fcplus = nn.Linear(84, 84)
     self.shortcut = nn.InputChoice(2, 1)
     self.fc3 = nn.Linear(84, 10)

Example #8

0

Show file

File: test_oneshot.py Project: maxpark/nni

 def __init__(self, value_choice=True):
     super().__init__()
     self.conv1 = nn.Conv2d(1, 32, 3, 1)
     self.conv2 = LayerChoice(
         [nn.Conv2d(32, 64, 3, 1),
          DepthwiseSeparableConv(32, 64)])
     self.dropout1 = LayerChoice(
         [nn.Dropout(.25), nn.Dropout(.5),
          nn.Dropout(.75)])
     self.dropout2 = nn.Dropout(0.5)
     if value_choice:
         hidden = nn.ValueChoice([32, 64, 128])
     else:
         hidden = 64
     self.fc1 = nn.Linear(9216, hidden)
     self.fc2 = nn.Linear(hidden, 10)
     self.rpfc = nn.Linear(10, 10)
     self.input_ch = InputChoice(2, 1)

Example #9

0

Show file

File: test_highlevel_apis.py Project: xiaowu0162/nni

 def __init__(self):
     super().__init__()
     vc = nn.ValueChoice([(6, 3), (8, 5)])
     self.conv = nn.Conv2d(3, vc[0], kernel_size=vc[1])

Example #10

0

Show file

File: test_highlevel_apis.py Project: xiaowu0162/nni

 def __init__(self):
     super().__init__()
     self.linear = nn.LayerChoice([
         nn.Linear(3, nn.ValueChoice([10, 20])),
         nn.Linear(3, nn.ValueChoice([30, 40]))
     ])

Example #11

0

Show file

File: test_highlevel_apis.py Project: xiaowu0162/nni

 def __init__(self):
     super().__init__()
     self.dropout_rate = nn.ValueChoice([0., 1.])

Example #12

0

Show file

File: test_highlevel_apis.py Project: xiaowu0162/nni

 def __init__(self):
     super().__init__()
     self.conv1 = nn.Conv2d(3, nn.ValueChoice([6, 8], label='shared'), 1)
     self.conv2 = nn.Conv2d(3, nn.ValueChoice([6, 8], label='shared'), 1)

Example #13

0

Show file

File: test_highlevel_apis.py Project: xiaowu0162/nni

 def __init__(self):
     super().__init__()
     self.conv = nn.Conv2d(3, nn.ValueChoice([6, 8]), kernel_size=nn.ValueChoice([3, 5]))

Example #14

0

Show file

File: mobilenetv3.py Project: microsoft/nni

    def __init__(self,
                 num_labels: int = 1000,
                 base_widths: Tuple[int, ...] = (16, 16, 32, 64, 128, 256, 512,
                                                 1024),
                 width_multipliers: Tuple[float, ...] = (0.5, 0.625, 0.75, 1.0,
                                                         1.25, 1.5, 2.0),
                 expand_ratios: Tuple[int, ...] = (1, 2, 3, 4, 5, 6),
                 dropout_rate: float = 0.2,
                 bn_eps: float = 1e-3,
                 bn_momentum: float = 0.1):
        super().__init__()

        self.widths = [
            nn.ValueChoice([
                make_divisible(base_width * mult, 8)
                for mult in width_multipliers
            ],
                           label=f'width_{i}')
            for i, base_width in enumerate(base_widths)
        ]
        self.expand_ratios = expand_ratios

        blocks = [
            # Stem
            ConvBNReLU(3,
                       self.widths[0],
                       nn.ValueChoice([3, 5], label='ks_0'),
                       stride=2,
                       activation_layer=h_swish),
            SeparableConv(self.widths[0],
                          self.widths[0],
                          activation_layer=nn.ReLU),
        ]

        # counting for kernel sizes and expand ratios
        self.layer_count = 2

        blocks += [
            # Body
            self._make_stage(1, self.widths[0], self.widths[1], False, 2,
                             nn.ReLU),
            self._make_stage(2, self.widths[1], self.widths[2], True, 2,
                             nn.ReLU),
            self._make_stage(1, self.widths[2], self.widths[3], False, 2,
                             h_swish),
            self._make_stage(1, self.widths[3], self.widths[4], True, 1,
                             h_swish),
            self._make_stage(1, self.widths[4], self.widths[5], True, 2,
                             h_swish),
        ]

        # Head
        blocks += [
            ConvBNReLU(self.widths[5],
                       self.widths[6],
                       1,
                       1,
                       activation_layer=h_swish),
            nn.AdaptiveAvgPool2d(1),
            ConvBNReLU(self.widths[6],
                       self.widths[7],
                       1,
                       1,
                       norm_layer=nn.Identity,
                       activation_layer=h_swish),
        ]

        self.blocks = nn.Sequential(*blocks)

        self.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(self.widths[7], num_labels),
        )

        reset_parameters(self, bn_momentum=bn_momentum, bn_eps=bn_eps)

Example #15

0

Show file

    def __init__(self,
                 op_candidates: List[str],
                 merge_op: Literal['all', 'loose_end'] = 'all',
                 num_nodes_per_cell: int = 4,
                 width: Union[Tuple[int], int] = 16,
                 num_cells: Union[Tuple[int], int] = 20,
                 dataset: Literal['cifar', 'imagenet'] = 'imagenet',
                 auxiliary_loss: bool = False):
        super().__init__()

        self.dataset = dataset
        self.num_labels = 10 if dataset == 'cifar' else 1000
        self.auxiliary_loss = auxiliary_loss

        # preprocess the specified width and depth
        if isinstance(width, Iterable):
            C = nn.ValueChoice(list(width), label='width')
        else:
            C = width

        if isinstance(num_cells, Iterable):
            num_cells = nn.ValueChoice(list(num_cells), label='depth')
        num_cells_per_stage = [
            i * num_cells // 3 - (i - 1) * num_cells // 3 for i in range(3)
        ]

        # auxiliary head is different for network targetted at different datasets
        if dataset == 'imagenet':
            self.stem0 = nn.Sequential(
                nn.Conv2d(3,
                          C // 2,
                          kernel_size=3,
                          stride=2,
                          padding=1,
                          bias=False),
                nn.BatchNorm2d(C // 2),
                nn.ReLU(inplace=True),
                nn.Conv2d(C // 2, C, 3, stride=2, padding=1, bias=False),
                nn.BatchNorm2d(C),
            )
            self.stem1 = nn.Sequential(
                nn.ReLU(inplace=True),
                nn.Conv2d(C, C, 3, stride=2, padding=1, bias=False),
                nn.BatchNorm2d(C),
            )
            C_pprev = C_prev = C_curr = C
            last_cell_reduce = True
        elif dataset == 'cifar':
            self.stem = nn.Sequential(
                nn.Conv2d(3, 3 * C, 3, padding=1, bias=False),
                nn.BatchNorm2d(3 * C))
            C_pprev = C_prev = 3 * C
            C_curr = C
            last_cell_reduce = False

        self.stages = nn.ModuleList()
        for stage_idx in range(3):
            if stage_idx > 0:
                C_curr *= 2
            # For a stage, we get C_in, C_curr, and C_out.
            # C_in is only used in the first cell.
            # C_curr is number of channels for each operator in current stage.
            # C_out is usually `C * num_nodes_per_cell` because of concat operator.
            cell_builder = CellBuilder(op_candidates, C_pprev, C_prev, C_curr,
                                       num_nodes_per_cell, merge_op,
                                       stage_idx > 0, last_cell_reduce)
            stage = nn.Repeat(cell_builder, num_cells_per_stage[stage_idx])
            self.stages.append(stage)

            # C_pprev is output channel number of last second cell among all the cells already built.
            if len(stage) > 1:
                # Contains more than one cell
                C_pprev = len(stage[-2].output_node_indices) * C_curr
            else:
                # Look up in the out channels of last stage.
                C_pprev = C_prev

            # This was originally,
            # C_prev = num_nodes_per_cell * C_curr.
            # but due to loose end, it becomes,
            C_prev = len(stage[-1].output_node_indices) * C_curr

            # Useful in aligning the pprev and prev cell.
            last_cell_reduce = cell_builder.last_cell_reduce

            if stage_idx == 2:
                C_to_auxiliary = C_prev

        if auxiliary_loss:
            assert isinstance(
                self.stages[2], nn.Sequential
            ), 'Auxiliary loss can only be enabled in retrain mode.'
            self.stages[2] = SequentialBreakdown(self.stages[2])
            self.auxiliary_head = AuxiliaryHead(C_to_auxiliary,
                                                self.num_labels,
                                                dataset=self.dataset)

        self.global_pooling = nn.AdaptiveAvgPool2d((1, 1))
        self.classifier = nn.Linear(C_prev, self.num_labels)

Example #16

0

Show file

File: shufflenet.py Project: microsoft/nni

    def __init__(self,
                 num_labels: int = 1000,
                 channel_search: bool = False,
                 affine: bool = False):
        super().__init__()

        self.num_labels = num_labels
        self.channel_search = channel_search
        self.affine = affine

        # the block number in each stage. 4 stages in total. 20 blocks in total.
        self.stage_repeats = [4, 4, 8, 4]

        # output channels for all stages, including the very first layer and the very last layer
        self.stage_out_channels = [-1, 16, 64, 160, 320, 640, 1024]

        # building first layer
        out_channels = self.stage_out_channels[1]
        self.first_conv = nn.Sequential(
            nn.Conv2d(3, out_channels, 3, 2, 1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
        )

        self.features = []

        global_block_idx = 0
        for stage_idx, num_repeat in enumerate(self.stage_repeats):
            for block_idx in range(num_repeat):
                # count global index to give names to choices
                global_block_idx += 1

                # get ready for input and output
                in_channels = out_channels
                out_channels = self.stage_out_channels[stage_idx + 2]
                stride = 2 if block_idx == 0 else 1

                # mid channels can be searched
                base_mid_channels = out_channels // 2
                if self.channel_search:
                    k_choice_list = [
                        int(base_mid_channels * (.2 * k)) for k in range(1, 9)
                    ]
                    mid_channels = nn.ValueChoice(
                        k_choice_list, label=f'channel_{global_block_idx}')
                else:
                    mid_channels = int(base_mid_channels)

                choice_block = nn.LayerChoice(
                    [
                        ShuffleNetBlock(in_channels,
                                        out_channels,
                                        mid_channels=mid_channels,
                                        kernel_size=3,
                                        stride=stride,
                                        affine=affine),
                        ShuffleNetBlock(in_channels,
                                        out_channels,
                                        mid_channels=mid_channels,
                                        kernel_size=5,
                                        stride=stride,
                                        affine=affine),
                        ShuffleNetBlock(in_channels,
                                        out_channels,
                                        mid_channels=mid_channels,
                                        kernel_size=7,
                                        stride=stride,
                                        affine=affine),
                        ShuffleXceptionBlock(in_channels,
                                             out_channels,
                                             mid_channels=mid_channels,
                                             stride=stride,
                                             affine=affine)
                    ],
                    label=f'layer_{global_block_idx}')
                self.features.append(choice_block)

        self.features = nn.Sequential(*self.features)

        # final layers
        last_conv_channels = self.stage_out_channels[-1]
        self.conv_last = nn.Sequential(
            nn.Conv2d(out_channels, last_conv_channels, 1, 1, 0, bias=False),
            nn.BatchNorm2d(last_conv_channels, affine=affine),
            nn.ReLU(inplace=True),
        )
        self.globalpool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Sequential(
            nn.Linear(last_conv_channels, num_labels, bias=False), )

        self._initialize_weights()

Example #17

0

Show file

File: test_highlevel_apis.py Project: xiaowu0162/nni

 def __init__(self):
     super().__init__()
     self.index = nn.ValueChoice([0, 1])
     self.conv = MutableConv()

Example #18

0

Show file

    def __init__(self,
                 op_candidates: List[str],
                 merge_op: Literal['all', 'loose_end'] = 'all',
                 num_nodes_per_cell: int = 4,
                 width: Union[Tuple[int, ...], int] = 16,
                 num_cells: Union[Tuple[int, ...], int] = 20,
                 dataset: Literal['cifar', 'imagenet'] = 'imagenet',
                 auxiliary_loss: bool = False):
        super().__init__()

        self.dataset = dataset
        self.num_labels = 10 if dataset == 'cifar' else 1000
        self.auxiliary_loss = auxiliary_loss

        # preprocess the specified width and depth
        if isinstance(width, Iterable):
            C = nn.ValueChoice(list(width), label='width')
        else:
            C = width

        self.num_cells: nn.MaybeChoice[int] = cast(int, num_cells)
        if isinstance(num_cells, Iterable):
            self.num_cells = nn.ValueChoice(list(num_cells), label='depth')
        num_cells_per_stage = [
            (i + 1) * self.num_cells // 3 - i * self.num_cells // 3
            for i in range(3)
        ]

        # auxiliary head is different for network targetted at different datasets
        if dataset == 'imagenet':
            self.stem0 = nn.Sequential(
                nn.Conv2d(3,
                          cast(int, C // 2),
                          kernel_size=3,
                          stride=2,
                          padding=1,
                          bias=False),
                nn.BatchNorm2d(cast(int, C // 2)),
                nn.ReLU(inplace=True),
                nn.Conv2d(cast(int, C // 2),
                          cast(int, C),
                          3,
                          stride=2,
                          padding=1,
                          bias=False),
                nn.BatchNorm2d(C),
            )
            self.stem1 = nn.Sequential(
                nn.ReLU(inplace=True),
                nn.Conv2d(cast(int, C),
                          cast(int, C),
                          3,
                          stride=2,
                          padding=1,
                          bias=False),
                nn.BatchNorm2d(C),
            )
            C_pprev = C_prev = C_curr = C
            last_cell_reduce = True
        elif dataset == 'cifar':
            self.stem = nn.Sequential(
                nn.Conv2d(3, cast(int, 3 * C), 3, padding=1, bias=False),
                nn.BatchNorm2d(cast(int, 3 * C)))
            C_pprev = C_prev = 3 * C
            C_curr = C
            last_cell_reduce = False
        else:
            raise ValueError(f'Unsupported dataset: {dataset}')

        self.stages = nn.ModuleList()
        for stage_idx in range(3):
            if stage_idx > 0:
                C_curr *= 2
            # For a stage, we get C_in, C_curr, and C_out.
            # C_in is only used in the first cell.
            # C_curr is number of channels for each operator in current stage.
            # C_out is usually `C * num_nodes_per_cell` because of concat operator.
            cell_builder = CellBuilder(op_candidates, C_pprev, C_prev, C_curr,
                                       num_nodes_per_cell, merge_op,
                                       stage_idx > 0, last_cell_reduce)
            stage: Union[NDSStage, nn.Sequential] = NDSStage(
                cell_builder, num_cells_per_stage[stage_idx])

            if isinstance(stage, NDSStage):
                stage.estimated_out_channels_prev = cast(int, C_prev)
                stage.estimated_out_channels = cast(
                    int, C_curr * num_nodes_per_cell)
                stage.downsampling = stage_idx > 0

            self.stages.append(stage)

            # NOTE: output_node_indices will be computed on-the-fly in trial code.
            # When constructing model space, it's just all the nodes in the cell,
            # which happens to be the case of one-shot supernet.

            # C_pprev is output channel number of last second cell among all the cells already built.
            if len(stage) > 1:
                # Contains more than one cell
                C_pprev = len(cast(nn.Cell,
                                   stage[-2]).output_node_indices) * C_curr
            else:
                # Look up in the out channels of last stage.
                C_pprev = C_prev

            # This was originally,
            # C_prev = num_nodes_per_cell * C_curr.
            # but due to loose end, it becomes,
            C_prev = len(cast(nn.Cell, stage[-1]).output_node_indices) * C_curr

            # Useful in aligning the pprev and prev cell.
            last_cell_reduce = cell_builder.last_cell_reduce

            if stage_idx == 2:
                C_to_auxiliary = C_prev

        if auxiliary_loss:
            assert isinstance(
                self.stages[2], nn.Sequential
            ), 'Auxiliary loss can only be enabled in retrain mode.'
            self.stages[2] = SequentialBreakdown(
                cast(nn.Sequential, self.stages[2]))
            self.auxiliary_head = AuxiliaryHead(
                C_to_auxiliary, self.num_labels,
                dataset=self.dataset)  # type: ignore

        self.global_pooling = nn.AdaptiveAvgPool2d((1, 1))
        self.classifier = nn.Linear(cast(int, C_prev), self.num_labels)

Example #19

0

Show file

File: autoformer.py Project: yinfupai/nni

    def __init__(
            self,
            search_embed_dim: Tuple[int, ...] = (192, 216, 240),
            search_mlp_ratio: Tuple[float, ...] = (3.5, 4.0),
            search_num_heads: Tuple[int, ...] = (3, 4),
            search_depth: Tuple[int, ...] = (12, 13, 14),
            img_size: int = 224,
            patch_size: int = 16,
            in_chans: int = 3,
            num_classes: int = 1000,
            qkv_bias: bool = False,
            drop_rate: float = 0.,
            attn_drop_rate: float = 0.,
            drop_path_rate: float = 0.,
            pre_norm: bool = True,
            global_pool: bool = False,
            abs_pos: bool = True,
            qk_scale: Optional[float] = None,
            rpe: bool = True,
    ):
        super().__init__()

        embed_dim = nn.ValueChoice(list(search_embed_dim), label="embed_dim")
        fixed_embed_dim = nn.ModelParameterChoice(
            list(search_embed_dim), label="embed_dim")
        depth = nn.ValueChoice(list(search_depth), label="depth")
        self.patch_embed = nn.Conv2d(
            in_chans,
            cast(int, embed_dim),
            kernel_size=patch_size,
            stride=patch_size)
        self.patches_num = int((img_size // patch_size) ** 2)
        self.global_pool = global_pool
        self.cls_token = nn.Parameter(torch.zeros(1, 1, cast(int, fixed_embed_dim)))
        trunc_normal_(self.cls_token, std=.02)

        dpr = [
            x.item() for x in torch.linspace(
                0,
                drop_path_rate,
                max(search_depth))]  # stochastic depth decay rule

        self.abs_pos = abs_pos
        if self.abs_pos:
            self.pos_embed = nn.Parameter(torch.zeros(
                1, self.patches_num + 1, cast(int, fixed_embed_dim)))
            trunc_normal_(self.pos_embed, std=.02)

        self.blocks = nn.Repeat(lambda index: nn.LayerChoice([
            TransformerEncoderLayer(embed_dim=embed_dim,
                                    fixed_embed_dim=fixed_embed_dim,
                                    num_heads=num_heads, mlp_ratio=mlp_ratio,
                                    qkv_bias=qkv_bias, drop_rate=drop_rate,
                                    attn_drop=attn_drop_rate,
                                    drop_path=dpr[index],
                                    rpe_length=img_size // patch_size,
                                    qk_scale=qk_scale, rpe=rpe,
                                    pre_norm=pre_norm,)
            for mlp_ratio, num_heads in itertools.product(search_mlp_ratio, search_num_heads)
        ], label=f'layer{index}'), depth)
        self.pre_norm = pre_norm
        if self.pre_norm:
            self.norm = nn.LayerNorm(cast(int, embed_dim))
        self.head = nn.Linear(
            cast(int, embed_dim),
            num_classes) if num_classes > 0 else nn.Identity()