Exemple #1
0
 def create_block(
     self,
     width_in: int,
     width_out: int,
     stride: int,
     params: RegNetParams,
     bot_mul: float,
     group_width: int = 1,
 ):
     block_constructor = self.BLOCK_TYPES[params.block_type.upper()]
     activation = self.ACTIVATION_TYPES[params.activation]()
     block = block_constructor(
         width_in,
         width_out,
         stride,
         params.bn_epsilon,
         params.bn_momentum,
         activation,
         bot_mul,
         group_width,
         params.se_ratio,
     ).cuda()
     with set_torch_seed(self.seed):
         init_weights(block)
         self.seed += 1
     return block
Exemple #2
0
 def __init__(self, model_config: AttrDict, model_name: str):
     super().__init__()
     self.model_config = model_config
     self.seed = self.model_config._MODEL_INIT_SEED
     with set_torch_seed(self.seed):
         self._feature_blocks, self.trunk_depth = create_regnet_feature_blocks(
             factory=RegnetFSDPBlocksFactory(self.model_config.FSDP_CONFIG),
             model_config=model_config,
         )
Exemple #3
0
 def create_stem(self, params: RegNetParams):
     activation = self.ACTIVATION_TYPES[params.activation]()
     stem = self.STEM_TYPES[params.stem_type](3, params.stem_width,
                                              params.bn_epsilon,
                                              params.bn_momentum,
                                              activation)
     with set_torch_seed(self.seed):
         init_weights(stem)
         self.seed += 1
     return stem
Exemple #4
0
 def __init__(self, model_config: AttrDict, model_name: str):
     super().__init__()
     self.model_config = model_config
     self.seed = self.model_config._MODEL_INIT_SEED
     self.use_activation_checkpointing = (
         model_config.ACTIVATION_CHECKPOINTING.USE_ACTIVATION_CHECKPOINTING)
     self.activation_checkpointing_splits = (
         model_config.ACTIVATION_CHECKPOINTING.
         NUM_ACTIVATION_CHECKPOINTING_SPLITS)
     with set_torch_seed(self.seed):
         self._feature_blocks, self.trunk_depth = create_regnet_feature_blocks(
             factory=RegnetBlocksFactory(), model_config=model_config)
Exemple #5
0
 def __init__(self, model_config: AttrDict, model_name: str):
     super().__init__()
     self.model_config = model_config
     self.seed = self.model_config._MODEL_INIT_SEED
     self.use_activation_checkpointing = (
         model_config.ACTIVATION_CHECKPOINTING.USE_ACTIVATION_CHECKPOINTING)
     with set_torch_seed(self.seed):
         self._feature_blocks, self.trunk_depth = create_regnet_feature_blocks(
             factory=RegnetFSDPBlocksFactory(
                 fsdp_config=self.model_config.FSDP_CONFIG,
                 use_activation_checkpointing=self.
                 use_activation_checkpointing,
             ),
             model_config=self.model_config,
         )
Exemple #6
0
    def create_stem(self, params: Union[RegNetParams, AnyNetParams]):
        # get the activation
        silu = None if get_torch_version() < [1, 7] else nn.SiLU()
        activation = {
            ActivationType.RELU: nn.ReLU(params.relu_in_place),
            ActivationType.SILU: silu,
        }[params.activation]

        # create stem
        stem = {
            StemType.RES_STEM_CIFAR: ResStemCifar,
            StemType.RES_STEM_IN: ResStemIN,
            StemType.SIMPLE_STEM_IN: SimpleStemIN,
        }[params.stem_type](3, params.stem_width, params.bn_epsilon,
                            params.bn_momentum, activation)

        # set stem seeds
        with set_torch_seed(self.seed):
            init_weights(stem)
            self.seed += 1
        return stem
Exemple #7
0
    def create_block(
        self,
        width_in: int,
        width_out: int,
        stride: int,
        params: Union[RegNetParams, AnyNetParams],
        bottleneck_multiplier: float,
        group_width: int = 1,
    ):
        # get the block constructor function to use
        block_constructor = {
            BlockType.VANILLA_BLOCK: VanillaBlock,
            BlockType.RES_BASIC_BLOCK: ResBasicBlock,
            BlockType.RES_BOTTLENECK_BLOCK: ResBottleneckBlock,
            BlockType.RES_BOTTLENECK_LINEAR_BLOCK: ResBottleneckLinearBlock,
        }[params.block_type]

        # get the activation module
        silu = None if get_torch_version() < [1, 7] else nn.SiLU()
        activation = {
            ActivationType.RELU: nn.ReLU(params.relu_in_place),
            ActivationType.SILU: silu,
        }[params.activation]

        block = block_constructor(
            width_in,
            width_out,
            stride,
            params.bn_epsilon,
            params.bn_momentum,
            activation,
            group_width,
            bottleneck_multiplier,
            params.se_ratio,
        ).cuda()
        with set_torch_seed(self.seed):
            init_weights(block)
            self.seed += 1
        return block
Exemple #8
0
    def _get_heads(self):
        """
        This function creates the heads needed by the module.
        HEAD.PARAMS is a list containing parameters for (multiple) heads.
        Each head consist of head_modules that can be composed in different ways.

        * Head Module
            A head_module is specified as a list ["name", kwargs], for example,
            ["mlp", {"dims": [2048, 128]}]

        * Heads can be applied to different types of inputs.
          See `_setup_multi_input_head_mapping`

        Examples of Heads one can specify:
        * Case1: Simple Head containing single module - Single Input, Single output
            ["mlp", {"dims": [2048, 128]}]

        * Case2: Complex Head containing chain of head modules
          Single Input, Single output
            [
                ["mlp", {"dims": [2048, 1000], "use_bn": False, "use_relu": False}],
                ["siamese_concat_view", {"num_towers": 9}],
                ["mlp", {"dims": [9000, 128]}]
            ]

        * Case3: Multiple Heads (example 2 heads) - Single input, multiple output
            Can be used for multi-task learning
            # head 0
            [
                ["mlp", {"dims": [2048, 128]}]
            ],
            # head 1
            [
                ["mlp", {"dims": [2048, 1000], "use_bn": False, "use_relu": False}],
                ["siamese_concat_view", {"num_towers": 9}],
                ["mlp", {"dims": [9000, 128]}],
            ]

        * Case4: Multiple Heads (example 5 simple heads) - Single input, multiple output.
          For example, used in linear evaluation of models
            ["eval_mlp", {"in_channels": 64, "dims": [9216, 1000]}],
            ["eval_mlp", {"in_channels": 256, "dims": [9216, 1000]}],
            ["eval_mlp", {"in_channels": 512, "dims": [8192, 1000]}],
            ["eval_mlp", {"in_channels": 1024, "dims": [9216, 1000]}],
            ["eval_mlp", {"in_channels": 2048, "dims": [8192, 1000]}],

        """
        for head_param in self.model_config.HEAD.PARAMS:
            if isinstance(head_param[0], list):
                # head is composed of several modules
                head_type, head_modules = [], []
                for idx in range(len(head_param)):
                    with set_torch_seed(self.model_config._MODEL_INIT_SEED +
                                        idx):
                        head_modules.append(
                            self._build_head_module(head_param[idx]))
                    head_type.append(head_param[idx][0])
                head_name = "->".join(head_type)
                head = nn.Sequential(*head_modules)
            else:
                # head is a single module
                head_name = head_param[0]
                with set_torch_seed(self.model_config._MODEL_INIT_SEED):
                    head = self._build_head_module(head_param)
            self.heads.append(head)
            self.head_names.append(head_name)
Exemple #9
0
    def init_distributed_data_parallel_model(self):
        """
        Initialize FSDP if needed.

        This method overloads the ClassificationTask class's method from ClassyVision.
        """
        if not is_distributed_training_run():
            return

        # Make sure default cuda device is set. TODO (Min): we should ensure FSDP can
        # be enabled for 1-GPU as well, but the use case there is likely different.
        # I.e. perhaps we use it for cpu_offloading.
        assert get_cuda_device_index(
        ) > -1, "Distributed training not setup correctly"

        # The model might be already wrapped by FSDP internally. Check regnet_fsdp.py.
        # Here, we wrap it at the outer most level.
        fsdp_config = self.config["MODEL"]["FSDP_CONFIG"]
        if is_primary():
            logging.info(f"Using FSDP, config: {fsdp_config}")

        # First, wrap the head's prototype_i layers if it is SWAV.
        # TODO (Min): make this more general for different models, which may have multiple
        #             heads.
        if len(self.base_model.heads) != 1:
            raise ValueError(
                f"FSDP only support 1 head, not {len(self.base_model.heads)} heads"
            )
        head0 = self.base_model.heads[0]
        if isinstance(head0, SwAVPrototypesHead):
            # This is important for convergence!
            #
            # Since we "normalize" this layer in the update hook, we need to keep its
            # weights in full precision. It is output is going into the loss and used
            # for clustering, so we need to have that in full precision as well.
            fp_fsdp_config = fsdp_config.copy()
            fp_fsdp_config["flatten_parameters"] = False
            fp_fsdp_config["mixed_precision"] = False
            fp_fsdp_config["fp32_reduce_scatter"] = False
            for j in range(head0.nmb_heads):
                module = getattr(head0, "prototypes" + str(j))
                module = FSDP(module=module, **fp_fsdp_config)
                setattr(head0, "prototypes" + str(j), module)
        head0 = FSDP(module=head0, **fsdp_config)
        self.base_model.heads[0] = head0

        # Init the head properly since the weights are potentially initialized on different
        # ranks with different seeds. We first summon the full params from all workers.
        # Then, within that context, we set a fixed random seed so that all workers init the
        # weights the same way. Finally, we reset the layer's weights using reset_parameters().
        #
        # TODO (Min): This will go away once we have a way to sync from rank 0.
        with head0.summon_full_params():
            with set_torch_seed(self.config["SEED_VALUE"]):
                for m in head0.modules():
                    if isinstance(m, Linear):
                        m.reset_parameters()
        head0._reset_lazy_init()
        head0.prototypes0._reset_lazy_init()

        # TODO (Min): We can load checkpoint, but it ends up setting the trunk's _is_root
        # flag to true. We need to set it back to None here.
        # Also, right now, the head's weight is only partially loaded from the checkpoint
        # because we dump the checkpoint after the head if wrapped, but loading it before
        # it is wrapped.
        # For very big models, we need re-work the checkpoint logic because we don't have
        # enough memory to load the entire model on one node. We need to use local_state_dict()
        # API to load checkpoint shards.
        for module in self.base_model.trunk.modules():
            if isinstance(module, FSDP):
                module._is_root = None

        # Then, wrap the whole model. We replace the base_model since it is used
        # when checkpoint is taken.
        self.base_model = FSDP(module=self.base_model, **fsdp_config)
        self.distributed_model = self.base_model
Exemple #10
0
    def __init__(self, model_config: AttrDict, model_name: str):
        super().__init__()
        self.model_config = model_config

        assert model_config.INPUT_TYPE in ["rgb", "bgr"], "Input type not supported"
        trunk_config = model_config.TRUNK.TRUNK_PARAMS.REGNET

        assert "name" not in trunk_config, "Please specify the RegNet Params dictionary"

        ################################################################################

        params = RegNetParams(
            depth=trunk_config["depth"],
            w_0=trunk_config["w_0"],
            w_a=trunk_config["w_a"],
            w_m=trunk_config["w_m"],
            group_width=trunk_config["group_width"],
            stem_type=trunk_config.get("stem_type", "simple_stem_in").upper(),
            stem_width=trunk_config.get("stem_width", 32),
            block_type=trunk_config.get("block_type", "res_bottleneck_block").upper(),
            activation=trunk_config.get("activation_type", "relu").upper(),
            use_se=trunk_config.get("use_se", True),
            se_ratio=trunk_config.get("se_ratio", 0.25),
            bn_epsilon=trunk_config.get("bn_epsilon", 1e-05),
            bn_momentum=trunk_config.get("bn_momentum", 0.1),
        )

        # We need all workers (on all nodes) to have the same weights.
        # Unlike DDP, FSDP does not sync weights using rank 0 on start.
        # Therefore, we init stem and trunk_output below within the seed context.
        #
        # TODO (Min): we can make this seed coming from the config or env.
        stem = None
        trunk_output = None
        seed = model_config._MODEL_INIT_SEED
        logging.info(f"_MODEL_INIT_SEED: {seed}")
        with set_torch_seed(seed):
            # Ad hoc stem
            #
            # Important: do NOT retain modules in self.stem or self.trunk_output. It may
            # seem to be harmless, but it appears that autograd will result in computing
            # grads in different order. Different ordering can cause deterministic OOM,
            # even when the peak memory otherwise is only 24GB out of 32GB.
            #
            # When debugging this, it is not enough to just dump the total module
            # params. You need to diff the module string representations.
            activation = {"RELU": nn.ReLU(True)}[params.activation]

            stem = {
                "RES_STEM_CIFAR": ResStemCifar,
                "RES_STEM_IN": ResStemIN,
                "SIMPLE_STEM_IN": SimpleStemIN,
            }[params.stem_type](
                3, params.stem_width, params.bn_epsilon, params.bn_momentum, activation
            )
            init_weights(stem)
            stem = auto_wrap_bn(stem, single_rank_pg=False)

            # Instantiate all the AnyNet blocks in the trunk
            block_fun = {
                "VANILLA_BLOCK": VanillaBlock,
                "RES_BASIC_BLOCK": ResBasicBlock,
                "RES_BOTTLENECK_BLOCK": ResBottleneckBlock,
            }[params.block_type]

            current_width = params.stem_width

            self.trunk_depth = 0

            blocks = []

            for i, (width_out, stride, depth, bot_mul, group_width) in enumerate(
                params.get_expanded_params()
            ):
                blocks.append(
                    (
                        f"block{i+1}",
                        AnyStage(
                            model_config,
                            current_width,
                            width_out,
                            stride,
                            depth,
                            block_fun,
                            activation,
                            bot_mul,
                            group_width,
                            params,
                            stage_index=i + 1,
                        ),
                    )
                )

                self.trunk_depth += blocks[-1][1].stage_depth

                current_width = width_out

            trunk_output = nn.Sequential(OrderedDict(blocks))

        ################################################################################

        # Now map the models to the structure we want to expose for SSL tasks
        # The upstream RegNet model is made of :
        # - `stem`
        # - n x blocks in trunk_output, named `block1, block2, ..`

        # We're only interested in the stem and successive blocks
        # everything else is not picked up on purpose
        feature_blocks: List[Tuple[str, nn.Module]] = []

        # - get the stem
        feature_blocks.append(("conv1", stem))

        # - get all the feature blocks
        for k, v in trunk_output.named_children():
            assert k.startswith("block"), f"Unexpected layer name {k}"
            block_index = len(feature_blocks) + 1
            feature_blocks.append((f"res{block_index}", v))

        # - finally, add avgpool and flatten.
        feature_blocks.append(("avgpool", nn.AdaptiveAvgPool2d((1, 1))))
        feature_blocks.append(("flatten", Flatten(1)))

        self._feature_blocks = nn.ModuleDict(feature_blocks)