def create_block( self, width_in: int, width_out: int, stride: int, params: RegNetParams, bot_mul: float, group_width: int = 1, ): block_constructor = self.BLOCK_TYPES[params.block_type.upper()] activation = self.ACTIVATION_TYPES[params.activation]() block = block_constructor( width_in, width_out, stride, params.bn_epsilon, params.bn_momentum, activation, bot_mul, group_width, params.se_ratio, ).cuda() with set_torch_seed(self.seed): init_weights(block) self.seed += 1 return block
def __init__(self, model_config: AttrDict, model_name: str): super().__init__() self.model_config = model_config self.seed = self.model_config._MODEL_INIT_SEED with set_torch_seed(self.seed): self._feature_blocks, self.trunk_depth = create_regnet_feature_blocks( factory=RegnetFSDPBlocksFactory(self.model_config.FSDP_CONFIG), model_config=model_config, )
def create_stem(self, params: RegNetParams): activation = self.ACTIVATION_TYPES[params.activation]() stem = self.STEM_TYPES[params.stem_type](3, params.stem_width, params.bn_epsilon, params.bn_momentum, activation) with set_torch_seed(self.seed): init_weights(stem) self.seed += 1 return stem
def __init__(self, model_config: AttrDict, model_name: str): super().__init__() self.model_config = model_config self.seed = self.model_config._MODEL_INIT_SEED self.use_activation_checkpointing = ( model_config.ACTIVATION_CHECKPOINTING.USE_ACTIVATION_CHECKPOINTING) self.activation_checkpointing_splits = ( model_config.ACTIVATION_CHECKPOINTING. NUM_ACTIVATION_CHECKPOINTING_SPLITS) with set_torch_seed(self.seed): self._feature_blocks, self.trunk_depth = create_regnet_feature_blocks( factory=RegnetBlocksFactory(), model_config=model_config)
def __init__(self, model_config: AttrDict, model_name: str): super().__init__() self.model_config = model_config self.seed = self.model_config._MODEL_INIT_SEED self.use_activation_checkpointing = ( model_config.ACTIVATION_CHECKPOINTING.USE_ACTIVATION_CHECKPOINTING) with set_torch_seed(self.seed): self._feature_blocks, self.trunk_depth = create_regnet_feature_blocks( factory=RegnetFSDPBlocksFactory( fsdp_config=self.model_config.FSDP_CONFIG, use_activation_checkpointing=self. use_activation_checkpointing, ), model_config=self.model_config, )
def create_stem(self, params: Union[RegNetParams, AnyNetParams]): # get the activation silu = None if get_torch_version() < [1, 7] else nn.SiLU() activation = { ActivationType.RELU: nn.ReLU(params.relu_in_place), ActivationType.SILU: silu, }[params.activation] # create stem stem = { StemType.RES_STEM_CIFAR: ResStemCifar, StemType.RES_STEM_IN: ResStemIN, StemType.SIMPLE_STEM_IN: SimpleStemIN, }[params.stem_type](3, params.stem_width, params.bn_epsilon, params.bn_momentum, activation) # set stem seeds with set_torch_seed(self.seed): init_weights(stem) self.seed += 1 return stem
def create_block( self, width_in: int, width_out: int, stride: int, params: Union[RegNetParams, AnyNetParams], bottleneck_multiplier: float, group_width: int = 1, ): # get the block constructor function to use block_constructor = { BlockType.VANILLA_BLOCK: VanillaBlock, BlockType.RES_BASIC_BLOCK: ResBasicBlock, BlockType.RES_BOTTLENECK_BLOCK: ResBottleneckBlock, BlockType.RES_BOTTLENECK_LINEAR_BLOCK: ResBottleneckLinearBlock, }[params.block_type] # get the activation module silu = None if get_torch_version() < [1, 7] else nn.SiLU() activation = { ActivationType.RELU: nn.ReLU(params.relu_in_place), ActivationType.SILU: silu, }[params.activation] block = block_constructor( width_in, width_out, stride, params.bn_epsilon, params.bn_momentum, activation, group_width, bottleneck_multiplier, params.se_ratio, ).cuda() with set_torch_seed(self.seed): init_weights(block) self.seed += 1 return block
def _get_heads(self): """ This function creates the heads needed by the module. HEAD.PARAMS is a list containing parameters for (multiple) heads. Each head consist of head_modules that can be composed in different ways. * Head Module A head_module is specified as a list ["name", kwargs], for example, ["mlp", {"dims": [2048, 128]}] * Heads can be applied to different types of inputs. See `_setup_multi_input_head_mapping` Examples of Heads one can specify: * Case1: Simple Head containing single module - Single Input, Single output ["mlp", {"dims": [2048, 128]}] * Case2: Complex Head containing chain of head modules Single Input, Single output [ ["mlp", {"dims": [2048, 1000], "use_bn": False, "use_relu": False}], ["siamese_concat_view", {"num_towers": 9}], ["mlp", {"dims": [9000, 128]}] ] * Case3: Multiple Heads (example 2 heads) - Single input, multiple output Can be used for multi-task learning # head 0 [ ["mlp", {"dims": [2048, 128]}] ], # head 1 [ ["mlp", {"dims": [2048, 1000], "use_bn": False, "use_relu": False}], ["siamese_concat_view", {"num_towers": 9}], ["mlp", {"dims": [9000, 128]}], ] * Case4: Multiple Heads (example 5 simple heads) - Single input, multiple output. For example, used in linear evaluation of models ["eval_mlp", {"in_channels": 64, "dims": [9216, 1000]}], ["eval_mlp", {"in_channels": 256, "dims": [9216, 1000]}], ["eval_mlp", {"in_channels": 512, "dims": [8192, 1000]}], ["eval_mlp", {"in_channels": 1024, "dims": [9216, 1000]}], ["eval_mlp", {"in_channels": 2048, "dims": [8192, 1000]}], """ for head_param in self.model_config.HEAD.PARAMS: if isinstance(head_param[0], list): # head is composed of several modules head_type, head_modules = [], [] for idx in range(len(head_param)): with set_torch_seed(self.model_config._MODEL_INIT_SEED + idx): head_modules.append( self._build_head_module(head_param[idx])) head_type.append(head_param[idx][0]) head_name = "->".join(head_type) head = nn.Sequential(*head_modules) else: # head is a single module head_name = head_param[0] with set_torch_seed(self.model_config._MODEL_INIT_SEED): head = self._build_head_module(head_param) self.heads.append(head) self.head_names.append(head_name)
def init_distributed_data_parallel_model(self): """ Initialize FSDP if needed. This method overloads the ClassificationTask class's method from ClassyVision. """ if not is_distributed_training_run(): return # Make sure default cuda device is set. TODO (Min): we should ensure FSDP can # be enabled for 1-GPU as well, but the use case there is likely different. # I.e. perhaps we use it for cpu_offloading. assert get_cuda_device_index( ) > -1, "Distributed training not setup correctly" # The model might be already wrapped by FSDP internally. Check regnet_fsdp.py. # Here, we wrap it at the outer most level. fsdp_config = self.config["MODEL"]["FSDP_CONFIG"] if is_primary(): logging.info(f"Using FSDP, config: {fsdp_config}") # First, wrap the head's prototype_i layers if it is SWAV. # TODO (Min): make this more general for different models, which may have multiple # heads. if len(self.base_model.heads) != 1: raise ValueError( f"FSDP only support 1 head, not {len(self.base_model.heads)} heads" ) head0 = self.base_model.heads[0] if isinstance(head0, SwAVPrototypesHead): # This is important for convergence! # # Since we "normalize" this layer in the update hook, we need to keep its # weights in full precision. It is output is going into the loss and used # for clustering, so we need to have that in full precision as well. fp_fsdp_config = fsdp_config.copy() fp_fsdp_config["flatten_parameters"] = False fp_fsdp_config["mixed_precision"] = False fp_fsdp_config["fp32_reduce_scatter"] = False for j in range(head0.nmb_heads): module = getattr(head0, "prototypes" + str(j)) module = FSDP(module=module, **fp_fsdp_config) setattr(head0, "prototypes" + str(j), module) head0 = FSDP(module=head0, **fsdp_config) self.base_model.heads[0] = head0 # Init the head properly since the weights are potentially initialized on different # ranks with different seeds. We first summon the full params from all workers. # Then, within that context, we set a fixed random seed so that all workers init the # weights the same way. Finally, we reset the layer's weights using reset_parameters(). # # TODO (Min): This will go away once we have a way to sync from rank 0. with head0.summon_full_params(): with set_torch_seed(self.config["SEED_VALUE"]): for m in head0.modules(): if isinstance(m, Linear): m.reset_parameters() head0._reset_lazy_init() head0.prototypes0._reset_lazy_init() # TODO (Min): We can load checkpoint, but it ends up setting the trunk's _is_root # flag to true. We need to set it back to None here. # Also, right now, the head's weight is only partially loaded from the checkpoint # because we dump the checkpoint after the head if wrapped, but loading it before # it is wrapped. # For very big models, we need re-work the checkpoint logic because we don't have # enough memory to load the entire model on one node. We need to use local_state_dict() # API to load checkpoint shards. for module in self.base_model.trunk.modules(): if isinstance(module, FSDP): module._is_root = None # Then, wrap the whole model. We replace the base_model since it is used # when checkpoint is taken. self.base_model = FSDP(module=self.base_model, **fsdp_config) self.distributed_model = self.base_model
def __init__(self, model_config: AttrDict, model_name: str): super().__init__() self.model_config = model_config assert model_config.INPUT_TYPE in ["rgb", "bgr"], "Input type not supported" trunk_config = model_config.TRUNK.TRUNK_PARAMS.REGNET assert "name" not in trunk_config, "Please specify the RegNet Params dictionary" ################################################################################ params = RegNetParams( depth=trunk_config["depth"], w_0=trunk_config["w_0"], w_a=trunk_config["w_a"], w_m=trunk_config["w_m"], group_width=trunk_config["group_width"], stem_type=trunk_config.get("stem_type", "simple_stem_in").upper(), stem_width=trunk_config.get("stem_width", 32), block_type=trunk_config.get("block_type", "res_bottleneck_block").upper(), activation=trunk_config.get("activation_type", "relu").upper(), use_se=trunk_config.get("use_se", True), se_ratio=trunk_config.get("se_ratio", 0.25), bn_epsilon=trunk_config.get("bn_epsilon", 1e-05), bn_momentum=trunk_config.get("bn_momentum", 0.1), ) # We need all workers (on all nodes) to have the same weights. # Unlike DDP, FSDP does not sync weights using rank 0 on start. # Therefore, we init stem and trunk_output below within the seed context. # # TODO (Min): we can make this seed coming from the config or env. stem = None trunk_output = None seed = model_config._MODEL_INIT_SEED logging.info(f"_MODEL_INIT_SEED: {seed}") with set_torch_seed(seed): # Ad hoc stem # # Important: do NOT retain modules in self.stem or self.trunk_output. It may # seem to be harmless, but it appears that autograd will result in computing # grads in different order. Different ordering can cause deterministic OOM, # even when the peak memory otherwise is only 24GB out of 32GB. # # When debugging this, it is not enough to just dump the total module # params. You need to diff the module string representations. activation = {"RELU": nn.ReLU(True)}[params.activation] stem = { "RES_STEM_CIFAR": ResStemCifar, "RES_STEM_IN": ResStemIN, "SIMPLE_STEM_IN": SimpleStemIN, }[params.stem_type]( 3, params.stem_width, params.bn_epsilon, params.bn_momentum, activation ) init_weights(stem) stem = auto_wrap_bn(stem, single_rank_pg=False) # Instantiate all the AnyNet blocks in the trunk block_fun = { "VANILLA_BLOCK": VanillaBlock, "RES_BASIC_BLOCK": ResBasicBlock, "RES_BOTTLENECK_BLOCK": ResBottleneckBlock, }[params.block_type] current_width = params.stem_width self.trunk_depth = 0 blocks = [] for i, (width_out, stride, depth, bot_mul, group_width) in enumerate( params.get_expanded_params() ): blocks.append( ( f"block{i+1}", AnyStage( model_config, current_width, width_out, stride, depth, block_fun, activation, bot_mul, group_width, params, stage_index=i + 1, ), ) ) self.trunk_depth += blocks[-1][1].stage_depth current_width = width_out trunk_output = nn.Sequential(OrderedDict(blocks)) ################################################################################ # Now map the models to the structure we want to expose for SSL tasks # The upstream RegNet model is made of : # - `stem` # - n x blocks in trunk_output, named `block1, block2, ..` # We're only interested in the stem and successive blocks # everything else is not picked up on purpose feature_blocks: List[Tuple[str, nn.Module]] = [] # - get the stem feature_blocks.append(("conv1", stem)) # - get all the feature blocks for k, v in trunk_output.named_children(): assert k.startswith("block"), f"Unexpected layer name {k}" block_index = len(feature_blocks) + 1 feature_blocks.append((f"res{block_index}", v)) # - finally, add avgpool and flatten. feature_blocks.append(("avgpool", nn.AdaptiveAvgPool2d((1, 1)))) feature_blocks.append(("flatten", Flatten(1))) self._feature_blocks = nn.ModuleDict(feature_blocks)