def _init_apex_grad_scaler(self): # Check Apex availability if not is_apex_available(): raise RuntimeError("Apex is not available. Can't use mixed precision") # "amp_args" are actually Apex Amp args self.amp_args = self.config.MODEL.AMP_PARAMS.AMP_ARGS logging.info(f"Setting AMP: using apex, args {self.amp_args}")
def _build_optimizer(self): """ Build optimizers using the optimizer settings specified by user. For SGD, we support LARC as well. In order to use LARC, Apex must be installed. """ optimizer_config = self.config["OPTIMIZER"] if optimizer_config.use_larc and optimizer_config.name != "sgd_fsdp": assert is_apex_available(), "Apex must be available to use LARC" optim = build_optimizer(optimizer_config) return optim
def set_amp_args(self): """ Two automatic mixed precision implementations are available: Apex's and PyTorch's. - If Apex's AMP is enabled, amp_args is a dictionary containing arguments to be passed to amp.initialize. Set to None to disable amp. To enable mixed precision training, pass amp_args={"opt_level": "O1"} here. See https://nvidia.github.io/apex/amp.html for more info. - If Pytorch's AMP is enabled, no arguments are needed. """ if self.config.MODEL.AMP_PARAMS.USE_AMP: assert ( self.device.type == "cuda" ), "Mixed precision is only available on CUDA devices for now" # This will rightly fail if the setting is not correct self.amp_type = AmpType[self.config.MODEL.AMP_PARAMS.AMP_TYPE.upper()] # Check Apex availability if self.amp_type == AmpType.APEX: if not is_apex_available(): raise RuntimeError( "Apex is not available. Can't use mixed precision" ) # "amp_args" are actually Apex Amp args self.amp_args = self.config.MODEL.AMP_PARAMS.AMP_ARGS logging.info(f"Setting AMP: using apex, args {self.amp_args}") elif self.amp_type == AmpType.PYTORCH: # if the optimizer is sharded or FSDP data parallel is used, then the GradScaler # needs to be shard-aware. if ( self.config["TRAINER"]["TASK_NAME"] == "self_supervision_fsdp_task" or self.config["OPTIMIZER"]["name"] == "zero" ): assert is_fairscale_sharded_available(), ( "To use ZeRO with PyTorch AMP, ShardedGradScaler() " "from fairscale is needed. Please upgrade fairscale" ) from fairscale.optim.grad_scaler import ShardedGradScaler self.amp_grad_scaler = ShardedGradScaler() logging.info("Setting AMP: using sharded grad scaler") else: self.amp_grad_scaler = TorchGradScaler() logging.info("Setting AMP: using pytorch grad scaler") logging.info(f"Setting AMP: {self.amp_type} - args: {self.amp_args}") else: self.amp_args, self.amp_type = None, None logging.info("Not using Automatic Mixed Precision")
def set_amp_args(self): """ Two automatic mixed precision implementations are available: Apex's and PyTorch's. - If Apex's AMP is enabled, amp_args is a dictionary containing arguments to be passed to amp.initialize. Set to None to disable amp. To enable mixed precision training, pass amp_args={"opt_level": "O1"} here. See https://nvidia.github.io/apex/amp.html for more info. - If Pytorch's AMP is enabled, no arguments are needed. """ if self.config.MODEL.AMP_PARAMS.USE_AMP: assert ( self.device.type == "cuda" ), "Mixed precision is only available on CUDA devices for now" # This will rightly fail if the setting is not correct self.amp_type = AmpType[self.config.MODEL.AMP_PARAMS.AMP_TYPE.upper()] # Check Apex availability if self.amp_type == AmpType.APEX: if not is_apex_available(): raise RuntimeError( "Apex is not available. Can't use mixed precision" ) # "amp_args" are actually Apex Amp args self.amp_args = self.config.MODEL.AMP_PARAMS.AMP_ARGS elif self.amp_type == AmpType.PYTORCH: # If the optimizer is sharded, then the GradScaler needs to be shard-aware self.amp_grad_scaler = ( ShardedGradScaler() if self.config["OPTIMIZER"]["name"] == "zero" else TorchGradScaler() ) logging.info(f"Setting AMP: {self.amp_type} - args: {self.amp_args}") else: self.amp_args, self.amp_type = None, None logging.info("Not using Automatic Mixed Precision")
def _set_classy_state(self, state): """ We load/set the model state setting here to resume correctly from the specified state. Usually called when resuming training from a previous model checkpoint. We set the model phase (train or eval), model weights, copy the model to correct device, initialize meters, initialize optimizers initialize amp state, set loss state, set the train phase number, iteration, recreate data iterators, etc. """ logging.info("=======Updating classy state_dict from checkpoint=======") # here we load the state specific things only. The other extra variables # are init from the checkpoint in the trainer step. self.train = state["train"] self.base_model.set_classy_state(state["base_model"]) # We need to set the model on correct device here unlike in the case of # training from scratch. The optimizer looks at the model parameters like # momentum etc. for getting the device info. Since in case of scratch # training, we don't have those and the optimizer just gets the inputs # as cuda inputs from the model, it can work. However, when we load from # a checkpoint, we already have these parameters and the type is CPU # (since the model isn't copied to gpu yet). The copy_model_to_gpu() # doesn't modify optimizer params device. The optimizer is constructed # with the CPU inputs. When the model runs, it rather sends CUDA. self.base_model.to(self.device) self._set_ema_model_state(state) for meter, meter_state in zip(self.meters, state["meters"]): meter.set_classy_state(meter_state) self.optimizer.set_classy_state(state["optimizer"]) # restore amp state. It's called after amp.initialize is done. if "amp" in state: if self.amp_type == AmpType.APEX: if is_apex_available(): apex.amp.load_state_dict(state["amp"]) else: logging.warning( "Loading a checkpoint which has amp state but apex isn't available now" ) else: self.amp_grad_scaler.load_state_dict(state["amp"]) self.phase_idx = state["phase_idx"] self.train_phase_idx = state["train_phase_idx"] self.num_updates = state["num_updates"] self.losses = state["losses"] phase_type = "train" if self.train else "test" phase = self.phases[self.phase_idx] # Re-create the data iterator. # We are restoring from a checkpoint, which means we need to # (1) set the right epoch # (2) set the right start_iter # epoch number is `phase_idx + 1` since checkpoint's value is the epoch finished. # start_iter is computed in recreate_data_iterator based on iteration # number from the checkpoint state. self.recreate_data_iterator( phase_type, epoch=self.phase_idx + 1, compute_start_iter=True, train_phase_idx=self.train_phase_idx + 1, ) # set the model to train or eval depending on what phase we are in self.base_model.train(phase["train"]) if self.train and self.train_phase_idx >= 0: self.optimizer.on_epoch(self.where)
from classy_vision.tasks import ClassificationTask, register_task from classy_vision.tasks.classification_task import AmpType, BroadcastBuffersMode from fairscale.nn import FullyShardedDataParallel from iopath.common.file_io import g_pathmgr from torch.cuda.amp import GradScaler as TorchGradScaler from vissl.config import AttrDict from vissl.data import build_dataloader, build_dataset, print_sampler_config from vissl.models import build_model, convert_sync_bn from vissl.optimizers import get_optimizer_param_groups from vissl.utils.activation_checkpointing import manual_gradient_reduction from vissl.utils.checkpoint import CheckpointLoader from vissl.utils.ema_model import ModelEmaV2 from vissl.utils.misc import is_apex_available, is_fairscale_sharded_available if is_apex_available(): import apex @register_task("self_supervision_task") class SelfSupervisionTask(ClassificationTask): """ A task prepares and holds all the components of a training like optimizer, datasets, dataloaders, losses, meters etc. Task also contains the variable like training iteration, epoch number etc. that are updated during the training. We prepare every single component according to the parameter settings user wants and specified in the yaml config file. Task also supports 2 additional things: 1) converts the model BatchNorm layers to the synchronized batchnorm