def begin_fit(self): self.run.model = fp16.convert_network(self.model, dtype=torch.float16) self.model_pgs, self.master_pgs = get_master(self.opt, self.flat_master) #Changes the optimizer so that the optimization step is done in FP32. self.run.opt.param_groups = self.master_pgs #Put those param groups inside our runner. if self.dynamic: self.count = 0
def begin_fit(self): self.run.model = fp16.convert_network(self.model, dtype=torch.float16) self.model_pgs, self.master_pgs = get_master(self.opt, self.flat_master) #Changes the optimizer so that the optimization step is done in FP32. param_groups = self.opt.param_groups #Load the old param groups to get the HP values for (pg,mp) in zip(param_groups,self.master_pgs): pg['params'] = mp #Replace the parameters by the new ones self.run.opt.param_groups = param_groups #Put those param groups inside our runner. if self.dynamic: self.count = 0
def on_train_begin(self): """Convert network to float16""" self.learner._model = convert_network(self.learner._model, float16) self.model_param_groups, self.master_param_groups = get_param_groups( self.learner._optimizer) # self.learner._optimizer.param_groups = self.master_param_groups # self.learner._optimizer.zero_grad = self.learner._model.zero_grad copy_param_to_optimizer(self.learner._optimizer, self.master_param_groups) if self.dynamic: self.count = 0
def begin_fit(self): # Helper 1: Convert model (except for any batchnorm layers) to FP16: self.run.model = fp16.convert_network(self.model, dtype=torch.float16) # Helper 2: Creating a FP32 master copy of parameter weights self.model_param_groups, self.master_param_groups = get_master( self.opt, self.flat_master) # To place those FP32 master copy param groups inside the runner: self.run.opt.param_groups = self.master_param_groups # To count number of iterations without gradient overflow occurring. if self.dynamic: self.count = 0
def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs=None): from apex.parallel import DistributedDataParallel as apex_DDP from .amp import init as amp_init optimizers_was_list = False if isinstance(optimizers, torch.optim.Optimizer): optimizers = [optimizers] elif optimizers is None: optimizers = [] elif isinstance(optimizers, list): optimizers_was_list = True check_optimizers(optimizers) else: check_optimizers([optimizers]) raise TypeError("optimizers must be either a single optimizer or a list of optimizers.") if isinstance(models, torch.nn.Module): models_was_list = False models = [models] elif isinstance(models, list): models_was_list = True else: raise TypeError("models must be either a single model or a list of models.") check_models(models) if not _amp_state.allow_incoming_model_not_fp32: check_params_fp32(models) # In the future, when FP16_Optimizer can be deprecated and master weights can # become an attribute, remember to stash master weights before casting the model. if properties.cast_model_type: if properties.keep_batchnorm_fp32: for model in models: convert_network(model, properties.cast_model_type) else: for model in models: model.to(properties.cast_model_type) input_caster = functools.partial(to_type, properties.cast_model_type) if cast_model_outputs is not None: output_caster = functools.partial(to_type, cast_model_outputs) else: output_caster = functools.partial(to_type, torch.float32) for model in models: # Patch the forward method to cast incoming data to the correct type, and # outgoing data to float32, so "the user never needs to call .half()." # I like writing things explicitly more than decorators. def patch_forward(old_fwd): def new_fwd(*args, **kwargs): output = old_fwd(*applier(args, input_caster), **applier(kwargs, input_caster)) return applier(output, output_caster) return new_fwd model.forward = patch_forward(model.forward) # State dict trick to recast any preexisting per-param state tensors for optimizer in optimizers: optimizer.load_state_dict(optimizer.state_dict()) elif cast_model_outputs is not None: output_caster = functools.partial(to_type, cast_model_outputs) for model in models: def patch_forward(old_fwd): def new_fwd(*args, **kwargs): output = old_fwd(*args, **kwargs) return applier(output, output_caster) return new_fwd model.forward = patch_forward(model.forward) for i, optimizer in enumerate(optimizers): # Still need to special case this for the first pass if isinstance(optimizer, FusedAdam): optimizers[i] = wrap_fused_adam(optimizer, properties) else: optimizers[i] = _process_optimizer(optimizer, properties) _amp_state.loss_scalers = [] for _ in range(num_losses): _amp_state.loss_scalers.append(LossScaler(properties.loss_scale)) if properties.patch_torch_functions: # handle is unused here. It's accessible later through a global value anyway. handle = amp_init(loss_scale=properties.loss_scale, verbose=(_amp_state.verbosity == 2)) for optimizer in optimizers: # Disable Amp casting for the optimizer step, because it should only be # applied to FP32 master params anyway. def patch_step(old_step): def new_step(*args, **kwargs): with disable_casts(): output = old_step(*args, **kwargs) return output return new_step optimizer.step = patch_step(optimizer.step) if optimizers_was_list: if models_was_list: return models, optimizers else: return models[0], optimizers else: if models_was_list: if len(optimizers) == 0: return models else: return models, optimizers[0] else: if len(optimizers) == 0: return models[0] else: return models[0], optimizers[0]
def _initialize(models, optimizers, properties): from apex.parallel import DistributedDataParallel as apex_DDP from .amp import init as amp_init if isinstance(optimizers, torch.optim.Optimizer): optimizers_was_list = False optimizers = [optimizers] elif isinstance(optimizers, list): optimizers_was_list = True else: raise TypeError( "optimizers must be either a single optimizer or a list of optimizers." ) if isinstance(models, torch.nn.Module): models_was_list = False models = [models] elif isinstance(models, list): models_was_list = True else: raise TypeError( "models must be either a single model or a list of models.") check_models(models) check_params_fp32(models) check_optimizers(optimizers) # In the future, when FP16_Optimizer can be deprecated and master weights can # become an attribute, remember to stash master weights before casting the model. if properties.cast_model_type: if properties.keep_batchnorm_fp32: for model in models: convert_network(model, properties.cast_model_type) else: for model in models: model.to(properties.cast_model_type) caster = functools.partial(to_type, properties.cast_model_type) # Patch the forward method to cast incoming data to the correct type. # I like writing things explicitly more than decorators. def patch_forward(old_fwd): def new_fwd(*args, **kwargs): return old_fwd(*applier(args, caster), **applier(kwargs, caster)) return new_fwd model.forward = patch_forward(model.forward) # State dict trick to recast any preexisting per-param state tensors for optimizer in optimizers: optimizer.load_state_dict(optimizer.state_dict()) if properties.master_weights: for i, optimizer in enumerate(optimizers): if isinstance(optimizer, FusedAdam): optimizers[i] = wrap_fused_adam(optimizer, properties) if properties.loss_scale == "dynamic": optimizers[i] = FP16_Optimizer_general(optimizer, dynamic_loss_scale=True, verbose=False) else: optimizers[i] = FP16_Optimizer_general( optimizer, static_loss_scale=properties.loss_scale, verbose=False) else: for optimizer in optimizers: optimizer.loss_scaler = LossScaler(properties.loss_scale) if properties.patch_torch_functions: # handle is unused here. It's accessible later through a global value anyway. handle = amp_init(loss_scale=properties.loss_scale) for optimizer in optimizers: # Disable Amp casting for the optimizer step, because it should only be # applied to FP32 master params anyway. def patch_step(old_step): def new_step(*args, **kwargs): with disable_casts(): output = old_step(*args, **kwargs) return output return new_step optimizer.step = patch_step(optimizer.step) if optimizers_was_list: if models_was_list: return models, optimizers else: return models[0], optimizers else: if models_was_list: return models, optimizers[0] else: return models[0], optimizers[0]
model = model_to_half(model) def check_weights(model): for i, t in enumerate([torch.float16, torch.float32, torch.float16]): assert model[i].weight.dtype == t assert model[i].bias.dtype == t check_weights(model) # In Apex, the function that does this for us is `convert_network`. We can use it to put the model in FP16 or back to FP32. model = nn.Sequential(nn.Linear(10, 30), nn.BatchNorm1d(30), nn.Linear(30, 2)).cuda() model = fp16.convert_network(model, torch.float16) check_weights(model) # ### Creating the master copy of the parameters # From our model parameters (mostly in FP16), we'll want to create a copy in FP32 (master parameters) that we will use for the step in the optimizer. Optionally, we concatenate all the parameters to do one flat big tensor, which can make that step a little bit faster. from torch.nn.utils import parameters_to_vector def get_master(model, flat_master=False): model_params = [ param for param in model.parameters() if param.requires_grad ] if flat_master: master_param = parameters_to_vector(