def __init__(self, optimizer, *args, **kwargs): if not torch.cuda.is_available: raise SystemError('Cannot use fp16 without CUDA') self.optimizer = optimizer self.state = optimizer.state self.param_groups = optimizer.param_groups self.fp16_params = [] self.fp32_params = [] for group in self.param_groups: for i, param in enumerate(group['params']): self.fp16_params.append(param) fp32_param = param if isinstance(fp32_param.data, HALF_TYPES): fp32_param = param.clone().float().detach() fp32_param.requires_grad = param.requires_grad self.fp32_params.append(fp32_param) group['params'][i] = fp32_param if 'loss_scaler' in kwargs and kwargs['loss_scaler'] is not None: self.loss_scaler = kwargs['loss_scaler'] elif 'dynamic_scale' in kwargs and kwargs['dynamic_scale']: self.loss_scaler = DynamicLossScaler() else: scale = kwargs['scale'] if 'scale' in kwargs else 1 self.loss_scaler = LossScaler(scale) self.overflow = False
def __init__(self, optimizer, static_loss_scale=1.0, dynamic_loss_scale=False): if not torch.cuda.is_available: raise SystemError('Cannot use fp16 without CUDA') self.fp16_param_groups = [] self.fp32_param_groups = [] self.fp32_flattened_groups = [] for i, param_group in enumerate(optimizer.param_groups): print("FP16_Optimizer processing param group {}:".format(i)) fp16_params_this_group = [] fp32_params_this_group = [] for param in param_group['params']: if param.requires_grad: if param.type() == 'torch.cuda.HalfTensor': print( "FP16_Optimizer received torch.cuda.HalfTensor with {}" .format(param.size())) fp16_params_this_group.append(param) elif param.type() == 'torch.cuda.FloatTensor': print( "FP16_Optimizer received torch.cuda.FloatTensor with {}" .format(param.size())) fp32_params_this_group.append(param) else: raise TypeError( "Wrapped parameters must be either " "torch.cuda.FloatTensor or torch.cuda.HalfTensor. " "Received {}".format(param.type())) fp32_flattened_this_group = None if len(fp16_params_this_group) > 0: fp32_flattened_this_group = _flatten_dense_tensors([ param.detach().data.clone().float() for param in fp16_params_this_group ]) fp32_flattened_this_group = Variable(fp32_flattened_this_group, requires_grad=True) fp32_flattened_this_group.grad = fp32_flattened_this_group.new( *fp32_flattened_this_group.size()) # python's lovely list concatenation via + if fp32_flattened_this_group is not None: param_group['params'] = [fp32_flattened_this_group ] + fp32_params_this_group else: param_group['params'] = fp32_params_this_group self.fp16_param_groups.append(fp16_params_this_group) self.fp32_param_groups.append(fp32_params_this_group) self.fp32_flattened_groups.append(fp32_flattened_this_group) # print("self.fp32_flattened_groups = ", self.fp32_flattened_groups) # print("self.fp16_param_groups = ", self.fp16_param_groups) self.optimizer = optimizer.__class__(optimizer.param_groups) # self.optimizer.load_state_dict(optimizer.state_dict()) self.param_groups = self.optimizer.param_groups if dynamic_loss_scale: self.dynamic_loss_scale = True self.loss_scaler = DynamicLossScaler() else: self.dynamic_loss_scale = False self.loss_scaler = LossScaler(static_loss_scale) self.overflow = False self.first_closure_call_this_step = True
def __init__(self, init_optimizer, static_loss_scale=1.0, dynamic_loss_scale=False, dynamic_loss_args=None, verbose=False): if not torch.cuda.is_available: raise SystemError("Cannot use fp16 without CUDA.") self.verbose = verbose self.optimizer = init_optimizer # init_state_dict sets up an alternative way to cast per-param state tensors. # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary. # init_state_dict = init_optimizer.state_dict() self.fp16_groups = [] self.fp32_from_fp16_groups = [] self.fp32_from_fp32_groups = [] for i, param_group in enumerate(self.optimizer.param_groups): self.maybe_print("FP16_Optimizer processing param group {}:".format(i)) fp16_params_this_group = [] fp32_params_this_group = [] fp32_from_fp16_params_this_group = [] for i, param in enumerate(param_group['params']): if param.requires_grad: if param.type() == 'torch.cuda.HalfTensor': self.maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}" .format(param.size())) if param.nelement() % 8 != 0: print(f'Warning: non-8 tensor with size {param.size()}') fp16_params_this_group.append(param) master_param = param.detach().clone().float() master_param.requires_grad = True param_group['params'][i] = master_param fp32_from_fp16_params_this_group.append(master_param) # Reset existing state dict key to the new master param. # We still need to recast per-param state tensors, if any, to FP32. if param in self.optimizer.state: self.optimizer.state[master_param] = self.optimizer.state.pop(param) elif param.type() == 'torch.cuda.FloatTensor': self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}" .format(param.size())) fp32_params_this_group.append(param) param_group['params'][i] = param else: raise TypeError("Wrapped parameters must be either " "torch.cuda.FloatTensor or torch.cuda.HalfTensor. " "Received {}".format(param.type())) self.fp16_groups.append(fp16_params_this_group) self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group) self.fp32_from_fp32_groups.append(fp32_params_this_group) # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors self.optimizer.load_state_dict(self.optimizer.state_dict()) # alternative way to cast per-param state tensors: # self.optimizer.load_state_dict(init_state_dict) if dynamic_loss_scale: self.dynamic_loss_scale = True if dynamic_loss_args is not None: self.loss_scaler = DynamicLossScaler(**dynamic_loss_args) else: self.loss_scaler = DynamicLossScaler() else: self.dynamic_loss_scale = False self.loss_scaler = LossScaler(static_loss_scale) self.overflow = False self.first_closure_call_this_step = True self.clip_grad_norm = clip_grad_norm
def __init__(self, init_optimizer, static_loss_scale=1.0, dynamic_loss_scale=False, dynamic_loss_args=None): if not torch.cuda.is_available: raise SystemError("Cannot use fp16 without CUDA.") self.fp16_groups = [] self.fp32_from_fp16_groups = [] self.fp32_from_fp32_groups = [] for i, param_group in enumerate(init_optimizer.param_groups): print("FP16_Optimizer processing param group {}:".format(i)) fp16_params_this_group = [] fp32_params_this_group = [] for param in param_group['params']: if param.requires_grad: if param.type() == 'torch.cuda.HalfTensor': print( "FP16_Optimizer received torch.cuda.HalfTensor with {}" .format(param.size())) fp16_params_this_group.append(param) elif param.type() == 'torch.cuda.FloatTensor': print( "FP16_Optimizer received torch.cuda.FloatTensor with {}" .format(param.size())) fp32_params_this_group.append(param) else: raise TypeError( "Wrapped parameters must be either " "torch.cuda.FloatTensor or torch.cuda.HalfTensor. " "Received {}".format(param.type())) fp32_from_fp16_params_this_group = [ param.detach().clone().float() for param in fp16_params_this_group ] for param in fp32_from_fp16_params_this_group: param.requires_grad = True param_group[ 'params'] = fp32_from_fp16_params_this_group + fp32_params_this_group self.fp16_groups.append(fp16_params_this_group) self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group) self.fp32_from_fp32_groups.append(fp32_params_this_group) self.optimizer = init_optimizer.__class__(init_optimizer.param_groups) if dynamic_loss_scale: self.dynamic_loss_scale = True if dynamic_loss_args is not None: self.loss_scaler = DynamicLossScaler(**dynamic_loss_args) else: self.loss_scaler = DynamicLossScaler() else: self.dynamic_loss_scale = False self.loss_scaler = LossScaler(static_loss_scale) self.overflow = False self.first_closure_call_this_step = True