コード例 #1
0
    def __init__(self, optimizer, *args, **kwargs):
        if not torch.cuda.is_available:
            raise SystemError('Cannot use fp16 without CUDA')

        self.optimizer = optimizer
        self.state = optimizer.state
        self.param_groups = optimizer.param_groups

        self.fp16_params = []
        self.fp32_params = []
        for group in self.param_groups:
            for i, param in enumerate(group['params']):
                self.fp16_params.append(param)
                fp32_param = param
                if isinstance(fp32_param.data, HALF_TYPES):
                    fp32_param = param.clone().float().detach()
                fp32_param.requires_grad = param.requires_grad
                self.fp32_params.append(fp32_param)
                group['params'][i] = fp32_param

        if 'loss_scaler' in kwargs and kwargs['loss_scaler'] is not None:
            self.loss_scaler = kwargs['loss_scaler']
        elif 'dynamic_scale' in kwargs and kwargs['dynamic_scale']: 
            self.loss_scaler = DynamicLossScaler()
        else:
            scale = kwargs['scale'] if 'scale' in kwargs else 1
            self.loss_scaler = LossScaler(scale)

        self.overflow = False
コード例 #2
0
ファイル: fp16_optimizer.py プロジェクト: zge/tacotron2-vae
    def __init__(self,
                 optimizer,
                 static_loss_scale=1.0,
                 dynamic_loss_scale=False):
        if not torch.cuda.is_available:
            raise SystemError('Cannot use fp16 without CUDA')

        self.fp16_param_groups = []
        self.fp32_param_groups = []
        self.fp32_flattened_groups = []
        for i, param_group in enumerate(optimizer.param_groups):
            print("FP16_Optimizer processing param group {}:".format(i))
            fp16_params_this_group = []
            fp32_params_this_group = []
            for param in param_group['params']:
                if param.requires_grad:
                    if param.type() == 'torch.cuda.HalfTensor':
                        print(
                            "FP16_Optimizer received torch.cuda.HalfTensor with {}"
                            .format(param.size()))
                        fp16_params_this_group.append(param)
                    elif param.type() == 'torch.cuda.FloatTensor':
                        print(
                            "FP16_Optimizer received torch.cuda.FloatTensor with {}"
                            .format(param.size()))
                        fp32_params_this_group.append(param)
                    else:
                        raise TypeError(
                            "Wrapped parameters must be either "
                            "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
                            "Received {}".format(param.type()))

            fp32_flattened_this_group = None
            if len(fp16_params_this_group) > 0:
                fp32_flattened_this_group = _flatten_dense_tensors([
                    param.detach().data.clone().float()
                    for param in fp16_params_this_group
                ])

                fp32_flattened_this_group = Variable(fp32_flattened_this_group,
                                                     requires_grad=True)

                fp32_flattened_this_group.grad = fp32_flattened_this_group.new(
                    *fp32_flattened_this_group.size())

            # python's lovely list concatenation via +
            if fp32_flattened_this_group is not None:
                param_group['params'] = [fp32_flattened_this_group
                                         ] + fp32_params_this_group
            else:
                param_group['params'] = fp32_params_this_group

            self.fp16_param_groups.append(fp16_params_this_group)
            self.fp32_param_groups.append(fp32_params_this_group)
            self.fp32_flattened_groups.append(fp32_flattened_this_group)

        # print("self.fp32_flattened_groups = ", self.fp32_flattened_groups)
        # print("self.fp16_param_groups = ", self.fp16_param_groups)

        self.optimizer = optimizer.__class__(optimizer.param_groups)

        # self.optimizer.load_state_dict(optimizer.state_dict())

        self.param_groups = self.optimizer.param_groups

        if dynamic_loss_scale:
            self.dynamic_loss_scale = True
            self.loss_scaler = DynamicLossScaler()
        else:
            self.dynamic_loss_scale = False
            self.loss_scaler = LossScaler(static_loss_scale)

        self.overflow = False
        self.first_closure_call_this_step = True
コード例 #3
0
    def __init__(self, 
                 init_optimizer, 
                 static_loss_scale=1.0, 
                 dynamic_loss_scale=False,
                 dynamic_loss_args=None,
                 verbose=False):
        if not torch.cuda.is_available:
            raise SystemError("Cannot use fp16 without CUDA.")

        self.verbose = verbose

        self.optimizer = init_optimizer
        # init_state_dict sets up an alternative way to cast per-param state tensors.
        # Stashing here in case https://github.com/pytorch/pytorch/issues/7733 makes it necessary.
        # init_state_dict = init_optimizer.state_dict()

        self.fp16_groups = []
        self.fp32_from_fp16_groups = []
        self.fp32_from_fp32_groups = []
        for i, param_group in enumerate(self.optimizer.param_groups):
            self.maybe_print("FP16_Optimizer processing param group {}:".format(i))
            fp16_params_this_group = []
            fp32_params_this_group = []
            fp32_from_fp16_params_this_group = []
            for i, param in enumerate(param_group['params']):
                if param.requires_grad:
                    if param.type() == 'torch.cuda.HalfTensor':
                        self.maybe_print("FP16_Optimizer received torch.cuda.HalfTensor with {}"
                                         .format(param.size()))
                        if param.nelement() % 8 != 0:
                            print(f'Warning: non-8 tensor with size {param.size()}')
                        fp16_params_this_group.append(param)
                        master_param = param.detach().clone().float()
                        master_param.requires_grad = True
                        param_group['params'][i] = master_param
                        fp32_from_fp16_params_this_group.append(master_param)
                        # Reset existing state dict key to the new master param.
                        # We still need to recast per-param state tensors, if any, to FP32.
                        if param in self.optimizer.state:
                           self.optimizer.state[master_param] = self.optimizer.state.pop(param) 
                    elif param.type() == 'torch.cuda.FloatTensor':
                        self.maybe_print("FP16_Optimizer received torch.cuda.FloatTensor with {}"
                                         .format(param.size()))
                        fp32_params_this_group.append(param)
                        param_group['params'][i] = param
                    else:
                        raise TypeError("Wrapped parameters must be either "
                                        "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "  
                                        "Received {}".format(param.type()))
            
            self.fp16_groups.append(fp16_params_this_group)
            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
            self.fp32_from_fp32_groups.append(fp32_params_this_group)

        # Leverage state_dict() and load_state_dict() to recast preexisting per-param state tensors
        self.optimizer.load_state_dict(self.optimizer.state_dict())
        # alternative way to cast per-param state tensors:
        # self.optimizer.load_state_dict(init_state_dict)

        if dynamic_loss_scale:
            self.dynamic_loss_scale = True
            if dynamic_loss_args is not None:
                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
            else:
                self.loss_scaler = DynamicLossScaler()
        else:
            self.dynamic_loss_scale = False
            self.loss_scaler = LossScaler(static_loss_scale)

        self.overflow = False
        self.first_closure_call_this_step = True

        self.clip_grad_norm = clip_grad_norm
コード例 #4
0
    def __init__(self,
                 init_optimizer,
                 static_loss_scale=1.0,
                 dynamic_loss_scale=False,
                 dynamic_loss_args=None):
        if not torch.cuda.is_available:
            raise SystemError("Cannot use fp16 without CUDA.")

        self.fp16_groups = []
        self.fp32_from_fp16_groups = []
        self.fp32_from_fp32_groups = []
        for i, param_group in enumerate(init_optimizer.param_groups):
            print("FP16_Optimizer processing param group {}:".format(i))
            fp16_params_this_group = []
            fp32_params_this_group = []
            for param in param_group['params']:
                if param.requires_grad:
                    if param.type() == 'torch.cuda.HalfTensor':
                        print(
                            "FP16_Optimizer received torch.cuda.HalfTensor with {}"
                            .format(param.size()))
                        fp16_params_this_group.append(param)
                    elif param.type() == 'torch.cuda.FloatTensor':
                        print(
                            "FP16_Optimizer received torch.cuda.FloatTensor with {}"
                            .format(param.size()))
                        fp32_params_this_group.append(param)
                    else:
                        raise TypeError(
                            "Wrapped parameters must be either "
                            "torch.cuda.FloatTensor or torch.cuda.HalfTensor. "
                            "Received {}".format(param.type()))

            fp32_from_fp16_params_this_group = [
                param.detach().clone().float()
                for param in fp16_params_this_group
            ]
            for param in fp32_from_fp16_params_this_group:
                param.requires_grad = True

            param_group[
                'params'] = fp32_from_fp16_params_this_group + fp32_params_this_group

            self.fp16_groups.append(fp16_params_this_group)
            self.fp32_from_fp16_groups.append(fp32_from_fp16_params_this_group)
            self.fp32_from_fp32_groups.append(fp32_params_this_group)

        self.optimizer = init_optimizer.__class__(init_optimizer.param_groups)

        if dynamic_loss_scale:
            self.dynamic_loss_scale = True
            if dynamic_loss_args is not None:
                self.loss_scaler = DynamicLossScaler(**dynamic_loss_args)
            else:
                self.loss_scaler = DynamicLossScaler()
        else:
            self.dynamic_loss_scale = False
            self.loss_scaler = LossScaler(static_loss_scale)

        self.overflow = False
        self.first_closure_call_this_step = True