Beispiel #1
0
    def __init__(self, dataset, sink_size, epoch_num, iter_first_order):
        super().__init__(dataset, sink_size, epoch_num)
        sink_count = 1
        if hasattr(dataset, '__loop_size__'):
            loop_size = dataset.__loop_size__ + iter_first_order
            if loop_size <= dataset.get_dataset_size(
            ) and dataset.get_dataset_size() % loop_size != 0:
                raise ValueError(
                    f'Dataset size {dataset.get_dataset_size()} and '
                    f'sink_size {loop_size} are not matched.')
            sink_count = math.ceil(dataset.get_dataset_size() / loop_size) * 2
        self.sink_count = sink_count
        ms_role = os.getenv("MS_ROLE")
        if ms_role in ("MS_PSERVER", "MS_SCHED"):
            self.sink_count = 1
        # for self._parallel_mode equal to semi_auto_parallel or auto_parallel, and not using full_batch,
        # use a complete tensor to compile, and slice tensor to run. The batch dimension of tensors for
        # compile is device_number times the batch dimension of tensors for run. Now only support LoopSink.
        if _need_to_full():
            device_num = _get_device_num()
            self.dataset_shapes = _to_full_shapes(self.dataset_shapes,
                                                  device_num)

        def op():
            return tuple()

        self.op = op
Beispiel #2
0
    def __init__(self, network, total_steps=1, sens=16384.0):
        super(TrainStepWrap, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_train()
        self.network.add_flags(defer_inline=True)
        self.weights = ParameterTuple(network.trainable_params())

        lr = dynamic_lr(0.01, total_steps, 5000)
        self.optimizer = nn.Adam(self.weights,
                                 learning_rate=lr,
                                 beta1=0.9,
                                 beta2=0.999,
                                 eps=1e-8,
                                 loss_scale=sens)

        self.hyper_map = C.HyperMap()
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens

        self.reducer_flag = False
        self.grad_reducer = None
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL,
                             ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(
                self.optimizer.parameters, mean, degree)
Beispiel #3
0
 def __init__(self, network, optimizer, sens=1.0):
     super(TrainOneStepCell, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_grad()
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.grad = C.GradOperation(get_by_list=True, sens_param=True)
     self.sens = sens
     self.reducer_flag = False
     self.grad_reducer = F.identity
     self.parallel_mode = _get_parallel_mode()
     if self.parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL):
         self.reducer_flag = True
     if self.reducer_flag:
         self.mean = _get_gradients_mean()
         self.degree = _get_device_num()
         self.grad_reducer = DistributedGradReducer(self.weights, self.mean, self.degree)
     self.use_grad_accumulation = False
     if self.parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.STAND_ALONE):
         self.use_grad_accumulation = True
     if self.use_grad_accumulation:
         self.max_accumulation_step = get_auto_parallel_context("grad_accumulation_step")
         if self.max_accumulation_step <= 1:
             self.max_accumulation_step = 1
             self.use_grad_accumulation = False
     if self.use_grad_accumulation:
         self.grad_accumulation = GradientAccumulation(self.max_accumulation_step, self.optimizer)
Beispiel #4
0
    def __init__(self,
                 network,
                 loss_fn=None,
                 optimizer=None,
                 metrics=None,
                 eval_network=None,
                 eval_indexes=None,
                 amp_level="O0",
                 frequency=278,
                 stop_epoch=100,
                 **kwargs):
        self._network = network
        self._loss_fn = loss_fn
        self._optimizer = optimizer
        self._loss_scale_manager = None
        self._loss_scale_manager_set = False
        self._keep_bn_fp32 = True
        self._check_kwargs(kwargs)
        self._amp_level = amp_level
        self._process_amp_args(kwargs)
        self._parallel_mode = _get_parallel_mode()
        self._device_number = _get_device_num()
        self._global_rank = _get_global_rank()
        self._parameter_broadcast = _get_parameter_broadcast()
        self._frequency = frequency
        self._stop_epoch = stop_epoch

        self._train_network = self._build_train_network()
        self._build_eval_network(metrics, eval_network, eval_indexes)
        self._build_predict_network()
Beispiel #5
0
 def _use_parallel_optimizer(self):
     """Indicates whether to use automatic parallelism."""
     if context.get_auto_parallel_context("enable_parallel_optimizer"):
         if _get_parallel_mode() == ParallelMode.DATA_PARALLEL and context.get_context("device_target") == "Ascend":
             self.use_parallel = True
         elif _get_parallel_mode() == ParallelMode.DATA_PARALLEL \
                 and context.get_context("device_target") != "Ascend":
             raise RuntimeError("Parallel optimizer only supports Ascend in data parallel mode.")
         elif _get_parallel_mode() in (ParallelMode.STAND_ALONE, ParallelMode.HYBRID_PARALLEL):
             raise RuntimeError("Parallel optimizer is not supported in {}.".format(_get_parallel_mode()))
         else:
             self.use_parallel = False
     else:
         self.use_parallel = False
     if self.use_parallel:
         if self.cls_name not in ["Lamb", "AdamWeightDecay"]:
             raise RuntimeError("Parallel optimizer does not support optimizer {}".format(self.cls_name))
         self.dev_num = _get_device_num()
         if self.dev_num > self.param_length:
             raise RuntimeError("Parallel optimizer can not be applied when the number of parameters {} is"
                                " less than the number of devices {}".format(self.param_length, self.dev_num))
         self.param_rank = self._get_parameter_group_id()
         self.optim_filter = tuple(map(lambda x: x == _get_global_rank(), self.param_rank))
         self.param_names = []
         for param in self.parameters:
             self.param_names.append(param.name)
     else:
         self.optim_filter = (True,) * self.param_length
Beispiel #6
0
def _init_device_info():
    """
    INTERNAL USE ONLY!
    As rank_id need to pass into deep layer for numa and device_queue.
    One process work with only one rank_id, In standalone scenario,
    rank_id may come from env 'CUDA_VISIBLE_DEVICES', For distribute
    scenario, rank_id come from _get_global_rank()
    """
    from mindspore import context
    from mindspore.parallel._auto_parallel_context import auto_parallel_context
    from mindspore.parallel._utils import _get_global_rank, _get_device_num
    if context.get_context("device_target") == "GPU":
        rank_id = _get_global_rank()
        parallel_mode = auto_parallel_context().get_parallel_mode()
        if parallel_mode == "stand_alone":
            cuda_device_info = os.getenv("CUDA_VISIBLE_DEVICES")
            if cuda_device_info:
                cuda_id = int(cuda_device_info.split(",")[0].strip())
                if cuda_id != rank_id:
                    rank_id = cuda_id
        _config.set_rank_id(rank_id)
    elif context.get_context("device_target") == "Ascend":
        rank_id = _get_global_rank()
        device_num = _get_device_num()
        # Ascend only support multi-process scenario
        if device_num > 1:
            _config.set_rank_id(rank_id)
Beispiel #7
0
    def __init__(self, network, optimizer, scale_update_cell=None):
        super(TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.add_flags(defer_inline=True)
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
        self.hyper_map = C.HyperMap()
        self.alloc_status = NPUAllocFloatStatus()
        self.get_status = NPUGetFloatStatus()
        self.clear_status = NPUClearFloatStatus()
        self.reduce_sum = ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = LessEqual()
        self.depend_parameter_use = ControlDepend(depend_mode=1)
        self.allreduce = P.AllReduce()
        self.parallel_mode = _get_parallel_mode()
        self.grad_reducer = None
        self.reducer_flag = self.parallel_mode in [
            ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE

        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(Tensor(
                scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                        name="loss_scale")
        self.add_flags(has_effect=True)
Beispiel #8
0
 def __init__(self, network, optimizer, sens=1.0):
     super(TrainOneStepCellWithGradClip, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_grad()
     self.network.add_flags(defer_inline=True)
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
     self.sens = sens
     self.reducer_flag = False
     self.grad_reducer = None
     self.hyper_map = C.HyperMap()
     self.greater = P.Greater()
     self.select = P.Select()
     self.norm = nn.Norm(keep_dims=True)
     self.dtype = P.DType()
     self.cast = P.Cast()
     self.concat = P.Concat(axis=0)
     self.ten = Tensor(np.array([10.0]).astype(np.float32))
     parallel_mode = _get_parallel_mode()
     if parallel_mode in (ParallelMode.DATA_PARALLEL,
                          ParallelMode.HYBRID_PARALLEL):
         self.reducer_flag = True
     if self.reducer_flag:
         mean = _get_mirror_mean()
         degree = _get_device_num()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                    mean, degree)
Beispiel #9
0
    def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0,
                 loss_scale=1.0,
                 decay_filter=lambda x: x.name not in []):
        super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale)
        if isinstance(momentum, float) and momentum < 0.0:
            raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
        self.momentum = Parameter(Tensor(momentum, mstype.float32))
        self.params = self.parameters
        self.moments = self.params.clone(prefix="moments", init='zeros')
        self.hyper_map = C.HyperMap()
        self.opt = P.ApplyMomentum()
        self.matrix_A = ParameterTuple(matrix_A)
        self.matrix_G = ParameterTuple(matrix_G)
        self.A_inv_max = ParameterTuple(A_inv_max)
        self.G_inv_max = ParameterTuple(G_inv_max)
        self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast()
        self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft()
        self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight()
        self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul()
        self.transpose = P.Transpose()
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.mul = P.Mul()
        self.weight_idx = []
        for i in range(len(self.params)):
            if "conv" in self.params[i].name or "end_point" in self.params[i].name:
                self.weight_idx.append(i)
        self.weight_idx.append(len(self.params))
        self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
                            1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
                            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
                            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
                            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49,
                            1.0]
        mean = _get_gradients_mean()
        degree = _get_device_num()
        parameter_length = len(self.feature_map)
        self.grad_reducer_Amax = DistributedGradReducerThor(parameter_length, ((27,), 2), mean, degree)
        self.grad_reducer_Gmax = DistributedGradReducerThor(parameter_length, ((27,), 4), mean, degree)
        self.grad_reducer_A = DistributedGradReducerThor(parameter_length, ((27,), 6), mean, degree)
        self.grad_reducer_G = DistributedGradReducerThor(parameter_length, ((27,), 8), mean, degree)
        self.matrix_A_inv = ()
        self.matrix_G_inv = ()
        self.matrix_max_inv = ()

        for i in range(54):
            self.matrix_max_inv = self.matrix_max_inv + (
                Parameter(initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False),)
        self.log = P.Log()
        self.exp = P.Exp()
        self.sqrt = P.Sqrt()
        self.matrix_max_inv = ParameterTuple(self.matrix_max_inv)
        self.assign = P.Assign()
        self.cast = P.Cast()
        self.thor = True
        self.weight_decay = weight_decay * loss_scale
        self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
Beispiel #10
0
    def __init__(self, network, optimizer, sens=1.0):
        super(TrainOneStepCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens
        self.reducer_flag = False
        self.grad_reducer = None
        self._tuple_add = _TupleAdd()
        self._tuple_mul = _TupleMul()
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL,
                             ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)

        self.do_privacy = False
        self.grad_mask_tup = ()  # tuple containing grad_mask(cell)
        self.de_weight_tup = ()  # tuple containing de_weight(cell)
        self._suppress_pri_ctrl = None
Beispiel #11
0
    def __init__(self, network, optimizer, sens=1.0, micro_batches=None, norm_clip=1.0, mech=None):
        super(_TrainOneStepCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
        self.sens = sens
        self.reducer_flag = False
        self.grad_reducer = None
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)

        # dp params
        self._micro_batches = micro_batches
        norm_clip = check_param_type('norm_clip', norm_clip, float)
        self._l2_norm = check_value_positive('norm_clip', norm_clip)
        self._split = P.Split(0, self._micro_batches)
        self._clip_by_global_norm = _ClipGradients()
        self._mech = mech
        self._tuple_add = _TupleAdd()
        self._hyper_map = C.HyperMap()
        self._micro_float = Tensor(micro_batches, mstype.float32)
Beispiel #12
0
 def __init__(self,
              params,
              learning_rate,
              momentum,
              matrix_A,
              matrix_G,
              weight_decay=0.0,
              loss_scale=1.0,
              num_hidden_layers=24,
              batch_size=12,
              damping=0.03,
              decay_filter=lambda x: 'layernorm' not in x.name.lower() and
              'bias' not in x.name.lower()):
     super(THOR, self).__init__(learning_rate, params, weight_decay,
                                loss_scale)
     if isinstance(momentum, float) and momentum < 0.0:
         raise ValueError(
             "momentum should be at least 0.0, but got momentum {}".format(
                 momentum))
     self.momentum = Parameter(Tensor(momentum, mstype.float32),
                               name="momentum")
     self.params = self.parameters
     self.moments = self.params.clone(prefix="moments", init='zeros')
     self.hyper_map = C.HyperMap()
     self.opt = P.ApplyMomentum()
     self.matrix_A = ParameterTuple(matrix_A)
     self.matrix_G = ParameterTuple(matrix_G)
     self.matmul = P.MatMul()
     self.transpose = P.Transpose()
     self.shape = P.Shape()
     self.reshape = P.Reshape()
     self.mul = P.Mul()
     self.gather = P.GatherV2()
     self.matrix_A_inv = ()
     self.matrix_G_inv = ()
     self.num_hidden_layers = num_hidden_layers
     self.sqrt = P.Sqrt()
     self.assign = P.Assign()
     self.cast = P.Cast()
     self.thor = True
     self.weight_decay = weight_decay * loss_scale
     self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
     self.expand = P.ExpandDims()
     self.square = P.Square()
     self.inv = P.Inv()
     self.batch_size = batch_size
     self.damping = damping
     self.one = Tensor(1, mstype.int32)
     self.cov_step = Parameter(initializer(0, [1], mstype.int32),
                               name="cov_step",
                               requires_grad=False)
     mean = _get_gradients_mean()
     degree = _get_device_num()
     self.grad_reducer_g = DistributedGradReducerThor(
         self.parameters, 3, mean, degree)
Beispiel #13
0
    def __init__(self, network, config, sens=1000.0):
        super(TrainStepWrap, self).__init__()
        self.network = network
        self.network.set_train()
        self.trainable_params = network.trainable_params()
        weights_w = []
        weights_d = []
        for params in self.trainable_params:
            if 'wide' in params.name:
                weights_w.append(params)
            else:
                weights_d.append(params)

        self.weights_w = ParameterTuple(weights_w)
        self.weights_d = ParameterTuple(weights_d)
        self.optimizer_w = FTRL(learning_rate=config.ftrl_lr,
                                params=self.weights_w,
                                l1=5e-4,
                                l2=5e-4,
                                initial_accum=0.1,
                                loss_scale=sens)

        #self.optimizer_d = ProximalAdagrad(self.weights_d, learning_rate=config.adam_lr,loss_scale=sens)
        self.optimizer_d = Adam(self.weights_d,
                                learning_rate=config.adam_lr,
                                eps=1e-6,
                                loss_scale=sens)

        self.hyper_map = C.HyperMap()

        self.grad_w = C.GradOperation('grad_w',
                                      get_by_list=True,
                                      sens_param=True)
        self.grad_d = C.GradOperation('grad_d',
                                      get_by_list=True,
                                      sens_param=True)

        self.sens = sens
        self.loss_net_w = IthOutputCell(network, output_index=0)
        self.loss_net_d = IthOutputCell(network, output_index=1)

        self.reducer_flag = False
        self.grad_reducer_w = None
        self.grad_reducer_d = None
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL,
                             ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer_w = DistributedGradReducer(
                self.optimizer_w.parameters, mean, degree)
            self.grad_reducer_d = DistributedGradReducer(
                self.optimizer_d.parameters, mean, degree)
    def __init__(self, network, optimizer, scale_update_cell=None):

        super(TransformerTrainOneStepWithLossScaleCell,
              self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.reducer_flag = False
        self.all_reduce = P.AllReduce()

        self.parallel_mode = _get_parallel_mode()
        if self.parallel_mode not in ParallelMode.MODE_LIST:
            raise ValueError("Parallel mode does not support: ",
                             self.parallel_mode)
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.clip_gradients = ClipGradients()
        self.cast = P.Cast()
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.gpu_target = False
            self.alloc_status = P.NPUAllocFloatStatus()
            self.get_status = P.NPUGetFloatStatus()
            self.clear_status = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.depend_parameter_use = P.ControlDepend(depend_mode=1)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.hyper_map = C.HyperMap()

        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(
                Tensor(scale_update_cell.get_loss_scale(),
                       dtype=mstype.float32))
        self.add_flags(has_effect=True)
Beispiel #15
0
    def __init__(self, dataset, first_order_order):
        self.dataset = dataset
        self.device_num = _get_device_num()
        self.global_rank = _get_global_rank()
        self.repeat_count = dataset.get_repeat_count()
        self.repeat_ind = 0
        self.loop_count = dataset.get_dataset_size()
        self.ind = 0

        parallel_mode = context.get_auto_parallel_context("parallel_mode")
        self.need_to_full = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL,
                                              ParallelMode.AUTO_PARALLEL)
Beispiel #16
0
 def __init__(self, network, optimizer, sens=1.0):
     super(TrainOneStepCell, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_grad()
     self.freeze = isinstance(optimizer, acc.FreezeOpt)
     self.optimizer = optimizer
     if not self.freeze:
         self.weights = self.optimizer.parameters
     self.train_strategy = getattr(self.optimizer, 'train_strategy', None)
     self.grad = C.GradOperation(get_by_list=True, sens_param=True)
     self.sens = sens
     self.reducer_flag = False
     self.grad_reducer = F.identity
     self.parallel_mode = _get_parallel_mode()
     self.reducer_flag = self.parallel_mode in (
         ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL)
     self.use_grad_accumulation = self.parallel_mode in (
         ParallelMode.DATA_PARALLEL, ParallelMode.STAND_ALONE)
     if self.use_grad_accumulation:
         self.max_accumulation_step = get_auto_parallel_context(
             "grad_accumulation_step")
         if self.max_accumulation_step <= 1:
             self.max_accumulation_step = 1
             self.use_grad_accumulation = False
     self.grad_accumulation = None
     if self.use_grad_accumulation:
         self.grad_accumulation = GradientAccumulation(
             self.max_accumulation_step, self.optimizer)
     if self.reducer_flag:
         self.mean = _get_gradients_mean()
         self.degree = _get_device_num()
         if self.freeze:
             self.grad_reducers = (DistributedGradReducer(
                 opt.parameters, self.mean, self.degree)
                                   for opt in self.optimizer.opts)
             self.freeze_nets = tuple(
                 _TrainFreezeCell(self.network, self.sens, self.grad,
                                  reducer, self.use_grad_accumulation,
                                  self.max_accumulation_step, opt) for
                 reducer, opt in zip(self.grad_reducers, self.optimizer))
         else:
             self.grad_reducer = DistributedGradReducer(
                 self.optimizer.parameters, self.mean, self.degree)
     else:
         if self.freeze:
             self.freeze_nets = tuple(
                 _TrainFreezeCell(self.network, self.sens, self.grad, self.
                                  grad_reducer, self.use_grad_accumulation,
                                  self.max_accumulation_step, opt)
                 for opt in self.optimizer.opts)
     self.step = Parameter(Tensor(0, dtype=mstype.int32))
Beispiel #17
0
    def __init__(self, network, optimizer, scale_update_cell=None, micro_batches=None, norm_clip=1.0, mech=None):
        super(_TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
        self.hyper_map = C.HyperMap()
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.gpu_target = False
            self.alloc_status = NPUAllocFloatStatus()
            self.get_status = NPUGetFloatStatus()
            self.clear_status = NPUClearFloatStatus()
        self.reduce_sum = ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = LessEqual()
        self.depend_parameter_use = ControlDepend(depend_mode=1)
        self.allreduce = P.AllReduce()
        self.parallel_mode = _get_parallel_mode()
        self.grad_reducer = F.identity
        self.reducer_flag = self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
        self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE

        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                        name="loss_scale")
        self.add_flags(has_effect=True)

        # dp params
        self._micro_batches = micro_batches
        norm_clip = check_param_type('norm_clip', norm_clip, float)
        self._l2_norm = check_value_positive('norm_clip', norm_clip)
        self._split = P.Split(0, self._micro_batches)
        self._clip_by_global_norm = _ClipGradients()
        self._mech = mech
        self._tuple_add = _TupleAdd()
        self._hyper_map = C.HyperMap()
        self._micro_float = Tensor(micro_batches, mstype.float32)
Beispiel #18
0
    def __init__(self, network, sens=1024.0, host_device_mix=False, parameter_server=False):
        super(TrainStepWrap, self).__init__()
        parallel_mode = _get_parallel_mode()
        is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL)
        self.network = network
        self.network.set_train()
        self.trainable_params = network.trainable_params()
        weights_w = []
        weights_d = []
        for params in self.trainable_params:
            if 'wide' in params.name:
                weights_w.append(params)
            else:
                weights_d.append(params)
        self.weights_w = ParameterTuple(weights_w)
        self.weights_d = ParameterTuple(weights_d)

        if (host_device_mix and is_auto_parallel) or parameter_server:
            self.optimizer_d = LazyAdam(
                self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens)
            self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w,
                                    l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens)
            self.optimizer_w.sparse_opt.add_prim_attr("primitive_target", "CPU")
            self.optimizer_d.sparse_opt.add_prim_attr("primitive_target", "CPU")
        else:
            self.optimizer_d = Adam(
                self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens)
            self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w,
                                    l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens)
        self.hyper_map = C.HyperMap()
        self.grad_w = C.GradOperation('grad_w', get_by_list=True,
                                      sens_param=True)
        self.grad_d = C.GradOperation('grad_d', get_by_list=True,
                                      sens_param=True)
        self.sens = sens
        self.loss_net_w = IthOutputCell(network, output_index=0)
        self.loss_net_d = IthOutputCell(network, output_index=1)

        self.reducer_flag = False
        self.grad_reducer_w = None
        self.grad_reducer_d = None
        parallel_mode = _get_parallel_mode()
        self.reducer_flag = parallel_mode in (ParallelMode.DATA_PARALLEL,
                                              ParallelMode.HYBRID_PARALLEL)
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer_w = DistributedGradReducer(self.optimizer_w.parameters, mean, degree)
            self.grad_reducer_d = DistributedGradReducer(self.optimizer_d.parameters, mean, degree)
Beispiel #19
0
    def __init__(self, dataset, iter_first_order):
        super(_DatasetIterMSLoopSink, self).__init__(dataset)
        loop_size = dataset.__loop_size__ + iter_first_order
        self.loop_count = int(dataset.get_dataset_size() / loop_size * 2)
        # for self._parallel_mode equal to semi_auto_parallel or auto_parallel, use a complete tensor to
        # compile, and slice tensor to run. The batch dimension of tensors for compile is device_number
        # times the batch dimension of tensors for run. Now only support LoopSink.
        if _get_parallel_mode() in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
            device_num = _get_device_num()
            self.dataset_shapes = _to_full_shapes(self.dataset_shapes, device_num)

        def op():
            return tuple()

        self.op = op
Beispiel #20
0
    def __init__(self, dataset, sink_size, epoch_num, iter_first_order):
        super().__init__(dataset, sink_size, epoch_num)
        self.sink_count = self.get_sink_count(dataset, sink_size, iter_first_order)
        ms_role = os.getenv("MS_ROLE")
        if ms_role in ("MS_PSERVER", "MS_SCHED"):
            self.sink_count = 1
        # for self._parallel_mode equal to semi_auto_parallel or auto_parallel, and not using full_batch,
        # use a complete tensor to compile, and slice tensor to run. The batch dimension of tensors for
        # compile is device_number times the batch dimension of tensors for run. Now only support LoopSink.
        if _need_to_full():
            device_num = _get_device_num()
            self.dataset_shapes = _to_full_shapes(self.dataset_shapes, device_num)

        def op():
            return tuple()

        self.op = op
Beispiel #21
0
    def __init__(self, dataset):
        super(_DatasetIterGE, self).__init__(dataset)
        self.loop_count = self.get_loop_count(dataset)
        parallel_mode = _get_parallel_mode()
        self.need_to_full = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL,
                                              ParallelMode.AUTO_PARALLEL)
        batch_expand_num = 1
        if self.need_to_full:
            batch_expand_num = _get_device_num()
        tensor_list_run = _construct_tensor_list(self.dataset_types,
                                                 self.dataset_shapes,
                                                 batch_expand_num)

        def op():
            return tensor_list_run

        self.op = op
Beispiel #22
0
 def __init__(self, network, optimizer, sens=1.0):
     super(TrainOneStepCell, self).__init__(auto_prefix=False)
     self.network = network
     self.network.add_flags(defer_inline=True)
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
     self.sens = sens
     self.reducer_flag = False
     self.grad_reducer = None
     parallel_mode = _get_parallel_mode()
     if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL):
         self.reducer_flag = True
     if self.reducer_flag:
         mean = _get_mirror_mean()
         degree = _get_device_num()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
Beispiel #23
0
    def __init__(self, network, optimizer, scale_sense=None):
        super(TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.hyper_map = C.HyperMap()
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.gpu_target = False
            self.alloc_status = NPUAllocFloatStatus()
            self.get_status = NPUGetFloatStatus()
            self.clear_status = NPUClearFloatStatus()
        self.reduce_sum = ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = LessEqual()
        self.depend_parameter_use = ControlDepend(depend_mode=1)
        self.allreduce = P.AllReduce()
        self.parallel_mode = _get_parallel_mode()
        self.grad_reducer = F.identity
        self.reducer_flag = self.parallel_mode in [
            ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE

        self.scale_sense = None
        self.loss_scaling_manager = None
        if isinstance(scale_sense, Cell):
            self.loss_scaling_manager = scale_sense
            self.scale_sense = Parameter(Tensor(scale_sense.get_loss_scale(),
                                                dtype=mstype.float32),
                                         name="scale_sense")
        if isinstance(scale_sense, Tensor):
            self.scale_sense = Parameter(scale_sense, name='scale_sense')
Beispiel #24
0
    def __init__(self, network, lr, eps, loss_scale=1000.0):
        super(TrainStepWrap, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_train()
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = Adam(self.weights, learning_rate=lr, eps=eps, loss_scale=loss_scale)
        self.hyper_map = C.HyperMap()
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.sens = loss_scale

        self.reducer_flag = False
        self.grad_reducer = None
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(self.optimizer.parameters, mean, degree)
Beispiel #25
0
    def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max,
                 weight_decay=0.0, loss_scale=1.0, use_nesterov=False, decay_filter=lambda x: x.name not in []):
        super(THOR_GPU, self).__init__(learning_rate, params, weight_decay, loss_scale)
        Validator.check_value_type("momentum", momentum, [float], self.cls_name)
        if isinstance(momentum, float) and momentum < 0.0:
            raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
        self.momentum = Parameter(Tensor(momentum, mstype.float32))
        self.params = self.parameters
        self.use_nesterov = Validator.check_bool(use_nesterov)
        self.moments = self.params.clone(prefix="moments", init='zeros')
        self.hyper_map = C.HyperMap()
        self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov)

        self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
                            1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
                            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
                            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
                            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49,
                            1.0]
        self.feature_map_new = [x ** 0.5 for x in self.feature_map]
        self.transpose = P.Transpose()
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.matmul = P.MatMul()
        self.matrix_A = ParameterTuple(matrix_A)
        self.matrix_G = ParameterTuple(matrix_G)
        self.A_inv_max = ParameterTuple(A_inv_max)
        self.G_inv_max = ParameterTuple(G_inv_max)
        self.assign = P.Assign()
        self.mul = P.Mul()

        mean = _get_gradients_mean()
        degree = _get_device_num()

        parameter_length = len(self.feature_map)
        self.grad_reducer_thorA = DistributedGradReducerThor(parameter_length, ((parameter_length,), 0), mean, degree)
        self.grad_reducer_thorG = DistributedGradReducerThor(parameter_length, ((parameter_length,), 0), mean, degree)
        self.weight_decay = weight_decay
        self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
        self.update_gradient = P.UpdateThorGradient(split_dim=128)
Beispiel #26
0
    def __init__(self, dataset):
        self.loop_size = 1
        if not hasattr(dataset, '__ME_INITED__'):
            if not hasattr(dataset, '__loop_size__'):
                self.loop_size = dataset.get_dataset_size()
            else:
                self.loop_size = dataset.__loop_size__
            dataset.__ME_INITED__ = _exec_datagraph(dataset,
                                                    self.loop_size).queue_name

        self.ind = 0
        self.dataset = dataset
        dataset_types, dataset_shapes = _get_types_and_shapes(dataset)
        self.dataset_types, self.dataset_shapes = dataset_types, dataset_shapes
        # for self._parallel_mode equal to semi_auto_parallel or auto_parallel, use a complete tensor to
        # compile, and slice tensor to run. The batch dimension of tensors for compile is device_number
        # times the batch dimension of tensors for run
        if _get_parallel_mode() in (ParallelMode.SEMI_AUTO_PARALLEL,
                                    ParallelMode.AUTO_PARALLEL):
            device_num = _get_device_num()
            self.dataset_shapes = _to_full_shapes(dataset_shapes, device_num)
Beispiel #27
0
    def __init__(self,
                 loss_netD,
                 loss_netG,
                 optimizerD,
                 optimizerG,
                 sens=1,
                 auto_prefix=True):
        super(TrainOneStepCell, self).__init__(auto_prefix=auto_prefix)
        self.loss_netD = loss_netD  # loss network
        self.loss_netD.set_grad()
        self.loss_netD.add_flags(defer_inline=True)

        self.loss_netG = loss_netG
        self.loss_netG.set_grad()
        self.loss_netG.add_flags(defer_inline=True)

        self.weights_G = optimizerG.parameters
        self.optimizerG = optimizerG
        self.weights_D = optimizerD.parameters
        self.optimizerD = optimizerD

        self.grad = ops.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens

        # Parallel processing
        self.reducer_flag = False
        self.grad_reducer_G = F.identity
        self.grad_reducer_D = F.identity
        self.parallel_mode = _get_parallel_mode()
        if self.parallel_mode in (ParallelMode.DATA_PARALLEL,
                                  ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer_G = DistributedGradReducer(
                self.weights_G, mean, degree)
            self.grad_reducer_D = DistributedGradReducer(
                self.weights_D, mean, degree)
Beispiel #28
0
    def __init__(self,
                 learning_rate,
                 parameters,
                 weight_decay=0.0,
                 loss_scale=1.0):
        super(Optimizer, self).__init__(auto_prefix=False)
        if parameters is not None and not isinstance(parameters, list):
            parameters = list(parameters)

        if not parameters:
            raise ValueError("Optimizer got an empty parameter list.")

        if not isinstance(parameters[0], (dict, Parameter)):
            raise TypeError(
                "Only a list of Parameter or dict can be supported.")

        if isinstance(loss_scale, int):
            loss_scale = float(loss_scale)
        validator.check_value_type("loss_scale", loss_scale, [float],
                                   self.cls_name)
        validator.check_positive_float(loss_scale, "loss_scale", self.cls_name)
        self.loss_scale = loss_scale

        weight_decay = self._preprocess_weight_decay(weight_decay)

        self._unique = True
        self._target = context.get_context("device_target")
        self.dynamic_lr = False
        self.assignadd = None
        self.global_step = None
        self.is_group = False
        self.is_group_lr = False
        self.is_group_params_ordered = False
        learning_rate = self._preprocess_single_lr(learning_rate)
        if isinstance(parameters[0], dict):
            self.is_group = True
            self.group_params = []
            self.group_lr = []
            self.group_weight_decay = []
            self._init_group_params(parameters, learning_rate, weight_decay)

        # The final value of dynamic_lr can be determined after the process of parse_single_lr and init_group_params
        if self.dynamic_lr:
            self.assignadd = P.AssignAdd()
            self.global_step = Parameter(initializer(0, [1], mindspore.int32),
                                         name='global_step')

        if self.is_group_lr:
            if self.dynamic_lr:
                self.learning_rate = CellList(self.group_lr)
            else:
                self.learning_rate = ParameterTuple(self.group_lr)
        else:
            self.learning_rate = self._build_single_lr(learning_rate,
                                                       'learning_rate')
        if self.is_group:
            self.parameters = ParameterTuple(self.group_params)
            self.weight_decay = tuple(self.group_weight_decay)
            self.weight_decay_tensor_tuple = tuple(
                Tensor(x, mstype.float32) for x in self.group_weight_decay)
            decay_filter = lambda x: x > 0
            self.decay_flags = tuple(
                decay_filter(x) for x in self.weight_decay)
            self.exec_weight_decay = any(self.decay_flags)
        else:
            self.parameters = ParameterTuple(parameters)
            self.weight_decay = weight_decay * loss_scale
            self.weight_decay_tensor = Tensor(self.weight_decay,
                                              mstype.float32)
            decay_filter = lambda x: 'beta' not in x.name and 'gamma' not in x.name
            self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
            self.exec_weight_decay = self.weight_decay > 0
        # when a parameter has been unique, there is no need do another unique in optimizer.
        for param in self.parameters:
            if param.unique:
                self._unique = False
                break
        ps_filter = lambda x: x.is_param_ps
        self.ps_parameters = tuple(ps_filter(x) for x in self.parameters)
        ps_cache_filter = lambda x: x.cache_enable
        self.cache_enable = tuple(ps_cache_filter(x) for x in self.parameters)
        self.reciprocal_scale = Tensor(1.0 / loss_scale, mstype.float32)
        self.need_scale = loss_scale != 1.0
        self.global_step_increase_tensor = Tensor(1, mstype.int32)
        self.param_length = len(self.parameters)
        self.map_ = C.Map()
        if context.get_auto_parallel_context("enable_parallel_optimizer"):
            if _get_parallel_mode(
            ) == ParallelMode.DATA_PARALLEL and context.get_context(
                    "device_target") == "Ascend":
                self.use_parallel = True
            elif _get_parallel_mode() == ParallelMode.DATA_PARALLEL \
                    and context.get_context("device_target") != "Ascend":
                raise RuntimeError(
                    "Parallel optimizer only supports Ascend in data parallel mode."
                )
            elif _get_parallel_mode() in (ParallelMode.STAND_ALONE,
                                          ParallelMode.HYBRID_PARALLEL):
                raise RuntimeError(
                    "Parallel optimizer is not supported in {}.".format(
                        _get_parallel_mode()))
            else:
                self.use_parallel = False
        else:
            self.use_parallel = False
        if self.use_parallel:
            if self.cls_name not in ["Lamb", "AdamWeightDecay"]:
                raise RuntimeError(
                    "Parallel optimizer does not support optimizer {}".format(
                        self.cls_name))
            self.dev_num = _get_device_num()
            if self.dev_num > self.param_length:
                raise RuntimeError(
                    "Parallel optimizer can not be applied when the number of parameters {} is"
                    " less than the number of devices {}".format(
                        self.param_length, self.dev_num))
            self.param_rank = self._get_parameter_group_id()
            self.optim_filter = tuple(
                map(lambda x: x == _get_global_rank(), self.param_rank))
            self.param_names = []
            for param in self.parameters:
                self.param_names.append(param.name)

        else:
            self.optim_filter = (True, ) * self.param_length
Beispiel #29
0
    def __init__(self,
                 network,
                 optimizer,
                 norm_bound=1.0,
                 sens=1.0,
                 micro_batches=None,
                 noise_mech=None,
                 clip_mech=None):
        super(_TrainOneStepCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
        self.sens = sens
        self.reducer_flag = False
        self.grad_reducer = None
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL,
                             ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)

        # dp params
        if micro_batches is None:
            msg = 'micro_batches must give in differential privacy, but got value: {}'.format(
                micro_batches)
            LOGGER.error(TAG, msg)
            raise ValueError(msg)
        self._micro_batches = micro_batches
        self._norm_bound = norm_bound
        self._split = P.Split(0, self._micro_batches)
        self._clip_by_global_norm = _ClipGradients()
        self._noise_mech = noise_mech
        self._clip_mech = clip_mech
        self._tuple_add = _TupleAdd()
        self._add = P.TensorAdd()
        self._norm = nn.Norm()
        self._hyper_map = C.HyperMap()
        self._zero = Tensor(0, mstype.float32)
        self._assign = P.Assign()
        self._div = P.Div()
        self._sqrt = P.Sqrt()
        self._reduce_sum = P.ReduceSum()
        self._square_all = P.Square()
        self._less = P.Less()
        self._cast = P.Cast()

        self._micro_float = Tensor(micro_batches, mstype.float32)

        self._noise_mech_param_updater = None
        if self._noise_mech is not None and self._noise_mech._decay_policy is not None:
            self._noise_mech_param_updater = _MechanismsParamsUpdater(
                decay_policy=self._noise_mech._decay_policy,
                decay_rate=self._noise_mech._noise_decay_rate,
                cur_noise_multiplier=self._noise_mech._noise_multiplier,
                init_noise_multiplier=self._noise_mech.
                _initial_noise_multiplier)
Beispiel #30
0
    def __init__(self,
                 network,
                 optimizer,
                 scale_update_cell=None,
                 micro_batches=None,
                 norm_bound=1.0,
                 noise_mech=None,
                 clip_mech=None):
        super(_TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
        self.hyper_map = C.HyperMap()
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.gpu_target = False
            self.alloc_status = NPUAllocFloatStatus()
            self.get_status = NPUGetFloatStatus()
            self.clear_status = NPUClearFloatStatus()
        self.reduce_sum = ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = LessEqual()
        self.depend_parameter_use = ControlDepend(depend_mode=1)
        self.allreduce = P.AllReduce()
        self.parallel_mode = _get_parallel_mode()
        self.grad_reducer = F.identity
        self.reducer_flag = self.parallel_mode in [
            ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE

        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(Tensor(
                scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                        name="loss_scale")
        self.add_flags(has_effect=True)

        # dp params
        self._micro_batches = micro_batches
        self._norm_bound = norm_bound
        self._split = P.Split(0, self._micro_batches)
        self._clip_by_global_norm = _ClipGradients()
        self._noise_mech = noise_mech
        self._clip_mech = clip_mech
        self._add = P.TensorAdd()
        self._norm = nn.Norm()
        self._tuple_add = _TupleAdd()
        self._hyper_map = C.HyperMap()
        self._micro_float = Tensor(micro_batches, mstype.float32)
        self._zero = Tensor(0, mstype.float32)
        self._assign = P.Assign()
        self._div = P.Div()
        self._sqrt = P.Sqrt()
        self._reduce_sum = P.ReduceSum()
        self._square_all = P.Square()
        self._less = P.Less()
        self._cast = P.Cast()

        self._noise_mech_param_updater = None
        if self._noise_mech is not None and self._noise_mech._decay_policy is not None:
            self._noise_mech_param_updater = _MechanismsParamsUpdater(
                decay_policy=self._noise_mech._decay_policy,
                decay_rate=self._noise_mech._noise_decay_rate,
                cur_noise_multiplier=self._noise_mech._noise_multiplier,
                init_noise_multiplier=self._noise_mech.
                _initial_noise_multiplier)