Ejemplo n.º 1
0
    def __init__(self, network, optimizer, sens=1.0):
        super(TrainOneStepCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens
        self.reducer_flag = False
        self.grad_reducer = None
        self._tuple_add = _TupleAdd()
        self._tuple_mul = _TupleMul()
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL,
                             ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)

        self.do_privacy = False
        self.grad_mask_tup = ()  # tuple containing grad_mask(cell)
        self.de_weight_tup = ()  # tuple containing de_weight(cell)
        self._suppress_pri_ctrl = None
Ejemplo n.º 2
0
 def __init__(self, network, optimizer, sens=1.0):
     super(TrainOneStepCellWithGradClip, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_grad()
     self.network.add_flags(defer_inline=True)
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.grad = C.GradOperation(get_by_list=True, sens_param=True)
     self.sens = sens
     self.reducer_flag = False
     self.grad_reducer = None
     self.hyper_map = C.HyperMap()
     self.greater = P.Greater()
     self.select = P.Select()
     self.norm = nn.Norm(keep_dims=True)
     self.dtype = P.DType()
     self.cast = P.Cast()
     self.concat = P.Concat(axis=0)
     self.ten = Tensor(np.array([10.0]).astype(np.float32))
     parallel_mode = _get_parallel_mode()
     if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL):
         self.reducer_flag = True
     if self.reducer_flag:
         mean = _get_mirror_mean()
         degree = _get_device_num()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
Ejemplo n.º 3
0
    def __init__(self, network, optimizer, scale_update_cell=None):
        super(TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.add_flags(defer_inline=True)
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
        self.hyper_map = C.HyperMap()
        self.alloc_status = NPUAllocFloatStatus()
        self.get_status = NPUGetFloatStatus()
        self.clear_status = NPUClearFloatStatus()
        self.reduce_sum = ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = LessEqual()
        self.depend_parameter_use = ControlDepend(depend_mode=1)
        self.allreduce = P.AllReduce()
        self.parallel_mode = _get_parallel_mode()
        self.grad_reducer = None
        self.reducer_flag = self.parallel_mode in [
            ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE

        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(Tensor(
                scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                        name="loss_scale")
        self.add_flags(has_effect=True)
Ejemplo n.º 4
0
 def _process_vocab_cache(self, slice_mode):
     """PS embeddingLookup cache check and process."""
     self.cache_enable = False
     if self.vocab_cache_size > 0:
         if self.target == 'CPU':
             logger.warning("The configuration of 'vocab_cache_size' is valid only in 'DEVICE' target, "
                            "current target is CPU, so it will be ignored.")
             return
         enable_ps = _get_ps_context("enable_ps")
         if not enable_ps:
             logger.warning("The configuration of 'vocab_cache_size' is valid only in parameter server trainning "
                            "mode, current mode is not parameter server trainning mode, so it will be ignored.")
             return
         parallel_mode = _get_parallel_mode()
         is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL)
         if is_auto_parallel:
             device_num = get_group_size()
             full_batch = _get_full_batch()
             if device_num > 1 and not (full_batch and slice_mode == "table_row_slice"):
                 raise ValueError("The embeddingLookup cache of parameter server parallel only be used "
                                  "in 'full_batch' and 'table_row_slice' parallel strategy.")
             self.vocab_cache_size = self.vocab_cache_size * device_num
         self.cache_enable = True
         if _is_role_worker():
             self.vocab_size = self.vocab_cache_size
Ejemplo n.º 5
0
    def __init__(self, network, total_steps=1, sens=16384.0):
        super(TrainStepWrap, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_train()
        self.network.add_flags(defer_inline=True)
        self.weights = ParameterTuple(network.trainable_params())

        lr = dynamic_lr(0.01, total_steps, 5000)
        self.optimizer = nn.Adam(self.weights,
                                 learning_rate=lr,
                                 beta1=0.9,
                                 beta2=0.999,
                                 eps=1e-8,
                                 loss_scale=sens)

        self.hyper_map = C.HyperMap()
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens

        self.reducer_flag = False
        self.grad_reducer = None
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL,
                             ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(
                self.optimizer.parameters, mean, degree)
Ejemplo n.º 6
0
    def __init__(self, network, optimizer, sens=1.0, micro_batches=None, norm_clip=1.0, mech=None):
        super(_TrainOneStepCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
        self.sens = sens
        self.reducer_flag = False
        self.grad_reducer = None
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)

        # dp params
        self._micro_batches = micro_batches
        norm_clip = check_param_type('norm_clip', norm_clip, float)
        self._l2_norm = check_value_positive('norm_clip', norm_clip)
        self._split = P.Split(0, self._micro_batches)
        self._clip_by_global_norm = _ClipGradients()
        self._mech = mech
        self._tuple_add = _TupleAdd()
        self._hyper_map = C.HyperMap()
        self._micro_float = Tensor(micro_batches, mstype.float32)
Ejemplo n.º 7
0
    def __init__(self,
                 network,
                 loss_fn=None,
                 optimizer=None,
                 metrics=None,
                 eval_network=None,
                 eval_indexes=None,
                 amp_level="O0",
                 frequency=278,
                 stop_epoch=100,
                 **kwargs):
        self._network = network
        self._loss_fn = loss_fn
        self._optimizer = optimizer
        self._loss_scale_manager = None
        self._loss_scale_manager_set = False
        self._keep_bn_fp32 = True
        self._check_kwargs(kwargs)
        self._amp_level = amp_level
        self._process_amp_args(kwargs)
        self._parallel_mode = _get_parallel_mode()
        self._device_number = _get_device_num()
        self._global_rank = _get_global_rank()
        self._parameter_broadcast = _get_parameter_broadcast()
        self._frequency = frequency
        self._stop_epoch = stop_epoch

        self._train_network = self._build_train_network()
        self._build_eval_network(metrics, eval_network, eval_indexes)
        self._build_predict_network()
Ejemplo n.º 8
0
 def _process_vocab_cache(self, slice_mode):
     """PS embeddingLookup cache check and process."""
     self.cache_enable = False
     if self.vocab_cache_size > 0:
         if self.target == 'CPU':
             logger.warning("The configuration of 'vocab_cache_size' is valid only in 'DEVICE' target, "
                            "current target is CPU, so it will be ignored.")
             return
         enable_ps = _get_ps_context("enable_ps")
         if not enable_ps:
             logger.warning("The configuration of 'vocab_cache_size' is valid only in parameter server trainning "
                            "mode, current mode is not parameter server trainning mode, so it will be ignored.")
             return
         parallel_mode = _get_parallel_mode()
         is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL)
         if is_auto_parallel:
             rank_size = get_group_size()
             rank_id = get_rank()
             full_batch = _get_full_batch()
             if rank_size > 1 and not (full_batch and slice_mode == "table_row_slice"):
                 raise ValueError("The embeddingLookup cache of parameter server parallel only be used "
                                  "in 'full_batch' and 'table_row_slice' parallel strategy.")
             self.vocab_cache_size = self.vocab_cache_size * rank_size
             _set_rank_id(rank_id)
         self.cache_enable = True
         if _is_role_worker():
             self.vocab_size = self.vocab_cache_size
             if context.get_context("enable_sparse") != self.sparse:
                 raise ValueError("The value of parameter 'sparse' must be same for all EmbeddingLookup "
                                  "kernels and equal the value of 'enable_sparse' in context setting in "
                                  "parameter server cache mode")
Ejemplo n.º 9
0
    def __init__(self, config):
        super(WideDeepModel, self).__init__()
        self.batch_size = config.batch_size
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
            self.batch_size = self.batch_size * get_group_size()
        self.field_size = config.field_size
        self.vocab_size = config.vocab_size
        self.emb_dim = config.emb_dim
        self.deep_layer_dims_list = config.deep_layer_dim
        self.deep_layer_act = config.deep_layer_act
        self.init_args = config.init_args
        self.weight_init, self.bias_init = config.weight_bias_init
        self.weight_bias_init = config.weight_bias_init
        self.emb_init = config.emb_init
        self.drop_out = config.dropout_flag
        self.keep_prob = config.keep_prob
        self.deep_input_dims = self.field_size * self.emb_dim
        self.layer_dims = self.deep_layer_dims_list + [1]
        self.all_dim_list = [self.deep_input_dims] + self.layer_dims

        init_acts = [('Wide_w', [self.vocab_size, 1], self.emb_init),
                     ('V_l2', [self.vocab_size, self.emb_dim], self.emb_init),
                     ('Wide_b', [1], self.emb_init)]
        var_map = init_var_dict(self.init_args, init_acts)
        self.wide_w = var_map["Wide_w"]
        self.wide_b = var_map["Wide_b"]
        self.embedding_table = var_map["V_l2"]
        self.dense_layer_1 = DenseLayer(self.all_dim_list[0],
                                        self.all_dim_list[1],
                                        self.weight_bias_init,
                                        self.deep_layer_act, convert_dtype=True)
        self.dense_layer_2 = DenseLayer(self.all_dim_list[1],
                                        self.all_dim_list[2],
                                        self.weight_bias_init,
                                        self.deep_layer_act, convert_dtype=True)
        self.dense_layer_3 = DenseLayer(self.all_dim_list[2],
                                        self.all_dim_list[3],
                                        self.weight_bias_init,
                                        self.deep_layer_act, convert_dtype=True)
        self.dense_layer_4 = DenseLayer(self.all_dim_list[3],
                                        self.all_dim_list[4],
                                        self.weight_bias_init,
                                        self.deep_layer_act, convert_dtype=True)
        self.dense_layer_5 = DenseLayer(self.all_dim_list[4],
                                        self.all_dim_list[5],
                                        self.weight_bias_init,
                                        self.deep_layer_act, convert_dtype=True)

        self.gather_v2 = P.GatherV2().set_strategy(((1, 8), (1, 1)))
        self.gather_v2_1 = P.GatherV2()
        self.mul = P.Mul()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.reshape = P.Reshape()
        self.square = P.Square()
        self.shape = P.Shape()
        self.tile = P.Tile()
        self.concat = P.Concat(axis=1)
        self.cast = P.Cast()
Ejemplo n.º 10
0
 def __init__(self,
              vocab_size,
              embedding_size,
              param_init='normal',
              target='CPU',
              slice_mode='batch_slice',
              manual_shapes=None):
     super(EmbeddingLookup, self).__init__()
     self.target = target
     if target not in ('CPU', 'DEVICE'):
         raise ValueError(
             'Attr \'target\' of \'EmbeddingLookup\' Op passed ' +
             str(target) +
             ', should be one of values in \'CPU\', \'DEVICE\'.')
     self.gatherv2 = P.GatherV2()
     self.embeddinglookup = P.EmbeddingLookup().add_prim_attr(
         'primitive_target', 'CPU')
     self.embedding_table = Parameter(initializer(
         param_init, [vocab_size, embedding_size]),
                                      name='embedding_table')
     parallel_mode = _get_parallel_mode()
     is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL,
                                          ParallelMode.AUTO_PARALLEL)
     if slice_mode == EmbeddingLookUpSplitMode.FIELD_SLICE and is_auto_parallel:
         if not manual_shapes:
             raise ValueError(
                 "in slice field mode, the manual_shapes should not be none"
             )
         if not isinstance(manual_shapes, tuple):
             raise TypeError(
                 "manual_shapes type must be tuple(int) cannot be {}!".
                 format(type(manual_shapes)))
         for dim in manual_shapes:
             Validator.check_integer('manul shape dim', dim, 0, Rel.GT,
                                     self.cls_name)
         self.gatherv2.add_prim_attr("manual_split", manual_shapes)
         self.embeddinglookup.add_prim_attr("manual_split", manual_shapes)
         self.gatherv2.set_strategy(
             ((get_group_size(), 1), (1, get_group_size())))
         self.embeddinglookup.set_strategy(
             ((get_group_size(), 1), (1, get_group_size())))
     elif slice_mode == EmbeddingLookUpSplitMode.TABLE_ROW_SLICE and is_auto_parallel:
         self.gatherv2.set_strategy(((get_group_size(), 1), (1, 1)))
         self.embeddinglookup.set_strategy(((get_group_size(), 1), (1, 1)))
     elif slice_mode == EmbeddingLookUpSplitMode.TABLE_COLUMN_SLICE and is_auto_parallel:
         self.gatherv2.set_strategy(((1, get_group_size()), (1, 1)))
         self.embeddinglookup.set_strategy(((1, get_group_size()), (1, 1)))
     elif slice_mode == EmbeddingLookUpSplitMode.BATCH_SLICE and is_auto_parallel:
         self.gatherv2.set_strategy(((1, 1), (get_group_size(), 1)))
         self.embeddinglookup.set_strategy(((1, 1), (get_group_size(), 1)))
     else:
         if is_auto_parallel:
             raise ValueError(
                 "slice_mode should support mode in nn.EmbeddingLookUpSplitMode, but get "
                 + str(slice_mode))
Ejemplo n.º 11
0
    def __init__(self, network, config, sens=1000.0):
        super(TrainStepWrap, self).__init__()
        self.network = network
        self.network.set_train()
        self.trainable_params = network.trainable_params()
        weights_w = []
        weights_d = []
        for params in self.trainable_params:
            if 'wide' in params.name:
                weights_w.append(params)
            else:
                weights_d.append(params)

        self.weights_w = ParameterTuple(weights_w)
        self.weights_d = ParameterTuple(weights_d)
        self.optimizer_w = FTRL(learning_rate=config.ftrl_lr,
                                params=self.weights_w,
                                l1=5e-4,
                                l2=5e-4,
                                initial_accum=0.1,
                                loss_scale=sens)

        #self.optimizer_d = ProximalAdagrad(self.weights_d, learning_rate=config.adam_lr,loss_scale=sens)
        self.optimizer_d = Adam(self.weights_d,
                                learning_rate=config.adam_lr,
                                eps=1e-6,
                                loss_scale=sens)

        self.hyper_map = C.HyperMap()

        self.grad_w = C.GradOperation('grad_w',
                                      get_by_list=True,
                                      sens_param=True)
        self.grad_d = C.GradOperation('grad_d',
                                      get_by_list=True,
                                      sens_param=True)

        self.sens = sens
        self.loss_net_w = IthOutputCell(network, output_index=0)
        self.loss_net_d = IthOutputCell(network, output_index=1)

        self.reducer_flag = False
        self.grad_reducer_w = None
        self.grad_reducer_d = None
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL,
                             ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer_w = DistributedGradReducer(
                self.optimizer_w.parameters, mean, degree)
            self.grad_reducer_d = DistributedGradReducer(
                self.optimizer_d.parameters, mean, degree)
Ejemplo n.º 12
0
    def _amp_build_train_network(self,
                                 network,
                                 optimizer,
                                 loss_fn=None,
                                 level='O0',
                                 **kwargs):
        """
        Build the mixed precision training cell automatically.

        Args:
            network (Cell): Definition of the network.
            loss_fn (Union[None, Cell]): Definition of the loss_fn. If None,
                the `network` should have the loss inside. Default: None.
            optimizer (Optimizer): Optimizer to update the Parameter.
            level (str): Supports [O0, O2]. Default: "O0".
                - O0: Do not change.
                - O2: Cast network to float16, keep batchnorm and `loss_fn`
                  (if set) run in float32, using dynamic loss scale.
            cast_model_type (:class:`mindspore.dtype`): Supports `mstype.float16`
                or `mstype.float32`. If set to `mstype.float16`, use `float16`
                mode to train. If set, overwrite the level setting.
            keep_batchnorm_fp32 (bool): Keep Batchnorm run in `float32`. If set,
                overwrite the level setting.
            loss_scale_manager (Union[None, LossScaleManager]): If None, not
                scale the loss, or else scale the loss by LossScaleManager.
                If set, overwrite the level setting.
        """
        validator.check_value_type('network', network, nn.Cell, None)
        validator.check_value_type('optimizer', optimizer, nn.Optimizer, None)
        validator.check('level', level, "", ['O0', 'O2'], Rel.IN, None)
        self._check_kwargs(kwargs)
        config = dict(_config_level[level], **kwargs)
        config = edict(config)

        if config.cast_model_type == mstype.float16:
            network.to_float(mstype.float16)

            if config.keep_batchnorm_fp32:
                _do_keep_batchnorm_fp32(network)

        if loss_fn:
            network = _add_loss_network(network, loss_fn,
                                        config.cast_model_type)

        if _get_parallel_mode() in (ParallelMode.SEMI_AUTO_PARALLEL,
                                    ParallelMode.AUTO_PARALLEL):
            network = _VirtualDatasetCell(network)

        loss_scale = 1.0
        if config.loss_scale_manager is not None:
            print("----model config have loss scale manager !")
        network = TrainOneStepCell(network, optimizer,
                                   sens=loss_scale).set_train()
        return network
Ejemplo n.º 13
0
    def __init__(self, network, optimizer, scale_update_cell=None):

        super(TransformerTrainOneStepWithLossScaleCell,
              self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.reducer_flag = False
        self.all_reduce = P.AllReduce()

        self.parallel_mode = _get_parallel_mode()
        if self.parallel_mode not in ParallelMode.MODE_LIST:
            raise ValueError("Parallel mode does not support: ",
                             self.parallel_mode)
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.clip_gradients = ClipGradients()
        self.cast = P.Cast()
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.gpu_target = False
            self.alloc_status = P.NPUAllocFloatStatus()
            self.get_status = P.NPUGetFloatStatus()
            self.clear_status = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.depend_parameter_use = P.ControlDepend(depend_mode=1)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.hyper_map = C.HyperMap()

        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(
                Tensor(scale_update_cell.get_loss_scale(),
                       dtype=mstype.float32))
        self.add_flags(has_effect=True)
Ejemplo n.º 14
0
 def _use_parallel_optimizer(self):
     """Indicates whether to use automatic parallelism."""
     if context.get_auto_parallel_context("enable_parallel_optimizer"):
         if _get_parallel_mode(
         ) == ParallelMode.DATA_PARALLEL and context.get_context(
                 "device_target") == "Ascend":
             self.use_parallel = True
         elif _get_parallel_mode() == ParallelMode.DATA_PARALLEL \
                 and context.get_context("device_target") != "Ascend":
             raise RuntimeError(
                 "Parallel optimizer only supports Ascend in data parallel mode."
             )
         elif _get_parallel_mode() in (ParallelMode.STAND_ALONE,
                                       ParallelMode.HYBRID_PARALLEL):
             raise RuntimeError(
                 "Parallel optimizer is not supported in {}.".format(
                     _get_parallel_mode()))
         else:
             self.use_parallel = False
     else:
         self.use_parallel = False
     if self.use_parallel:
         if self.cls_name not in ["Lamb", "AdamWeightDecay"]:
             raise RuntimeError(
                 "Parallel optimizer does not support optimizer {}".format(
                     self.cls_name))
         self.dev_num = _get_device_num()
         if self.dev_num > self.param_length:
             raise RuntimeError(
                 "Parallel optimizer can not be applied when the number of parameters {} is"
                 " less than the number of devices {}".format(
                     self.param_length, self.dev_num))
         self.param_rank = self._get_parameter_group_id()
         self.optim_filter = tuple(
             map(lambda x: x == _get_global_rank(), self.param_rank))
         self.param_names = []
         for param in self.parameters:
             self.param_names.append(param.name)
     else:
         self.optim_filter = (True, ) * self.param_length
Ejemplo n.º 15
0
 def __init__(self, network, optimizer, sens=1.0):
     super(TrainOneStepCell, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_grad()
     self.freeze = isinstance(optimizer, acc.FreezeOpt)
     self.optimizer = optimizer
     if not self.freeze:
         self.weights = self.optimizer.parameters
     self.train_strategy = getattr(self.optimizer, 'train_strategy', None)
     self.grad = C.GradOperation(get_by_list=True, sens_param=True)
     self.sens = sens
     self.reducer_flag = False
     self.grad_reducer = F.identity
     self.parallel_mode = _get_parallel_mode()
     self.reducer_flag = self.parallel_mode in (
         ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL)
     self.use_grad_accumulation = self.parallel_mode in (
         ParallelMode.DATA_PARALLEL, ParallelMode.STAND_ALONE)
     if self.use_grad_accumulation:
         self.max_accumulation_step = get_auto_parallel_context(
             "grad_accumulation_step")
         if self.max_accumulation_step <= 1:
             self.max_accumulation_step = 1
             self.use_grad_accumulation = False
     self.grad_accumulation = None
     if self.use_grad_accumulation:
         self.grad_accumulation = GradientAccumulation(
             self.max_accumulation_step, self.optimizer)
     if self.reducer_flag:
         self.mean = _get_gradients_mean()
         self.degree = _get_device_num()
         if self.freeze:
             self.grad_reducers = (DistributedGradReducer(
                 opt.parameters, self.mean, self.degree)
                                   for opt in self.optimizer.opts)
             self.freeze_nets = tuple(
                 _TrainFreezeCell(self.network, self.sens, self.grad,
                                  reducer, self.use_grad_accumulation,
                                  self.max_accumulation_step, opt) for
                 reducer, opt in zip(self.grad_reducers, self.optimizer))
         else:
             self.grad_reducer = DistributedGradReducer(
                 self.optimizer.parameters, self.mean, self.degree)
     else:
         if self.freeze:
             self.freeze_nets = tuple(
                 _TrainFreezeCell(self.network, self.sens, self.grad, self.
                                  grad_reducer, self.use_grad_accumulation,
                                  self.max_accumulation_step, opt)
                 for opt in self.optimizer.opts)
     self.step = Parameter(Tensor(0, dtype=mstype.int32))
Ejemplo n.º 16
0
    def __init__(self, network, optimizer, scale_update_cell=None, micro_batches=None, norm_clip=1.0, mech=None):
        super(_TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
        self.hyper_map = C.HyperMap()
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.gpu_target = False
            self.alloc_status = NPUAllocFloatStatus()
            self.get_status = NPUGetFloatStatus()
            self.clear_status = NPUClearFloatStatus()
        self.reduce_sum = ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = LessEqual()
        self.depend_parameter_use = ControlDepend(depend_mode=1)
        self.allreduce = P.AllReduce()
        self.parallel_mode = _get_parallel_mode()
        self.grad_reducer = F.identity
        self.reducer_flag = self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
        self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE

        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                        name="loss_scale")
        self.add_flags(has_effect=True)

        # dp params
        self._micro_batches = micro_batches
        norm_clip = check_param_type('norm_clip', norm_clip, float)
        self._l2_norm = check_value_positive('norm_clip', norm_clip)
        self._split = P.Split(0, self._micro_batches)
        self._clip_by_global_norm = _ClipGradients()
        self._mech = mech
        self._tuple_add = _TupleAdd()
        self._hyper_map = C.HyperMap()
        self._micro_float = Tensor(micro_batches, mstype.float32)
Ejemplo n.º 17
0
    def __init__(self, dataset, iter_first_order):
        super(_DatasetIterMSLoopSink, self).__init__(dataset)
        loop_size = dataset.__loop_size__ + iter_first_order
        self.loop_count = int(dataset.get_dataset_size() / loop_size * 2)
        # for self._parallel_mode equal to semi_auto_parallel or auto_parallel, use a complete tensor to
        # compile, and slice tensor to run. The batch dimension of tensors for compile is device_number
        # times the batch dimension of tensors for run. Now only support LoopSink.
        if _get_parallel_mode() in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL):
            device_num = _get_device_num()
            self.dataset_shapes = _to_full_shapes(self.dataset_shapes, device_num)

        def op():
            return tuple()

        self.op = op
Ejemplo n.º 18
0
 def __init__(self, network, config):
     super(NetWithLossClass, self).__init__(auto_prefix=False)
     host_device_mix = bool(config.host_device_mix)
     parallel_mode = _get_parallel_mode()
     is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL,
                                          ParallelMode.AUTO_PARALLEL)
     self.no_l2loss = host_device_mix and is_auto_parallel
     self.network = network
     self.l2_coef = config.l2_coef
     self.loss = P.SigmoidCrossEntropyWithLogits()
     self.square = P.Square()
     self.reduceMean_false = P.ReduceMean(keep_dims=False)
     if is_auto_parallel:
         self.reduceMean_false.add_prim_attr("cross_batch", True)
     self.reduceSum_false = P.ReduceSum(keep_dims=False)
Ejemplo n.º 19
0
 def __init__(self, network, optimizer, sens=1.0):
     super(MyTrainOneStepCell, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_grad()
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.grad = C.GradOperation(get_by_list=True, sens_param=True)
     self.sens = sens
     self.reducer_flag = False
     self.grad_reducer = None
     parallel_mode = _get_parallel_mode()
     if parallel_mode in (ParallelMode.DATA_PARALLEL,
                          ParallelMode.HYBRID_PARALLEL):
         self.reducer_flag = True
     if self.reducer_flag:
         self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                    True, 8)
Ejemplo n.º 20
0
 def __init__(self, network, optimizer, sens=1.0):
     super(TrainOneStepCell, self).__init__(auto_prefix=False)
     self.network = network
     self.network.add_flags(defer_inline=True)
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
     self.sens = sens
     self.reducer_flag = False
     self.grad_reducer = None
     parallel_mode = _get_parallel_mode()
     if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL):
         self.reducer_flag = True
     if self.reducer_flag:
         mean = _get_mirror_mean()
         degree = _get_device_num()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
Ejemplo n.º 21
0
    def __init__(self, dataset):
        super(_DatasetIterGE, self).__init__(dataset)
        self.loop_count = self.get_loop_count(dataset)
        parallel_mode = _get_parallel_mode()
        self.need_to_full = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL,
                                              ParallelMode.AUTO_PARALLEL)
        batch_expand_num = 1
        if self.need_to_full:
            batch_expand_num = _get_device_num()
        tensor_list_run = _construct_tensor_list(self.dataset_types,
                                                 self.dataset_shapes,
                                                 batch_expand_num)

        def op():
            return tensor_list_run

        self.op = op
Ejemplo n.º 22
0
    def __init__(self, network, optimizer, scale_sense=None):
        super(TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.hyper_map = C.HyperMap()
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.gpu_target = False
            self.alloc_status = NPUAllocFloatStatus()
            self.get_status = NPUGetFloatStatus()
            self.clear_status = NPUClearFloatStatus()
        self.reduce_sum = ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = LessEqual()
        self.depend_parameter_use = ControlDepend(depend_mode=1)
        self.allreduce = P.AllReduce()
        self.parallel_mode = _get_parallel_mode()
        self.grad_reducer = F.identity
        self.reducer_flag = self.parallel_mode in [
            ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE

        self.scale_sense = None
        self.loss_scaling_manager = None
        if isinstance(scale_sense, Cell):
            self.loss_scaling_manager = scale_sense
            self.scale_sense = Parameter(Tensor(scale_sense.get_loss_scale(),
                                                dtype=mstype.float32),
                                         name="scale_sense")
        if isinstance(scale_sense, Tensor):
            self.scale_sense = Parameter(scale_sense, name='scale_sense')
Ejemplo n.º 23
0
    def __init__(self, network, lr, eps, loss_scale=1000.0):
        super(TrainStepWrap, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_train()
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = Adam(self.weights, learning_rate=lr, eps=eps, loss_scale=loss_scale)
        self.hyper_map = C.HyperMap()
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.sens = loss_scale

        self.reducer_flag = False
        self.grad_reducer = None
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(self.optimizer.parameters, mean, degree)
Ejemplo n.º 24
0
    def __init__(self, dataset):
        self.loop_size = 1
        if not hasattr(dataset, '__ME_INITED__'):
            if not hasattr(dataset, '__loop_size__'):
                self.loop_size = dataset.get_dataset_size()
            else:
                self.loop_size = dataset.__loop_size__
            dataset.__ME_INITED__ = _exec_datagraph(dataset,
                                                    self.loop_size).queue_name

        self.ind = 0
        self.dataset = dataset
        dataset_types, dataset_shapes = _get_types_and_shapes(dataset)
        self.dataset_types, self.dataset_shapes = dataset_types, dataset_shapes
        # for self._parallel_mode equal to semi_auto_parallel or auto_parallel, use a complete tensor to
        # compile, and slice tensor to run. The batch dimension of tensors for compile is device_number
        # times the batch dimension of tensors for run
        if _get_parallel_mode() in (ParallelMode.SEMI_AUTO_PARALLEL,
                                    ParallelMode.AUTO_PARALLEL):
            device_num = _get_device_num()
            self.dataset_shapes = _to_full_shapes(dataset_shapes, device_num)
Ejemplo n.º 25
0
    def __init__(self,
                 loss_netD,
                 loss_netG,
                 optimizerD,
                 optimizerG,
                 sens=1,
                 auto_prefix=True):
        super(TrainOneStepCell, self).__init__(auto_prefix=auto_prefix)
        self.loss_netD = loss_netD  # loss network
        self.loss_netD.set_grad()
        self.loss_netD.add_flags(defer_inline=True)

        self.loss_netG = loss_netG
        self.loss_netG.set_grad()
        self.loss_netG.add_flags(defer_inline=True)

        self.weights_G = optimizerG.parameters
        self.optimizerG = optimizerG
        self.weights_D = optimizerD.parameters
        self.optimizerD = optimizerD

        self.grad = ops.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens

        # Parallel processing
        self.reducer_flag = False
        self.grad_reducer_G = F.identity
        self.grad_reducer_D = F.identity
        self.parallel_mode = _get_parallel_mode()
        if self.parallel_mode in (ParallelMode.DATA_PARALLEL,
                                  ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer_G = DistributedGradReducer(
                self.weights_G, mean, degree)
            self.grad_reducer_D = DistributedGradReducer(
                self.weights_D, mean, degree)
Ejemplo n.º 26
0
    def __init__(self,
                 learning_rate,
                 parameters,
                 weight_decay=0.0,
                 loss_scale=1.0):
        super(Optimizer, self).__init__(auto_prefix=False)
        if parameters is not None and not isinstance(parameters, list):
            parameters = list(parameters)

        if not parameters:
            raise ValueError("Optimizer got an empty parameter list.")

        if not isinstance(parameters[0], (dict, Parameter)):
            raise TypeError(
                "Only a list of Parameter or dict can be supported.")

        if isinstance(loss_scale, int):
            loss_scale = float(loss_scale)
        validator.check_value_type("loss_scale", loss_scale, [float],
                                   self.cls_name)
        validator.check_positive_float(loss_scale, "loss_scale", self.cls_name)
        self.loss_scale = loss_scale

        weight_decay = self._preprocess_weight_decay(weight_decay)

        self._unique = True
        self._target = context.get_context("device_target")
        self.dynamic_lr = False
        self.assignadd = None
        self.global_step = None
        self.is_group = False
        self.is_group_lr = False
        self.is_group_params_ordered = False
        learning_rate = self._preprocess_single_lr(learning_rate)
        if isinstance(parameters[0], dict):
            self.is_group = True
            self.group_params = []
            self.group_lr = []
            self.group_weight_decay = []
            self._init_group_params(parameters, learning_rate, weight_decay)

        # The final value of dynamic_lr can be determined after the process of parse_single_lr and init_group_params
        if self.dynamic_lr:
            self.assignadd = P.AssignAdd()
            self.global_step = Parameter(initializer(0, [1], mindspore.int32),
                                         name='global_step')

        if self.is_group_lr:
            if self.dynamic_lr:
                self.learning_rate = CellList(self.group_lr)
            else:
                self.learning_rate = ParameterTuple(self.group_lr)
        else:
            self.learning_rate = self._build_single_lr(learning_rate,
                                                       'learning_rate')
        if self.is_group:
            self.parameters = ParameterTuple(self.group_params)
            self.weight_decay = tuple(self.group_weight_decay)
            self.weight_decay_tensor_tuple = tuple(
                Tensor(x, mstype.float32) for x in self.group_weight_decay)
            decay_filter = lambda x: x > 0
            self.decay_flags = tuple(
                decay_filter(x) for x in self.weight_decay)
            self.exec_weight_decay = any(self.decay_flags)
        else:
            self.parameters = ParameterTuple(parameters)
            self.weight_decay = weight_decay * loss_scale
            self.weight_decay_tensor = Tensor(self.weight_decay,
                                              mstype.float32)
            decay_filter = lambda x: 'beta' not in x.name and 'gamma' not in x.name
            self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
            self.exec_weight_decay = self.weight_decay > 0
        # when a parameter has been unique, there is no need do another unique in optimizer.
        for param in self.parameters:
            if param.unique:
                self._unique = False
                break
        ps_filter = lambda x: x.is_param_ps
        self.ps_parameters = tuple(ps_filter(x) for x in self.parameters)
        ps_cache_filter = lambda x: x.cache_enable
        self.cache_enable = tuple(ps_cache_filter(x) for x in self.parameters)
        self.reciprocal_scale = Tensor(1.0 / loss_scale, mstype.float32)
        self.need_scale = loss_scale != 1.0
        self.global_step_increase_tensor = Tensor(1, mstype.int32)
        self.param_length = len(self.parameters)
        self.map_ = C.Map()
        if context.get_auto_parallel_context("enable_parallel_optimizer"):
            if _get_parallel_mode(
            ) == ParallelMode.DATA_PARALLEL and context.get_context(
                    "device_target") == "Ascend":
                self.use_parallel = True
            elif _get_parallel_mode() == ParallelMode.DATA_PARALLEL \
                    and context.get_context("device_target") != "Ascend":
                raise RuntimeError(
                    "Parallel optimizer only supports Ascend in data parallel mode."
                )
            elif _get_parallel_mode() in (ParallelMode.STAND_ALONE,
                                          ParallelMode.HYBRID_PARALLEL):
                raise RuntimeError(
                    "Parallel optimizer is not supported in {}.".format(
                        _get_parallel_mode()))
            else:
                self.use_parallel = False
        else:
            self.use_parallel = False
        if self.use_parallel:
            if self.cls_name not in ["Lamb", "AdamWeightDecay"]:
                raise RuntimeError(
                    "Parallel optimizer does not support optimizer {}".format(
                        self.cls_name))
            self.dev_num = _get_device_num()
            if self.dev_num > self.param_length:
                raise RuntimeError(
                    "Parallel optimizer can not be applied when the number of parameters {} is"
                    " less than the number of devices {}".format(
                        self.param_length, self.dev_num))
            self.param_rank = self._get_parameter_group_id()
            self.optim_filter = tuple(
                map(lambda x: x == _get_global_rank(), self.param_rank))
            self.param_names = []
            for param in self.parameters:
                self.param_names.append(param.name)

        else:
            self.optim_filter = (True, ) * self.param_length
Ejemplo n.º 27
0
    def compile(self,
                obj,
                *args,
                phase='predict',
                params=None,
                do_convert=True):
        """
        Compiles graph.

        Args:
            obj (Function/Cell): The function or cell instance need compile.
            args (tuple): Function or cell input arguments.
            phase (str): The name of compile phase. Default: 'predict'.
            params (OrderedDict): The parameters dictionary used for init data graph. Default: None.
            do_convert (bool): When set to True, convert ME graph to GE graph after compiling graph.

        Return:
            Str, the full phase of the cell.
            Bool, if the graph has been compiled before, return False, else return True.
        """
        obj.check_names()
        args_names, args_list = _generate_pip_args(obj, *args)
        dic = dict(zip(args_names, args_list))
        key = generate_key(phase, dic)
        self.phase_prefix = str(key[1])
        if phase == 'export':
            phase = phase + '.' + str(obj.create_time)
        else:
            phase = self.phase_prefix + phase + '.' + str(obj.create_time)
        enable_debug_runtime = context.get_context("enable_debug_runtime")
        enable_ge = context.get_context("enable_ge")

        use_vm = not enable_ge or (enable_debug_runtime
                                   and context.get_context("mode")
                                   == context.PYNATIVE_MODE)

        if phase in self.compile_cache.keys():
            logger.debug("%r graph has existed.", phase)
            return phase, False

        result = self._executor.compile(obj, args_list, phase, use_vm)
        self.compile_cache[phase] = phase
        if not result:
            raise RuntimeError("Executor compile failed.")
        graph = self._executor.get_func_graph(phase)

        if graph is None:
            logger.error("%r graph compile failed.", phase)
        if not do_convert:
            return phase, True
        if not enable_debug_runtime or enable_ge:
            if _get_parallel_mode() in ["auto_parallel", "semi_auto_parallel"]:
                obj.parameter_layout_dict = self._executor.get_parameter_layout(
                    phase)
                obj.load_parameter_slice(params)

        # the following GE init process is not needed when use vm or ms backend
        if enable_ge:
            # decide whether to sink based on whether the inputs is virtual or not
            if args_list and isinstance(args_list[0],
                                        Tensor) and args_list[0].virtual_flag:
                _set_dataset_mode_config('sink')
            else:
                _set_dataset_mode_config('normal')

            self._build_data_graph(obj, params, phase)

            if "export" not in phase:
                init_phase = "init_subgraph" + "." + str(obj.create_time)
                _exec_init_graph(obj, init_phase)
        elif not enable_ge and "export" in phase:
            self._build_data_graph(obj, params, phase)

        return phase, True
Ejemplo n.º 28
0
    def __init__(self,
                 network,
                 optimizer,
                 norm_bound=1.0,
                 sens=1.0,
                 micro_batches=None,
                 noise_mech=None,
                 clip_mech=None):
        super(_TrainOneStepCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
        self.sens = sens
        self.reducer_flag = False
        self.grad_reducer = None
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL,
                             ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)

        # dp params
        if micro_batches is None:
            msg = 'micro_batches must give in differential privacy, but got value: {}'.format(
                micro_batches)
            LOGGER.error(TAG, msg)
            raise ValueError(msg)
        self._micro_batches = micro_batches
        self._norm_bound = norm_bound
        self._split = P.Split(0, self._micro_batches)
        self._clip_by_global_norm = _ClipGradients()
        self._noise_mech = noise_mech
        self._clip_mech = clip_mech
        self._tuple_add = _TupleAdd()
        self._add = P.TensorAdd()
        self._norm = nn.Norm()
        self._hyper_map = C.HyperMap()
        self._zero = Tensor(0, mstype.float32)
        self._assign = P.Assign()
        self._div = P.Div()
        self._sqrt = P.Sqrt()
        self._reduce_sum = P.ReduceSum()
        self._square_all = P.Square()
        self._less = P.Less()
        self._cast = P.Cast()

        self._micro_float = Tensor(micro_batches, mstype.float32)

        self._noise_mech_param_updater = None
        if self._noise_mech is not None and self._noise_mech._decay_policy is not None:
            self._noise_mech_param_updater = _MechanismsParamsUpdater(
                decay_policy=self._noise_mech._decay_policy,
                decay_rate=self._noise_mech._noise_decay_rate,
                cur_noise_multiplier=self._noise_mech._noise_multiplier,
                init_noise_multiplier=self._noise_mech.
                _initial_noise_multiplier)
Ejemplo n.º 29
0
    def __init__(self,
                 network,
                 optimizer,
                 scale_update_cell=None,
                 micro_batches=None,
                 norm_bound=1.0,
                 noise_mech=None,
                 clip_mech=None):
        super(_TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
        self.hyper_map = C.HyperMap()
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.gpu_target = False
            self.alloc_status = NPUAllocFloatStatus()
            self.get_status = NPUGetFloatStatus()
            self.clear_status = NPUClearFloatStatus()
        self.reduce_sum = ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = LessEqual()
        self.depend_parameter_use = ControlDepend(depend_mode=1)
        self.allreduce = P.AllReduce()
        self.parallel_mode = _get_parallel_mode()
        self.grad_reducer = F.identity
        self.reducer_flag = self.parallel_mode in [
            ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE

        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(Tensor(
                scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                        name="loss_scale")
        self.add_flags(has_effect=True)

        # dp params
        self._micro_batches = micro_batches
        self._norm_bound = norm_bound
        self._split = P.Split(0, self._micro_batches)
        self._clip_by_global_norm = _ClipGradients()
        self._noise_mech = noise_mech
        self._clip_mech = clip_mech
        self._add = P.TensorAdd()
        self._norm = nn.Norm()
        self._tuple_add = _TupleAdd()
        self._hyper_map = C.HyperMap()
        self._micro_float = Tensor(micro_batches, mstype.float32)
        self._zero = Tensor(0, mstype.float32)
        self._assign = P.Assign()
        self._div = P.Div()
        self._sqrt = P.Sqrt()
        self._reduce_sum = P.ReduceSum()
        self._square_all = P.Square()
        self._less = P.Less()
        self._cast = P.Cast()

        self._noise_mech_param_updater = None
        if self._noise_mech is not None and self._noise_mech._decay_policy is not None:
            self._noise_mech_param_updater = _MechanismsParamsUpdater(
                decay_policy=self._noise_mech._decay_policy,
                decay_rate=self._noise_mech._noise_decay_rate,
                cur_noise_multiplier=self._noise_mech._noise_multiplier,
                init_noise_multiplier=self._noise_mech.
                _initial_noise_multiplier)
Ejemplo n.º 30
0
    def _amp_build_train_network(self,
                                 network,
                                 optimizer,
                                 loss_fn=None,
                                 level='O0',
                                 **kwargs):
        """
        Build the mixed precision training cell automatically.

        Args:
            network (Cell): Definition of the network.
            loss_fn (Union[None, Cell]): Definition of the loss_fn. If None,
                the `network` should have the loss inside. Default: None.
            optimizer (Optimizer): Optimizer to update the Parameter.
            level (str): Supports [O0, O2]. Default: "O0".

                - O0: Do not change.
                - O2: Cast network to float16, keep batchnorm and `loss_fn`
                  (if set) run in float32, using dynamic loss scale.

            cast_model_type (:class:`mindspore.dtype`): Supports `mstype.float16`
                or `mstype.float32`. If set to `mstype.float16`, use `float16`
                mode to train. If set, overwrite the level setting.
            keep_batchnorm_fp32 (bool): Keep Batchnorm run in `float32`. If set,
                overwrite the level setting.
            loss_scale_manager (Union[None, LossScaleManager]): If None, not
                scale the loss, or else scale the loss by LossScaleManager.
                If set, overwrite the level setting.
        """
        validator.check_value_type('network', network, nn.Cell, None)
        validator.check_value_type('optimizer', optimizer, nn.Optimizer, None)
        validator.check('level', level, "", ['O0', 'O2'], Rel.IN, None)
        self._check_kwargs(kwargs)
        config = dict(_config_level[level], **kwargs)
        config = edict(config)

        if config.cast_model_type == mstype.float16:
            network.to_float(mstype.float16)

            if config.keep_batchnorm_fp32:
                _do_keep_batchnorm_fp32(network)

        if loss_fn:
            network = _add_loss_network(network, loss_fn,
                                        config.cast_model_type)

        if _get_parallel_mode() in (ParallelMode.SEMI_AUTO_PARALLEL,
                                    ParallelMode.AUTO_PARALLEL):
            network = _VirtualDatasetCell(network)

        loss_scale = 1.0
        if config.loss_scale_manager is not None:
            loss_scale_manager = config.loss_scale_manager
            loss_scale = loss_scale_manager.get_loss_scale()
            update_cell = loss_scale_manager.get_update_cell()
            if update_cell is not None:
                # only cpu not support `TrainOneStepWithLossScaleCell` for control flow.
                if not context.get_context(
                        "enable_ge") and context.get_context(
                            "device_target") == "CPU":
                    msg = "Only `loss_scale_manager=None` and " \
                          "`loss_scale_manager=FixedLossScaleManager(drop_overflow" \
                          "_update=False)` are supported in current version. " \
                          "If you use `O2` option, please use " \
                          "`loss_scale_manager=None` or `FixedLossScaleManager`"
                    LOGGER.error(TAG, msg)
                    raise ValueError(msg)
                network = _TrainOneStepWithLossScaleCell(
                    network,
                    optimizer,
                    scale_update_cell=update_cell,
                    micro_batches=self._micro_batches,
                    norm_bound=self._norm_bound,
                    clip_mech=self._clip_mech,
                    noise_mech=self._noise_mech).set_train()
                return network

        network = _TrainOneStepCell(network,
                                    optimizer,
                                    self._norm_bound,
                                    loss_scale,
                                    micro_batches=self._micro_batches,
                                    clip_mech=self._clip_mech,
                                    noise_mech=self._noise_mech).set_train()
        return network