def __init__(self, params, config): super(GlobalNormPipline, self).__init__() self.norm = nn.Norm() self.hyper_map = C.HyperMap() self.allreduce_filter = tuple( "projection.bias" not in x.name and "layernorm" not in x.name and "position_embedding.embedding_table" not in x.name for x in params) self.allreduce_group_size = () for item in self.allreduce_filter: if item: self.allreduce_group_size = self.allreduce_group_size + (1.0, ) else: self.allreduce_group_size = self.allreduce_group_size + ( config.mp * 1.0, ) self.length = len(params) group_list, group_name = _get_model_parallel_group(config.mp) print("rank_list", group_name) print("group_size_list", self.allreduce_group_size) create_group(group_name, group_list) self.allreduce = P.AllReduce(group=group_name) pipeline_group_list, pipeline_group_name = _get_pipeline_group() print("pipeline_group_name", pipeline_group_name) create_group(pipeline_group_name, pipeline_group_list) self.allreduce2 = P.AllReduce(group=pipeline_group_name)
def __init__(self, group, transpose_a=False, transpose_b=False): super(MatmulReduce, self).__init__() self.matmul1 = P.MatMul(transpose_a, transpose_b) self.allreduce1 = P.AllReduce(group=group) self.matmul2 = P.MatMul(transpose_a, transpose_b) self.pow = P.Pow() self.reduce_sum = P.ReduceSum() self.allreduce2 = P.AllReduce(group=group)
def __init__(self): super(Net2, self).__init__() self.x1 = Parameter(initializer(Tensor(x), x.shape), name='x1') self.op0 = "sum" self.op1 = "sum" self.op2 = "sum" self.all_reduce1 = P.AllReduce(self.op0, group=NCCL_WORLD_COMM_GROUP) self.all_reduce2 = P.AllReduce(self.op1, group=NCCL_WORLD_COMM_GROUP) self.all_reduce3 = P.AllReduce(self.op2, group=NCCL_WORLD_COMM_GROUP)
def __init__(self, group, transpose_a=False, transpose_b=False): super(MatmulAllgather, self).__init__() self.allgather = P.AllGather(group=group) self.matmul = P.MatMul(transpose_a, transpose_b) self.pow = P.Pow() self.reduce_sum = P.ReduceSum() self.allreduce = P.AllReduce(group=group)
def __init__(self, network, optimizer, scale_update_cell=None): super(TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.add_flags(defer_inline=True) self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.hyper_map = C.HyperMap() self.alloc_status = NPUAllocFloatStatus() self.get_status = NPUGetFloatStatus() self.clear_status = NPUClearFloatStatus() self.reduce_sum = ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.reducer_flag = False self.less_equal = LessEqual() self.allreduce = P.AllReduce() self.parallel_mode = _get_parallel_mode() self.grad_reducer = None parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale")
def __init__(self, network, optimizer, sens=1.0): super(TrainingWrapper, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() #False self.network.add_flags(defer_inline=True) self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): degree = context.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer( optimizer.parameters, mean, degree) self.hyper_map = C.HyperMap() self.alloc_status = NPUAllocFloatStatus() self.get_status = NPUGetFloatStatus() self.clear_status = NPUClearFloatStatus() self.reduce_sum = ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = LessEqual() self.allreduce = P.AllReduce() self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE
def __init__(self, network, optimizer, scale_update_cell=None): super(BertTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False self.allreduce = P.AllReduce() self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = F.identity self.degree = 1 if self.reducer_flag: self.degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.cast = P.Cast() self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_status = P.NPUClearFloatStatus() self.reduce_sum = P.ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = P.LessEqual() self.hyper_map = C.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))
def __init__(self, network, optimizer, scale_update_cell=None): super(BertSquadCell, self).__init__(auto_prefix=False) self.network = network self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) self.reducer_flag = False self.allreduce = P.AllReduce() self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = context.get_auto_parallel_context("mirror_mean") degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.cast = P.Cast() self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_before_grad = P.NPUClearFloatStatus() self.reduce_sum = P.ReduceSum(keep_dims=False) self.depend_parameter_use = P.ControlDepend(depend_mode=1) self.base = Tensor(1, mstype.float32) self.less_equal = P.LessEqual() self.hyper_map = C.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor( scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale")
def __init__(self, network, optimizer, scale_update_cell=None, accumulation_steps=1, enable_global_norm=False): super(BertTrainAccumulateStepsWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.weights = optimizer.parameters self.optimizer = optimizer self.accumulation_steps = accumulation_steps self.enable_global_norm = enable_global_norm self.one = Tensor(np.array([1]).astype(np.int32)) self.zero = Tensor(np.array([0]).astype(np.int32)) self.local_step = Parameter(initializer(0, [1], mstype.int32), name="local_step") self.accu_grads = self.weights.clone(prefix="accu_grads", init='zeros') self.accu_overflow = Parameter(initializer(0, [1], mstype.int32), name="accu_overflow") self.loss = Parameter(initializer(0, [1], mstype.float32), name="accu_loss") self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True self.grad_reducer = F.identity self.degree = 1 if self.reducer_flag: self.degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.overflow_reducer = F.identity if self.is_distributed: self.overflow_reducer = P.AllReduce() self.cast = P.Cast() self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_before_grad = P.NPUClearFloatStatus() self.reduce_sum = P.ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = P.LessEqual() self.logical_or = P.LogicalOr() self.not_equal = P.NotEqual() self.select = P.Select() self.reshape = P.Reshape() self.hyper_map = C.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor( scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale")
def __init__(self, network, optimizer, scale_update_cell=None): super(BertTrainWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False self.allreduce = P.AllReduce() self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True self.grad_reducer = F.identity self.degree = 1 if self.reducer_flag: self.degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) self.clip_type = gradient_cfg.clip_type self.clip_value = gradient_cfg.clip_value self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.cast = P.Cast() self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_before_grad = P.NPUClearFloatStatus() self.reduce_sum = P.ReduceSum(keep_dims=False) self.depend_parameter_use = P.ControlDepend(depend_mode=1) self.base = Tensor(1, mstype.float32) self.less_equal = P.LessEqual() self.hyper_map = C.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter( Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) self.saved_params = self.weights.clone(prefix='saved') self.length = len(self.weights) self.quant_embedding_list = [] self.quant_weight_list = [] for i, key in enumerate(self.saved_params): if 'embedding_lookup' in key.name: self.quant_embedding_list.append(i) elif 'weight' in key.name and 'dense_1' not in key.name: self.quant_weight_list.append(i) self.quant_embedding_list_length = len(self.quant_embedding_list) self.quant_weight_list_length = len(self.quant_weight_list) self.quantize_embedding = QuantizeWeightCell( num_bits=network.embedding_bits, compute_type=network.compute_type, clip_value=network.weight_clip_value) self.quantize_weight = QuantizeWeightCell( num_bits=network.weight_bits, compute_type=network.compute_type, clip_value=network.weight_clip_value)
def __init__(self, loop_count=1): super().__init__() self.op_list = () self.fushion_flag = [0, 1, 1, 0, 1, 0] for i in self.fushion_flag: op = P.AllReduce().add_prim_attr('fusion', i) self.op_list = self.op_list + (op,) self.hyper_map = C.HyperMap()
def __init__(self, network): super(ClassifyCorrectCell, self).__init__(auto_prefix=False) self._network = network self.argmax = P.Argmax() self.equal = P.Equal() self.cast = P.Cast() self.reduce_sum = P.ReduceSum() self.allreduce = P.AllReduce(P.ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP)
def __init__(self, network, optimizer, scale_update_cell=None): super(TransformerTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False self.all_reduce = P.AllReduce() self.parallel_mode = _get_parallel_mode() if self.parallel_mode not in ParallelMode.MODE_LIST: raise ValueError("Parallel mode does not support: ", self.parallel_mode) if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.clip_gradients = ClipGradients() self.cast = P.Cast() if context.get_context("device_target") == "GPU": self.gpu_target = True self.float_status = P.FloatStatus() self.addn = P.AddN() self.reshape = P.Reshape() else: self.gpu_target = False self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_status = P.NPUClearFloatStatus() self.reduce_sum = P.ReduceSum(keep_dims=False) self.depend_parameter_use = P.ControlDepend(depend_mode=1) self.base = Tensor(1, mstype.float32) self.less_equal = P.LessEqual() self.hyper_map = C.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter( Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) self.add_flags(has_effect=True)
def __init__(self, network, optimizer, scale_update_cell=None, micro_batches=None, norm_clip=1.0, mech=None): super(_TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) self.hyper_map = C.HyperMap() if context.get_context("device_target") == "GPU": self.gpu_target = True self.float_status = P.FloatStatus() self.addn = P.AddN() self.reshape = P.Reshape() else: self.gpu_target = False self.alloc_status = NPUAllocFloatStatus() self.get_status = NPUGetFloatStatus() self.clear_status = NPUClearFloatStatus() self.reduce_sum = ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = LessEqual() self.depend_parameter_use = ControlDepend(depend_mode=1) self.allreduce = P.AllReduce() self.parallel_mode = _get_parallel_mode() self.grad_reducer = F.identity self.reducer_flag = self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL] if self.reducer_flag: mean = _get_mirror_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale") self.add_flags(has_effect=True) # dp params self._micro_batches = micro_batches norm_clip = check_param_type('norm_clip', norm_clip, float) self._l2_norm = check_value_positive('norm_clip', norm_clip) self._split = P.Split(0, self._micro_batches) self._clip_by_global_norm = _ClipGradients() self._mech = mech self._tuple_add = _TupleAdd() self._hyper_map = C.HyperMap() self._micro_float = Tensor(micro_batches, mstype.float32)
def __init__(self, network, optimizer, scale_sense): super(DFCNNCTCTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.optimizer = optimizer if isinstance(scale_sense, nn.Cell): self.loss_scaling_manager = scale_sense self.scale_sense = Parameter(Tensor(scale_sense.get_loss_scale(), dtype=mstype.float32), name="scale_sense") elif isinstance(scale_sense, Tensor): if scale_sense.shape == (1,) or scale_sense.shape == (): self.scale_sense = Parameter(scale_sense, name='scale_sense') else: raise ValueError("The shape of scale_sense must be (1,) or (), but got {}".format( scale_sense.shape)) else: raise TypeError("The scale_sense must be Cell or Tensor, but got {}".format( type(scale_sense))) self.network.set_grad() self.weights = ParameterTuple(network.trainable_params()) self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode not in ParallelMode.MODE_LIST: raise ValueError("Parallel mode does not support: ", self.parallel_mode) if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.clip_gradients = ClipGradients() self.cast = P.Cast() self.addn = P.AddN() self.reshape = P.Reshape() self.hyper_map = C.HyperMap() self.less_equal = P.LessEqual() self.allreduce = P.AllReduce()
def __init__(self, network, optimizer, scale_update_cell=None, enable_global_norm=True, config=None): super(PANGUALPHATrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.config = config self.network.add_flags(defer_inline=True) self.weights = optimizer.parameters self.optimizer = optimizer self.enable_global_norm = enable_global_norm self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False self.allreduce = P.AllReduce() self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True self.grad_reducer = F.identity self.degree = 1 if self.reducer_flag: self.degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.cast = P.Cast() self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_before_grad = P.NPUClearFloatStatus() self.reduce_sum = P.ReduceSum(keep_dims=False) self.depend_parameter_use = P.ControlDepend(depend_mode=1) self.base = Tensor(1, mstype.float32) self.less_equal = P.LessEqual() self.hyper_map = C.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor( scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale") self.clip = ClipByGlobalNorm(self.weights, self.config, pipeline=False)
def __init__(self, network, optimizer, scale_update_cell=None): super(BertFinetuneCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False self.allreduce = P.AllReduce() self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.cast = P.Cast() self.gpu_target = False if context.get_context("device_target") == "GPU": self.gpu_target = True self.float_status = P.FloatStatus() self.addn = P.AddN() self.reshape = P.Reshape() else: self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_status = P.NPUClearFloatStatus() self.reduce_sum = P.ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = P.LessEqual() self.hyper_map = C.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter( Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))
def __init__(self, num_features, eps=1e-5, momentum=0.9, affine=True, gamma_init='ones', beta_init='zeros', moving_mean_init='zeros', moving_var_init='ones', use_batch_statistics=None, device_num_each_group=1, input_dims='2d', data_format='NCHW'): super(_BatchNorm, self).__init__() if num_features < 1: raise ValueError("num_features must be at least 1") if momentum < 0 or momentum > 1: raise ValueError("momentum should be a number in range [0, 1], but got {}".format(momentum)) self.format = validator.check_string(data_format, ['NCHW', 'NHWC'], 'format', self.cls_name) if context.get_context("device_target") != "GPU" and self.format == "NHWC": raise ValueError("NHWC format only support in GPU target.") self.use_batch_statistics = use_batch_statistics self.num_features = num_features self.eps = eps self.input_dims = input_dims self.moving_mean = Parameter(initializer( moving_mean_init, num_features), name="mean", requires_grad=False) self.moving_variance = Parameter(initializer( moving_var_init, num_features), name="variance", requires_grad=False) self.gamma = Parameter(initializer( gamma_init, num_features), name="gamma", requires_grad=affine) self.beta = Parameter(initializer( beta_init, num_features), name="beta", requires_grad=affine) self.group = validator.check_positive_int(device_num_each_group) self.is_global = False if self.group != 1: self.rank_id = get_rank() self.rank_size = get_group_size() self.device_list = [i for i in range(0, self.rank_size)] self.rank_list = self.list_group(self.device_list, self.group) self.rank_list_idx = len(self.rank_list) for i in range(self.rank_list_idx): if self.rank_id in self.rank_list[i] and self.group != 1: self.is_global = True management.create_group('group' + str(i), self.rank_list[i]) self.all_reduce = P.AllReduce(P.ReduceOp.SUM, 'group' + str(i)).add_prim_attr('fusion', 1) self.shape = P.Shape() self.reduce_mean = P.ReduceMean(keep_dims=True) self.square = P.Square() self.sqrt = P.Sqrt() self.cast = P.Cast() self.dtype = P.DType() self.reshape = P.Reshape() self.is_ascend = context.get_context("device_target") == "Ascend" self.is_gpu = context.get_context("device_target") == "GPU" self.is_graph_mode = context.get_context("mode") == context.GRAPH_MODE self.momentum = 1.0 - momentum if context.get_context("enable_ge"): self.is_ge_backend = True else: self.is_ge_backend = False if self.is_graph_mode and (self.is_ge_backend or self.is_ascend): self.bn_train = P.BatchNorm(is_training=True, epsilon=self.eps) elif self.is_gpu: self.bn_train = P.FusedBatchNormEx(mode=1, epsilon=self.eps, momentum=self.momentum, data_format=self.format) else: self.bn_train = P.FusedBatchNorm(mode=1, epsilon=self.eps, momentum=self.momentum) self.bn_infer = P.BatchNorm(is_training=False, epsilon=self.eps, data_format=self.format) self.enable_global_sync = self.is_global and (self.is_ge_backend or (self.is_graph_mode and self.is_ascend)) self.enable_default_train = self.is_graph_mode and not self.is_global and \ (self.is_ge_backend or self.is_ascend) data_parallel_strategy = ((1,), (1,)) data_parallel_strategy_one = ((1,), ()) self.sub_mean = P.Sub().shard(data_parallel_strategy) self.sub_var = P.Sub().shard(data_parallel_strategy) self.mul_mean = P.Mul().shard(data_parallel_strategy_one) self.mul_var = P.Mul().shard(data_parallel_strategy_one) self.assign_sub_mean = P.AssignSub().shard(data_parallel_strategy) self.assign_sub_var = P.AssignSub().shard(data_parallel_strategy)
def __init__(self): super(ParameterReduce, self).__init__() self.cast = P.Cast() self.reduce = P.AllReduce()
def __init__(self): super(AllReduceNet, self).__init__() self.all_reduce = P.AllReduce()
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ from mindspore.ops import Primitive from mindspore.ops import operations as P from mindspore.ops import _constants as Constants depend = P.Depend() all_reduce = P.AllReduce() broadcast = P.Broadcast(1) tensor_move = Primitive('TensorMove') make_tuple = Primitive('MakeTuple') tuple_getitem = Primitive(Constants.kTupleGetItem) assign_add = P.AssignAdd() apply_momentun = P.ApplyMomentum() relu = P.ReLU() class FnDict: def __init__(self): self.fnDict = {} def __call__(self, fn): self.fnDict[fn.__name__] = fn
def construct(self, grads): square_sum_dp = self.hyper_map(get_square_sum, grads, self.values) global_norms = F.sqrt(P.AllReduce()(F.addn(square_sum_dp))) return global_norms
def __init__(self, network, optimizer, scale_update_cell=None, micro_batches=None, norm_bound=1.0, noise_mech=None, clip_mech=None): super(_TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) self.hyper_map = C.HyperMap() if context.get_context("device_target") == "GPU": self.gpu_target = True self.float_status = P.FloatStatus() self.addn = P.AddN() self.reshape = P.Reshape() else: self.gpu_target = False self.alloc_status = NPUAllocFloatStatus() self.get_status = NPUGetFloatStatus() self.clear_status = NPUClearFloatStatus() self.reduce_sum = ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = LessEqual() self.depend_parameter_use = ControlDepend(depend_mode=1) self.allreduce = P.AllReduce() self.parallel_mode = _get_parallel_mode() self.grad_reducer = F.identity self.reducer_flag = self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ] if self.reducer_flag: mean = _get_mirror_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor( scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale") self.add_flags(has_effect=True) # dp params self._micro_batches = micro_batches self._norm_bound = norm_bound self._split = P.Split(0, self._micro_batches) self._clip_by_global_norm = _ClipGradients() self._noise_mech = noise_mech self._clip_mech = clip_mech self._add = P.TensorAdd() self._norm = nn.Norm() self._tuple_add = _TupleAdd() self._hyper_map = C.HyperMap() self._micro_float = Tensor(micro_batches, mstype.float32) self._zero = Tensor(0, mstype.float32) self._assign = P.Assign() self._div = P.Div() self._sqrt = P.Sqrt() self._reduce_sum = P.ReduceSum() self._square_all = P.Square() self._less = P.Less() self._cast = P.Cast() self._noise_mech_param_updater = None if self._noise_mech is not None and self._noise_mech._decay_policy is not None: self._noise_mech_param_updater = _MechanismsParamsUpdater( decay_policy=self._noise_mech._decay_policy, decay_rate=self._noise_mech._noise_decay_rate, cur_noise_multiplier=self._noise_mech._noise_multiplier, init_noise_multiplier=self._noise_mech. _initial_noise_multiplier)
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ from mindspore.ops import operations as P from mindspore.ops.operations import _grad_ops as G from mindspore.ops import Primitive # pylint: disable=unused-variable tuple_getitem = Primitive('tuple_getitem') add = P.TensorAdd() allreduce = P.AllReduce() allreduce.add_prim_attr('fusion', 1) make_tuple = Primitive('make_tuple') conv = P.Conv2D(out_channel=64, kernel_size=7, mode=1, pad_mode="valid", pad=0, stride=1, dilation=1, group=1) bn = P.FusedBatchNorm() relu = P.ReLU() conv_bn1 = Primitive('ConvBN1') bn2_add_relu = Primitive('BN2AddRelu') bn2_relu = Primitive('BN2Relu') fused_bn1 = Primitive('FusedBN1') fused_bn2 = Primitive('FusedBN2') fused_bn3 = Primitive('FusedBN3') bn_grad = G.FusedBatchNormGrad() bn_grad1 = Primitive('BNGrad1') bn_grad2 = Primitive('BNGrad2') bn_grad3 = Primitive('BNGrad3')
def __init__(self, num_features, eps=1e-5, momentum=0.9, affine=True, gamma_init='ones', beta_init='zeros', moving_mean_init='zeros', moving_var_init='ones', use_batch_statistics=True, device_num_each_group=1): super(_BatchNorm, self).__init__() if num_features < 1: raise ValueError("num_features must be at least 1") if momentum < 0 or momentum > 1: raise ValueError( "momentum should be a number in range [0, 1], but got {}". format(momentum)) self.use_batch_statistics = use_batch_statistics self.num_features = num_features self.eps = eps self.moving_mean = Parameter(initializer(moving_mean_init, num_features), name="mean", requires_grad=False) self.moving_variance = Parameter(initializer(moving_var_init, num_features), name="variance", requires_grad=False) self.gamma = Parameter(initializer(gamma_init, num_features), name="gamma", requires_grad=affine) self.beta = Parameter(initializer(beta_init, num_features), name="beta", requires_grad=affine) self.group = check_int_positive(device_num_each_group) self.is_global = False if self.group != 1: self.rank_id = get_rank() self.rank_size = get_group_size() self.device_list = [i for i in range(0, self.rank_size)] self.rank_list = self.list_group(self.device_list, self.group) self.rank_list_idx = len(self.rank_list) for i in range(self.rank_list_idx): if self.rank_id in self.rank_list[i] and self.group != 1: self.is_global = True management.create_group('group' + str(i), self.rank_list[i]) self.all_reduce = P.AllReduce( P.ReduceOp.SUM, 'group' + str(i)).add_prim_attr('fusion', 1) self.shape = P.Shape() self.reduce_mean = P.ReduceMean(keep_dims=True) self.square = P.Square() self.sqrt = P.Sqrt() self.cast = P.Cast() self.dtype = P.DType() self.reshape = P.Reshape() self.is_ascend = context.get_context("device_target") == "Ascend" if context.get_context("enable_ge"): self.is_ge_backend = True self.momentum = Tensor(1.0 - momentum, mstype.float32) else: self.is_ge_backend = False self.momentum = 1.0 - momentum if self.is_ge_backend or self.is_ascend: self.bn_train = P.BatchNorm(is_training=True, epsilon=self.eps) else: self.bn_train = P.FusedBatchNorm(mode=1, epsilon=self.eps, momentum=self.momentum) self.bn_infer = P.BatchNorm(is_training=False, epsilon=self.eps) data_parallel_strategy = ((1, ), (1, )) data_parallel_strategy_one = ((1, ), ()) self.sub_mean = P.Sub().set_strategy(data_parallel_strategy) self.sub_var = P.Sub().set_strategy(data_parallel_strategy) self.mul_mean = P.Mul().set_strategy(data_parallel_strategy_one) self.mul_var = P.Mul().set_strategy(data_parallel_strategy_one) self.assign_sub_mean = P.AssignSub().set_strategy( data_parallel_strategy) self.assign_sub_var = P.AssignSub().set_strategy( data_parallel_strategy)
def __init__(self): super(DynamicAllReduceNet, self).__init__() self.op = "sum" self.all_reduce = P.AllReduce(self.op, group=NCCL_WORLD_COMM_GROUP) self.d = inner.GpuConvertToDynamicShape()