def __init__(self): super(NetNorm, self).__init__() self.norm_1 = nn.Norm(axis=0) self.norm_2 = nn.Norm(axis=1) self.norm_3 = nn.Norm(axis=-1) self.norm_4 = nn.Norm(axis=-1, keep_dims=True)
def __init__(self, network, optimizer, sens=1.0): super(TrainOneStepCellWithGradClip, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None self.hyper_map = C.HyperMap() self.greater = P.Greater() self.select = P.Select() self.norm = nn.Norm(keep_dims=True) self.dtype = P.DType() self.cast = P.Cast() self.concat = P.Concat(axis=0) self.ten = Tensor(np.array([10.0]).astype(np.float32)) parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_mirror_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
def __init__(self, params, config): super(GlobalNormPipline, self).__init__() self.norm = nn.Norm() self.hyper_map = C.HyperMap() self.allreduce_filter = tuple( "projection.bias" not in x.name and "layernorm" not in x.name and "position_embedding.embedding_table" not in x.name for x in params) self.allreduce_group_size = () for item in self.allreduce_filter: if item: self.allreduce_group_size = self.allreduce_group_size + (1.0, ) else: self.allreduce_group_size = self.allreduce_group_size + ( config.mp * 1.0, ) self.length = len(params) group_list, group_name = _get_model_parallel_group(config.mp) print("rank_list", group_name) print("group_size_list", self.allreduce_group_size) create_group(group_name, group_list) self.allreduce = P.AllReduce(group=group_name) pipeline_group_list, pipeline_group_name = _get_pipeline_group() print("pipeline_group_name", pipeline_group_name) create_group(pipeline_group_name, pipeline_group_list) self.allreduce2 = P.AllReduce(group=pipeline_group_name)
def __init__(self, is_training, query_size, key_size, num_units, normalize=False, initializer_range=0.1, compute_type=mstype.float16): super(BahdanauAttention, self).__init__() self.is_training = is_training self.mask = None self.query_size = query_size self.key_size = key_size self.normalize = normalize self.num_units = num_units self.linear_att = Parameter(Tensor(np.random.uniform( -initializer_range, initializer_range, size=[num_units]), dtype=mstype.float32), name='linear_att') if self.normalize: self.normalize_scalar = Parameter(Tensor(np.array( [1.0 / num_units]), dtype=mstype.float32), name='normalize_scalar') self.normalize_bias = Parameter(Tensor(np.zeros(num_units), dtype=mstype.float32), name='normalize_bias') self.transpose = P.Transpose() self.transpose_orders = (1, 0, 2) self.shape_op = P.Shape() self.linear_q = nn.Dense( query_size, num_units, has_bias=False, weight_init=Uniform(initializer_range)).to_float(compute_type) self.linear_k = nn.Dense( key_size, num_units, has_bias=False, weight_init=Uniform(initializer_range)).to_float(compute_type) self.expand = P.ExpandDims() self.tile = P.Tile() self.norm = nn.Norm(axis=-1) self.mul = P.Mul() self.matmul = P.MatMul() self.batchMatmul = P.BatchMatMul() self.tanh = nn.Tanh() self.matmul_trans_b = P.BatchMatMul(transpose_b=True) self.softmax = nn.Softmax(axis=-1) self.reshape = P.Reshape() self.cast = P.Cast()
def __init__(self, fixed_atoms=False, dim=3): super().__init__() self.fixed_atoms = fixed_atoms self.reducesum = P.ReduceSum() self.pow = P.Pow() # self.concat = P.Concat() # self.pack = P.Pack() self.gatherd = P.GatherD() self.norm = nn.Norm(-1) self.gather_neighbors = GatherNeighbors(dim, fixed_atoms)
def __init__(self, params, config): super(GlobalNorm, self).__init__() self.norm = nn.Norm() self.hyper_map = C.HyperMap() self.config = config self.allreduce_filter = tuple( "projection.bias" not in x.name and "layernorm" not in x.name and "embedding_table" not in x.name for x in params) self.length = len(params) self.values = [] self.group_size = get_group_size() for item in self.allreduce_filter: if item: self.values.append(1.0) else: self.values.append(self.group_size * 1.0) self.values = tuple(self.values)
def __init__(self, network, optimizer, norm_bound=1.0, sens=1.0, micro_batches=None, noise_mech=None, clip_mech=None): super(_TrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_mirror_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) # dp params if micro_batches is None: msg = 'micro_batches must give in differential privacy, but got value: {}'.format( micro_batches) LOGGER.error(TAG, msg) raise ValueError(msg) self._micro_batches = micro_batches self._norm_bound = norm_bound self._split = P.Split(0, self._micro_batches) self._clip_by_global_norm = _ClipGradients() self._noise_mech = noise_mech self._clip_mech = clip_mech self._tuple_add = _TupleAdd() self._add = P.TensorAdd() self._norm = nn.Norm() self._hyper_map = C.HyperMap() self._zero = Tensor(0, mstype.float32) self._assign = P.Assign() self._div = P.Div() self._sqrt = P.Sqrt() self._reduce_sum = P.ReduceSum() self._square_all = P.Square() self._less = P.Less() self._cast = P.Cast() self._micro_float = Tensor(micro_batches, mstype.float32) self._noise_mech_param_updater = None if self._noise_mech is not None and self._noise_mech._decay_policy is not None: self._noise_mech_param_updater = _MechanismsParamsUpdater( decay_policy=self._noise_mech._decay_policy, decay_rate=self._noise_mech._noise_decay_rate, cur_noise_multiplier=self._noise_mech._noise_multiplier, init_noise_multiplier=self._noise_mech. _initial_noise_multiplier)
def __init__(self, network, optimizer, scale_update_cell=None, micro_batches=None, norm_bound=1.0, noise_mech=None, clip_mech=None): super(_TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) self.hyper_map = C.HyperMap() if context.get_context("device_target") == "GPU": self.gpu_target = True self.float_status = P.FloatStatus() self.addn = P.AddN() self.reshape = P.Reshape() else: self.gpu_target = False self.alloc_status = NPUAllocFloatStatus() self.get_status = NPUGetFloatStatus() self.clear_status = NPUClearFloatStatus() self.reduce_sum = ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = LessEqual() self.depend_parameter_use = ControlDepend(depend_mode=1) self.allreduce = P.AllReduce() self.parallel_mode = _get_parallel_mode() self.grad_reducer = F.identity self.reducer_flag = self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ] if self.reducer_flag: mean = _get_mirror_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor( scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale") self.add_flags(has_effect=True) # dp params self._micro_batches = micro_batches self._norm_bound = norm_bound self._split = P.Split(0, self._micro_batches) self._clip_by_global_norm = _ClipGradients() self._noise_mech = noise_mech self._clip_mech = clip_mech self._add = P.TensorAdd() self._norm = nn.Norm() self._tuple_add = _TupleAdd() self._hyper_map = C.HyperMap() self._micro_float = Tensor(micro_batches, mstype.float32) self._zero = Tensor(0, mstype.float32) self._assign = P.Assign() self._div = P.Div() self._sqrt = P.Sqrt() self._reduce_sum = P.ReduceSum() self._square_all = P.Square() self._less = P.Less() self._cast = P.Cast() self._noise_mech_param_updater = None if self._noise_mech is not None and self._noise_mech._decay_policy is not None: self._noise_mech_param_updater = _MechanismsParamsUpdater( decay_policy=self._noise_mech._decay_policy, decay_rate=self._noise_mech._noise_decay_rate, cur_noise_multiplier=self._noise_mech._noise_multiplier, init_noise_multiplier=self._noise_mech. _initial_noise_multiplier)
def __init__(self): super(GlobalNorm, self).__init__() self.norm = nn.Norm() self.hyper_map = C.HyperMap()
def _compute_norm(grad): norm = nn.Norm() norm = norm(F.cast(grad, mstype.float32)) ret = F.expand_dims(F.cast(norm, mstype.float32), 0) return ret
def __init__(self,reduction='mean'): super().__init__() self.norm = nn.Norm(-1) self.reduce_mean = P.ReduceMean()
def __init__(self): super(NormNet, self).__init__() self.norm = nn.Norm()