Ejemplo n.º 1
0
    def __init__(self):
        super(NetNorm, self).__init__()

        self.norm_1 = nn.Norm(axis=0)
        self.norm_2 = nn.Norm(axis=1)
        self.norm_3 = nn.Norm(axis=-1)
        self.norm_4 = nn.Norm(axis=-1, keep_dims=True)
Ejemplo n.º 2
0
 def __init__(self, network, optimizer, sens=1.0):
     super(TrainOneStepCellWithGradClip, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_grad()
     self.network.add_flags(defer_inline=True)
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
     self.sens = sens
     self.reducer_flag = False
     self.grad_reducer = None
     self.hyper_map = C.HyperMap()
     self.greater = P.Greater()
     self.select = P.Select()
     self.norm = nn.Norm(keep_dims=True)
     self.dtype = P.DType()
     self.cast = P.Cast()
     self.concat = P.Concat(axis=0)
     self.ten = Tensor(np.array([10.0]).astype(np.float32))
     parallel_mode = _get_parallel_mode()
     if parallel_mode in (ParallelMode.DATA_PARALLEL,
                          ParallelMode.HYBRID_PARALLEL):
         self.reducer_flag = True
     if self.reducer_flag:
         mean = _get_mirror_mean()
         degree = _get_device_num()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                    mean, degree)
Ejemplo n.º 3
0
 def __init__(self, params, config):
     super(GlobalNormPipline, self).__init__()
     self.norm = nn.Norm()
     self.hyper_map = C.HyperMap()
     self.allreduce_filter = tuple(
         "projection.bias" not in x.name and "layernorm" not in x.name
         and "position_embedding.embedding_table" not in x.name
         for x in params)
     self.allreduce_group_size = ()
     for item in self.allreduce_filter:
         if item:
             self.allreduce_group_size = self.allreduce_group_size + (1.0, )
         else:
             self.allreduce_group_size = self.allreduce_group_size + (
                 config.mp * 1.0, )
     self.length = len(params)
     group_list, group_name = _get_model_parallel_group(config.mp)
     print("rank_list", group_name)
     print("group_size_list", self.allreduce_group_size)
     create_group(group_name, group_list)
     self.allreduce = P.AllReduce(group=group_name)
     pipeline_group_list, pipeline_group_name = _get_pipeline_group()
     print("pipeline_group_name", pipeline_group_name)
     create_group(pipeline_group_name, pipeline_group_list)
     self.allreduce2 = P.AllReduce(group=pipeline_group_name)
Ejemplo n.º 4
0
    def __init__(self,
                 is_training,
                 query_size,
                 key_size,
                 num_units,
                 normalize=False,
                 initializer_range=0.1,
                 compute_type=mstype.float16):
        super(BahdanauAttention, self).__init__()
        self.is_training = is_training
        self.mask = None
        self.query_size = query_size
        self.key_size = key_size
        self.normalize = normalize
        self.num_units = num_units
        self.linear_att = Parameter(Tensor(np.random.uniform(
            -initializer_range, initializer_range, size=[num_units]),
                                           dtype=mstype.float32),
                                    name='linear_att')
        if self.normalize:
            self.normalize_scalar = Parameter(Tensor(np.array(
                [1.0 / num_units]),
                                                     dtype=mstype.float32),
                                              name='normalize_scalar')
            self.normalize_bias = Parameter(Tensor(np.zeros(num_units),
                                                   dtype=mstype.float32),
                                            name='normalize_bias')
        self.transpose = P.Transpose()
        self.transpose_orders = (1, 0, 2)
        self.shape_op = P.Shape()

        self.linear_q = nn.Dense(
            query_size,
            num_units,
            has_bias=False,
            weight_init=Uniform(initializer_range)).to_float(compute_type)

        self.linear_k = nn.Dense(
            key_size,
            num_units,
            has_bias=False,
            weight_init=Uniform(initializer_range)).to_float(compute_type)
        self.expand = P.ExpandDims()
        self.tile = P.Tile()

        self.norm = nn.Norm(axis=-1)
        self.mul = P.Mul()
        self.matmul = P.MatMul()
        self.batchMatmul = P.BatchMatMul()
        self.tanh = nn.Tanh()

        self.matmul_trans_b = P.BatchMatMul(transpose_b=True)
        self.softmax = nn.Softmax(axis=-1)
        self.reshape = P.Reshape()
        self.cast = P.Cast()
Ejemplo n.º 5
0
    def __init__(self, fixed_atoms=False, dim=3):
        super().__init__()
        self.fixed_atoms = fixed_atoms
        self.reducesum = P.ReduceSum()
        self.pow = P.Pow()
        # self.concat = P.Concat()
        # self.pack = P.Pack()
        self.gatherd = P.GatherD()
        self.norm = nn.Norm(-1)

        self.gather_neighbors = GatherNeighbors(dim, fixed_atoms)
Ejemplo n.º 6
0
 def __init__(self, params, config):
     super(GlobalNorm, self).__init__()
     self.norm = nn.Norm()
     self.hyper_map = C.HyperMap()
     self.config = config
     self.allreduce_filter = tuple(
         "projection.bias" not in x.name and "layernorm" not in x.name
         and "embedding_table" not in x.name for x in params)
     self.length = len(params)
     self.values = []
     self.group_size = get_group_size()
     for item in self.allreduce_filter:
         if item:
             self.values.append(1.0)
         else:
             self.values.append(self.group_size * 1.0)
     self.values = tuple(self.values)
Ejemplo n.º 7
0
    def __init__(self,
                 network,
                 optimizer,
                 norm_bound=1.0,
                 sens=1.0,
                 micro_batches=None,
                 noise_mech=None,
                 clip_mech=None):
        super(_TrainOneStepCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
        self.sens = sens
        self.reducer_flag = False
        self.grad_reducer = None
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL,
                             ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)

        # dp params
        if micro_batches is None:
            msg = 'micro_batches must give in differential privacy, but got value: {}'.format(
                micro_batches)
            LOGGER.error(TAG, msg)
            raise ValueError(msg)
        self._micro_batches = micro_batches
        self._norm_bound = norm_bound
        self._split = P.Split(0, self._micro_batches)
        self._clip_by_global_norm = _ClipGradients()
        self._noise_mech = noise_mech
        self._clip_mech = clip_mech
        self._tuple_add = _TupleAdd()
        self._add = P.TensorAdd()
        self._norm = nn.Norm()
        self._hyper_map = C.HyperMap()
        self._zero = Tensor(0, mstype.float32)
        self._assign = P.Assign()
        self._div = P.Div()
        self._sqrt = P.Sqrt()
        self._reduce_sum = P.ReduceSum()
        self._square_all = P.Square()
        self._less = P.Less()
        self._cast = P.Cast()

        self._micro_float = Tensor(micro_batches, mstype.float32)

        self._noise_mech_param_updater = None
        if self._noise_mech is not None and self._noise_mech._decay_policy is not None:
            self._noise_mech_param_updater = _MechanismsParamsUpdater(
                decay_policy=self._noise_mech._decay_policy,
                decay_rate=self._noise_mech._noise_decay_rate,
                cur_noise_multiplier=self._noise_mech._noise_multiplier,
                init_noise_multiplier=self._noise_mech.
                _initial_noise_multiplier)
Ejemplo n.º 8
0
    def __init__(self,
                 network,
                 optimizer,
                 scale_update_cell=None,
                 micro_batches=None,
                 norm_bound=1.0,
                 noise_mech=None,
                 clip_mech=None):
        super(_TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
        self.hyper_map = C.HyperMap()
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.gpu_target = False
            self.alloc_status = NPUAllocFloatStatus()
            self.get_status = NPUGetFloatStatus()
            self.clear_status = NPUClearFloatStatus()
        self.reduce_sum = ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = LessEqual()
        self.depend_parameter_use = ControlDepend(depend_mode=1)
        self.allreduce = P.AllReduce()
        self.parallel_mode = _get_parallel_mode()
        self.grad_reducer = F.identity
        self.reducer_flag = self.parallel_mode in [
            ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE

        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(Tensor(
                scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                        name="loss_scale")
        self.add_flags(has_effect=True)

        # dp params
        self._micro_batches = micro_batches
        self._norm_bound = norm_bound
        self._split = P.Split(0, self._micro_batches)
        self._clip_by_global_norm = _ClipGradients()
        self._noise_mech = noise_mech
        self._clip_mech = clip_mech
        self._add = P.TensorAdd()
        self._norm = nn.Norm()
        self._tuple_add = _TupleAdd()
        self._hyper_map = C.HyperMap()
        self._micro_float = Tensor(micro_batches, mstype.float32)
        self._zero = Tensor(0, mstype.float32)
        self._assign = P.Assign()
        self._div = P.Div()
        self._sqrt = P.Sqrt()
        self._reduce_sum = P.ReduceSum()
        self._square_all = P.Square()
        self._less = P.Less()
        self._cast = P.Cast()

        self._noise_mech_param_updater = None
        if self._noise_mech is not None and self._noise_mech._decay_policy is not None:
            self._noise_mech_param_updater = _MechanismsParamsUpdater(
                decay_policy=self._noise_mech._decay_policy,
                decay_rate=self._noise_mech._noise_decay_rate,
                cur_noise_multiplier=self._noise_mech._noise_multiplier,
                init_noise_multiplier=self._noise_mech.
                _initial_noise_multiplier)
Ejemplo n.º 9
0
 def __init__(self):
     super(GlobalNorm, self).__init__()
     self.norm = nn.Norm()
     self.hyper_map = C.HyperMap()
Ejemplo n.º 10
0
def _compute_norm(grad):
    norm = nn.Norm()
    norm = norm(F.cast(grad, mstype.float32))
    ret = F.expand_dims(F.cast(norm, mstype.float32), 0)
    return ret
Ejemplo n.º 11
0
 def __init__(self,reduction='mean'):
     super().__init__()
     self.norm = nn.Norm(-1)
     self.reduce_mean = P.ReduceMean()
Ejemplo n.º 12
0
 def __init__(self):
     super(NormNet, self).__init__()
     self.norm = nn.Norm()