Beispiel #1
0
    def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0,
                 loss_scale=1.0,
                 decay_filter=lambda x: x.name not in []):
        super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale)
        if isinstance(momentum, float) and momentum < 0.0:
            raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
        self.momentum = Parameter(Tensor(momentum, mstype.float32))
        self.params = self.parameters
        self.moments = self.params.clone(prefix="moments", init='zeros')
        self.hyper_map = C.HyperMap()
        self.opt = P.ApplyMomentum()
        self.matrix_A = ParameterTuple(matrix_A)
        self.matrix_G = ParameterTuple(matrix_G)
        self.A_inv_max = ParameterTuple(A_inv_max)
        self.G_inv_max = ParameterTuple(G_inv_max)
        self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast()
        self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft()
        self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight()
        self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul()
        self.transpose = P.Transpose()
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.mul = P.Mul()
        self.weight_idx = []
        for i in range(len(self.params)):
            if "conv" in self.params[i].name or "end_point" in self.params[i].name:
                self.weight_idx.append(i)
        self.weight_idx.append(len(self.params))
        self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
                            1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
                            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
                            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
                            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49,
                            1.0]
        mean = _get_gradients_mean()
        degree = _get_device_num()
        parameter_length = len(self.feature_map)
        self.grad_reducer_Amax = DistributedGradReducerThor(parameter_length, ((27,), 2), mean, degree)
        self.grad_reducer_Gmax = DistributedGradReducerThor(parameter_length, ((27,), 4), mean, degree)
        self.grad_reducer_A = DistributedGradReducerThor(parameter_length, ((27,), 6), mean, degree)
        self.grad_reducer_G = DistributedGradReducerThor(parameter_length, ((27,), 8), mean, degree)
        self.matrix_A_inv = ()
        self.matrix_G_inv = ()
        self.matrix_max_inv = ()

        for i in range(54):
            self.matrix_max_inv = self.matrix_max_inv + (
                Parameter(initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False),)
        self.log = P.Log()
        self.exp = P.Exp()
        self.sqrt = P.Sqrt()
        self.matrix_max_inv = ParameterTuple(self.matrix_max_inv)
        self.assign = P.Assign()
        self.cast = P.Cast()
        self.thor = True
        self.weight_decay = weight_decay * loss_scale
        self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
 def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, weight_decay=0.0,
              loss_scale=1.0, num_hidden_layers=24, batch_size=12, damping=0.03,
              decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()):
     super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale)
     if isinstance(momentum, float) and momentum < 0.0:
         raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
     self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum")
     self.params = self.parameters
     self.moments = self.params.clone(prefix="moments", init='zeros')
     self.hyper_map = C.HyperMap()
     self.opt = P.ApplyMomentum()
     self.matrix_A = ParameterTuple(matrix_A)
     self.matrix_G = ParameterTuple(matrix_G)
     self.matmul = P.MatMul()
     self.transpose = P.Transpose()
     self.shape = P.Shape()
     self.reshape = P.Reshape()
     self.mul = P.Mul()
     self.gather = P.GatherV2()
     self.matrix_A_inv = ()
     self.matrix_G_inv = ()
     self.num_hidden_layers = num_hidden_layers
     self.sqrt = P.Sqrt()
     self.assign = P.Assign()
     self.cast = P.Cast()
     self.thor = True
     self.weight_decay = weight_decay * loss_scale
     self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
     self.expand = P.ExpandDims()
     self.square = P.Square()
     self.inv = P.Inv()
     self.batch_size = batch_size
     self.damping = damping
     self.one = Tensor(1, mstype.int32)
     self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False)
Beispiel #3
0
    def get_weight_bias(self):
        stdv = 1 / math.sqrt(self.hidden_s)
        gate_size = 4 * self.hidden_s
        w_list_value = []
        b_list_value = []

        for i in range(self.num_layers):
            b0 = np.zeros(gate_size, dtype=np.float16)
            w_shape = self.input_s if i == 0 else (self.num_directions * self.hidden_s)
            w_np = np.random.uniform(-stdv, stdv, (w_shape + self.hidden_s, gate_size)).astype(np.float16)
            w_list_value.append(Parameter(initializer(Tensor(w_np), [w_shape + self.hidden_s, gate_size]),
                                          name="weight_fw" + str(i)))

            if self.has_bias:
                b_np = np.random.uniform(-stdv, stdv, gate_size).astype(np.float16)
                b_list_value.append(Parameter(initializer(Tensor(b_np), [gate_size]), name="bias_fw" + str(i)))
            else:
                b_list_value.append(Parameter(initializer(Tensor(b0), [gate_size]), name="bias_fw" + str(i)))

            if self.bidirectional:
                w_bw_np = np.random.uniform(-stdv, stdv, (w_shape + self.hidden_s, gate_size)).astype(np.float16)
                b_list_value.append(Parameter(initializer(Tensor(w_bw_np), [w_shape + self.hidden_s, gate_size]),
                                              name="weight_bw" + str(i)))
                b_bw_np = np.random.uniform(-stdv, stdv, (4 * self.hidden_s)).astype(
                    np.float16) if self.has_bias else b0
                b_list_value.append(Parameter(initializer(Tensor(b_bw_np), [gate_size]), name="bias_bw" + str(i)))
        w_list_value = ParameterTuple(w_list_value)
        b_list_value = ParameterTuple(b_list_value)
        return w_list_value, b_list_value
Beispiel #4
0
    def __init__(self, seq_len, batch_size, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout):
        super(Net, self).__init__()

        num_directions = 1
        if bidirectional:
            num_directions = 2
        input_np = np.array([[[0.6755, -1.6607, 0.1367], [0.4276, -0.7850, -0.3758]],
                             [[-0.6424, -0.6095, 0.6639], [0.7918, 0.4147, -0.5089]],
                             [[-1.5612, 0.0120, -0.7289], [-0.6656, -0.6626, -0.5883]],
                             [[-0.9667, -0.6296, -0.7310], [0.1026, -0.6821, -0.4387]],
                             [[-0.4710, 0.6558, -0.3144], [-0.8449, -0.2184, -0.1806]]
                             ]).astype(np.float32)
        self.x = Parameter(initializer(Tensor(input_np), [seq_len, batch_size, input_size]), name='x')
        self.hlist = []
        self.clist = []
        self.hlist.append(Parameter(initializer(
            Tensor(
                np.array([0.1, 0.1, 0.1, 0.1]).reshape((num_directions, batch_size, hidden_size)).astype(
                    np.float32)),
            [num_directions, batch_size, hidden_size]), name='h'))
        self.clist.append(Parameter(initializer(
            Tensor(
                np.array([0.2, 0.2, 0.2, 0.2]).reshape((num_directions, batch_size, hidden_size)).astype(
                    np.float32)),
            [num_directions, batch_size, hidden_size]), name='c'))
        self.h = ParameterTuple(tuple(self.hlist))
        self.c = ParameterTuple(tuple(self.clist))
        wih = np.array([[3.4021e-01, -4.6622e-01, 4.5117e-01],
                        [-6.4257e-02, -2.4807e-01, 1.3550e-02],  # i
                        [-3.2140e-01, 5.5578e-01, 6.3589e-01],
                        [1.6547e-01, -7.9030e-02, -2.0045e-01],
                        [-6.9863e-01, 5.9773e-01, -3.9062e-01],
                        [-3.0253e-01, -1.9464e-01, 7.0591e-01],
                        [-4.0835e-01, 3.6751e-01, 4.7989e-01],
                        [-5.6894e-01, -5.0359e-01, 4.7491e-01]]).astype(np.float32).reshape([1, -1])
        whh = np.array([[-0.4820, -0.2350],
                        [-0.1195, 0.0519],
                        [0.2162, -0.1178],
                        [0.6237, 0.0711],
                        [0.4511, -0.3961],
                        [-0.5962, 0.0906],
                        [0.1867, -0.1225],
                        [0.1831, 0.0850]]).astype(np.float32).reshape([1, -1])
        bih = np.zeros((1, 8)).astype(np.float32)
        w_np = np.concatenate((wih, whh, bih), axis=1).reshape([-1, 1, 1])
        self.w = Parameter(initializer(Tensor(w_np), w_np.shape), name='weight0')
        self.lstm = StackLSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,
                              has_bias=has_bias, bidirectional=bidirectional, dropout=dropout)
        self.lstm.weight = ParameterTuple(tuple([self.w]))
Beispiel #5
0
    def __init__(self, network, optimizer, scale_update_cell=None):
        super(TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.add_flags(defer_inline=True)
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.hyper_map = C.HyperMap()
        self.alloc_status = NPUAllocFloatStatus()
        self.get_status = NPUGetFloatStatus()
        self.clear_status = NPUClearFloatStatus()
        self.reduce_sum = ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.reducer_flag = False
        self.less_equal = LessEqual()
        self.allreduce = P.AllReduce()
        self.parallel_mode = _get_parallel_mode()
        self.grad_reducer = None
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
        self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE

        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                        name="loss_scale")
Beispiel #6
0
 def __init__(self, network, optimizer, scale_update_cell=None):
     super(BertSquadCell, self).__init__(auto_prefix=False)
     self.network = network
     self.weights = ParameterTuple(network.trainable_params())
     self.optimizer = optimizer
     self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
     self.reducer_flag = False
     self.allreduce = P.AllReduce()
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [
             ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
     ]:
         self.reducer_flag = True
     self.grad_reducer = None
     if self.reducer_flag:
         mean = context.get_auto_parallel_context("mirror_mean")
         degree = get_group_size()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                    mean, degree)
     self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
     self.cast = P.Cast()
     self.alloc_status = P.NPUAllocFloatStatus()
     self.get_status = P.NPUGetFloatStatus()
     self.clear_before_grad = P.NPUClearFloatStatus()
     self.reduce_sum = P.ReduceSum(keep_dims=False)
     self.depend_parameter_use = P.ControlDepend(depend_mode=1)
     self.base = Tensor(1, mstype.float32)
     self.less_equal = P.LessEqual()
     self.hyper_map = C.HyperMap()
     self.loss_scale = None
     self.loss_scaling_manager = scale_update_cell
     if scale_update_cell:
         self.loss_scale = Parameter(Tensor(
             scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                     name="loss_scale")
def test_grad_fv_and_insert_gradient_of():
    class FvAndInsertGradientNet(nn.Cell):
        def __init__(self):
            super(FvAndInsertGradientNet, self).__init__()
            self.gather = P.GatherV2()
            self.damping = Tensor(np.array([0.03, 0.03], np.float32))
            self.cov_step = Parameter(0, name="cov_step", requires_grad=False)
            self.freq = Tensor(278, ms.int32)
            self.getG = P.InsertGradientOf(self.save_gradient)

            self.z = Parameter(Tensor(np.array([1.0], np.float32)), name='z')

        def save_gradient(self, dout):
            self.cov_step = self.cov_step + self.freq
            return dout

        def construct(self, *inputs):
            # fv self.z from construct_wrapper
            x, = inputs
            self.z = x

            # insert_gradient_of
            self.gather(self.damping, self.cov_step, 0)
            out = self.getG(x)
            return out

    net = FvAndInsertGradientNet()
    input_data = Tensor(np.array([1.0], np.float32))
    # if use grad_all_list, the generated graph will have env_setitem
    # as gradient for inputs is constant zero, so it will depend on result of grad.
    grad_net = grad_by_list(net, ParameterTuple(net.trainable_params()))
    print(grad_net(input_data))
Beispiel #8
0
 def __init__(self, net):
     super(GradNet, self).__init__()
     self.weights = ParameterTuple(net.trainable_params())
     self.net = net
     grad_op = C.GradOperation(get_all=False, get_by_list=True, sens_param=True)
     sens = Tensor(np.ones([3, 4, 5]), dtype=mstype.float32)
     self.grad = Bprop(self.net, True, self.weights, grad_op, sens)
def test_insert_gradient_of():
    class InsertGradientNet(nn.Cell):
        def __init__(self):
            super(InsertGradientNet, self).__init__()
            self.gather = P.GatherV2()
            self.damping = Tensor(np.array([0.03, 0.03], np.float32))
            self.cov_step = Parameter(0, name="cov_step", requires_grad=False)
            self.freq = Tensor(278, ms.int32)
            self.getG = P.InsertGradientOf(self.save_gradient)

        def save_gradient(self, dout):
            self.cov_step = self.cov_step + self.freq
            return dout

        def construct(self, x):
            self.gather(self.damping, self.cov_step, 0)
            out = P.ReLU()(x)
            out = self.getG(out)
            out = self.getG(out)
            return out

    net = InsertGradientNet()
    input_data = np.array([[1.2, 2.1], [2.2, 3.2]]).astype(np.float32)
    grad_net = grad_all_list(net, ParameterTuple(net.trainable_params()))
    print(grad_net(Tensor(input_data)))
Beispiel #10
0
 def __init__(self, network):
     super(GradByListNet, self).__init__()
     self.grad = C.GradOperation(get_all=True,
                                 sens_param=True,
                                 get_by_list=True)
     self.network = network
     self.params = ParameterTuple(network.trainable_params())
 def __init__(self, network, optimizer):
     super(TrainOneStepCell, self).__init__(auto_prefix=False)
     self.network = network
     self.network.add_flags(defer_inline=True)
     self.weights = ParameterTuple(network.trainable_params())
     self.optimizer = optimizer
     self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
Beispiel #12
0
 def __init__(self, net):
     super(GradNet, self).__init__()
     self.weights = ParameterTuple(net.trainable_params())
     self.net = net
     grad_op = C.GradOperation(
         name='grad', get_all=True, get_by_list=False, sens_param=True)
     self.grad = Bprop(self.net, False, self.weights, grad_op)
    def __init__(self, network, optimizer, sens=1.0):
        super(TransformerTrainOneStepCell, self).__init__(auto_prefix=False)
        self.network = network
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens
        self.reducer_flag = False
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode not in ParallelMode.MODE_LIST:
            raise ValueError("Parallel mode does not support: ",
                             self.parallel_mode)
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)

        self.clip_gradients = ClipGradients()
        self.cast = P.Cast()
Beispiel #14
0
 def __init__(self, network):
     super(Grad, self).__init__()
     self.network = network
     self.weights = ParameterTuple(network.trainable_params())
     self.grad = C.GradOperation('grad',
                                 get_by_list=True,
                                 sens_param=True)
Beispiel #15
0
 def __init__(self, net):
     super(NetGrad, self).__init__()
     self.grad_op = C.GradOperation('grad',
                                    get_by_list=True,
                                    sens_param=False)
     self.net = net
     self.weights = ParameterTuple(self.net.trainable_params())
Beispiel #16
0
 def __init__(self, network):
     super(TrainOneStepCell, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_train()
     self.weights = ParameterTuple(network.trainable_params())
     self.optimizer = nn.Momentum(self.weights, 0.1, 0.9)
     self.hyper_map = C.HyperMap()
     self.grad = C.GradOperation(get_by_list=True)
Beispiel #17
0
    def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max,
                 weight_decay=0.0, loss_scale=1.0, use_nesterov=False, decay_filter=lambda x: x.name not in []):
        super(THOR_GPU, self).__init__(learning_rate, params, weight_decay, loss_scale)
        Validator.check_value_type("momentum", momentum, [float], self.cls_name)
        if isinstance(momentum, float) and momentum < 0.0:
            raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
        self.momentum = Parameter(Tensor(momentum, mstype.float32))
        self.params = self.parameters
        self.use_nesterov = Validator.check_bool(use_nesterov)
        self.moments = self.params.clone(prefix="moments", init='zeros')
        self.hyper_map = C.HyperMap()
        self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov)

        self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
                            1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
                            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
                            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
                            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49,
                            1.0]
        self.feature_map_new = [x ** 0.5 for x in self.feature_map]
        self.transpose = P.Transpose()
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.matmul = P.MatMul()
        self.matrix_A = ParameterTuple(matrix_A)
        self.matrix_G = ParameterTuple(matrix_G)
        self.A_inv_max = ParameterTuple(A_inv_max)
        self.G_inv_max = ParameterTuple(G_inv_max)
        self.assign = P.Assign()
        self.mul = P.Mul()

        mean = _get_gradients_mean()
        degree = _get_device_num()

        parameter_length = len(self.feature_map)
        self.grad_reducer_thorA = DistributedGradReducerThor(parameter_length, ((parameter_length,), 0), mean, degree)
        self.grad_reducer_thorG = DistributedGradReducerThor(parameter_length, ((parameter_length,), 0), mean, degree)
        self.weight_decay = weight_decay
        self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
        self.update_gradient = P.UpdateThorGradient(split_dim=128)
Beispiel #18
0
    def __init__(self,
                 input_size,
                 hidden_size,
                 num_layers=1,
                 has_bias=True,
                 batch_first=False,
                 dropout=0.0,
                 bidirectional=False):
        super(StackLSTM, self).__init__()
        self.num_layers = num_layers
        self.batch_first = batch_first
        self.transpose = P.Transpose()

        # direction number
        num_directions = 2 if bidirectional else 1

        # input_size list
        input_size_list = [input_size]
        for i in range(num_layers - 1):
            input_size_list.append(hidden_size * num_directions)

        # layers
        layers = []
        for i in range(num_layers):
            layers.append(
                nn.LSTMCell(input_size=input_size_list[i],
                            hidden_size=hidden_size,
                            has_bias=has_bias,
                            batch_first=batch_first,
                            bidirectional=bidirectional,
                            dropout=dropout))

        # weights
        weights = []
        for i in range(num_layers):
            # weight size
            weight_size = (input_size_list[i] +
                           hidden_size) * num_directions * hidden_size * 4
            if has_bias:
                bias_size = num_directions * hidden_size * 4
                weight_size = weight_size + bias_size

            # numpy weight
            stdv = 1 / math.sqrt(hidden_size)
            w_np = np.random.uniform(-stdv, stdv,
                                     (weight_size, 1, 1)).astype(np.float32)

            # lstm weight
            weights.append(
                Parameter(initializer(Tensor(w_np), w_np.shape),
                          name="weight" + str(i)))

        #
        self.lstms = layers
        self.weight = ParameterTuple(tuple(weights))
Beispiel #19
0
    def __init__(self,
                 learning_rate,
                 parameters,
                 weight_decay=0.0,
                 loss_scale=1.0,
                 decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in
                 x.name):
        super(Optimizer, self).__init__()
        if isinstance(learning_rate, float):
            self.dynamic_lr = False
            self.gather = None
            self.assignadd = None
            self.global_step = None
            validator.check_number_range("learning rate", learning_rate, 0.0,
                                         float("inf"), Rel.INC_LEFT)
        else:
            self.dynamic_lr = True
            self.gather = P.GatherV2()
            self.assignadd = P.AssignAdd()
            self.global_step = Parameter(initializer(0, [1], mindspore.int32),
                                         name='global_step')
            if isinstance(learning_rate, Iterable):
                learning_rate = Tensor(
                    np.array(list(learning_rate)).astype(np.float32))
            elif isinstance(learning_rate, Tensor):
                if learning_rate.dim() > 1:
                    raise ValueError(
                        "Learning rate should be a 0 or 1 dim `Tensor`,"
                        f"but got {learning_rate.dim()}.")
                if learning_rate.dim() == 1 and learning_rate.size() < 2:
                    logger.warning(
                        "If want to use the dynamic learning rate, please make sure that the number "
                        "of elements in the list, tuple or tensor passed is greater than 1."
                    )
            else:
                raise TypeError(
                    "Learning rate should be float, Tensor or Iterable.")

        if loss_scale <= 0.0:
            raise ValueError(
                "Loss scale should be greater than 0, but got {}".format(
                    loss_scale))
        if weight_decay < 0.0:
            raise ValueError(
                "Weight decay should be equal or greater than 0, but got {}".
                format(weight_decay))

        self.learning_rate = Parameter(learning_rate, name="learning_rate")
        self.parameters = ParameterTuple(parameters)
        self.reciprocal_scale = 1.0 / loss_scale
        self.weight_decay = weight_decay * loss_scale
        self.decay_flags = tuple(decay_filter(x) for x in self.parameters)

        if not self.parameters:
            raise ValueError("optimizer got an empty parameter list.")
Beispiel #20
0
def test_grad_refactor_13():
    class Net(nn.Cell):
        """ Net definition """
        def __init__(self):
            super(Net, self).__init__()
            self.z = Parameter(Tensor(np.ones([2]).astype(np.float32)), name='z')
        def construct(self, x, y):
            return x * self.z * y
    net = Net()
    weights = ParameterTuple(net.trainable_params())
    C.grad_by_list(net, weights)(Tensor(np.ones([2]).astype(np.float32)), Tensor(np.zeros([2]).astype(np.float32)))
Beispiel #21
0
 def __init__(self, net):
     super(GradNet, self).__init__()
     self.weights = ParameterTuple(net.trainable_params())
     self.net = net
     self.sens = Parameter(Tensor(np.ones([3, 4, 5]),
                                  dtype=mstype.float32),
                           name='sens',
                           requires_grad=False)
     self.grad = C.GradOperation('grad',
                                 get_by_list=True,
                                 sens_param=True)
Beispiel #22
0
 def __init__(self, num_class, label, mask, l2_coeff, params):
     super(MaskedSoftMaxLoss, self).__init__()
     self.num_class = num_class
     self.label = label
     self.mask = mask
     self.softmax = P.SoftmaxCrossEntropyWithLogits()
     self.reduce_mean = P.ReduceMean()
     self.cast = P.Cast()
     self.l2_coeff = l2_coeff
     self.params = ParameterTuple(list(param for param in params if param.name[-4:] != 'bias'))
     self.reduce_sum = P.ReduceSum()
     self.num_params = len(self.params)
 def __init__(self, network, optimizer, sens=1.0):
     super(TrainOneStepWithLarsCell, self).__init__(auto_prefix=False)
     self.network = network
     self.slice_index, self.params_len, weights = get_net_trainable_reordered_params(
         self.network)
     self.weights = ParameterTuple(weights)
     self.optimizer = optimizer
     self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
     self.sens = Parameter(Tensor([sens], mstype.float32),
                           name='sens',
                           requires_grad=False)
     self.weight_decay = 1.0
     self.lars = P.Lars(epsilon=1.0, hyperpara=1.0)
Beispiel #24
0
 def __init__(self, func, wrt_params, params, grad_op, sens=None):
     super(Bprop, self).__init__(auto_prefix=False)
     self.func = func
     self.wrt_params = wrt_params
     self.params = None
     if self.wrt_params and params:
         self.params = ParameterTuple(params)
     self.grad = grad_op
     self.with_sens = False
     self.sens = sens
     if sens:
         self.sens = Tensor(sens, dtype=mstype.float32)
         self.with_sens = True
Beispiel #25
0
    def __init__(self, network, optimizer, scale_update_cell=None):
        super(TransformerTrainOneStepWithLossScaleCell,
              self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.reducer_flag = False
        self.allreduce = P.AllReduce()

        self.parallel_mode = _get_parallel_mode()
        if self.parallel_mode not in ParallelMode.MODE_LIST:
            raise ValueError("Parallel mode does not support: ",
                             self.parallel_mode)
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.clip_gradients = ClipGradients()
        self.cast = P.Cast()
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.gpu_target = False
            self.alloc_status = P.NPUAllocFloatStatus()
            self.get_status = P.NPUGetFloatStatus()
            self.clear_before_grad = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.depend_parameter_use = P.ControlDepend(depend_mode=1)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.hyper_map = C.HyperMap()

        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(Tensor(
                scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                        name="loss_scale")
Beispiel #26
0
 def __init__(self,
              params,
              learning_rate,
              momentum,
              matrix_A,
              matrix_G,
              A_inv_max,
              G_inv_max,
              weight_decay=0.0,
              loss_scale=1.0,
              use_nesterov=False,
              decay_filter=lambda x: x.name not in []):
     super(SKFAC_GPU, self).__init__(learning_rate, params, weight_decay,
                                     loss_scale)
     Validator.check_value_type("momentum", momentum, [float],
                                self.cls_name)
     if isinstance(momentum, float) and momentum < 0.0:
         raise ValueError(
             "momentum should be at least 0.0, but got momentum {}".format(
                 momentum))
     self.momentum = Parameter(Tensor(momentum, mstype.float32))
     self.params = self.parameters
     self.use_nesterov = Validator.check_bool(use_nesterov)
     self.moments = self.params.clone(prefix="moments", init='zeros')
     self.hyper_map = C.HyperMap()
     self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov)
     self.transpose = P.Transpose()
     self.shape = P.Shape()
     self.reshape = P.Reshape()
     self.matmul = P.MatMul()
     self.matrix_A = ParameterTuple(matrix_A)
     self.matrix_G = ParameterTuple(matrix_G)
     self.A_inv_max = ParameterTuple(A_inv_max)
     self.G_inv_max = ParameterTuple(G_inv_max)
     self.assign = P.Assign()
     self.mul = P.Mul()
     self.weight_decay = weight_decay
     self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
Beispiel #27
0
def TrainWrap(net, loss_fn=None, optimizer=None, weights=None):
    """
    TrainWrap
    """
    if loss_fn is None:
        loss_fn = nn.SoftmaxCrossEntropyWithLogits(reduction='mean', sparse=True)
    loss_net = nn.WithLossCell(net, loss_fn)
    loss_net.set_train()
    if weights is None:
        weights = ParameterTuple(net.trainable_params())
    if optimizer is None:
        optimizer = nn.Adam(weights, learning_rate=0.003, beta1=0.9, beta2=0.999, eps=1e-5, use_locking=False,
                            use_nesterov=False, weight_decay=4e-5, loss_scale=1.0)
    train_net = nn.TrainOneStepCell(loss_net, optimizer)
    return train_net
def test_load_grad():
    class LoadNet(nn.Cell):
        def __init__(self):
            super().__init__()
            self.z = Parameter(Tensor(np.array([1.0], np.float32)), name='z')

        def construct(self, x, y):
            x = x * y * self.z
            return x

    x = Tensor(np.array([2.0], np.float32))
    y = Tensor(np.array([3.0], np.float32))
    load_net = LoadNet()
    grad_net = grad_all_list(load_net,
                             ParameterTuple(load_net.trainable_params()))
    print(grad_net(x, y))
Beispiel #29
0
def test_switch_layer_with_single_prim():
    class SwitchLayerCell(nn.Cell):
        def __init__(self):
            super(SwitchLayerCell, self).__init__()
            self.layers = (nn.ReLU(), nn.ReLU())
            self.z3 = Parameter(
                Tensor(np.full([128, 96], 0.6, dtype=np.float32)), name='z3')

        def construct(self, index, x):
            ret = self.layers[index](x) * self.z3
            return ret

    index = Tensor(0, dtype=mstype.int32)
    net = SwitchLayerCell()
    net(index, Tensor(np.full([128, 96], 0.6, dtype=np.float32)))
    C.grad_by_list(net, ParameterTuple(net.trainable_params()))(index,
                                                                Tensor(np.full([128, 96], 0.6, dtype=np.float32)))
    C.grad_all(net)(index, Tensor(np.full([128, 96], 0.6, dtype=np.float32)))
    def __init__(self, network, optimizer, sens=1.0):
        super(BertTrainOneStepCell, self).__init__(auto_prefix=False)
        self.network = network
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens
        self.reducer_flag = False
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)

        self.cast = P.Cast()
        self.hyper_map = C.HyperMap()