def __init__(self, network, sens=1000.0):
     super(TrainStepWarp, self).__init__()
     self.network = network
     self.network.set_train()
     self.trainable_params = network.trainable_params()
     weights_w = []
     weights_d = []
     for params in self.trainable_params:
         weights_w.append(params)
         weights_d.append(params)
     self.weights_w = ParameterTuple(weights_w)
     self.weights_d = ParameterTuple(weights_d)
     self.optimizer_w = FTRL(learning_rate=1e-2,
                             params=self.weights_w,
                             l1=1e-8,
                             l2=1e-8,
                             initial_accum=1.0)
     self.optimizer_d = Adam(self.weights_d,
                             learning_rate=3.5e-4,
                             eps=1e-8,
                             loss_scale=sens)
     self.hyper_map = C.HyperMap()
     self.grad_w = C.GradOperation(get_by_list=True, sens_param=True)
     self.grad_d = C.GradOperation(get_by_list=True, sens_param=True)
     self.sens = sens
     self.loss_net_w = IthOutputCell(network, output_index=0)
     self.loss_net_d = IthOutputCell(network, output_index=1)
Esempio n. 2
0
def lstm_default_state(batch_size, hidden_size, num_layers, bidirectional):
    """init default input."""
    num_directions = 1
    if bidirectional:
        num_directions = 2

    if context.get_context("device_target") == "CPU":
        h_list = []
        c_list = []
        for i in range(num_layers):
            hi = Parameter(initializer(
                Tensor(
                    np.zeros((num_directions, batch_size,
                              hidden_size)).astype(np.float32)),
                [num_directions, batch_size, hidden_size]),
                           name='h' + str(i))
            h_list.append(hi)
            ci = Parameter(initializer(
                Tensor(
                    np.zeros((num_directions, batch_size,
                              hidden_size)).astype(np.float32)),
                [num_directions, batch_size, hidden_size]),
                           name='c' + str(i))
            c_list.append(ci)
        h = ParameterTuple(tuple(h_list))
        c = ParameterTuple(tuple(c_list))
        return h, c

    h = Tensor(
        np.zeros((num_layers * num_directions, batch_size,
                  hidden_size)).astype(np.float32))
    c = Tensor(
        np.zeros((num_layers * num_directions, batch_size,
                  hidden_size)).astype(np.float32))
    return h, c
Esempio n. 3
0
    def __init__(self,
                 input_dim,
                 hidden_num,
                 hidden_dim,
                 output_dim,
                 mu,
                 lamb,
                 nonlinear="leaky-relu",
                 norm_prod='paths',
                 square_prod=False):

        super(BaseModel, self).__init__()
        self.input_dim = input_dim
        self.hidden_num = hidden_num
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.mu = mu
        self.lamb = lamb
        self.nonlinear = nonlinear
        self.norm_prod = norm_prod
        self.square_prod = square_prod

        self.normal = msd.Normal(dtype=mstype.float32)
        self.extra_params = []

        # initialize current adjacency matrix
        self.adjacency = msnp.ones(
            (self.input_dim, self.input_dim), dtype=mstype.float32) - msnp.eye(
                self.input_dim, dtype=mstype.float32)

        # Generate layer_list
        layer_list = [self.hidden_dim] * self.hidden_num
        layer_list.insert(0, self.input_dim)
        layer_list.append(self.output_dim)

        # Instantiate the parameters of each layer in the model of each variable
        tmp_weights = list()
        tmp_biases = list()
        for i, item in enumerate(layer_list[:-1]):
            in_dim = item
            out_dim = layer_list[i + 1]
            tmp_weights.append(
                Parameter(msnp.zeros((self.input_dim, out_dim, in_dim),
                                     dtype=mstype.float32),
                          requires_grad=True,
                          name='w' + str(i)))
            tmp_biases.append(
                Parameter(msnp.zeros((self.input_dim, out_dim),
                                     dtype=mstype.float32),
                          requires_grad=True,
                          name='b' + str(i)))

        self.weights = ParameterTuple(tmp_weights)
        self.biases = ParameterTuple(tmp_biases)

        # reset initialization parameters
        self.reset_params()
Esempio n. 4
0
    def __init__(self, network, config, sens=1000.0):
        super(TrainStepWrap, self).__init__()
        self.network = network
        self.network.set_train()
        self.trainable_params = network.trainable_params()
        weights_w = []
        weights_d = []
        for params in self.trainable_params:
            if 'wide' in params.name:
                weights_w.append(params)
            else:
                weights_d.append(params)

        self.weights_w = ParameterTuple(weights_w)
        self.weights_d = ParameterTuple(weights_d)
        self.optimizer_w = FTRL(learning_rate=config.ftrl_lr,
                                params=self.weights_w,
                                l1=5e-4,
                                l2=5e-4,
                                initial_accum=0.1,
                                loss_scale=sens)

        #self.optimizer_d = ProximalAdagrad(self.weights_d, learning_rate=config.adam_lr,loss_scale=sens)
        self.optimizer_d = Adam(self.weights_d,
                                learning_rate=config.adam_lr,
                                eps=1e-6,
                                loss_scale=sens)

        self.hyper_map = C.HyperMap()

        self.grad_w = C.GradOperation('grad_w',
                                      get_by_list=True,
                                      sens_param=True)
        self.grad_d = C.GradOperation('grad_d',
                                      get_by_list=True,
                                      sens_param=True)

        self.sens = sens
        self.loss_net_w = IthOutputCell(network, output_index=0)
        self.loss_net_d = IthOutputCell(network, output_index=1)

        self.reducer_flag = False
        self.grad_reducer_w = None
        self.grad_reducer_d = None
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL,
                             ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer_w = DistributedGradReducer(
                self.optimizer_w.parameters, mean, degree)
            self.grad_reducer_d = DistributedGradReducer(
                self.optimizer_d.parameters, mean, degree)
Esempio n. 5
0
    def __init__(self, network, sens=1024.0, host_device_mix=False, parameter_server=False, sparse=False):
        super(TrainStepWrap, self).__init__()
        parallel_mode = context.get_auto_parallel_context("parallel_mode")
        is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL)
        self.network = network
        self.network.set_train()
        self.trainable_params = network.trainable_params()
        weights_w = []
        weights_d = []
        for params in self.trainable_params:
            if 'wide' in params.name:
                weights_w.append(params)
            else:
                weights_d.append(params)
        self.weights_w = ParameterTuple(weights_w)
        self.weights_d = ParameterTuple(weights_d)

        if (sparse and is_auto_parallel) or parameter_server:
            self.optimizer_d = LazyAdam(
                self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens)
            self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w,
                                    l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens)
            if host_device_mix or parameter_server:
                self.optimizer_w.target = "CPU"
                self.optimizer_d.target = "CPU"
        else:
            self.optimizer_d = Adam(
                self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens)
            self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w,
                                    l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens)
        self.hyper_map = C.HyperMap()
        self.grad_w = C.GradOperation(get_by_list=True,
                                      sens_param=True)
        self.grad_d = C.GradOperation(get_by_list=True,
                                      sens_param=True)
        self.sens = sens
        self.loss_net_w = IthOutputCell(network, output_index=0)
        self.loss_net_d = IthOutputCell(network, output_index=1)
        self.loss_net_w.set_grad()
        self.loss_net_d.set_grad()

        self.reducer_flag = False
        self.grad_reducer_w = None
        self.grad_reducer_d = None
        self.reducer_flag = parallel_mode in (ParallelMode.DATA_PARALLEL,
                                              ParallelMode.HYBRID_PARALLEL)
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            degree = context.get_auto_parallel_context("device_num")
            self.grad_reducer_w = DistributedGradReducer(self.optimizer_w.parameters, mean, degree)
            self.grad_reducer_d = DistributedGradReducer(self.optimizer_d.parameters, mean, degree)
Esempio n. 6
0
def test_pynative_lenet_with_new_interface():
    context.set_context(mode=context.PYNATIVE_MODE)

    epoch_size = 20
    batch_size = 32
    inputs = Tensor(np.ones([batch_size, 1, 32, 32]).astype(np.float32))
    labels = Tensor(np.ones([batch_size]).astype(np.int32))

    net = LeNet()
    criterion = CrossEntropyLoss()
    net_with_criterion = WithLossCell(net, criterion)
    net_with_criterion.set_train()

    weights = ParameterTuple(
        filter(lambda x: x.requires_grad, net.get_parameters()))
    optimizer = Momentum(weights, 0.1, 0.9)

    forward_value_and_grad = nn.ForwardValueAndGrad(network=net_with_criterion,
                                                    weights=weights,
                                                    get_by_list=True)
    total_time = 0
    for epoch in range(0, epoch_size):
        start_time = time.time()
        loss_output, grads = forward_value_and_grad(inputs, labels)
        optimizer(grads)
        end_time = time.time()
        cost_time = end_time - start_time
        total_time = total_time + cost_time

        print("======epoch: ", epoch, " loss: ", loss_output.asnumpy(),
              " cost time: ", cost_time)
    assert loss_output.asnumpy() < 0.005
    assert loss_output.asnumpy() > 0.003
Esempio n. 7
0
 def __init__(self, parameters, learning_rate=0.001, batch_size=1):
     super(SGD, self).__init__()
     self.parameters = ParameterTuple(parameters)
     self.learning_rate = Tensor(
         np.array([learning_rate]).astype(np.float32))
     self.batch_size = Tensor(np.array([batch_size]).astype(np.float32))
     self.hyper_map = C.HyperMap()
Esempio n. 8
0
    def __init__(self,
                 input_dim,
                 hidden_num,
                 hidden_dim,
                 output_dim,
                 mu,
                 lamb,
                 nonlinear="leaky-relu",
                 norm_prod='paths',
                 square_prod=False):
        super(NonlinearGaussANM, self).__init__(input_dim=input_dim,
                                                hidden_num=hidden_num,
                                                hidden_dim=hidden_dim,
                                                output_dim=output_dim,
                                                mu=mu,
                                                lamb=lamb,
                                                nonlinear=nonlinear,
                                                norm_prod=norm_prod,
                                                square_prod=square_prod)

        # extra parameters are log_std
        extra_params = np.ones((self.input_dim, ))
        np.random.shuffle(extra_params)
        extra_params_list = list()
        for i, extra_param in enumerate(extra_params):
            extra_params_list.append(
                Parameter(MsTensor(np.log(extra_param).reshape(1),
                                   dtype=mstype.float32),
                          requires_grad=True,
                          name='e' + str(i)))

        # each element in the list represents a variable,
        # the size of the element is the number of extra_params per var
        self.extra_params = ParameterTuple(extra_params_list)
Esempio n. 9
0
def test_train_lenet_with_new_interface(num_classes=10,
                                        epoch=20,
                                        batch_size=32):
    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
    network = LeNet5(num_classes)
    criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_with_criterion = WithLossCell(network, criterion)
    net_with_criterion.set_train()

    weights = ParameterTuple(network.trainable_params())
    optimizer = nn.Momentum(weights, 0.1, 0.9)

    train_network = ForwardValueAndGrad(network=net_with_criterion,
                                        weights=weights,
                                        get_by_list=True,
                                        sens_param=True)
    losses = []
    for i in range(0, epoch):
        data = Tensor(
            np.ones([batch_size, 1, 32, 32]).astype(np.float32) * 0.01)
        label = Tensor(np.ones([batch_size]).astype(np.int32))
        sens = Tensor(np.ones([1]).astype(np.float32))
        loss, grads = train_network(data, label, sens)
        grads = F.identity(grads)
        optimizer(grads)
        losses.append(loss)
    assert losses[-1].asnumpy() < 0.01
    assert losses[-1].asnumpy() > 0.001
Esempio n. 10
0
 def __init__(self, network, lr, momentum, is_train=True):
     super(TrainStepWrap, self).__init__(auto_prefix=False)
     self.network = network
     self.weights = ParameterTuple(network.trainable_params())
     self.optimizer = Momentum(self.weights, lr, momentum)
     self.grad = C.GradOperation(get_by_list=True)
     self.is_train = is_train
Esempio n. 11
0
 def __init__(self, network, optimizer, scale_update_cell=None):
     super(BertTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
     self.network = network
     self.weights = ParameterTuple(network.trainable_params())
     self.optimizer = optimizer
     self.grad = ops.GradOperation(
         get_by_list=True,
         sens_param=True)
     self.reducer_flag = False
     self.allreduce = ops.AllReduce()
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
         self.reducer_flag = True
     self.grad_reducer = ops.identity
     self.degree = 1
     if self.reducer_flag:
         self.degree = get_group_size()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
     self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
     self.cast = ops.Cast()
     self.alloc_status = ops.NPUAllocFloatStatus()
     self.get_status = ops.NPUGetFloatStatus()
     self.clear_before_grad = ops.NPUClearFloatStatus()
     self.reduce_sum = ops.ReduceSum(keep_dims=False)
     self.depend_parameter_use = ops.ControlDepend(depend_mode=1)
     self.base = Tensor(1, mstype.float32)
     self.less_equal = ops.LessEqual()
     self.hyper_map = ops.HyperMap()
     self.loss_scale = None
     self.loss_scaling_manager = scale_update_cell
     if scale_update_cell:
         self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                     name="loss_scale")
Esempio n. 12
0
def test_big_batchSize_with_new_interface(num_classes=10,
                                          epoch=8,
                                          batch_size=338):
    net = resnet50(num_classes)
    criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    net_with_criterion = WithLossCell(net, criterion)
    net_with_criterion.set_train()

    weights = ParameterTuple(
        filter(lambda x: x.requires_grad, net.get_parameters()))
    optimizer = Momentum(weights, 0.1, 0.9)

    train_network = ForwardValueAndGrad(network=net_with_criterion,
                                        weights=weights,
                                        get_by_list=True,
                                        sens_param=True,
                                        sens=1.0)
    losses = []
    for i in range(0, epoch):
        data = Tensor(
            np.ones([batch_size, 3, 224, 224]).astype(np.float32) * 0.01)
        label = Tensor(np.ones([batch_size]).astype(np.int32))
        loss, grads = train_network(data, label)
        grads = F.identity(grads)
        optimizer(grads)
        losses.append(loss)
    assert (losses[-1].asnumpy() < 0.8)
Esempio n. 13
0
    def __init__(self, network, total_steps=1, sens=16384.0):
        super(TrainStepWrap, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_train()
        self.network.add_flags(defer_inline=True)
        self.weights = ParameterTuple(network.trainable_params())

        lr = dynamic_lr(0.01, total_steps, 5000)
        self.optimizer = nn.Adam(self.weights,
                                 learning_rate=lr,
                                 beta1=0.9,
                                 beta2=0.999,
                                 eps=1e-8,
                                 loss_scale=sens)

        self.hyper_map = C.HyperMap()
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens

        self.reducer_flag = False
        self.grad_reducer = None
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL,
                             ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(
                self.optimizer.parameters, mean, degree)
Esempio n. 14
0
 def __init__(self, network, grad_op):
     if isinstance(network, nn.Cell):
         super(NNBackwardWithNoSens, self).__init__(auto_prefix=False)
     else:
         super(NNBackwardWithNoSens, self).__init__()
     self.network = network
     self.grad = grad_op
     self.params = ParameterTuple(network.trainable_params())
Esempio n. 15
0
 def __init__(self, network):
     super(TrainStepWrap, self).__init__()
     self.network = network
     self.network.set_train()
     self.weights = ParameterTuple(network.trainable_params())
     self.optimizer = nn.Momentum(self.weights, 0.1, 0.9)
     self.hyper_map = C.HyperMap()
     self.grad = C.GradOperation(get_by_list=True)
Esempio n. 16
0
 def __init__(self, network, sens):
     super(TrainStepWrap2, self).__init__()
     self.network = network
     self.network.set_train()
     self.weights = ParameterTuple(network.get_parameters())
     self.optimizer = nn.Momentum(self.weights, 0.1, 0.9)
     self.hyper_map = C.HyperMap()
     self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
     self.sens = sens
Esempio n. 17
0
 def __init__(self, network, lr=5e-8, eps=1e-8, loss_scale=1000.0):
     super(TrainStepWrap, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_train()
     self.weights = ParameterTuple(network.trainable_params())
     self.optimizer = Adam(self.weights, learning_rate=lr, eps=eps, loss_scale=loss_scale)
     self.hyper_map = C.HyperMap()
     self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
     self.sens = loss_scale
 def __init__(self, grad, network, wrt_params=False, real_inputs_count=None):
     super().__init__()
     self.network = network
     self.grad = grad
     self.sens_param = self.grad.sens_param
     self.wrt_params = wrt_params
     self.real_inputs_count = real_inputs_count
     if self.wrt_params:
         self.params = ParameterTuple(self.network.trainable_params())
Esempio n. 19
0
    def __init__(self,
                 input_size,
                 hidden_size,
                 num_layers=1,
                 has_bias=True,
                 batch_first=False,
                 dropout=0.0,
                 bidirectional=False):
        super(StackLSTM, self).__init__()
        self.num_layers = num_layers
        self.batch_first = batch_first
        self.transpose = P.Transpose()

        # direction number
        num_directions = 2 if bidirectional else 1

        # input_size list
        input_size_list = [input_size]
        for i in range(num_layers - 1):
            input_size_list.append(hidden_size * num_directions)

        # layers
        layers = []
        for i in range(num_layers):
            layers.append(
                nn.LSTMCell(input_size=input_size_list[i],
                            hidden_size=hidden_size,
                            has_bias=has_bias,
                            batch_first=batch_first,
                            bidirectional=bidirectional,
                            dropout=dropout))

        # weights
        weights = []
        for i in range(num_layers):
            # weight size
            weight_size = (input_size_list[i] +
                           hidden_size) * num_directions * hidden_size * 4
            if has_bias:
                bias_size = num_directions * hidden_size * 4
                weight_size = weight_size + bias_size

            # numpy weight
            stdv = 1 / math.sqrt(hidden_size)
            w_np = np.random.uniform(-stdv, stdv,
                                     (weight_size, 1, 1)).astype(np.float32)

            # lstm weight
            weights.append(
                Parameter(initializer(Tensor(w_np), w_np.shape),
                          name="weight" + str(i)))

        #
        self.lstms = layers
        self.weight = ParameterTuple(tuple(weights))
Esempio n. 20
0
 def __init__(self, network, optimizer, sens=1.0, reduce_flag=False, mean=True, degree=None):
     super(TrainOneStepCell, self).__init__(auto_prefix=False)
     self.network = network
     self.weights = ParameterTuple(network.trainable_params())
     self.optimizer = optimizer
     self.grad = C.GradOperation(get_by_list=True,
                                 sens_param=True)
     self.sens = Tensor((np.ones(1, dtype=np.float32)) * sens)
     self.reducer_flag = reduce_flag
     if self.reducer_flag:
         self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
Esempio n. 21
0
 def __init__(self, mixture_size: int, do_layer_norm: bool = False) -> None:
     super(Scalar_mix, self).__init__()
     self.mixture_size = mixture_size
     self.do_layer_norm = do_layer_norm
     self.scalar_parameters = ParameterTuple([Parameter(Tensor(np.array([0.0]), mindspore.float32)) \
                                              for _ in range(mixture_size)])
     self.gamma = Parameter(Tensor(np.array([0.0]), mindspore.float32))
     self.sum = P.ReduceSum()
     self.sqrt = P.Sqrt()
     self.cat = P.Concat()
     self.unsqueeze = P.ExpandDims(0)
Esempio n. 22
0
 def __init__(self, network, optimizer, grad_sum, sens=1.0):
     super(TrainForwardBackward, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_grad()
     self.network.add_flags(defer_inline=True)
     self.weights = ParameterTuple(network.trainable_params())
     self.optimizer = optimizer
     self.grad_sum = grad_sum
     self.grad = ops.GradOperation(get_by_list=True, sens_param=True)
     self.sens = sens
     self.hyper_map = ops.HyperMap()
Esempio n. 23
0
 def __init__(self,
              fn: Callable,
              args: List[Any],
              delta: float = 1e-3,
              max_error: float = 1e-3,
              input_selector=None,
              output_selector=None,
              sampling_times=-1,
              reduce_output=False) -> None:
     grad_op = GradOperation('grad', get_by_list=True, sens_param=True)
     self.params = ParameterTuple(fn.trainable_params())
     super(NNGradChecker, self).__init__(fn, grad_op, args, delta, max_error, input_selector, \
                                         output_selector, sampling_times, reduce_output)
Esempio n. 24
0
 def init_weights(self, pretrained=''):
     if os.path.isfile(pretrained):
         # load params from pretrained
         param_dict = load_checkpoint(pretrained)
         weight = ParameterTuple(self.trainable_params())
         for w in weight:
             if w.name.split('.')[0] not in ('deconv_layers',
                                             'final_layer'):
                 assert w.name in param_dict, "parameter %s not in checkpoint" % w.name
         load_param_into_net(self, param_dict)
         print('loading pretrained model {}'.format(pretrained))
     else:
         assert False, '{} is not a file'.format(pretrained)
Esempio n. 25
0
    def __init__(self, network, optimizer, scale_update_cell=None):
        super(GRUTrainOneStepWithLossScaleCell,
              self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.reducer_flag = False
        self.allreduce = P.AllReduce()

        self.parallel_mode = _get_parallel_mode()
        if self.parallel_mode not in ParallelMode.MODE_LIST:
            raise ValueError("Parallel mode does not support: ",
                             self.parallel_mode)
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.clip_gradients = ClipGradients()
        self.cast = P.Cast()
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.gpu_target = False
            self.alloc_status = P.NPUAllocFloatStatus()
            self.get_status = P.NPUGetFloatStatus()
            self.clear_before_grad = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.hyper_map = C.HyperMap()

        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(
                Tensor(scale_update_cell.get_loss_scale(),
                       dtype=mstype.float32))
Esempio n. 26
0
    def __init__(self, network, optimizer, scale_update_cell=None, micro_batches=None, norm_clip=1.0, mech=None):
        super(_TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
        self.hyper_map = C.HyperMap()
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.gpu_target = False
            self.alloc_status = NPUAllocFloatStatus()
            self.get_status = NPUGetFloatStatus()
            self.clear_status = NPUClearFloatStatus()
        self.reduce_sum = ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = LessEqual()
        self.depend_parameter_use = ControlDepend(depend_mode=1)
        self.allreduce = P.AllReduce()
        self.parallel_mode = _get_parallel_mode()
        self.grad_reducer = F.identity
        self.reducer_flag = self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
        self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE

        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                        name="loss_scale")
        self.add_flags(has_effect=True)

        # dp params
        self._micro_batches = micro_batches
        norm_clip = check_param_type('norm_clip', norm_clip, float)
        self._l2_norm = check_value_positive('norm_clip', norm_clip)
        self._split = P.Split(0, self._micro_batches)
        self._clip_by_global_norm = _ClipGradients()
        self._mech = mech
        self._tuple_add = _TupleAdd()
        self._hyper_map = C.HyperMap()
        self._micro_float = Tensor(micro_batches, mstype.float32)
Esempio n. 27
0
 def __init__(self,
              net_with_loss,
              optimizer,
              sens=1.0,
              reduce_flag=False,
              mean=False,
              degree=None):
     super(TrainOneStepCell, self).__init__(auto_prefix=False)
     self.net_with_loss = net_with_loss
     self.weights = ParameterTuple(net_with_loss.trainable_params())
     self.optimizer = optimizer
     self.grad = C.GradOperation(get_by_list=True, sens_param=False)
     self.reduce_flag = reduce_flag
     if reduce_flag:
         self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                    mean, degree)
Esempio n. 28
0
 def __init__(self, network, network_backbone, optimizer, sens=1.0, reduce_flag=False, mean=True, degree=None):
     super(TrainOneStepCell, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_grad()
     self.backbone = network_backbone
     self.weights = ParameterTuple(network.trainable_params())
     self.optimizer = optimizer
     self.grad = C.GradOperation(get_by_list=True,
                                 sens_param=True)
     if context.get_context("device_target") == "Ascend":
         self.sens = Tensor((np.ones((1,)) * sens).astype(np.float16))
     else:
         self.sens = Tensor((np.ones((1,)) * sens).astype(np.float32))
     self.reduce_flag = reduce_flag
     if reduce_flag:
         self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
Esempio n. 29
0
    def __init__(self, network, optimizer, scale_sense):
        super(DFCNNCTCTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.optimizer = optimizer

        if isinstance(scale_sense, nn.Cell):
            self.loss_scaling_manager = scale_sense
            self.scale_sense = Parameter(Tensor(scale_sense.get_loss_scale(),
                                                dtype=mstype.float32), name="scale_sense")
        elif isinstance(scale_sense, Tensor):
            if scale_sense.shape == (1,) or scale_sense.shape == ():
                self.scale_sense = Parameter(scale_sense, name='scale_sense')
            else:
                raise ValueError("The shape of scale_sense must be (1,) or (), but got {}".format(
                    scale_sense.shape))
        else:
            raise TypeError("The scale_sense must be Cell or Tensor, but got {}".format(
                type(scale_sense)))

        self.network.set_grad()
        self.weights = ParameterTuple(network.trainable_params())

        self.grad = C.GradOperation(get_by_list=True,
                                    sens_param=True)

        self.reducer_flag = False
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode not in ParallelMode.MODE_LIST:
            raise ValueError("Parallel mode does not support: ", self.parallel_mode)
        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)

        self.clip_gradients = ClipGradients()
        self.cast = P.Cast()
        self.addn = P.AddN()
        self.reshape = P.Reshape()
        self.hyper_map = C.HyperMap()
        self.less_equal = P.LessEqual()
        self.allreduce = P.AllReduce()
Esempio n. 30
0
    def __init__(self, network, optimizer):
        """
        Append an optimizer to the training network after that the construct
        function can be called to create the backward graph.
        Arguments:
            network: The training network.
                Note that loss function should have been added.
            optimizer: optimizer for updating the weights
        """
        super(_TrainOneStepCell, self).__init__(auto_prefix=False)
        self.network = network
        self.weights = ParameterTuple(network.get_parameters())

        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an optimizer'.format(
                type(optimizer).__name__))

        self.has_lr_schedule = False
        self.optimizer = optimizer