def __init__(self, network, sens=1000.0): super(TrainStepWarp, self).__init__() self.network = network self.network.set_train() self.trainable_params = network.trainable_params() weights_w = [] weights_d = [] for params in self.trainable_params: weights_w.append(params) weights_d.append(params) self.weights_w = ParameterTuple(weights_w) self.weights_d = ParameterTuple(weights_d) self.optimizer_w = FTRL(learning_rate=1e-2, params=self.weights_w, l1=1e-8, l2=1e-8, initial_accum=1.0) self.optimizer_d = Adam(self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens) self.hyper_map = C.HyperMap() self.grad_w = C.GradOperation(get_by_list=True, sens_param=True) self.grad_d = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.loss_net_w = IthOutputCell(network, output_index=0) self.loss_net_d = IthOutputCell(network, output_index=1)
def lstm_default_state(batch_size, hidden_size, num_layers, bidirectional): """init default input.""" num_directions = 1 if bidirectional: num_directions = 2 if context.get_context("device_target") == "CPU": h_list = [] c_list = [] for i in range(num_layers): hi = Parameter(initializer( Tensor( np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32)), [num_directions, batch_size, hidden_size]), name='h' + str(i)) h_list.append(hi) ci = Parameter(initializer( Tensor( np.zeros((num_directions, batch_size, hidden_size)).astype(np.float32)), [num_directions, batch_size, hidden_size]), name='c' + str(i)) c_list.append(ci) h = ParameterTuple(tuple(h_list)) c = ParameterTuple(tuple(c_list)) return h, c h = Tensor( np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32)) c = Tensor( np.zeros((num_layers * num_directions, batch_size, hidden_size)).astype(np.float32)) return h, c
def __init__(self, input_dim, hidden_num, hidden_dim, output_dim, mu, lamb, nonlinear="leaky-relu", norm_prod='paths', square_prod=False): super(BaseModel, self).__init__() self.input_dim = input_dim self.hidden_num = hidden_num self.hidden_dim = hidden_dim self.output_dim = output_dim self.mu = mu self.lamb = lamb self.nonlinear = nonlinear self.norm_prod = norm_prod self.square_prod = square_prod self.normal = msd.Normal(dtype=mstype.float32) self.extra_params = [] # initialize current adjacency matrix self.adjacency = msnp.ones( (self.input_dim, self.input_dim), dtype=mstype.float32) - msnp.eye( self.input_dim, dtype=mstype.float32) # Generate layer_list layer_list = [self.hidden_dim] * self.hidden_num layer_list.insert(0, self.input_dim) layer_list.append(self.output_dim) # Instantiate the parameters of each layer in the model of each variable tmp_weights = list() tmp_biases = list() for i, item in enumerate(layer_list[:-1]): in_dim = item out_dim = layer_list[i + 1] tmp_weights.append( Parameter(msnp.zeros((self.input_dim, out_dim, in_dim), dtype=mstype.float32), requires_grad=True, name='w' + str(i))) tmp_biases.append( Parameter(msnp.zeros((self.input_dim, out_dim), dtype=mstype.float32), requires_grad=True, name='b' + str(i))) self.weights = ParameterTuple(tmp_weights) self.biases = ParameterTuple(tmp_biases) # reset initialization parameters self.reset_params()
def __init__(self, network, config, sens=1000.0): super(TrainStepWrap, self).__init__() self.network = network self.network.set_train() self.trainable_params = network.trainable_params() weights_w = [] weights_d = [] for params in self.trainable_params: if 'wide' in params.name: weights_w.append(params) else: weights_d.append(params) self.weights_w = ParameterTuple(weights_w) self.weights_d = ParameterTuple(weights_d) self.optimizer_w = FTRL(learning_rate=config.ftrl_lr, params=self.weights_w, l1=5e-4, l2=5e-4, initial_accum=0.1, loss_scale=sens) #self.optimizer_d = ProximalAdagrad(self.weights_d, learning_rate=config.adam_lr,loss_scale=sens) self.optimizer_d = Adam(self.weights_d, learning_rate=config.adam_lr, eps=1e-6, loss_scale=sens) self.hyper_map = C.HyperMap() self.grad_w = C.GradOperation('grad_w', get_by_list=True, sens_param=True) self.grad_d = C.GradOperation('grad_d', get_by_list=True, sens_param=True) self.sens = sens self.loss_net_w = IthOutputCell(network, output_index=0) self.loss_net_d = IthOutputCell(network, output_index=1) self.reducer_flag = False self.grad_reducer_w = None self.grad_reducer_d = None parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_mirror_mean() degree = _get_device_num() self.grad_reducer_w = DistributedGradReducer( self.optimizer_w.parameters, mean, degree) self.grad_reducer_d = DistributedGradReducer( self.optimizer_d.parameters, mean, degree)
def __init__(self, network, sens=1024.0, host_device_mix=False, parameter_server=False, sparse=False): super(TrainStepWrap, self).__init__() parallel_mode = context.get_auto_parallel_context("parallel_mode") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) self.network = network self.network.set_train() self.trainable_params = network.trainable_params() weights_w = [] weights_d = [] for params in self.trainable_params: if 'wide' in params.name: weights_w.append(params) else: weights_d.append(params) self.weights_w = ParameterTuple(weights_w) self.weights_d = ParameterTuple(weights_d) if (sparse and is_auto_parallel) or parameter_server: self.optimizer_d = LazyAdam( self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens) self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w, l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens) if host_device_mix or parameter_server: self.optimizer_w.target = "CPU" self.optimizer_d.target = "CPU" else: self.optimizer_d = Adam( self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens) self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w, l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens) self.hyper_map = C.HyperMap() self.grad_w = C.GradOperation(get_by_list=True, sens_param=True) self.grad_d = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.loss_net_w = IthOutputCell(network, output_index=0) self.loss_net_d = IthOutputCell(network, output_index=1) self.loss_net_w.set_grad() self.loss_net_d.set_grad() self.reducer_flag = False self.grad_reducer_w = None self.grad_reducer_d = None self.reducer_flag = parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL) if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") degree = context.get_auto_parallel_context("device_num") self.grad_reducer_w = DistributedGradReducer(self.optimizer_w.parameters, mean, degree) self.grad_reducer_d = DistributedGradReducer(self.optimizer_d.parameters, mean, degree)
def test_pynative_lenet_with_new_interface(): context.set_context(mode=context.PYNATIVE_MODE) epoch_size = 20 batch_size = 32 inputs = Tensor(np.ones([batch_size, 1, 32, 32]).astype(np.float32)) labels = Tensor(np.ones([batch_size]).astype(np.int32)) net = LeNet() criterion = CrossEntropyLoss() net_with_criterion = WithLossCell(net, criterion) net_with_criterion.set_train() weights = ParameterTuple( filter(lambda x: x.requires_grad, net.get_parameters())) optimizer = Momentum(weights, 0.1, 0.9) forward_value_and_grad = nn.ForwardValueAndGrad(network=net_with_criterion, weights=weights, get_by_list=True) total_time = 0 for epoch in range(0, epoch_size): start_time = time.time() loss_output, grads = forward_value_and_grad(inputs, labels) optimizer(grads) end_time = time.time() cost_time = end_time - start_time total_time = total_time + cost_time print("======epoch: ", epoch, " loss: ", loss_output.asnumpy(), " cost time: ", cost_time) assert loss_output.asnumpy() < 0.005 assert loss_output.asnumpy() > 0.003
def __init__(self, parameters, learning_rate=0.001, batch_size=1): super(SGD, self).__init__() self.parameters = ParameterTuple(parameters) self.learning_rate = Tensor( np.array([learning_rate]).astype(np.float32)) self.batch_size = Tensor(np.array([batch_size]).astype(np.float32)) self.hyper_map = C.HyperMap()
def __init__(self, input_dim, hidden_num, hidden_dim, output_dim, mu, lamb, nonlinear="leaky-relu", norm_prod='paths', square_prod=False): super(NonlinearGaussANM, self).__init__(input_dim=input_dim, hidden_num=hidden_num, hidden_dim=hidden_dim, output_dim=output_dim, mu=mu, lamb=lamb, nonlinear=nonlinear, norm_prod=norm_prod, square_prod=square_prod) # extra parameters are log_std extra_params = np.ones((self.input_dim, )) np.random.shuffle(extra_params) extra_params_list = list() for i, extra_param in enumerate(extra_params): extra_params_list.append( Parameter(MsTensor(np.log(extra_param).reshape(1), dtype=mstype.float32), requires_grad=True, name='e' + str(i))) # each element in the list represents a variable, # the size of the element is the number of extra_params per var self.extra_params = ParameterTuple(extra_params_list)
def test_train_lenet_with_new_interface(num_classes=10, epoch=20, batch_size=32): context.set_context(mode=context.GRAPH_MODE, device_target="GPU") network = LeNet5(num_classes) criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_with_criterion = WithLossCell(network, criterion) net_with_criterion.set_train() weights = ParameterTuple(network.trainable_params()) optimizer = nn.Momentum(weights, 0.1, 0.9) train_network = ForwardValueAndGrad(network=net_with_criterion, weights=weights, get_by_list=True, sens_param=True) losses = [] for i in range(0, epoch): data = Tensor( np.ones([batch_size, 1, 32, 32]).astype(np.float32) * 0.01) label = Tensor(np.ones([batch_size]).astype(np.int32)) sens = Tensor(np.ones([1]).astype(np.float32)) loss, grads = train_network(data, label, sens) grads = F.identity(grads) optimizer(grads) losses.append(loss) assert losses[-1].asnumpy() < 0.01 assert losses[-1].asnumpy() > 0.001
def __init__(self, network, lr, momentum, is_train=True): super(TrainStepWrap, self).__init__(auto_prefix=False) self.network = network self.weights = ParameterTuple(network.trainable_params()) self.optimizer = Momentum(self.weights, lr, momentum) self.grad = C.GradOperation(get_by_list=True) self.is_train = is_train
def __init__(self, network, optimizer, scale_update_cell=None): super(BertTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = ops.GradOperation( get_by_list=True, sens_param=True) self.reducer_flag = False self.allreduce = ops.AllReduce() self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = ops.identity self.degree = 1 if self.reducer_flag: self.degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.cast = ops.Cast() self.alloc_status = ops.NPUAllocFloatStatus() self.get_status = ops.NPUGetFloatStatus() self.clear_before_grad = ops.NPUClearFloatStatus() self.reduce_sum = ops.ReduceSum(keep_dims=False) self.depend_parameter_use = ops.ControlDepend(depend_mode=1) self.base = Tensor(1, mstype.float32) self.less_equal = ops.LessEqual() self.hyper_map = ops.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale")
def test_big_batchSize_with_new_interface(num_classes=10, epoch=8, batch_size=338): net = resnet50(num_classes) criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') net_with_criterion = WithLossCell(net, criterion) net_with_criterion.set_train() weights = ParameterTuple( filter(lambda x: x.requires_grad, net.get_parameters())) optimizer = Momentum(weights, 0.1, 0.9) train_network = ForwardValueAndGrad(network=net_with_criterion, weights=weights, get_by_list=True, sens_param=True, sens=1.0) losses = [] for i in range(0, epoch): data = Tensor( np.ones([batch_size, 3, 224, 224]).astype(np.float32) * 0.01) label = Tensor(np.ones([batch_size]).astype(np.int32)) loss, grads = train_network(data, label) grads = F.identity(grads) optimizer(grads) losses.append(loss) assert (losses[-1].asnumpy() < 0.8)
def __init__(self, network, total_steps=1, sens=16384.0): super(TrainStepWrap, self).__init__(auto_prefix=False) self.network = network self.network.set_train() self.network.add_flags(defer_inline=True) self.weights = ParameterTuple(network.trainable_params()) lr = dynamic_lr(0.01, total_steps, 5000) self.optimizer = nn.Adam(self.weights, learning_rate=lr, beta1=0.9, beta2=0.999, eps=1e-8, loss_scale=sens) self.hyper_map = C.HyperMap() self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer( self.optimizer.parameters, mean, degree)
def __init__(self, network, grad_op): if isinstance(network, nn.Cell): super(NNBackwardWithNoSens, self).__init__(auto_prefix=False) else: super(NNBackwardWithNoSens, self).__init__() self.network = network self.grad = grad_op self.params = ParameterTuple(network.trainable_params())
def __init__(self, network): super(TrainStepWrap, self).__init__() self.network = network self.network.set_train() self.weights = ParameterTuple(network.trainable_params()) self.optimizer = nn.Momentum(self.weights, 0.1, 0.9) self.hyper_map = C.HyperMap() self.grad = C.GradOperation(get_by_list=True)
def __init__(self, network, sens): super(TrainStepWrap2, self).__init__() self.network = network self.network.set_train() self.weights = ParameterTuple(network.get_parameters()) self.optimizer = nn.Momentum(self.weights, 0.1, 0.9) self.hyper_map = C.HyperMap() self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) self.sens = sens
def __init__(self, network, lr=5e-8, eps=1e-8, loss_scale=1000.0): super(TrainStepWrap, self).__init__(auto_prefix=False) self.network = network self.network.set_train() self.weights = ParameterTuple(network.trainable_params()) self.optimizer = Adam(self.weights, learning_rate=lr, eps=eps, loss_scale=loss_scale) self.hyper_map = C.HyperMap() self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) self.sens = loss_scale
def __init__(self, grad, network, wrt_params=False, real_inputs_count=None): super().__init__() self.network = network self.grad = grad self.sens_param = self.grad.sens_param self.wrt_params = wrt_params self.real_inputs_count = real_inputs_count if self.wrt_params: self.params = ParameterTuple(self.network.trainable_params())
def __init__(self, input_size, hidden_size, num_layers=1, has_bias=True, batch_first=False, dropout=0.0, bidirectional=False): super(StackLSTM, self).__init__() self.num_layers = num_layers self.batch_first = batch_first self.transpose = P.Transpose() # direction number num_directions = 2 if bidirectional else 1 # input_size list input_size_list = [input_size] for i in range(num_layers - 1): input_size_list.append(hidden_size * num_directions) # layers layers = [] for i in range(num_layers): layers.append( nn.LSTMCell(input_size=input_size_list[i], hidden_size=hidden_size, has_bias=has_bias, batch_first=batch_first, bidirectional=bidirectional, dropout=dropout)) # weights weights = [] for i in range(num_layers): # weight size weight_size = (input_size_list[i] + hidden_size) * num_directions * hidden_size * 4 if has_bias: bias_size = num_directions * hidden_size * 4 weight_size = weight_size + bias_size # numpy weight stdv = 1 / math.sqrt(hidden_size) w_np = np.random.uniform(-stdv, stdv, (weight_size, 1, 1)).astype(np.float32) # lstm weight weights.append( Parameter(initializer(Tensor(w_np), w_np.shape), name="weight" + str(i))) # self.lstms = layers self.weight = ParameterTuple(tuple(weights))
def __init__(self, network, optimizer, sens=1.0, reduce_flag=False, mean=True, degree=None): super(TrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = Tensor((np.ones(1, dtype=np.float32)) * sens) self.reducer_flag = reduce_flag if self.reducer_flag: self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
def __init__(self, mixture_size: int, do_layer_norm: bool = False) -> None: super(Scalar_mix, self).__init__() self.mixture_size = mixture_size self.do_layer_norm = do_layer_norm self.scalar_parameters = ParameterTuple([Parameter(Tensor(np.array([0.0]), mindspore.float32)) \ for _ in range(mixture_size)]) self.gamma = Parameter(Tensor(np.array([0.0]), mindspore.float32)) self.sum = P.ReduceSum() self.sqrt = P.Sqrt() self.cat = P.Concat() self.unsqueeze = P.ExpandDims(0)
def __init__(self, network, optimizer, grad_sum, sens=1.0): super(TrainForwardBackward, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad_sum = grad_sum self.grad = ops.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.hyper_map = ops.HyperMap()
def __init__(self, fn: Callable, args: List[Any], delta: float = 1e-3, max_error: float = 1e-3, input_selector=None, output_selector=None, sampling_times=-1, reduce_output=False) -> None: grad_op = GradOperation('grad', get_by_list=True, sens_param=True) self.params = ParameterTuple(fn.trainable_params()) super(NNGradChecker, self).__init__(fn, grad_op, args, delta, max_error, input_selector, \ output_selector, sampling_times, reduce_output)
def init_weights(self, pretrained=''): if os.path.isfile(pretrained): # load params from pretrained param_dict = load_checkpoint(pretrained) weight = ParameterTuple(self.trainable_params()) for w in weight: if w.name.split('.')[0] not in ('deconv_layers', 'final_layer'): assert w.name in param_dict, "parameter %s not in checkpoint" % w.name load_param_into_net(self, param_dict) print('loading pretrained model {}'.format(pretrained)) else: assert False, '{} is not a file'.format(pretrained)
def __init__(self, network, optimizer, scale_update_cell=None): super(GRUTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False self.allreduce = P.AllReduce() self.parallel_mode = _get_parallel_mode() if self.parallel_mode not in ParallelMode.MODE_LIST: raise ValueError("Parallel mode does not support: ", self.parallel_mode) if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.clip_gradients = ClipGradients() self.cast = P.Cast() if context.get_context("device_target") == "GPU": self.gpu_target = True self.float_status = P.FloatStatus() self.addn = P.AddN() self.reshape = P.Reshape() else: self.gpu_target = False self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_before_grad = P.NPUClearFloatStatus() self.reduce_sum = P.ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = P.LessEqual() self.hyper_map = C.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter( Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))
def __init__(self, network, optimizer, scale_update_cell=None, micro_batches=None, norm_clip=1.0, mech=None): super(_TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) self.hyper_map = C.HyperMap() if context.get_context("device_target") == "GPU": self.gpu_target = True self.float_status = P.FloatStatus() self.addn = P.AddN() self.reshape = P.Reshape() else: self.gpu_target = False self.alloc_status = NPUAllocFloatStatus() self.get_status = NPUGetFloatStatus() self.clear_status = NPUClearFloatStatus() self.reduce_sum = ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = LessEqual() self.depend_parameter_use = ControlDepend(depend_mode=1) self.allreduce = P.AllReduce() self.parallel_mode = _get_parallel_mode() self.grad_reducer = F.identity self.reducer_flag = self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL] if self.reducer_flag: mean = _get_mirror_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale") self.add_flags(has_effect=True) # dp params self._micro_batches = micro_batches norm_clip = check_param_type('norm_clip', norm_clip, float) self._l2_norm = check_value_positive('norm_clip', norm_clip) self._split = P.Split(0, self._micro_batches) self._clip_by_global_norm = _ClipGradients() self._mech = mech self._tuple_add = _TupleAdd() self._hyper_map = C.HyperMap() self._micro_float = Tensor(micro_batches, mstype.float32)
def __init__(self, net_with_loss, optimizer, sens=1.0, reduce_flag=False, mean=False, degree=None): super(TrainOneStepCell, self).__init__(auto_prefix=False) self.net_with_loss = net_with_loss self.weights = ParameterTuple(net_with_loss.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=False) self.reduce_flag = reduce_flag if reduce_flag: self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
def __init__(self, network, network_backbone, optimizer, sens=1.0, reduce_flag=False, mean=True, degree=None): super(TrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.backbone = network_backbone self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) if context.get_context("device_target") == "Ascend": self.sens = Tensor((np.ones((1,)) * sens).astype(np.float16)) else: self.sens = Tensor((np.ones((1,)) * sens).astype(np.float32)) self.reduce_flag = reduce_flag if reduce_flag: self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
def __init__(self, network, optimizer, scale_sense): super(DFCNNCTCTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.optimizer = optimizer if isinstance(scale_sense, nn.Cell): self.loss_scaling_manager = scale_sense self.scale_sense = Parameter(Tensor(scale_sense.get_loss_scale(), dtype=mstype.float32), name="scale_sense") elif isinstance(scale_sense, Tensor): if scale_sense.shape == (1,) or scale_sense.shape == (): self.scale_sense = Parameter(scale_sense, name='scale_sense') else: raise ValueError("The shape of scale_sense must be (1,) or (), but got {}".format( scale_sense.shape)) else: raise TypeError("The scale_sense must be Cell or Tensor, but got {}".format( type(scale_sense))) self.network.set_grad() self.weights = ParameterTuple(network.trainable_params()) self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode not in ParallelMode.MODE_LIST: raise ValueError("Parallel mode does not support: ", self.parallel_mode) if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.clip_gradients = ClipGradients() self.cast = P.Cast() self.addn = P.AddN() self.reshape = P.Reshape() self.hyper_map = C.HyperMap() self.less_equal = P.LessEqual() self.allreduce = P.AllReduce()
def __init__(self, network, optimizer): """ Append an optimizer to the training network after that the construct function can be called to create the backward graph. Arguments: network: The training network. Note that loss function should have been added. optimizer: optimizer for updating the weights """ super(_TrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.weights = ParameterTuple(network.get_parameters()) if not isinstance(optimizer, Optimizer): raise TypeError('{} is not an optimizer'.format( type(optimizer).__name__)) self.has_lr_schedule = False self.optimizer = optimizer