def construct(self, x): t = x for op in self.op_seq: t = op(t) return t def test_op_seq_test(): context.set_context(mode=context.GRAPH_MODE) net = OpSeqNet() input_np = np.random.randn(2, 3, 4, 5).astype(np.float32) input_me = Tensor(input_np) net(input_me) _grad_fusion = C.MultitypeFuncGraph("grad_fushion") @_grad_fusion.register("Tensor", "Function") def tensor_grad_scale(x, op): return op(x) class AllReduceTest(Cell): def __init__(self, loop_count=1): super().__init__() self.op_list = () self.fushion_flag = [0, 1, 1, 0, 1, 0] for i in self.fushion_flag: op = P.AllReduce().add_prim_attr('fusion', i) self.op_list = self.op_list + (op,)
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """sgd""" from mindspore.ops import functional as F, composite as C, operations as P from mindspore.common.parameter import Parameter from mindspore._checkparam import ParamValidator as validator from .optimizer import Optimizer sgd_opt = C.MultitypeFuncGraph("sgd_opt") @sgd_opt.register("Function", "Number", "Number", "Tensor", "Tensor", "Tensor", "Tensor") def _tensor_run_opt(opt, learning_rate, momentum, gradient, weight, accum, stat): """Apply sgd optimizer to the weight parameter.""" success = True success = F.depend(success, opt(weight, gradient, learning_rate, accum, momentum, stat)) return success @sgd_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor") def _tensor_run_opt_ext(opt, learning_rate, momentum, gradient, weight, accum, stat): """Apply sgd optimizer to the weight parameter using Tensor.""" success = True success = F.depend(success, opt(weight, gradient, learning_rate, accum, momentum, stat))
def bprop(dims, dtype, out, dout): return zeros_like(dims) return bprop @bprop_getters.register(P.DType) def get_bprop_dtype(self): """Generate bprop for DType""" def bprop(x, out, dout): return (zeros_like(x), ) return bprop dout_cast = C.MultitypeFuncGraph("dout_cast") @dout_cast.register("Tensor", "Tensor") def dout_cast_tensor(dout, x): cast = P.Cast() get_dtype = P.DType() dx = cast(dout, get_dtype(x)) return dx @dout_cast.register("Number", "Number") def dout_cast_number(dout, x): cast = P.Cast() get_dtype = P.DType() dx = cast(dout, get_dtype(x))
target_ids, target_mask, label_ids, label_weights, self.cast(F.tuple_to_array((self.sens,)), mstype.float32)) grads = self.clip_gradients(grads, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE) if self.reducer_flag: # apply grad reducer on grads grads = self.grad_reducer(grads) succ = self.optimizer(grads) return F.depend(loss, succ) grad_scale = C.MultitypeFuncGraph("grad_scale") reciprocal = P.Reciprocal() @grad_scale.register("Tensor", "Tensor") def tensor_grad_scale(scale, grad): return grad * F.cast(reciprocal(scale), F.dtype(grad)) class TransformerTrainOneStepWithLossScaleCell(nn.Cell): """ Encapsulation class of Transformer network training. Append an optimizer to the training network after that the construct function can be called to create the backward graph.
# See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ test_multitype """ import numpy as np from mindspore.common.api import ms_function from mindspore.ops import Primitive from mindspore.ops import composite as C from mindspore.ops import operations as P from mindspore import Tensor from ...ut_filter import non_graph_engine tensor_add = P.TensorAdd() scala_add = Primitive('scalar_add') add = C.MultitypeFuncGraph('add') @add.register("Number", "Number") def add_scala(x, y): return scala_add(x, y) @add.register("Tensor", "Tensor") def add_tensor(x, y): return tensor_add(x, y) @ms_function def mainf(x, y): return add(x, y)
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """FTRL""" from mindspore.ops import functional as F, composite as C, operations as P from mindspore.common import Tensor import mindspore.common.dtype as mstype from mindspore._checkparam import Validator as validator from mindspore._checkparam import Rel from .optimizer import Optimizer, _apply_decay, _grad_scale _ftrl_opt = C.MultitypeFuncGraph("ftrl_opt") @_ftrl_opt.register("Function", "Function", "Function", "Function", "Number", "Number", "Number", "Tensor", "Tensor", "RowTensor", "Tensor", "Tensor", "Bool") def _tensor_run_opt_with_sparse(opt, spars_opt, push, pull, l1, l2, lr_power, learning_rate, linear, gradient, weight, moment, ps_parameter): """Apply sparse ftrl optimizer to the weight parameter when the gradient is sparse.""" success = True indices = gradient.indices values = gradient.values if ps_parameter: op_shape = P.Shape() shapes = (op_shape(weight), op_shape(moment), op_shape(linear),
# See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """lars optimizer""" from typing import Iterable from mindspore.common import dtype as mstype from mindspore.common import Tensor from mindspore.common.initializer import initializer from mindspore.common.parameter import Parameter from mindspore.ops import operations as P from mindspore.ops import composite as C from mindspore.ops import functional as F from mindspore._checkparam import Validator as validator from .optimizer import grad_scale, Optimizer lars_opt = C.MultitypeFuncGraph("lars_opt") @lars_opt.register("Function", "Number", "Tensor", "Tensor", "Tensor", "Bool", "Bool") def _tensor_run_opt(lars, weight_decay, learning_rate, gradient, weight, decay_flag, lars_flag): """Apply lars optimizer to the weight parameter.""" if lars_flag: op_reduce_sum = P.SquareSumAll() w_square_sum, grad_square_sum = op_reduce_sum(weight, gradient) if decay_flag: grad_t = lars(weight, gradient, w_square_sum, grad_square_sum, weight_decay, learning_rate) else: num_zero = 0.0 grad_t = lars(weight, gradient, w_square_sum, grad_square_sum, num_zero, learning_rate) return grad_t
''' """Automatic differentiation with grad clip.""" import numpy as np from mindspore.parallel._utils import (_get_device_num, _get_gradients_mean, _get_parallel_mode) from mindspore.context import ParallelMode from mindspore.common import dtype as mstype from mindspore.ops import composite as C from mindspore.ops import functional as F from mindspore.ops import operations as P from mindspore.nn.cell import Cell from mindspore.nn.wrap.grad_reducer import DistributedGradReducer import mindspore.nn as nn from mindspore.common.tensor import Tensor compute_norm = C.MultitypeFuncGraph("compute_norm") @compute_norm.register("Tensor") def _compute_norm(grad): norm = nn.Norm() norm = norm(F.cast(grad, mstype.float32)) ret = F.expand_dims(F.cast(norm, mstype.float32), 0) return ret grad_div = C.MultitypeFuncGraph("grad_div") @grad_div.register("Tensor", "Tensor") def _grad_div(val, grad):
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """PROXIMAL_ADA_GRAD""" from mindspore.ops import functional as F, composite as C, operations as P from mindspore.common import Tensor import mindspore.common.dtype as mstype from mindspore._checkparam import Validator as validator from mindspore._checkparam import Rel from .optimizer import Optimizer _proximal_ada_grad_opt = C.MultitypeFuncGraph("proximal_ada_grad_opt") @_proximal_ada_grad_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "IndexedSlices", "Tensor", "Tensor") def _tensor_run_opt_with_sparse(opt, sparse_opt, learning_rate, l1, l2, gradient, weight, accum): """Apply sparse proximal_ada_grad optimizer to the weight parameter.""" success = True success = F.depend(success, sparse_opt(weight, accum, learning_rate, l1, l2, gradient.values(), gradient.indices())) return success @_proximal_ada_grad_opt.register("Function", "Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor") def _tensor_run_opt(opt, sparse_opt, l1, l2, learning_rate, gradient, weight, accum): """Apply proximal_ada_grad optimizer to the weight parameter.""" success = True success = F.depend(success, opt(weight, accum, learning_rate, l1, l2, gradient))
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """grad_reducer_thor""" import mindspore.common.dtype as mstype from mindspore.communication.management import GlobalComm, get_group_size from mindspore.nn.cell import Cell from mindspore.ops import functional as F, composite as C, operations as P from mindspore.ops.operations.comm_ops import AllReduce, ReduceOp reduce_opt = C.MultitypeFuncGraph("reduce_opt") _all_reduce_A = AllReduce() def _init_optimizer_allreduce(group): global _all_reduce_A _all_reduce_A = AllReduce(ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP) _all_reduce_A.add_prim_attr('fusion', group) @reduce_opt.register("Function", "Number", "Tensor") def _tensors_allreduce_mean(mul, degree, grad): degree = F.scalar_cast(degree, F.dtype(grad)) grad = _all_reduce_A(grad) cast_op = P.Cast()
import mindspore.common.dtype as mstype from mindspore.ops import operations as P from mindspore.ops import functional as F from mindspore.ops import composite as C from mindspore.common.parameter import ParameterTuple from mindspore.context import ParallelMode from mindspore import nn from mindspore.communication.management import get_group_size from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore import context from src.fasttext_model import FastText GRADIENT_CLIP_TYPE = 1 GRADIENT_CLIP_VALUE = 1.0 clip_grad = C.MultitypeFuncGraph("clip_grad") @clip_grad.register("Number", "Number", "Tensor") def _clip_grad(clip_type, clip_value, grad): """ Clip gradients. Inputs: clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'. clip_value (float): Specifies how much to clip. grad (tuple[Tensor]): Gradients. Outputs: tuple[Tensor], clipped gradients. """
# ============================================================================ """adam""" import numpy as np from mindspore.common import dtype as mstype from mindspore.common.initializer import initializer from mindspore.ops import operations as P from mindspore.ops import composite as C from mindspore.ops import functional as F from mindspore.common.parameter import Parameter from mindspore.common.tensor import Tensor from mindspore._checkparam import Validator as validator from mindspore._checkparam import Rel from .optimizer import Optimizer _adam_opt = C.MultitypeFuncGraph("adam_opt") _adam_push_pull_opt = C.MultitypeFuncGraph("_adam_push_pull_opt") @_adam_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor", "Tensor", "Tensor", "Bool", "Bool") def _update_run_op(beta1, beta2, eps, lr, weight_decay, param, m, v, gradient, decay_flag, optim_filter): """ Update parameters. Args: beta1 (Tensor): The exponential decay rate for the 1st moment estimates. Should be in range (0.0, 1.0). beta2 (Tensor): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0). eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0. lr (Tensor): Learning rate. weight_decay (Number): Weight decay. Should be in range [0.0, 1.0].
from mindspore.common import dtype as mstype from mindspore.common.initializer import initializer from mindspore.ops import operations as P from mindspore.ops import composite as C from mindspore.ops import functional as F from mindspore.common.parameter import Parameter from mindspore.common.tensor import Tensor from mindspore._checkparam import Validator as validator from mindspore._checkparam import Rel from .optimizer import Optimizer from .. import layer num_one = Tensor(np.ones([1]), mstype.float32) _lamb_opt = C.MultitypeFuncGraph("lamb_opt") @_lamb_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor", "Tensor", "Tensor", "Bool", "Bool") def _update_run_op(beta1, beta2, eps, global_step, lr, weight_decay, param, m, v, gradient, decay_flag, optim_filter): """ Update parameters. Args: beta1 (Tensor): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0). beta2 (Tensor): The exponential decay rate for the 2nd moment estimations. Should be in range (0.0, 1.0). eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0. lr (Tensor): Learning rate. weight_decay (Number): Weight decay. Should be equal to or greater than 0. global_step (Tensor): Global step.
for i in range(self.dev_num - 1): status = F.depend( F.control_depend(new_param_group[i], new_param_group[i + 1][0]), status) return status def construct(self, *hyper_params): raise NotImplementedError op_add = P.AddN() op_gather = P.GatherV2() op_mul = P.Mul() _apply_decay = C.MultitypeFuncGraph("apply_decay") @_apply_decay.register("Tensor", "Bool", "Tensor", "RowTensor") def _tensor_apply_decay_with_sparse(weight_decay, if_apply, weight, gradient): """Get grad with weight_decay.""" if if_apply: indices = gradient.indices values = op_add( (op_gather(weight, indices, 0) * F.cast(weight_decay, F.dtype(weight)), gradient.values)) shape = gradient.dense_shape return RowTensor(indices, values, shape) return gradient
# ============================================================================ """adam""" import numpy as np from mindspore.common import dtype as mstype from mindspore.common.initializer import initializer from mindspore.ops import operations as P from mindspore.ops import composite as C from mindspore.ops import functional as F from mindspore.common.parameter import Parameter from mindspore.common.tensor import Tensor from mindspore._checkparam import Validator as validator from mindspore._checkparam import Rel from .optimizer import Optimizer _adam_opt = C.MultitypeFuncGraph("adam_opt") @_adam_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor", "Tensor", "Tensor", "Bool", "Bool") def _update_run_op(beta1, beta2, eps, lr, weight_decay, param, m, v, gradient, decay_flag, optim_filter): """ Update parameters. Args: beta1 (Tensor): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0). beta2 (Tensor): The exponential decay rate for the 2nd moment estimations. Should be in range (0.0, 1.0). eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0. lr (Tensor): Learning rate. weight_decay (Number): Weight decay. Should be equal to or greater than 0.
from mindspore.common import dtype as mstype from mindspore.common.initializer import initializer from mindspore.ops import operations as P from mindspore.ops import composite as C from mindspore.ops import functional as F from mindspore.common.parameter import Parameter from mindspore.common.tensor import Tensor from mindspore._checkparam import Validator as validator from mindspore._checkparam import Rel from .optimizer import Optimizer from .. import layer from .. import graph_kernels as G num_one = Tensor(np.ones([1]), mstype.float32) _lamb_opt = C.MultitypeFuncGraph("lamb_opt") @_lamb_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Number", "Tensor", "Tensor", "Tensor", "Tensor", "Bool", "Bool") def _update_run_op(beta1, beta2, eps, global_step, lr, weight_decay, param, m, v, gradient, decay_flag, optim_filter): """ Update parameters. Args: beta1 (Tensor): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0). beta2 (Tensor): The exponential decay rate for the 2nd moment estimations. Should be in range (0.0, 1.0). eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0. lr (Tensor): Learning rate. weight_decay (Number): Weight decay. Should be equal to or greater than 0.
from mindspore.ops import composite as C class SquaredLoss(nn.Cell): """Squared loss function.""" def __init__(self): super(SquaredLoss, self).__init__() self.reshape = P.Reshape() self.shape = P.Shape() self.two = Tensor(np.array([2.0]).astype(np.float32)) self.reduce_sum = P.ReduceSum() def construct(self, y_hat, y): ret = y_hat - self.reshape(y, self.shape(y_hat)) return self.reduce_sum((ret * ret) / self.two, (0,)) opt_step = C.MultitypeFuncGraph("opt_step") @opt_step.register("Tensor", "Tensor", "Tensor", "Tensor") def update_opt_step(learning_rate, batch_size, parameter, gradient): """ Update opt step. Args: learning_rate (Tensor): Learning rate. batch_size (Tensor): Batch Size. parameter (Tensor): Parameter. gradient (Tensor): Gradients. Returns: """ next_param = parameter - learning_rate * gradient / batch_size
from mindspore.ops import functional as F from mindspore.ops import composite as C from mindspore.common.tensor import Tensor from mindspore.common.parameter import Parameter from mindspore.common import dtype as mstype from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.context import ParallelMode from mindspore.communication.management import get_group_size from mindspore import context from .bert_for_pre_training import clip_grad from .finetune_eval_model import BertCLSModel, BertNERModel, BertSquadModel from .utils import CrossEntropyCalculation GRADIENT_CLIP_TYPE = 1 GRADIENT_CLIP_VALUE = 1.0 grad_scale = C.MultitypeFuncGraph("grad_scale") reciprocal = P.Reciprocal() @grad_scale.register("Tensor", "Tensor") def tensor_grad_scale(scale, grad): return grad * reciprocal(scale) _grad_overflow = C.MultitypeFuncGraph("_grad_overflow") grad_overflow = P.FloatStatus() @_grad_overflow.register("Tensor") def _tensor_grad_overflow(grad): return grad_overflow(grad)
# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ADA_GRAD""" from mindspore.ops import functional as F, composite as C, operations as P from mindspore._checkparam import Validator as validator from .optimizer import Optimizer _ada_grad_opt = C.MultitypeFuncGraph("ada_grad_opt") @_ada_grad_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor") def _tensor_run_opt(opt, learning_rate, weight, accum, gradient): """Apply ada_grad optimizer to the weight parameter.""" success = True success = F.depend(success, opt(weight, accum, learning_rate, gradient)) return success def _check_param_value(accum, update_slots, prim_name=None): """Check inputs param.""" validator.check_value_type("accum", accum, [float], prim_name) validator.check_value_type("update_slots", update_slots, [bool], prim_name) validator.check_non_negative_float(accum, "accum", prim_name)
from mindspore.ops import operations as P from mindspore.ops import functional as F from mindspore.ops import composite as C from mindspore.common.tensor import Tensor from mindspore.common.parameter import Parameter from mindspore.common import dtype as mstype from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.context import ParallelMode from mindspore.communication.management import get_group_size from mindspore import context from .bert_model import BertModel GRADIENT_CLIP_TYPE = 1 GRADIENT_CLIP_VALUE = 1.0 clip_grad = C.MultitypeFuncGraph("clip_grad") @clip_grad.register("Number", "Number", "Tensor") def _clip_grad(clip_type, clip_value, grad): """ Clip gradients. Inputs: clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'. clip_value (float): Specifies how much to clip. grad (tuple[Tensor]): Gradients. Outputs: tuple[Tensor], clipped gradients. """
ones_ = self.fill(self.dtype(cond), self.shape(cond), 1.0) l2sum_safe = self.select_(cond, l2sum, self.cast(ones_, self.dtype(l2sum))) l2norm = self.select_(cond, self.sqrt(l2sum_safe), l2sum) intermediate = x * clip_norm max_norm = self.max_op(l2norm, clip_norm) values_clip = self.cast( intermediate, mstype.float32) / self.expand_dims(max_norm, -1) values_clip = self.reshape(values_clip, self.shape(x)) values_clip = F.identity(values_clip) return values_clip clip_grad = C.MultitypeFuncGraph("clip_grad") # pylint: disable=consider-using-in @clip_grad.register("Number", "Number", "Tensor") def _clip_grad(clip_type, clip_value, grad): """ Clip gradients. Inputs: clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'. clip_value (float): Specifies how much to clip. grad (tuple[Tensor]): Gradients. Outputs: tuple[Tensor], clipped gradients.
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """rmsprop""" from mindspore.ops import functional as F, composite as C, operations as P from mindspore.common.initializer import initializer from mindspore.common.parameter import Parameter from mindspore._checkparam import ParamValidator as validator import mindspore.common.dtype as mstype from mindspore.common import Tensor from .optimizer import Optimizer, grad_scale, apply_decay rmsprop_opt = C.MultitypeFuncGraph("rmsprop_opt") centered_rmsprop_opt = C.MultitypeFuncGraph("rmsprop_opt") @rmsprop_opt.register("Function", "Number", "Number", "Number", "Number", "Tensor", "Tensor", "Tensor", "Tensor") def _rmsprop_opt(opt, learning_rate, decay, epsilon, momentum, weight, ms, mom, grad): """Apply rmsprop optimizer to the weight parameter.""" success = True success = F.depend( success, opt(weight, ms, mom, grad, learning_rate, decay, momentum, epsilon)) return success
# limitations under the License. # ============================================================================ """THOR""" from mindspore.ops import functional as F, composite as C, operations as P from mindspore.ops import _selected_ops from mindspore.common.initializer import initializer from mindspore.common.parameter import Parameter, ParameterTuple from mindspore.common.tensor import Tensor import mindspore.common.dtype as mstype from mindspore._checkparam import check_bool from mindspore._checkparam import Validator as validator from mindspore.nn.optim.optimizer import Optimizer from mindspore.parallel._utils import _get_device_num, _get_mirror_mean from src.grad_reducer_thor import DistributedGradReducerThor _momentum_opt = C.MultitypeFuncGraph("momentum_opt") op_add = P.AddN() apply_decay = C.MultitypeFuncGraph("apply_decay") @apply_decay.register("Number", "Bool", "Tensor", "Tensor") def _tensor_apply_decay(weight_decay, if_apply, weight, gradient): """Get grad with weight_decay.""" if if_apply: return op_add((weight * weight_decay, gradient)) return gradient @_momentum_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor")
import numpy as np from mindspore.common import dtype as mstype from mindspore.common.initializer import initializer from mindspore.ops import operations as P from mindspore.ops import composite as C from mindspore.ops import functional as F from mindspore.common.parameter import Parameter from mindspore.common.tensor import Tensor from mindspore._checkparam import Validator as validator from mindspore._checkparam import Rel from .optimizer import Optimizer from .. import layer num_one = Tensor(np.ones([1]), mstype.float32) lamb_opt = C.MultitypeFuncGraph("lamb_opt") @lamb_opt.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Bool") def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, global_step, param, m, v, gradient, decay_flag): """ Update parameters. Args: beta1 (Tensor): The exponential decay rate for the 1st moment estimates. Should be in range (0.0, 1.0). beta2 (Tensor): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0). eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0. lr (Tensor): Learning rate. weight_decay_tensor (Tensor): Weight decay. Should be equal to or greater than 0. global_step (Tensor): Global step.
status = F.control_depend(optim_result, new_param_group[0][0]) for i in range(self.dev_num - 1): status = F.depend( F.control_depend(new_param_group[i], new_param_group[i + 1][0]), status) return status def construct(self, *hyper_params): raise NotImplementedError op_add = P.AddN() op_gather = P.GatherV2() _apply_decay = C.MultitypeFuncGraph("apply_decay") @_apply_decay.register("Number", "Bool", "Tensor", "RowTensor") def _tensor_apply_decay_with_sparse(weight_decay, if_apply, weight, gradient): """Get grad with weight_decay.""" if if_apply: indices = gradient.indices values = op_add( (op_gather(weight, indices, 0) * weight_decay, gradient.values)) shape = gradient.dense_shape return RowTensor(indices, values, shape) return gradient @_apply_decay.register("Number", "Bool", "Tensor", "Tensor")
compute_dtype=mstype.float16, use_past=False): self.batch_size = batch_size self.seq_length = seq_length self.vocab_size = vocab_size self.embedding_size = embedding_size self.num_layers = num_layers self.num_heads = num_heads self.expand_ratio = expand_ratio self.post_layernorm_residual = post_layernorm_residual self.dropout_rate = dropout_rate self.compute_dtype = compute_dtype self.use_past = use_past get_square_sum = C.MultitypeFuncGraph("get_square_sum") @get_square_sum.register("Tensor") def _get_square_sum(grad): norm = P.ReduceSum(False)(F.square(grad), ()) norm = F.expand_dims(F.cast(norm, mstype.float32), 0) return norm apply_global_norm = C.MultitypeFuncGraph("apply_global_norm") @apply_global_norm.register("Tensor", "Tensor", "Tensor") def _apply_global_norm(clip_norm, global_norm, grad): grad = grad * clip_norm / global_norm
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """momentum""" from mindspore.ops import functional as F, composite as C, operations as P from mindspore.common.parameter import Parameter from mindspore.common.tensor import Tensor import mindspore.common.dtype as mstype from mindspore._checkparam import Validator from .optimizer import Optimizer from .optimizer import opt_init_args_register _momentum_opt = C.MultitypeFuncGraph("momentum_opt") @_momentum_opt.register("Function", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Bool", "Bool") def _tensor_run_opt_ext(opt, momentum, learning_rate, gradient, weight, moment, ps_parameter, cache_enable): """Apply momentum optimizer to the weight parameter using Tensor.""" if ps_parameter and not cache_enable: op_shape = P.Shape() _ps_pull = P.Pull() _ps_push = P.Push("ApplyMomentum", []) shapes = (op_shape(learning_rate), op_shape(gradient), op_shape(momentum)) success = F.depend( True,
out_shp = shape_op(dout) ind_shp = shape_op(indices) # Example: out_shape:(3,2,3) axis 1 -> (1,0,2) perm_1 = _generate_shape_index(out_shp, ind_shp, axis) values_transpose = transpose(dout, perm_1) params_grad = unsorted_segment_sum(values_transpose, indices, shape_op(x)[axis]) # Example: out_shape:(3,2,3) axis 2 -> (1,2,0) perm_2 = _generate_inverse_index(x_shp, axis) params_grad = transpose(params_grad, perm_2) return params_grad, zeros_like(indices), zeros_like(axis) return bprop adam_opt_for_map = C.MultitypeFuncGraph("adam_opt_for_map") @adam_opt_for_map.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "RowTensor", "Bool") def _update_run_op_for_map_row_tensor(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v, gradient, decay_flag): return gradient.values @adam_opt_for_map.register("Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Bool") def _update_run_op_for_map_tensor(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v, gradient, decay_flag): op_mul = P.Mul()
# See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """lazy adam""" from mindspore.common import dtype as mstype from mindspore.common.initializer import initializer from mindspore.ops import operations as P from mindspore.ops import composite as C from mindspore.ops import functional as F from mindspore.common.parameter import Parameter from mindspore.common.tensor import Tensor from mindspore._checkparam import Validator as validator from mindspore._checkparam import Rel from .optimizer import Optimizer _lazy_adam_opt = C.MultitypeFuncGraph("lazy_adam_opt") @_lazy_adam_opt.register("Function", "Function", "Function", "Function", "Bool", "Bool", "Bool", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "Tensor", "RowTensor", "Tensor", "Tensor", "Tensor", "Bool", "Bool") def _run_opt_with_sparse(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params, m, v, ps_parameter, cache_enable): """Apply sparse lazy adam optimizer to the weight parameter when the gradient is sparse.""" success = True indices = gradient.indices values = gradient.values if ps_parameter and not cache_enable:
next_params = ops(param_group[root]) new_param_group.append(next_params) for i in range(F.tuple_len(next_params)): F.assign(key_group[root][i], next_params[i]) return new_param_group def construct(self, *hyper_params): raise NotImplementedError op_add = P.AddN() op_gather = P.Gather() op_mul = P.Mul() op_gc = inner.Centralization() _apply_decay = C.MultitypeFuncGraph("apply_decay") _apply_grad_centralization = C.MultitypeFuncGraph("apply_grad_centralization") @_apply_decay.register("Tensor", "Bool", "Tensor", "RowTensor") def _tensor_apply_decay_with_sparse(weight_decay, if_apply, weight, gradient): """Get grad with weight_decay.""" if if_apply: indices = gradient.indices values = op_add( (op_gather(weight, indices, 0) * F.cast(weight_decay, F.dtype(weight)), gradient.values)) shape = gradient.dense_shape return RowTensor(indices, values, shape) return gradient