def __init__(self, parameters, mean=True, degree=None): super(DistributedGradReducer, self).__init__(auto_prefix=False) self.map_ = C.Map() if degree is None: self.degree = get_group_size() else: if not isinstance(degree, int) or degree <= 0: raise ValueError( "Parameter 'degree' in DistributedGradReducer should large than 0 and be int" ) self.degree = degree self.mean = mean self.allreduce_filter = tuple(x.layerwise_parallel is False for x in parameters) is_parallel_optimizer = context.get_auto_parallel_context( "enable_parallel_optimizer") split_indices = auto_parallel_context( ).get_all_reduce_fusion_split_indices() if is_parallel_optimizer and split_indices: self.split_fusion = True self.op_list = _init_allreduce_operators(len(parameters), split_indices) else: self.split_fusion = False self.allreduce = AllReduce().add_prim_attr('fusion', 1) self.allgather = AllGather(GlobalComm.WORLD_COMM_GROUP) ps_filter = lambda x: x.is_param_ps self.ps_parameters = tuple(ps_filter(x) for x in parameters) self.enable_parameter_server = any(self.ps_parameters)
def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-6, weight_decay=0.0, decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name): super(AdamWeightDecaySparse, self).__init__(learning_rate, params) if self.is_group: raise RuntimeError( f"The {self.cls_name} optimizer cannot support group setting.") _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name) self.beta1 = Tensor(np.array([beta1]).astype(np.float32)) self.beta2 = Tensor(np.array([beta2]).astype(np.float32)) self.eps = Tensor(np.array([eps]).astype(np.float32)) self.weight_decay_tensor = Tensor( np.array([weight_decay]).astype(np.float32)) self.params = self.parameters self.moments1 = self.params.clone(prefix="adam_m", init='zeros') self.moments2 = self.params.clone(prefix="adam_v", init='zeros') self.decay_flag = tuple(decay_filter(x) for x in self.params) self.map = C.Map()
def __init__(self, parameters, mean=True, degree=None): super(DistributedGradReducer, self).__init__(auto_prefix=False) self.map_ = C.Map() self.mul = P.Mul() if degree is None: self.degree = get_group_size() else: if not isinstance(degree, int) or degree <= 0: raise ValueError("Parameter 'degree' in DistributedGradReducer should large than 0 and be int") self.degree = degree self.mean = mean self.allreduce_filter = tuple(x.layerwise_parallel is False for x in parameters) _init_optimizer_communication()
def __init__(self, parameters, mean=True, degree=None): super(DistributedGradReducer, self).__init__(auto_prefix=False) self.map_ = C.Map() if degree is None: self.degree = get_group_size() else: if not isinstance(degree, int) or degree <= 0: raise ValueError( "Parameter 'degree' in DistributedGradReducer should large than 0 and be int" ) self.degree = degree self.mean = mean self.allreduce_filter = tuple(x.layerwise_parallel is False for x in parameters) self.opt_list = _init_allreduce_operators(len(parameters)) self.allgather = AllGather(GlobalComm.WORLD_COMM_GROUP) ps_filter = lambda x: x.is_param_ps self.ps_parameters = tuple(ps_filter(x) for x in parameters)
def __init__(self, learning_rate, parameters, weight_decay=0.0, loss_scale=1.0): super(Optimizer, self).__init__(auto_prefix=False) if parameters is not None and not isinstance(parameters, list): parameters = list(parameters) if not parameters: raise ValueError("Optimizer got an empty parameter list.") if not isinstance(parameters[0], (dict, Parameter)): raise TypeError( "Only a list of Parameter or dict can be supported.") if isinstance(loss_scale, int): loss_scale = float(loss_scale) validator.check_value_type("loss_scale", loss_scale, [float], self.cls_name) validator.check_positive_float(loss_scale, "loss_scale", self.cls_name) self.loss_scale = loss_scale weight_decay = self._preprocess_weight_decay(weight_decay) self._unique = True self._target = context.get_context("device_target") self.dynamic_lr = False self.assignadd = None self.global_step = None self.is_group = False self.is_group_lr = False self.is_group_params_ordered = False learning_rate = self._preprocess_single_lr(learning_rate) if isinstance(parameters[0], dict): self.is_group = True self.group_params = [] self.group_lr = [] self.group_weight_decay = [] self._init_group_params(parameters, learning_rate, weight_decay) # The final value of dynamic_lr can be determined after the process of parse_single_lr and init_group_params if self.dynamic_lr: self.assignadd = P.AssignAdd() self.global_step = Parameter(initializer(0, [1], mindspore.int32), name='global_step') if self.is_group_lr: if self.dynamic_lr: self.learning_rate = CellList(self.group_lr) else: self.learning_rate = ParameterTuple(self.group_lr) else: self.learning_rate = self._build_single_lr(learning_rate, 'learning_rate') if self.is_group: self.parameters = ParameterTuple(self.group_params) self.weight_decay = tuple(self.group_weight_decay) self.weight_decay_tensor_tuple = tuple( Tensor(x, mstype.float32) for x in self.group_weight_decay) decay_filter = lambda x: x > 0 self.decay_flags = tuple( decay_filter(x) for x in self.weight_decay) self.exec_weight_decay = any(self.decay_flags) else: self.parameters = ParameterTuple(parameters) self.weight_decay = weight_decay * loss_scale self.weight_decay_tensor = Tensor(self.weight_decay, mstype.float32) decay_filter = lambda x: 'beta' not in x.name and 'gamma' not in x.name self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.exec_weight_decay = self.weight_decay > 0 # when a parameter has been unique, there is no need do another unique in optimizer. for param in self.parameters: if param.unique: self._unique = False break ps_filter = lambda x: x.is_param_ps self.ps_parameters = tuple(ps_filter(x) for x in self.parameters) ps_cache_filter = lambda x: x.cache_enable self.cache_enable = tuple(ps_cache_filter(x) for x in self.parameters) self.reciprocal_scale = Tensor(1.0 / loss_scale, mstype.float32) self.need_scale = loss_scale != 1.0 self.global_step_increase_tensor = Tensor(1, mstype.int32) self.param_length = len(self.parameters) self.map_ = C.Map() if context.get_auto_parallel_context("enable_parallel_optimizer"): if _get_parallel_mode( ) == ParallelMode.DATA_PARALLEL and context.get_context( "device_target") == "Ascend": self.use_parallel = True elif _get_parallel_mode() == ParallelMode.DATA_PARALLEL \ and context.get_context("device_target") != "Ascend": raise RuntimeError( "Parallel optimizer only supports Ascend in data parallel mode." ) elif _get_parallel_mode() in (ParallelMode.STAND_ALONE, ParallelMode.HYBRID_PARALLEL): raise RuntimeError( "Parallel optimizer is not supported in {}.".format( _get_parallel_mode())) else: self.use_parallel = False else: self.use_parallel = False if self.use_parallel: if self.cls_name not in ["Lamb", "AdamWeightDecay"]: raise RuntimeError( "Parallel optimizer does not support optimizer {}".format( self.cls_name)) self.dev_num = _get_device_num() if self.dev_num > self.param_length: raise RuntimeError( "Parallel optimizer can not be applied when the number of parameters {} is" " less than the number of devices {}".format( self.param_length, self.dev_num)) self.param_rank = self._get_parameter_group_id() self.optim_filter = tuple( map(lambda x: x == _get_global_rank(), self.param_rank)) self.param_names = [] for param in self.parameters: self.param_names.append(param.name) else: self.optim_filter = (True, ) * self.param_length
def __init__(self, *args, **kwargs): super(KungFuMomentum, self).__init__(*args, **kwargs) self.map_ = C.Map() self.all_reduce = KungFuAllReduce()
def __init__(self, learning_rate, parameters, weight_decay=0.0, loss_scale=1.0): super(Optimizer, self).__init__(auto_prefix=False) if parameters is not None and not isinstance(parameters, list): parameters = list(parameters) if not parameters: raise ValueError("Optimizer got an empty parameter list.") if not isinstance(parameters[0], (dict, Parameter)): raise TypeError( "Only a list of Parameter or dict can be supported.") if isinstance(loss_scale, int): loss_scale = float(loss_scale) validator.check_value_type("loss_scale", loss_scale, [float], self.cls_name) validator.check_positive_float(loss_scale, "loss_scale", self.cls_name) self.loss_scale = loss_scale weight_decay = self._preprocess_weight_decay(weight_decay) self.grad_centralization = False self._unique = True self._target = context.get_context("device_target") self.dynamic_lr = False self.assignadd = None self.global_step = None self.is_group = False self.is_group_lr = False self.is_group_params_ordered = False learning_rate = self._preprocess_single_lr(learning_rate) if isinstance(parameters[0], dict): self.is_group = True self.group_params = [] self.group_lr = [] self.group_weight_decay = [] self.group_grad_centralization = [] self._init_group_params(parameters, learning_rate, weight_decay, self.grad_centralization) # The final value of dynamic_lr can be determined after the process of parse_single_lr and init_group_params if self.dynamic_lr: self.assignadd = P.AssignAdd() self.global_step = Parameter(initializer(0, [1], mindspore.int32), name='global_step') if self.is_group_lr: self.learning_rate = CellList( self.group_lr) if self.dynamic_lr else ParameterTuple( self.group_lr) else: self.learning_rate = self._build_single_lr(learning_rate, 'learning_rate') if self.is_group: self.parameters = ParameterTuple(self.group_params) self.weight_decay = tuple(self.group_weight_decay) self.weight_decay_tensor_tuple = tuple( Tensor(x, mstype.float32) for x in self.group_weight_decay) decay_filter = lambda x: x > 0 self.decay_flags = tuple( decay_filter(x) for x in self.weight_decay) self.exec_weight_decay = any(self.decay_flags) self.grad_centralization_flags = tuple( self.group_grad_centralization) else: self.parameters = ParameterTuple(parameters) self.weight_decay = weight_decay * loss_scale self.weight_decay_tensor = Tensor(self.weight_decay, mstype.float32) decay_filter = lambda x: 'beta' not in x.name and 'gamma' not in x.name self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.exec_weight_decay = self.weight_decay > 0 # when a parameter has been unique, there is no need do another unique in optimizer. for param in self.parameters: if param.unique: self._unique = False break ps_filter = lambda x: x.is_param_ps self.ps_parameters = tuple(ps_filter(x) for x in self.parameters) cache_filter = lambda x: x.cache_enable self.cache_enable = tuple(cache_filter(x) for x in self.parameters) self.reciprocal_scale = Tensor(1.0 / loss_scale, mstype.float32) self.need_scale = loss_scale != 1.0 self.global_step_increase_tensor = Tensor(1, mstype.int32) self.param_length = len(self.parameters) self.map_ = C.Map() self._use_parallel_optimizer()
def __init__(self, learning_rate, parameters, weight_decay=0.0, loss_scale=1.0): super(Optimizer, self).__init__(auto_prefix=False) if parameters is not None and not isinstance(parameters, list): parameters = list(parameters) if not parameters: raise ValueError("Optimizer got an empty parameter list.") if not isinstance(parameters[0], (dict, Parameter)): raise TypeError("Only a list of Parameter or dict can be supported.") if isinstance(loss_scale, int): loss_scale = float(loss_scale) validator.check_value_type("loss_scale", loss_scale, [float], self.cls_name) validator.check_number_range("loss_scale", loss_scale, 0.0, float("inf"), Rel.INC_NEITHER, self.cls_name) self.loss_scale = loss_scale weight_decay = self._preprocess_weight_decay(weight_decay) self.dynamic_lr = False self.assignadd = None self.global_step = None self.is_group = False self.is_group_lr = False self.is_group_params_ordered = False learning_rate = self._preprocess_single_lr(learning_rate) if isinstance(parameters[0], dict): self.is_group = True self.group_params = [] self.group_lr = [] self.group_weight_decay = [] self._init_group_params(parameters, learning_rate, weight_decay) # The final value of dynamic_lr can be determined after the process of parse_single_lr and init_group_params if self.dynamic_lr: self.assignadd = P.AssignAdd() self.global_step = Parameter(initializer(0, [1], mindspore.int32), name='global_step') if self.is_group_lr: if self.dynamic_lr: self.learning_rate = CellList(self.group_lr) else: self.learning_rate = ParameterTuple(self.group_lr) else: self.learning_rate = self._build_single_lr(learning_rate, 'learning_rate') if self.is_group: self.parameters = ParameterTuple(self.group_params) self.weight_decay = tuple(self.group_weight_decay) decay_filter = lambda x: x > 0 self.decay_flags = tuple(decay_filter(x) for x in self.weight_decay) self.exec_weight_decay = any(self.decay_flags) else: self.parameters = ParameterTuple(parameters) self.weight_decay = weight_decay * loss_scale decay_filter = lambda x: 'beta' not in x.name and 'gamma' not in x.name self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.exec_weight_decay = self.weight_decay > 0 ps_filter = lambda x: x.is_param_ps self.ps_parameters = tuple(ps_filter(x) for x in self.parameters) self.reciprocal_scale = 1.0 / loss_scale self.param_length = len(self.parameters) self.map_ = C.Map() use_parallel = context.get_auto_parallel_context("enable_parallel_optimizer") self.use_parallel = use_parallel if use_parallel: if self.cls_name not in ["Lamb", "AdamWeightDecay"]: raise RuntimeError("Optimizer segmentation does not support optimizer {}".format(self.cls_name)) if _get_parallel_mode() != ParallelMode.DATA_PARALLEL: raise RuntimeError("Optimizer segmentation does not support parallel mode {}".format (_get_parallel_mode())) self.dev_num = _get_device_num() if self.dev_num > self.param_length: raise RuntimeError("Optimizer segmentation can not be applied when the number of parameters {} is" " less than the number of devices {}".format(self.param_length, self.dev_num)) self.param_rank = self._get_parameter_group_id() self.optim_filter = tuple(map(lambda x: x == _get_global_rank(), self.param_rank)) self.param_names = [] for param in self.parameters: self.param_names.append(param.name) else: self.optim_filter = (True,) * self.param_length
T.not_: multitype_ops.logical_not, T.eq: multitype_ops.equal, T.ne: multitype_ops.not_equal, T.lt: multitype_ops.less, T.gt: multitype_ops.greater, T.le: multitype_ops.less_equal, T.ge: multitype_ops.greater_equal, T.is_: F.is_, T.is_not: F.is_not, T.contains: multitype_ops.in_, T.not_contains: multitype_ops.not_in_, # system function T.len: M.ms_len, T.bool_: M.bool_, T.map: C.Map(), T.partial: F.partial, T.zip: C.zip_operation, T.print: F.print_, T.enumerate: M.enumerate_, T.isinstance: M.isinstance_, # custom define operation T.iter: M.ms_iter, T.next: M.ms_next, T.hasnext: M.hasnext, T.make_tuple: F.make_tuple, T.make_dict: F.make_dict, T.make_list: F.make_list, T.make_slice: F.make_slice,
def __init__(self): super().__init__() self.relu = nn.ReLU() self.hypermap = C.Map()
def __init__(self, *args, **kwargs): super(KungFuMomentum, self).__init__(*args, **kwargs) from kungfu_mindspore_ops import KungFuAllReduce self.map_ = C.Map() self.all_reduce = KungFuAllReduce()
def __init__(self, learning_rate, parameters, weight_decay=0.0, loss_scale=1.0): super(Optimizer, self).__init__(auto_prefix=False) if parameters and not isinstance(parameters, list): parameters = list(parameters) if not parameters: raise ValueError("Optimizer got an empty parameter list.") if not isinstance(parameters[0], (dict, Parameter)): raise TypeError("Only a list of Parameter or dict can be supported.") if isinstance(loss_scale, int): loss_scale = float(loss_scale) validator.check_value_type("loss_scale", loss_scale, [float], self.cls_name) validator.check_number_range("loss_scale", loss_scale, 0.0, float("inf"), Rel.INC_NEITHER, self.cls_name) if isinstance(weight_decay, int): weight_decay = float(weight_decay) validator.check_value_type("weight_decay", weight_decay, [float], self.cls_name) validator.check_number_range("weight_decay", weight_decay, 0.0, float("inf"), Rel.INC_LEFT, self.cls_name) self.is_group = False self.is_group_lr = False self.is_group_params_ordered = False self.loss_scale = loss_scale if isinstance(learning_rate, int): learning_rate = float(learning_rate) if isinstance(learning_rate, float): self.dynamic_lr = False self.gather = None self.assignadd = None self.global_step = None self.scalar_lr = learning_rate else: self.dynamic_lr = True self.gather = P.GatherV2() self.assignadd = P.AssignAdd() self.global_step = Parameter(initializer(0, [1], mindspore.int32), name='global_step') self.scalar_lr = None learning_rate = self._get_single_lr(learning_rate) if isinstance(parameters[0], dict): self.is_group = True self.group_params = [] self.group_lr = [] self.group_weight_decay = [] self._init_group_params(parameters, learning_rate, weight_decay) if self.is_group_lr: self.learning_rate = ParameterTuple(self.group_lr) else: self.learning_rate = Parameter(learning_rate, name="learning_rate") if self.is_group: self.parameters = ParameterTuple(self.group_params) self.weight_decay = tuple(self.group_weight_decay) decay_filter = lambda x: x > 0 self.decay_flags = tuple(decay_filter(x) for x in self.weight_decay) else: self.parameters = ParameterTuple(parameters) self.weight_decay = weight_decay * loss_scale decay_filter = lambda x: 'beta' not in x.name and 'gamma' not in x.name self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.reciprocal_scale = 1.0 / loss_scale self.exec_weight_decay = any(self.decay_flags) self.param_length = len(self.parameters) self.map_ = C.Map()