def _init_device_info(): """ INTERNAL USE ONLY! As rank_id need to pass into deep layer for numa and device_queue. One process work with only one rank_id, In standalone scenario, rank_id may come from env 'CUDA_VISIBLE_DEVICES', For distribute scenario, rank_id come from _get_global_rank() """ from mindspore import context from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.parallel._utils import _get_global_rank, _get_device_num if context.get_context("device_target") == "GPU": rank_id = _get_global_rank() parallel_mode = auto_parallel_context().get_parallel_mode() if parallel_mode == "stand_alone": cuda_device_info = os.getenv("CUDA_VISIBLE_DEVICES") if cuda_device_info: cuda_id = int(cuda_device_info.split(",")[0].strip()) if cuda_id != rank_id: rank_id = cuda_id _config.set_rank_id(rank_id) elif context.get_context("device_target") == "Ascend": rank_id = _get_global_rank() device_num = _get_device_num() # Ascend only support multi-process scenario if device_num > 1: _config.set_rank_id(rank_id)
def _init_device_info(): """ INTERNAL USE ONLY! As rank_id need to pass into deep layer for numa and device_queue. One process work with only one rank_id, In standalone scenario, rank_id may come from env 'CUDA_VISIBLE_DEVICES', For distribute scenario, rank_id come from _get_global_rank() """ from mindspore import context from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.parallel._utils import _get_global_rank if context.get_context("device_target") == "GPU": rank_id = _get_global_rank() parallel_mode = auto_parallel_context().get_parallel_mode() if parallel_mode == "stand_alone": cuda_device_info = os.getenv("CUDA_VISIBLE_DEVICES") if cuda_device_info: cuda_id = int(cuda_device_info.split(",")[0].strip()) if cuda_id != rank_id: rank_id = cuda_id _config.set_rank_id(rank_id) elif context.get_context("device_target") == "Ascend": # Ascend is a special scenario, we'd better get rank info from env env_rank_size = os.getenv("RANK_SIZE", None) env_rank_id = os.getenv("RANK_ID", None) if env_rank_size and env_rank_id: # Ascend only support multi-process scenario rank_size = int(env_rank_size.strip()) rank_id = int(env_rank_id.strip()) if rank_size > 1: _config.set_rank_id(rank_id)
def __init__(self, network, loss_fn=None, optimizer=None, metrics=None, eval_network=None, eval_indexes=None, amp_level="O0", frequency=278, stop_epoch=100, **kwargs): self._network = network self._loss_fn = loss_fn self._optimizer = optimizer self._loss_scale_manager = None self._loss_scale_manager_set = False self._keep_bn_fp32 = True self._check_kwargs(kwargs) self._amp_level = amp_level self._process_amp_args(kwargs) self._parallel_mode = _get_parallel_mode() self._device_number = _get_device_num() self._global_rank = _get_global_rank() self._parameter_broadcast = _get_parameter_broadcast() self._frequency = frequency self._stop_epoch = stop_epoch self._train_network = self._build_train_network() self._build_eval_network(metrics, eval_network, eval_indexes) self._build_predict_network()
def _init_device_info(): """ INTERNAL USE ONLY! As rank_id need to pass into deep layer for numa and device_queue. One process work with only one rank_id, In standalone scenario, rank_id may come from env 'CUDA_VISIBLE_DEVICES', For distribute scenario, rank_id come from _get_global_rank() """ from mindspore import context from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.parallel._utils import _get_global_rank if context.get_context("device_target") == "GPU": rank_id = _get_global_rank() parallel_mode = auto_parallel_context().get_parallel_mode() if parallel_mode == "stand_alone": rank_id = context.get_context("device_id") _config.set_rank_id(rank_id) elif context.get_context("device_target") == "Ascend": # Ascend is a special scenario, we'd better get rank info from env env_rank_size = os.getenv("RANK_SIZE", None) env_rank_id = os.getenv("RANK_ID", None) if env_rank_size and env_rank_id: # Ascend only support multi-process scenario rank_size = int(env_rank_size.strip()) rank_id = int(env_rank_id.strip()) if rank_size > 1: _config.set_rank_id(rank_id) # Now single process under ascend mode doesn't support numa bind for performance consideration. if _config.get_numa_enable() is True and rank_size == 1: raise ValueError( "single process under Ascend mode doesn't support numa bind for " "performance consideration.")
def _init_device_info(): """ INTERNAL USE ONLY! As rank_id need to pass into deep layer for numa and device_queue. One process work with only one rank_id, In standalone scenario, rank_id may come from env 'CUDA_VISIBLE_DEVICES', For distribute scenario, rank_id come from _get_global_rank() """ from mindspore import context from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.parallel._utils import _get_global_rank numa_enable = False numa_enable_env = os.getenv("DATASET_ENABLE_NUMA", None) if numa_enable_env and numa_enable_env.strip() == 'True': numa_enable = True if context.get_context("device_target") == "GPU": rank_id = _get_global_rank() parallel_mode = auto_parallel_context().get_parallel_mode() if parallel_mode == "stand_alone": rank_id = context.get_context("device_id") if numa_enable: _config.set_numa_enable(True) _config.set_rank_id(rank_id) elif context.get_context("device_target") == "Ascend": # Ascend is a special scenario, we'd better get rank info from env env_rank_size = os.getenv("RANK_SIZE", None) env_rank_id = os.getenv("RANK_ID", None) if env_rank_size and env_rank_id: # Ascend only support multi-process scenario rank_size = int(env_rank_size.strip()) rank_id = int(env_rank_id.strip()) if rank_size > 1: if numa_enable: _config.set_numa_enable(True) _config.set_rank_id(rank_id)
def _use_parallel_optimizer(self): """Indicates whether to use automatic parallelism.""" if context.get_auto_parallel_context("enable_parallel_optimizer"): if _get_parallel_mode() == ParallelMode.DATA_PARALLEL and context.get_context("device_target") == "Ascend": self.use_parallel = True elif _get_parallel_mode() == ParallelMode.DATA_PARALLEL \ and context.get_context("device_target") != "Ascend": raise RuntimeError("Parallel optimizer only supports Ascend in data parallel mode.") elif _get_parallel_mode() in (ParallelMode.STAND_ALONE, ParallelMode.HYBRID_PARALLEL): raise RuntimeError("Parallel optimizer is not supported in {}.".format(_get_parallel_mode())) else: self.use_parallel = False else: self.use_parallel = False if self.use_parallel: if self.cls_name not in ["Lamb", "AdamWeightDecay"]: raise RuntimeError("Parallel optimizer does not support optimizer {}".format(self.cls_name)) self.dev_num = _get_device_num() if self.dev_num > self.param_length: raise RuntimeError("Parallel optimizer can not be applied when the number of parameters {} is" " less than the number of devices {}".format(self.param_length, self.dev_num)) self.param_rank = self._get_parameter_group_id() self.optim_filter = tuple(map(lambda x: x == _get_global_rank(), self.param_rank)) self.param_names = [] for param in self.parameters: self.param_names.append(param.name) else: self.optim_filter = (True,) * self.param_length
def __init__(self, dataset, first_order_order): self.dataset = dataset self.device_num = _get_device_num() self.global_rank = _get_global_rank() self.repeat_count = dataset.get_repeat_count() self.repeat_ind = 0 self.loop_count = dataset.get_dataset_size() self.ind = 0 parallel_mode = context.get_auto_parallel_context("parallel_mode") self.need_to_full = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL)
def __init__(self, learning_rate, parameters, weight_decay=0.0, loss_scale=1.0): super(Optimizer, self).__init__(auto_prefix=False) if parameters is not None and not isinstance(parameters, list): parameters = list(parameters) if not parameters: raise ValueError("Optimizer got an empty parameter list.") if not isinstance(parameters[0], (dict, Parameter)): raise TypeError( "Only a list of Parameter or dict can be supported.") if isinstance(loss_scale, int): loss_scale = float(loss_scale) validator.check_value_type("loss_scale", loss_scale, [float], self.cls_name) validator.check_positive_float(loss_scale, "loss_scale", self.cls_name) self.loss_scale = loss_scale weight_decay = self._preprocess_weight_decay(weight_decay) self._unique = True self._target = context.get_context("device_target") self.dynamic_lr = False self.assignadd = None self.global_step = None self.is_group = False self.is_group_lr = False self.is_group_params_ordered = False learning_rate = self._preprocess_single_lr(learning_rate) if isinstance(parameters[0], dict): self.is_group = True self.group_params = [] self.group_lr = [] self.group_weight_decay = [] self._init_group_params(parameters, learning_rate, weight_decay) # The final value of dynamic_lr can be determined after the process of parse_single_lr and init_group_params if self.dynamic_lr: self.assignadd = P.AssignAdd() self.global_step = Parameter(initializer(0, [1], mindspore.int32), name='global_step') if self.is_group_lr: if self.dynamic_lr: self.learning_rate = CellList(self.group_lr) else: self.learning_rate = ParameterTuple(self.group_lr) else: self.learning_rate = self._build_single_lr(learning_rate, 'learning_rate') if self.is_group: self.parameters = ParameterTuple(self.group_params) self.weight_decay = tuple(self.group_weight_decay) self.weight_decay_tensor_tuple = tuple( Tensor(x, mstype.float32) for x in self.group_weight_decay) decay_filter = lambda x: x > 0 self.decay_flags = tuple( decay_filter(x) for x in self.weight_decay) self.exec_weight_decay = any(self.decay_flags) else: self.parameters = ParameterTuple(parameters) self.weight_decay = weight_decay * loss_scale self.weight_decay_tensor = Tensor(self.weight_decay, mstype.float32) decay_filter = lambda x: 'beta' not in x.name and 'gamma' not in x.name self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.exec_weight_decay = self.weight_decay > 0 # when a parameter has been unique, there is no need do another unique in optimizer. for param in self.parameters: if param.unique: self._unique = False break ps_filter = lambda x: x.is_param_ps self.ps_parameters = tuple(ps_filter(x) for x in self.parameters) ps_cache_filter = lambda x: x.cache_enable self.cache_enable = tuple(ps_cache_filter(x) for x in self.parameters) self.reciprocal_scale = Tensor(1.0 / loss_scale, mstype.float32) self.need_scale = loss_scale != 1.0 self.global_step_increase_tensor = Tensor(1, mstype.int32) self.param_length = len(self.parameters) self.map_ = C.Map() if context.get_auto_parallel_context("enable_parallel_optimizer"): if _get_parallel_mode( ) == ParallelMode.DATA_PARALLEL and context.get_context( "device_target") == "Ascend": self.use_parallel = True elif _get_parallel_mode() == ParallelMode.DATA_PARALLEL \ and context.get_context("device_target") != "Ascend": raise RuntimeError( "Parallel optimizer only supports Ascend in data parallel mode." ) elif _get_parallel_mode() in (ParallelMode.STAND_ALONE, ParallelMode.HYBRID_PARALLEL): raise RuntimeError( "Parallel optimizer is not supported in {}.".format( _get_parallel_mode())) else: self.use_parallel = False else: self.use_parallel = False if self.use_parallel: if self.cls_name not in ["Lamb", "AdamWeightDecay"]: raise RuntimeError( "Parallel optimizer does not support optimizer {}".format( self.cls_name)) self.dev_num = _get_device_num() if self.dev_num > self.param_length: raise RuntimeError( "Parallel optimizer can not be applied when the number of parameters {} is" " less than the number of devices {}".format( self.param_length, self.dev_num)) self.param_rank = self._get_parameter_group_id() self.optim_filter = tuple( map(lambda x: x == _get_global_rank(), self.param_rank)) self.param_names = [] for param in self.parameters: self.param_names.append(param.name) else: self.optim_filter = (True, ) * self.param_length
def __init__(self, learning_rate, parameters, weight_decay=0.0, loss_scale=1.0): super(Optimizer, self).__init__(auto_prefix=False) if parameters is not None and not isinstance(parameters, list): parameters = list(parameters) if not parameters: raise ValueError("Optimizer got an empty parameter list.") if not isinstance(parameters[0], (dict, Parameter)): raise TypeError("Only a list of Parameter or dict can be supported.") if isinstance(loss_scale, int): loss_scale = float(loss_scale) validator.check_value_type("loss_scale", loss_scale, [float], self.cls_name) validator.check_number_range("loss_scale", loss_scale, 0.0, float("inf"), Rel.INC_NEITHER, self.cls_name) self.loss_scale = loss_scale weight_decay = self._preprocess_weight_decay(weight_decay) self.dynamic_lr = False self.assignadd = None self.global_step = None self.is_group = False self.is_group_lr = False self.is_group_params_ordered = False learning_rate = self._preprocess_single_lr(learning_rate) if isinstance(parameters[0], dict): self.is_group = True self.group_params = [] self.group_lr = [] self.group_weight_decay = [] self._init_group_params(parameters, learning_rate, weight_decay) # The final value of dynamic_lr can be determined after the process of parse_single_lr and init_group_params if self.dynamic_lr: self.assignadd = P.AssignAdd() self.global_step = Parameter(initializer(0, [1], mindspore.int32), name='global_step') if self.is_group_lr: if self.dynamic_lr: self.learning_rate = CellList(self.group_lr) else: self.learning_rate = ParameterTuple(self.group_lr) else: self.learning_rate = self._build_single_lr(learning_rate, 'learning_rate') if self.is_group: self.parameters = ParameterTuple(self.group_params) self.weight_decay = tuple(self.group_weight_decay) decay_filter = lambda x: x > 0 self.decay_flags = tuple(decay_filter(x) for x in self.weight_decay) self.exec_weight_decay = any(self.decay_flags) else: self.parameters = ParameterTuple(parameters) self.weight_decay = weight_decay * loss_scale decay_filter = lambda x: 'beta' not in x.name and 'gamma' not in x.name self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.exec_weight_decay = self.weight_decay > 0 ps_filter = lambda x: x.is_param_ps self.ps_parameters = tuple(ps_filter(x) for x in self.parameters) self.reciprocal_scale = 1.0 / loss_scale self.param_length = len(self.parameters) self.map_ = C.Map() use_parallel = context.get_auto_parallel_context("enable_parallel_optimizer") self.use_parallel = use_parallel if use_parallel: if self.cls_name not in ["Lamb", "AdamWeightDecay"]: raise RuntimeError("Optimizer segmentation does not support optimizer {}".format(self.cls_name)) if _get_parallel_mode() != ParallelMode.DATA_PARALLEL: raise RuntimeError("Optimizer segmentation does not support parallel mode {}".format (_get_parallel_mode())) self.dev_num = _get_device_num() if self.dev_num > self.param_length: raise RuntimeError("Optimizer segmentation can not be applied when the number of parameters {} is" " less than the number of devices {}".format(self.param_length, self.dev_num)) self.param_rank = self._get_parameter_group_id() self.optim_filter = tuple(map(lambda x: x == _get_global_rank(), self.param_rank)) self.param_names = [] for param in self.parameters: self.param_names.append(param.name) else: self.optim_filter = (True,) * self.param_length