def __init__(self, network, optimizer, sens=1.0): super(TrainingWrapper, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() #False self.network.add_flags(defer_inline=True) self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): degree = context.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer( optimizer.parameters, mean, degree) self.hyper_map = C.HyperMap() self.alloc_status = NPUAllocFloatStatus() self.get_status = NPUGetFloatStatus() self.clear_status = NPUClearFloatStatus() self.reduce_sum = ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = LessEqual() self.allreduce = P.AllReduce() self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE
def __init__(self, G, generator, optimizer, sens=1.0): super(TrainOneStepG, self).__init__(auto_prefix=False) self.optimizer = optimizer self.G = G self.G.set_grad() self.G.set_train() self.G.D_A.set_grad(False) self.G.D_A.set_train(False) self.G.D_B.set_grad(False) self.G.D_B.set_train(False) self.grad = ops.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.weights = ms.ParameterTuple(generator.trainable_params()) self.net = WithLossCell(G) self.reducer_flag = False self.grad_reducer = None self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): degree = context.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer( optimizer.parameters, mean, degree)
def __init__(self, network, optimizer, sens=1.0, use_global_norm=False): super(TrainingWrapper, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.weights = ms.ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None self.use_global_norm = use_global_norm self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): degree = context.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer( optimizer.parameters, mean, degree) self.hyper_map = C.HyperMap()
def __init__(self, network, optimizer, scale_update_cell=None): super(BertSquadCell, self).__init__(auto_prefix=False) self.network = network self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) self.reducer_flag = False self.allreduce = P.AllReduce() self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = context.get_auto_parallel_context("mirror_mean") degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.cast = P.Cast() self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_before_grad = P.NPUClearFloatStatus() self.reduce_sum = P.ReduceSum(keep_dims=False) self.depend_parameter_use = P.ControlDepend(depend_mode=1) self.base = Tensor(1, mstype.float32) self.less_equal = P.LessEqual() self.hyper_map = C.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor( scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale")
def __init__(self, network, optimizer, sens=1.0): super(TransformerTrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode not in ParallelMode.MODE_LIST: raise ValueError("Parallel mode does not support: ", self.parallel_mode) if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.clip_gradients = ClipGradients() self.cast = P.Cast()
def __init__(self, network, optimizer, sens=1): super(TrainingWrapper, self).__init__(auto_prefix=False) self.network = network self.depend_network = Depend_network(network) # self.weights = ms.ParameterTuple(network.trainable_params()) self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None self.print = P.Print() self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") #if mean.get_device_num_is_set(): # if mean: #degree = context.get_auto_parallel_context("device_num") # else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer( optimizer.parameters, mean, degree)
def _parallel_predict_check(): """validate parallel model prediction""" if _get_parallel_mode() in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL): if not context.get_auto_parallel_context("full_batch"): raise RuntimeError('Model prediction only supports full batch dataset. Please set "full_batch" with True.') if context.get_auto_parallel_context("enable_parallel_optimizer"): raise RuntimeError('Model prediction does not support parallel optimizer. Please set' '"enable_parallel_optimizer" with False.')
def __init__(self, network): super(PredictWithSigmoid, self).__init__() self.network = network self.sigmoid = P.Sigmoid() parallel_mode = context.get_auto_parallel_context("parallel_mode") full_batch = context.get_auto_parallel_context("full_batch") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) if is_auto_parallel and full_batch: self.sigmoid.shard(((1, 1), ))
def __init__(self, network, config, sens=1000.0): super(TrainStepWrap, self).__init__() self.network = network self.network.set_train() self.trainable_params = network.trainable_params() weights_w = [] weights_d = [] for params in self.trainable_params: if 'wide' in params.name: weights_w.append(params) else: weights_d.append(params) self.weights_w = ParameterTuple(weights_w) self.weights_d = ParameterTuple(weights_d) self.optimizer_w = FTRL(learning_rate=config.ftrl_lr, params=self.weights_w, l1=5e-4, l2=5e-4, initial_accum=0.1, loss_scale=sens) #self.optimizer_d = ProximalAdagrad(self.weights_d, learning_rate=config.adam_lr,loss_scale=sens) self.optimizer_d = Adam(self.weights_d, learning_rate=config.adam_lr, eps=1e-6, loss_scale=sens) self.hyper_map = C.HyperMap() self.grad_w = C.GradOperation('grad_w', get_by_list=True, sens_param=True) self.grad_d = C.GradOperation('grad_d', get_by_list=True, sens_param=True) self.sens = sens self.loss_net_w = IthOutputCell(network, output_index=0) self.loss_net_d = IthOutputCell(network, output_index=1) self.reducer_flag = False self.grad_reducer_w = None self.grad_reducer_d = None parallel_mode = context.get_auto_parallel_context("parallel_mode") if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = context.get_auto_parallel_context("mirror_mean") degree = context.get_auto_parallel_context("device_num") self.grad_reducer_w = DistributedGradReducer( self.optimizer_w.parameters, mean, degree) self.grad_reducer_d = DistributedGradReducer( self.optimizer_d.parameters, mean, degree)
def __init__(self, network, sens=1024.0, host_device_mix=False, parameter_server=False, sparse=False): super(TrainStepWrap, self).__init__() parallel_mode = context.get_auto_parallel_context("parallel_mode") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) self.network = network self.network.set_train() self.trainable_params = network.trainable_params() weights_w = [] weights_d = [] for params in self.trainable_params: if 'wide' in params.name: weights_w.append(params) else: weights_d.append(params) self.weights_w = ParameterTuple(weights_w) self.weights_d = ParameterTuple(weights_d) if (sparse and is_auto_parallel) or parameter_server: self.optimizer_d = LazyAdam( self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens) self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w, l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens) if host_device_mix or parameter_server: self.optimizer_w.target = "CPU" self.optimizer_d.target = "CPU" else: self.optimizer_d = Adam( self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens) self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w, l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens) self.hyper_map = C.HyperMap() self.grad_w = C.GradOperation(get_by_list=True, sens_param=True) self.grad_d = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.loss_net_w = IthOutputCell(network, output_index=0) self.loss_net_d = IthOutputCell(network, output_index=1) self.loss_net_w.set_grad() self.loss_net_d.set_grad() self.reducer_flag = False self.grad_reducer_w = None self.grad_reducer_d = None self.reducer_flag = parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL) if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") degree = context.get_auto_parallel_context("device_num") self.grad_reducer_w = DistributedGradReducer(self.optimizer_w.parameters, mean, degree) self.grad_reducer_d = DistributedGradReducer(self.optimizer_d.parameters, mean, degree)
def __init__(self, network, optimizer, scale_update_cell=None): super(BertTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False self.allreduce = P.AllReduce() self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = F.identity self.degree = 1 if self.reducer_flag: self.degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.cast = P.Cast() self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_status = P.NPUClearFloatStatus() self.reduce_sum = P.ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = P.LessEqual() self.hyper_map = C.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))
def _gpu_analyse(self): """Collect and analyse gpu performance data""" if context.get_auto_parallel_context( 'device_num') > 1 and self._dev_id != get_rank(): self._dev_id = get_rank() logger.error( 'Please check the Profiler object initialized after set_auto_parallel_context() ' 'and init(). Profiler should be initialized after these code. ' ) self._gpu_profiler.stop() timeline_generator = self._generate_timeline() # parse minddata pipeline operator and queue for GPU try: pipeline_parser = MinddataPipelineParser(self._output_path, self._dev_id, self._output_path) pipeline_parser.parse() except ProfilerException as err: logger.warning(err.message) # analyse step trace info try: self._analyse_step_trace(is_training_mode_flag=timeline_generator. check_op_name('Gradients')) except ProfilerException as err: logger.warning(err.message) os.environ['PROFILING_MODE'] = str("false") logger.warning( '\nMemory Usage is not supported on GPU currently.\n' 'Please running on Ascend if you would like to see memory analysis, ' 'otherwise, this warning can be ignored.')
def __init__(self, network, optimizer, sens=1.0): super(TrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = F.identity self.parallel_mode = _get_parallel_mode() if self.parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: self.mean = _get_gradients_mean() self.degree = _get_device_num() self.grad_reducer = DistributedGradReducer(self.weights, self.mean, self.degree) self.use_grad_accumulation = False if self.parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.STAND_ALONE): self.use_grad_accumulation = True if self.use_grad_accumulation: self.max_accumulation_step = get_auto_parallel_context("grad_accumulation_step") if self.max_accumulation_step <= 1: self.max_accumulation_step = 1 self.use_grad_accumulation = False if self.use_grad_accumulation: self.grad_accumulation = GradientAccumulation(self.max_accumulation_step, self.optimizer)
def __init__(self, **kwargs): # get device_id and device_target self._get_devid_and_devtarget() output_path = kwargs.pop("output_path", "./data") self._output_path = validate_and_normalize_path(output_path) self._output_path = os.path.join(self._output_path, "profiler") if not os.path.exists(self._output_path): os.makedirs(self._output_path, exist_ok=True) os.chmod(self._output_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) else: logger.warning( "The target dir already exists. " "There may be some old profiling data, and they will be rewrote in the end." ) os.environ['PROFILING_MODE'] = 'true' os.environ['MINDDATA_PROFILING_DIR'] = self._output_path if self._device_target and self._device_target == "GPU": from mindspore._c_expression import GPUProfiler self._gpu_profiler = GPUProfiler.get_instance() self._gpu_profiler.init(self._output_path) self._gpu_profiler.step_profiling_enable(True) if context.get_auto_parallel_context('device_num') > 1: self._dev_id = get_rank() os.environ['DEVICE_ID'] = str(self._dev_id) if kwargs: logger.warning("Params not be supported yet on GPU.") elif self._device_target and self._device_target == "Ascend": optypes_not_deal = kwargs.pop("optypes_not_deal", "Variable") if not isinstance(optypes_not_deal, str): raise TypeError("The parameter optypes_not_deal must be str.") job_id = kwargs.pop("ascend_job_id", "") if kwargs: logger.warning("There are invalid params which don't work.") os.environ['DEVICE_ID'] = self._dev_id os.environ['AICPU_PROFILING_MODE'] = 'true' # use context interface to open profiling, for the new mindspore version(after 2020.5.21) context.set_context(enable_profiling=True, profiling_options="training_trace:task_trace") self._container_path = os.path.join( self._base_profiling_container_path, self._dev_id) data_path = os.path.join(self._container_path, "data") data_path = validate_and_normalize_path(data_path) if not os.path.exists(data_path): os.makedirs(data_path, exist_ok=True) self._filt_optype_names = optypes_not_deal.split( ",") if optypes_not_deal else [] self._profiling_job_id = job_id # add job id env through user input later self._job_id_env = 0 self._start_time = int(time.time() * 10000000) logger.info("Profiling: profiling start time: %d", self._start_time)
def __init__(self, parameters, mean=True, degree=None): super(DistributedGradReducer, self).__init__(auto_prefix=False) self.map_ = C.Map() if degree is None: self.degree = get_group_size() else: if not isinstance(degree, int) or degree <= 0: raise ValueError( "Parameter 'degree' in DistributedGradReducer should large than 0 and be int" ) self.degree = degree self.mean = mean self.allreduce_filter = tuple(x.layerwise_parallel is False for x in parameters) is_parallel_optimizer = context.get_auto_parallel_context( "enable_parallel_optimizer") split_indices = auto_parallel_context( ).get_all_reduce_fusion_split_indices() if is_parallel_optimizer and split_indices: self.split_fusion = True self.op_list = _init_allreduce_operators(len(parameters), split_indices) else: self.split_fusion = False self.allreduce = AllReduce().add_prim_attr('fusion', 1) self.allgather = AllGather(GlobalComm.WORLD_COMM_GROUP) ps_filter = lambda x: x.is_param_ps self.ps_parameters = tuple(ps_filter(x) for x in parameters) self.enable_parameter_server = any(self.ps_parameters)
def step_end(self, run_context): """Monitor the loss in training.""" cb_params = run_context.original_args() wide_loss, deep_loss = cb_params.net_outputs[0].asnumpy( ), cb_params.net_outputs[1].asnumpy() cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 cur_num = cb_params.cur_step_num rank_id = 0 parallel_mode = context.get_auto_parallel_context("parallel_mode") if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL, ParallelMode.DATA_PARALLEL): rank_id = get_rank() print("===loss===", rank_id, cb_params.cur_epoch_num, cur_step_in_epoch, wide_loss, deep_loss, flush=True) # raise ValueError if self._per_print_times != 0 and cur_num % self._per_print_times == 0 and self.config is not None: loss_file = open(self.config.loss_file_name, "a+") loss_file.write( "epoch: %s, step: %s, wide_loss: %s, deep_loss: %s" % (cb_params.cur_epoch_num, cur_step_in_epoch, wide_loss, deep_loss)) loss_file.write("\n") loss_file.close() print("epoch: %s, step: %s, wide_loss: %s, deep_loss: %s" % (cb_params.cur_epoch_num, cur_step_in_epoch, wide_loss, deep_loss))
def _set_bert_all_reduce_split(): """set bert all_reduce fusion split, support num_hidden_layers is 12 and 24.""" device_target = context.get_context('device_target') enable_graph_kernel = context.get_context('enable_graph_kernel') device_num = context.get_auto_parallel_context('device_num') if bert_net_cfg.num_hidden_layers == 12: if bert_net_cfg.use_relative_positions: context.set_auto_parallel_context( all_reduce_fusion_config=[29, 58, 87, 116, 145, 174, 203, 217]) else: context.set_auto_parallel_context( all_reduce_fusion_config=[28, 55, 82, 109, 136, 163, 190, 205]) if device_target == 'GPU' and enable_graph_kernel and device_num == 8: context.set_auto_parallel_context( all_reduce_fusion_config=[180, 205]) elif device_target == 'GPU' and enable_graph_kernel and device_num == 16: context.set_auto_parallel_context( all_reduce_fusion_config=[120, 205]) elif bert_net_cfg.num_hidden_layers == 24: if bert_net_cfg.use_relative_positions: context.set_auto_parallel_context(all_reduce_fusion_config=[ 30, 90, 150, 210, 270, 330, 390, 421 ]) else: context.set_auto_parallel_context(all_reduce_fusion_config=[ 38, 93, 148, 203, 258, 313, 368, 397 ])
def __init__(self, network, optimizer, sens=1): super(CenterNetWithLossScaleCell, self).__init__(auto_prefix=False) self.image = ImagePreProcess() self.network = network self.network.set_grad() self.weights = optimizer.parameters self.optimizer = optimizer self.grad = ops.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False self.allreduce = ops.AllReduce() self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = ops.identity self.degree = 1 if self.reducer_flag: self.degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.cast = ops.Cast() self.alloc_status = ops.NPUAllocFloatStatus() self.get_status = ops.NPUGetFloatStatus() self.clear_before_grad = ops.NPUClearFloatStatus() self.reduce_sum = ops.ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = ops.LessEqual() self.grad_scale = GradScale() self.loss_scale = sens
def _use_parallel_optimizer(self): """Indicates whether to use automatic parallelism.""" if context.get_auto_parallel_context("enable_parallel_optimizer"): if _get_parallel_mode() == ParallelMode.DATA_PARALLEL and context.get_context("device_target") == "Ascend": self.use_parallel = True elif _get_parallel_mode() == ParallelMode.DATA_PARALLEL \ and context.get_context("device_target") != "Ascend": raise RuntimeError("Parallel optimizer only supports Ascend in data parallel mode.") elif _get_parallel_mode() in (ParallelMode.STAND_ALONE, ParallelMode.HYBRID_PARALLEL): raise RuntimeError("Parallel optimizer is not supported in {}.".format(_get_parallel_mode())) else: self.use_parallel = False else: self.use_parallel = False if self.use_parallel: if self.cls_name not in ["Lamb", "AdamWeightDecay"]: raise RuntimeError("Parallel optimizer does not support optimizer {}".format(self.cls_name)) self.dev_num = _get_device_num() if self.dev_num > self.param_length: raise RuntimeError("Parallel optimizer can not be applied when the number of parameters {} is" " less than the number of devices {}".format(self.param_length, self.dev_num)) self.param_rank = self._get_parameter_group_id() self.optim_filter = tuple(map(lambda x: x == _get_global_rank(), self.param_rank)) self.param_names = [] for param in self.parameters: self.param_names.append(param.name) else: self.optim_filter = (True,) * self.param_length
def __init__(self, network, optimizer, sens=1.0): super(TrainAccuStepsCell, self).__init__(network, optimizer, sens) self.accumulation = False self.accumulation_steps = context.get_auto_parallel_context( "grad_accumulation_step") self.accu_grads = self.weights.clone(prefix="accu_grads", init='zeros') self.hyper_map = ops.HyperMap()
def _init_allreduce_operators(length): """ initialize allreduce communication operators""" is_parallel_optimizer = context.get_auto_parallel_context( "enable_parallel_optimizer") split_indices = auto_parallel_context( ).get_all_reduce_fusion_split_indices() if is_parallel_optimizer and split_indices: group = 1 fusion = () for i in range(length): fusion = fusion + (group, ) if split_indices[group - 1] <= i + 1: if group >= len(split_indices): continue group = group + 1 index = tuple(range(1, length + 1)) else: fusion = (1, ) * length index = (0, ) * length opt_list = () for i in range(length): opt = AllReduce('sum', GlobalComm.WORLD_COMM_GROUP) opt.add_prim_attr('fusion', fusion[i]) opt.add_prim_attr('index', index[i]) opt_list = opt_list + (opt, ) return opt_list
def epoch_end(self, run_context): """ epoch end """ self.aucMetric.clear() parallel_mode = context.get_auto_parallel_context("parallel_mode") if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL): context.set_auto_parallel_context( strategy_ckpt_save_file="", strategy_ckpt_load_file="./strategy_train.ckpt") rank_id = 0 if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL, ParallelMode.DATA_PARALLEL): rank_id = get_rank() start_time = time.time() out = self.model.eval(self.eval_dataset, dataset_sink_mode=(not self.host_device_mix)) end_time = time.time() eval_time = int(end_time - start_time) time_str = time.strftime("%Y-%m-%d %H:%M%S", time.localtime()) out_str = "{} == Rank: {} == EvalCallBack model.eval(): {}; eval_time: {}s".\ format(time_str, rank_id, out.values(), eval_time) print(out_str) self.eval_values = out.values() add_write(self.eval_file_name, out_str)
def get_event_file_name(prefix, suffix, time_second): """ Create file name: file_prefix + EVENT_FILE_NAME_MARK + time(seconds) + "." + Hostname + file_suffix. Args: prefix (str): The prefix of file name. suffix (str): The suffix of file name. time_second (str): The time stamp of file name. Returns: String, the name of event log file. """ Validator.check_str_by_regular(prefix) Validator.check_str_by_regular(suffix) file_name = "" hostname = platform.node() device_num = context.get_auto_parallel_context('device_num') device_id = context.get_context('device_id') if device_num > 1 or GlobalComm.WORLD_COMM_GROUP == 'nccl_world_group': # Notice: # In GPU distribute training scene, get_context('device_id') will not work, # so we use get_rank instead of get_context. device_id = get_rank() file_name = f'{file_name}{EVENT_FILE_NAME_MARK}{time_second}.{device_id}.{hostname}' if prefix is not None: file_name = prefix + file_name if suffix is not None: file_name = file_name + suffix return file_name
def __init__(self, network, optimizer, sens=1.0): super(BertTrainCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.weights = optimizer.parameters self.optimizer = optimizer self.sens = sens self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.clip_type = gradient_cfg.clip_type self.clip_value = gradient_cfg.clip_value self.reducer_flag = False self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True self.grad_reducer = F.identity self.degree = 1 if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") self.degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, self.degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.cast = P.Cast() self.hyper_map = C.HyperMap() self.saved_params = self.weights.clone(prefix='saved') self.length = len(self.weights) self.quant_embedding_list = [] self.quant_weight_list = [] for i, key in enumerate(self.saved_params): if 'embedding_lookup' in key.name and 'min' not in key.name and 'max' not in key.name: self.quant_embedding_list.append(i) elif 'weight' in key.name and 'dense_1' not in key.name: self.quant_weight_list.append(i) self.quant_embedding_list_length = len(self.quant_embedding_list) self.quant_weight_list_length = len(self.quant_weight_list) self.quantize_embedding = QuantizeWeightCell( num_bits=network.embedding_bits, compute_type=network.compute_type, clip_value=network.weight_clip_value) self.quantize_weight = QuantizeWeightCell( num_bits=network.weight_bits, compute_type=network.compute_type, clip_value=network.weight_clip_value)
def __init__(self, network, optimizer, scale_update_cell=None, accumulation_steps=1, enable_global_norm=False): super(BertTrainAccumulateStepsWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.weights = optimizer.parameters self.optimizer = optimizer self.accumulation_steps = accumulation_steps self.enable_global_norm = enable_global_norm self.one = Tensor(np.array([1]).astype(np.int32)) self.zero = Tensor(np.array([0]).astype(np.int32)) self.local_step = Parameter(initializer(0, [1], mstype.int32), name="local_step") self.accu_grads = self.weights.clone(prefix="accu_grads", init='zeros') self.accu_overflow = Parameter(initializer(0, [1], mstype.int32), name="accu_overflow") self.loss = Parameter(initializer(0, [1], mstype.float32), name="accu_loss") self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True self.grad_reducer = F.identity self.degree = 1 if self.reducer_flag: self.degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.overflow_reducer = F.identity if self.is_distributed: self.overflow_reducer = P.AllReduce() self.cast = P.Cast() self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_before_grad = P.NPUClearFloatStatus() self.reduce_sum = P.ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = P.LessEqual() self.logical_or = P.LogicalOr() self.not_equal = P.NotEqual() self.select = P.Select() self.reshape = P.Reshape() self.hyper_map = C.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor( scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale")
def __init__(self, network, optimizer, scale_update_cell=None): super(BertTrainWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False self.allreduce = P.AllReduce() self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True self.grad_reducer = F.identity self.degree = 1 if self.reducer_flag: self.degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree) self.clip_type = gradient_cfg.clip_type self.clip_value = gradient_cfg.clip_value self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.cast = P.Cast() self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_before_grad = P.NPUClearFloatStatus() self.reduce_sum = P.ReduceSum(keep_dims=False) self.depend_parameter_use = P.ControlDepend(depend_mode=1) self.base = Tensor(1, mstype.float32) self.less_equal = P.LessEqual() self.hyper_map = C.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter( Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) self.saved_params = self.weights.clone(prefix='saved') self.length = len(self.weights) self.quant_embedding_list = [] self.quant_weight_list = [] for i, key in enumerate(self.saved_params): if 'embedding_lookup' in key.name: self.quant_embedding_list.append(i) elif 'weight' in key.name and 'dense_1' not in key.name: self.quant_weight_list.append(i) self.quant_embedding_list_length = len(self.quant_embedding_list) self.quant_weight_list_length = len(self.quant_weight_list) self.quantize_embedding = QuantizeWeightCell( num_bits=network.embedding_bits, compute_type=network.compute_type, clip_value=network.weight_clip_value) self.quantize_weight = QuantizeWeightCell( num_bits=network.weight_bits, compute_type=network.compute_type, clip_value=network.weight_clip_value)
def __init__(self, network, optimizer, scale_sense): super(DFCNNCTCTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.optimizer = optimizer if isinstance(scale_sense, nn.Cell): self.loss_scaling_manager = scale_sense self.scale_sense = Parameter(Tensor(scale_sense.get_loss_scale(), dtype=mstype.float32), name="scale_sense") elif isinstance(scale_sense, Tensor): if scale_sense.shape == (1,) or scale_sense.shape == (): self.scale_sense = Parameter(scale_sense, name='scale_sense') else: raise ValueError("The shape of scale_sense must be (1,) or (), but got {}".format( scale_sense.shape)) else: raise TypeError("The scale_sense must be Cell or Tensor, but got {}".format( type(scale_sense))) self.network.set_grad() self.weights = ParameterTuple(network.trainable_params()) self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode not in ParallelMode.MODE_LIST: raise ValueError("Parallel mode does not support: ", self.parallel_mode) if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.clip_gradients = ClipGradients() self.cast = P.Cast() self.addn = P.AddN() self.reshape = P.Reshape() self.hyper_map = C.HyperMap() self.less_equal = P.LessEqual() self.allreduce = P.AllReduce()
def test_reset_auto_parallel_context(): context.reset_auto_parallel_context() device_num = context.get_auto_parallel_context("device_num") global_rank = context.get_auto_parallel_context("global_rank") gradients_mean = context.get_auto_parallel_context("gradients_mean") gradient_fp32_sync = context.get_auto_parallel_context( "gradient_fp32_sync") parallel_mode = context.get_auto_parallel_context("parallel_mode") parameter_broadcast = context.get_auto_parallel_context( "parameter_broadcast") device_num_is_set = auto_parallel_context().get_device_num_is_set() parameter_broadcast_is_set = auto_parallel_context( ).get_parameter_broadcast_is_set() stage = auto_parallel_context().get_pipeline_stages() communi_parallel_mode = context.get_auto_parallel_context( "communi_parallel_mode") assert device_num == 1 assert global_rank == 0 assert not gradients_mean assert gradient_fp32_sync assert parallel_mode == "stand_alone" assert not parameter_broadcast assert not device_num_is_set assert not parameter_broadcast_is_set assert stage == 1 assert communi_parallel_mode == "all_group_parallel"
def __init__(self, network, optimizer, sens=1.0): super(BertTrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = ops.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = context.get_auto_parallel_context("mirror_mean") degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.cast = ops.Cast() self.hyper_map = ops.HyperMap()
def __init__(self, network, optimizer, scale_update_cell=None): super(BertFinetuneCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False self.allreduce = P.AllReduce() self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.cast = P.Cast() self.gpu_target = False if context.get_context("device_target") == "GPU": self.gpu_target = True self.float_status = P.FloatStatus() self.addn = P.AddN() self.reshape = P.Reshape() else: self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_status = P.NPUClearFloatStatus() self.reduce_sum = P.ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = P.LessEqual() self.hyper_map = C.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter( Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32))