def __init__(self, network, optimizer, sens=1.0, use_global_norm=False): super(TrainingWrapper, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.weights = ms.ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None self.use_global_norm = use_global_norm self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): degree = context.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer( optimizer.parameters, mean, degree) self.hyper_map = C.HyperMap()
def _init_device_info(): """ INTERNAL USE ONLY! As rank_id need to pass into deep layer for numa and device_queue. One process work with only one rank_id, In standalone scenario, rank_id may come from env 'CUDA_VISIBLE_DEVICES', For distribute scenario, rank_id come from _get_global_rank() """ from mindspore import context from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.parallel._utils import _get_global_rank numa_enable = False numa_enable_env = os.getenv("DATASET_ENABLE_NUMA", None) if numa_enable_env and numa_enable_env.strip() == 'True': numa_enable = True if context.get_context("device_target") == "GPU": rank_id = _get_global_rank() parallel_mode = auto_parallel_context().get_parallel_mode() if parallel_mode == "stand_alone": rank_id = context.get_context("device_id") if numa_enable: _config.set_numa_enable(True) _config.set_rank_id(rank_id) elif context.get_context("device_target") == "Ascend": # Ascend is a special scenario, we'd better get rank info from env env_rank_size = os.getenv("RANK_SIZE", None) env_rank_id = os.getenv("RANK_ID", None) if env_rank_size and env_rank_id: # Ascend only support multi-process scenario rank_size = int(env_rank_size.strip()) rank_id = int(env_rank_id.strip()) if rank_size > 1: if numa_enable: _config.set_numa_enable(True) _config.set_rank_id(rank_id)
def _restore_auto_parallel_context(): """restore auto parallel context""" global _parallel_mode global _device_num global _global_rank global _parameter_broadcast global _mirror_mean global _cast_before_mirror global _loss_repeated_mean global _communication_backend global _enable_all_reduce_fusion _set_auto_parallel_context(parallel_mode=_parallel_mode, device_num=_device_num, global_rank=_global_rank, parameter_broadcast=_parameter_broadcast, mirror_mean=_mirror_mean, cast_before_mirror=_cast_before_mirror, loss_repeated_mean=_loss_repeated_mean) auto_parallel_context().set_communication_backend(_communication_backend) auto_parallel_context().set_enable_all_reduce_fusion(_enable_all_reduce_fusion)
def __init__(self, parameters, mean=True, degree=None): super(DistributedGradReducer, self).__init__(auto_prefix=False) self.map_ = C.Map() if degree is None: self.degree = get_group_size() else: if not isinstance(degree, int) or degree <= 0: raise ValueError( "Parameter 'degree' in DistributedGradReducer should large than 0 and be int" ) self.degree = degree self.mean = mean self.allreduce_filter = tuple(x.layerwise_parallel is False for x in parameters) is_parallel_optimizer = context.get_auto_parallel_context( "enable_parallel_optimizer") split_indices = auto_parallel_context( ).get_all_reduce_fusion_split_indices() if is_parallel_optimizer and split_indices: self.split_fusion = True self.op_list = _init_allreduce_operators(len(parameters), split_indices) else: self.split_fusion = False self.allreduce = AllReduce().add_prim_attr('fusion', 1) self.allgather = AllGather(GlobalComm.WORLD_COMM_GROUP) ps_filter = lambda x: x.is_param_ps self.ps_parameters = tuple(ps_filter(x) for x in parameters) self.enable_parameter_server = any(self.ps_parameters)
def _init_device_info(): """ INTERNAL USE ONLY! As rank_id need to pass into deep layer for numa and device_queue. One process work with only one rank_id, In standalone scenario, rank_id may come from env 'CUDA_VISIBLE_DEVICES', For distribute scenario, rank_id come from _get_global_rank() """ from mindspore import context from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.parallel._utils import _get_global_rank if context.get_context("device_target") == "GPU": rank_id = _get_global_rank() parallel_mode = auto_parallel_context().get_parallel_mode() if parallel_mode == "stand_alone": rank_id = context.get_context("device_id") _config.set_rank_id(rank_id) elif context.get_context("device_target") == "Ascend": # Ascend is a special scenario, we'd better get rank info from env env_rank_size = os.getenv("RANK_SIZE", None) env_rank_id = os.getenv("RANK_ID", None) if env_rank_size and env_rank_id: # Ascend only support multi-process scenario rank_size = int(env_rank_size.strip()) rank_id = int(env_rank_id.strip()) if rank_size > 1: _config.set_rank_id(rank_id) # Now single process under ascend mode doesn't support numa bind for performance consideration. if _config.get_numa_enable() is True and rank_size == 1: raise ValueError( "single process under Ascend mode doesn't support numa bind for " "performance consideration.")
def __init__(self, network, optimizer, sens=1.0): super(TrainingWrapper, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() #False self.network.add_flags(defer_inline=True) self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): degree = context.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer( optimizer.parameters, mean, degree) self.hyper_map = C.HyperMap() self.alloc_status = NPUAllocFloatStatus() self.get_status = NPUGetFloatStatus() self.clear_status = NPUClearFloatStatus() self.reduce_sum = ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = LessEqual() self.allreduce = P.AllReduce() self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE
def __init__(self, G, generator, optimizer, sens=1.0): super(TrainOneStepG, self).__init__(auto_prefix=False) self.optimizer = optimizer self.G = G self.G.set_grad() self.G.set_train() self.G.D_A.set_grad(False) self.G.D_A.set_train(False) self.G.D_B.set_grad(False) self.G.D_B.set_train(False) self.grad = ops.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.weights = ms.ParameterTuple(generator.trainable_params()) self.net = WithLossCell(G) self.reducer_flag = False self.grad_reducer = None self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") if auto_parallel_context().get_device_num_is_set(): degree = context.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer( optimizer.parameters, mean, degree)
def test_lamb_split_fusion(): """ test_Lamb_split_fusion """ context.set_auto_parallel_context(parallel_mode="data_parallel", device_num=2, enable_parallel_optimizer=True) auto_parallel_context().set_all_reduce_fusion_split_indices([2, 4, 6, 8]) inputs = Tensor(np.ones([32, 128]).astype(np.float32)) label = Tensor(np.zeros([32, 768]).astype(np.float32)) net = Net() net.set_train() loss = nn.SoftmaxCrossEntropyWithLogits() optimizer = Lamb(net.trainable_params(), learning_rate=0.1) net_with_loss = WithLossCell(net, loss) train_network = TrainOneStepCell(net_with_loss, optimizer) _executor.compile(train_network, inputs, label) context.reset_auto_parallel_context()
def test_AdamWeightDecayDynamicLR(): """ test_AdamWeightDecayDynamicLR """ auto_parallel_context().set_enable_parallel_optimizer(True) context.set_auto_parallel_context(parallel_mode="data_parallel", device_num=2) inputs = Tensor(np.ones([32, 128]).astype(np.float32)) label = Tensor(np.zeros([32, 768]).astype(np.float32)) net = Net() net.set_train() loss = nn.SoftmaxCrossEntropyWithLogits() optimizer = AdamWeightDecayDynamicLR(net.trainable_params(), decay_steps=20, learning_rate=0.1) net_with_loss = WithLossCell(net, loss) train_network = TrainOneStepCell(net_with_loss, optimizer) _executor.compile(train_network, inputs, label)
def _checkpoint_auto_parallel_context(): """checkpoint auto parallel context""" global _has_checkpointed if _has_checkpointed is True: return global _parallel_mode global _device_num global _global_rank global _parameter_broadcast global _mirror_mean global _cast_before_mirror global _loss_repeated_mean global _communication_backend global _enable_all_reduce_fusion _parallel_mode = auto_parallel_context().get_parallel_mode() _device_num = _get_device_num() _global_rank = _get_global_rank() _parameter_broadcast = auto_parallel_context().get_parameter_broadcast() _mirror_mean = auto_parallel_context().get_mirror_mean() _cast_before_mirror = auto_parallel_context().get_cast_before_mirror() _loss_repeated_mean = auto_parallel_context().get_loss_repeated_mean() _communication_backend = auto_parallel_context().get_communication_backend() _enable_all_reduce_fusion = auto_parallel_context().get_enable_all_reduce_fusion() _has_checkpointed = True
def test_reset_auto_parallel_context(): context.reset_auto_parallel_context() device_num = context.get_auto_parallel_context("device_num") global_rank = context.get_auto_parallel_context("global_rank") mirror_mean = context.get_auto_parallel_context("mirror_mean") cast_before_mirror = context.get_auto_parallel_context("cast_before_mirror") parallel_mode = context.get_auto_parallel_context("parallel_mode") parameter_broadcast = context.get_auto_parallel_context("parameter_broadcast") device_num_is_set = auto_parallel_context().get_device_num_is_set() parameter_broadcast_is_set = auto_parallel_context().get_parameter_broadcast_is_set() assert device_num == 1 assert global_rank == 0 assert mirror_mean == False assert cast_before_mirror == True assert parallel_mode == "stand_alone" assert parameter_broadcast == False assert device_num_is_set == False assert parameter_broadcast_is_set == False
def init(backend_name=None): """ Init distributed backend, e.g., hccl/nccl, it is required before communication service can be used. Note: The full name of hccl is Huawei Collective Communication Library. The full name of nccl is NVIDIA Collective Communication Library. Args: backend_name (str): Backend. Raises: TypeError: If backen_name is not a string. RuntimeError: If device target is invalid. RuntimeError: If backend is invalid or distributed init fails. """ if MS_ROLE in ("MS_PSERVER", "MS_SCHED"): return if backend_name is None: device_target = context.get_context("device_target") if device_target == "Ascend": backend_name = "hccl" elif device_target == "GPU": backend_name = "nccl" else: raise RuntimeError( "Device target {} is not supported.".format(device_target)) if not isinstance(backend_name, str): raise TypeError("Backend name must be a string, but got {}".format( type(backend_name))) if backend_name == "hccl": init_hccl() GlobalComm.BACKEND = Backend("hccl") GlobalComm.WORLD_COMM_GROUP = HCCL_WORLD_COMM_GROUP elif backend_name == "nccl": init_gpu_collective() GlobalComm.BACKEND = Backend("nccl") GlobalComm.WORLD_COMM_GROUP = NCCL_WORLD_COMM_GROUP else: raise RuntimeError( "Backend name {} is not supported.".format(backend_name)) auto_parallel_context().set_communication_backend(backend_name)
def test_reset_auto_parallel_context(): context.reset_auto_parallel_context() device_num = context.get_auto_parallel_context("device_num") global_rank = context.get_auto_parallel_context("global_rank") gradients_mean = context.get_auto_parallel_context("gradients_mean") gradient_fp32_sync = context.get_auto_parallel_context( "gradient_fp32_sync") parallel_mode = context.get_auto_parallel_context("parallel_mode") parameter_broadcast = context.get_auto_parallel_context( "parameter_broadcast") device_num_is_set = auto_parallel_context().get_device_num_is_set() parameter_broadcast_is_set = auto_parallel_context( ).get_parameter_broadcast_is_set() assert device_num == 1 assert global_rank == 0 assert not gradients_mean assert gradient_fp32_sync assert parallel_mode == "stand_alone" assert not parameter_broadcast assert not device_num_is_set assert not parameter_broadcast_is_set
def train_common(net): batch_size = 32 learning_rate = 0.1 momentum = 0.9 epoch_size = 2 device_num = 4 auto_parallel_context().set_enable_all_reduce_fusion(enable_all_reduce_fusion=True) context.set_auto_parallel_context(device_num=device_num, parameter_broadcast=False) context.set_context(mode=context.GRAPH_MODE) predict = Tensor(np.ones([batch_size, 128]), dtype=ms.float32) label = Tensor(np.ones([batch_size]), dtype=ms.int32) dataset = Dataset(predict, label, 2) loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) opt = Momentum(net.trainable_params(), learning_rate, momentum) model = Model(net, loss, opt) model.train(epoch_size, dataset, dataset_sink_mode=False) allreduce_fusion_dict = _executor._get_allreduce_fusion(model._train_network) print(allreduce_fusion_dict) return allreduce_fusion_dict
def train_process(q, device_id, epoch_size, num_classes, device_num, batch_size, enable_hccl): os.system("mkdir " + str(device_id)) os.chdir(str(device_id)) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(enable_task_sink=True, device_id=device_id) context.set_context(enable_loop_sink=True) context.set_context(enable_mem_reuse=True) context.set_context(enable_hccl=enable_hccl) os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH os.environ['RANK_ID'] = str(device_id) os.environ['RANK_SIZE'] = str(device_num) if enable_hccl: context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL) auto_parallel_context().set_all_reduce_fusion_split_indices([140]) init() context.set_context(mode=context.GRAPH_MODE) net = resnet50(batch_size, num_classes) loss = CrossEntropyLoss() opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) dataset = create_dataset(epoch_size, training=True, batch_size=batch_size, rank_id=device_id, rank_size=device_num, enable_hccl=enable_hccl) batch_num = dataset.get_dataset_size() loss_cb = LossGet() model.train(epoch_size, dataset, callbacks=[loss_cb]) q.put(loss_cb.get_loss())
def _get_pipeline_group(): rank = get_rank() stage_nums = auto_parallel_context().get_pipeline_stages() device_nums = get_group_size() per_stage_device_nums = device_nums // stage_nums stage_id = rank // per_stage_device_nums local_stage_rank_id = rank % per_stage_device_nums group = range(0, stage_nums) rank_list = [ local_stage_rank_id + x * per_stage_device_nums for x in group ] rank_str_list = [ str(local_stage_rank_id + x * per_stage_device_nums) for x in group ] rank_list_str = "-".join(rank_str_list) return rank_list, rank_list_str
def _get_model_parallel_group(mp): rank = get_rank() stage_nums = auto_parallel_context().get_pipeline_stages() device_nums = get_group_size() per_stage_device_nums = device_nums // stage_nums stage_id = rank // per_stage_device_nums local_stage_rank_id = rank % per_stage_device_nums index = local_stage_rank_id // mp group = range(0, mp) rank_str_list = [ str(x + index * mp + stage_id * per_stage_device_nums) for x in group ] rank_list_str = "-".join(rank_str_list) rank_list = [ x + index * mp + stage_id * per_stage_device_nums for x in group ] return rank_list, rank_list_str
def _init_device_info(): """ INTERNAL USE ONLY! As rank_id need to pass into deep layer for numa and device_queue. One process work with only one rank_id, In standalone scenario, rank_id may come from env 'CUDA_VISIBLE_DEVICES', For distribute scenario, rank_id come from _get_global_rank() """ from mindspore import context from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.parallel._utils import _get_global_rank if context.get_context("device_target") == "GPU": rank_id = _get_global_rank() parallel_mode = auto_parallel_context().get_parallel_mode() if parallel_mode == "stand_alone": cuda_device_info = os.getenv("CUDA_VISIBLE_DEVICES") if cuda_device_info: cuda_id = int(cuda_device_info.split(",")[0].strip()) if cuda_id != rank_id: rank_id = cuda_id _config.set_rank_id(rank_id)
def __init__(self, network, optimizer, sens=1.0): super(TrainingWrapper, self).__init__(auto_prefix=False) self.network = network self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True if self.reducer_flag: mean = context.get_auto_parallel_context("mirror_mean") if auto_parallel_context().get_device_num_is_set(): degree = context.get_auto_parallel_context("device_num") else: degree = get_group_size() self.grad_reducer = nn.DistributedGradReducer( optimizer.parameters, mean, degree)
def run_pretrain(args_opt): """pre-train bert""" global device_id global device_num global rank_id global job_id args_opt.device_id = device_id args_opt.device_num = device_num sync_dataset(args_opt.data_url) context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) context.set_context(reserve_class_name_in_scope=False) context.set_context(variable_memory_max_size="30GB") ckpt_save_dir = args_opt.save_checkpoint_path if args_opt.distribute == "true": if args_opt.device_target == 'Ascend': D.init('hccl') device_num = args_opt.device_num rank = args_opt.device_id % device_num else: D.init('nccl') device_num = D.get_group_size() rank = D.get_rank() ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str( rank) + '/' context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=device_num) from mindspore.parallel._auto_parallel_context import auto_parallel_context if bert_net_cfg.num_hidden_layers == 12: if bert_net_cfg.use_relative_positions: auto_parallel_context().set_all_reduce_fusion_split_indices( [29, 58, 87, 116, 145, 174, 203, 217]) else: auto_parallel_context().set_all_reduce_fusion_split_indices( [28, 55, 82, 109, 136, 163, 190, 205]) elif bert_net_cfg.num_hidden_layers == 24: if bert_net_cfg.use_relative_positions: auto_parallel_context().set_all_reduce_fusion_split_indices( [30, 90, 150, 210, 270, 330, 390, 421]) else: auto_parallel_context().set_all_reduce_fusion_split_indices( [38, 93, 148, 203, 258, 313, 368, 397]) else: rank = 0 device_num = 1 if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32: logger.warning('Gpu only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 ds, new_repeat_count = create_bert_dataset(args_opt.epoch_size, device_num, rank, args_opt.do_shuffle, args_opt.enable_data_sink, args_opt.data_sink_steps, args_opt.data_dir, args_opt.schema_dir) if args_opt.train_steps > 0: new_repeat_count = min( new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps) netwithloss = BertNetworkWithLoss(bert_net_cfg, True) if cfg.optimizer == 'Lamb': optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size() * new_repeat_count, start_learning_rate=cfg.Lamb.start_learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, power=cfg.Lamb.power, warmup_steps=cfg.Lamb.warmup_steps, weight_decay=cfg.Lamb.weight_decay, eps=cfg.Lamb.eps) elif cfg.optimizer == 'Momentum': optimizer = Momentum(netwithloss.trainable_params(), learning_rate=cfg.Momentum.learning_rate, momentum=cfg.Momentum.momentum) elif cfg.optimizer == 'AdamWeightDecayDynamicLR': optimizer = AdamWeightDecayDynamicLR( netwithloss.trainable_params(), decay_steps=ds.get_dataset_size() * new_repeat_count, learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate, end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate, power=cfg.AdamWeightDecayDynamicLR.power, weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay, eps=cfg.AdamWeightDecayDynamicLR.eps, warmup_steps=cfg.AdamWeightDecayDynamicLR.warmup_steps) else: raise ValueError( "Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecayDynamicLR]" .format(cfg.optimizer)) callback = [TimeMonitor(ds.get_dataset_size()), LossCallBack()] print("Enable save checkpoint: ", args_opt.enable_save_ckpt) print("Rank ID: ", rank_id) if args_opt.enable_save_ckpt == "true" and rank_id % device_num == 0: print("Enable save checkpoint") config_ck = CheckpointConfig( save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', directory=ckpt_save_dir, config=config_ck) callback.append(ckpoint_cb) if args_opt.load_checkpoint_path: param_dict = load_checkpoint(args_opt.load_checkpoint_path) load_param_into_net(netwithloss, param_dict) if args_opt.enable_lossscale == "true": update_cell = DynamicLossScaleUpdateCell( loss_scale_value=cfg.loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=update_cell) else: netwithgrads = BertTrainOneStepCell(netwithloss, optimizer=optimizer) model = Model(netwithgrads) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"))
def _get_mirror_mean(): return auto_parallel_context().get_mirror_mean()
def _get_parallel_mode(): return auto_parallel_context().get_parallel_mode()
def _get_pipeline_stages(): """Get pipeline stages""" return auto_parallel_context().get_pipeline_stages()
def _get_full_batch(): """Get whether to use full_batch.""" return auto_parallel_context().get_full_batch()
def _get_gradients_mean(): """Get if using gradients_mean.""" return auto_parallel_context().get_gradients_mean()
def _get_mirror_mean(): """Get if using mirror_mean.""" return auto_parallel_context().get_mirror_mean()
def test_set_auto_parallel_context(): context.set_auto_parallel_context(device_num=4, global_rank=3, gradients_mean=True, gradient_fp32_sync=False, parallel_mode="auto_parallel", parameter_broadcast=False) device_num = context.get_auto_parallel_context("device_num") global_rank = context.get_auto_parallel_context("global_rank") gradients_mean = context.get_auto_parallel_context("gradients_mean") gradient_fp32_sync = context.get_auto_parallel_context( "gradient_fp32_sync") parallel_mode = context.get_auto_parallel_context("parallel_mode") parameter_broadcast = context.get_auto_parallel_context( "parameter_broadcast") assert device_num == 4 assert global_rank == 3 assert gradients_mean assert not gradient_fp32_sync assert parallel_mode == "auto_parallel" assert not parameter_broadcast auto_parallel_context().set_device_num(4) device_num = auto_parallel_context().get_device_num() device_num_is_set = auto_parallel_context().get_device_num_is_set() assert device_num == 4 assert device_num_is_set auto_parallel_context().set_global_rank(4) global_rank = auto_parallel_context().get_global_rank() assert global_rank == 4 auto_parallel_context().set_gradients_mean(True) gradients_mean = auto_parallel_context().get_gradients_mean() assert gradients_mean auto_parallel_context().set_gradient_fp32_sync(False) gradient_fp32_sync = auto_parallel_context().get_gradient_fp32_sync() assert not gradient_fp32_sync parameter_broadcast_is_set = auto_parallel_context( ).get_parameter_broadcast_is_set() assert parameter_broadcast_is_set with pytest.raises(ValueError): context.set_auto_parallel_context(device_num=0) with pytest.raises(ValueError): context.set_auto_parallel_context(device_num=4097) with pytest.raises(ValueError): context.set_auto_parallel_context(global_rank=-1) with pytest.raises(ValueError): context.set_auto_parallel_context(parallel_mode="wrong_mode") with pytest.raises(ValueError): context.set_auto_parallel_context(global_rank=4096) with pytest.raises(ValueError): set_algo_parameters(tensor_slice_align_size=0) with pytest.raises(ValueError): set_algo_parameters(tensor_slice_align_size=1025) context.set_auto_parallel_context(enable_parallel_optimizer=True) assert context.get_auto_parallel_context("enable_parallel_optimizer") assert not auto_parallel_context().get_all_reduce_fusion_split_indices()
damping_each_step.append(damping_here) current_step = global_step damping_each_step = np.array(damping_each_step).astype(np.float32) damping_now = damping_each_step[current_step:] return damping_now if __name__ == '__main__': if not args_opt.do_eval and args_opt.run_distribute: context.set_auto_parallel_context( device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, parameter_broadcast=True) auto_parallel_context().set_all_reduce_fusion_split_indices( [107], "hccl_world_groupsum1") auto_parallel_context().set_all_reduce_fusion_split_indices( [27], "hccl_world_groupsum2") auto_parallel_context().set_all_reduce_fusion_split_indices( [27], "hccl_world_groupsum3") auto_parallel_context().set_all_reduce_fusion_split_indices( [27], "hccl_world_groupsum4") auto_parallel_context().set_all_reduce_fusion_split_indices( [27], "hccl_world_groupsum5") init() epoch_size = config.epoch_size damping = get_model_damping(0, 0.03, 0.87, 50, 5004) net = resnet50(class_num=config.class_num, damping=damping,
device_id = int(os.getenv('DEVICE_ID')) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(enable_task_sink=True, device_id=device_id) context.set_context(enable_loop_sink=True) context.set_context(enable_mem_reuse=True) if __name__ == '__main__': if not args_opt.do_eval and args_opt.run_distribute: context.set_auto_parallel_context( device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) auto_parallel_context().set_all_reduce_fusion_split_indices([140]) init() epoch_size = config.epoch_size net = resnet50(class_num=config.class_num) loss = SoftmaxCrossEntropyWithLogits(sparse=True) if args_opt.do_train: dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, repeat_num=epoch_size, batch_size=config.batch_size) step_size = dataset.get_dataset_size() loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
def _get_parallel_mode(): """Get parallel mode.""" return auto_parallel_context().get_parallel_mode()