コード例 #1
0
ファイル: ssd.py プロジェクト: chncwang/mindspore
 def __init__(self, network, optimizer, sens=1.0, use_global_norm=False):
     super(TrainingWrapper, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_grad()
     self.weights = ms.ParameterTuple(network.trainable_params())
     self.optimizer = optimizer
     self.grad = C.GradOperation(get_by_list=True, sens_param=True)
     self.sens = sens
     self.reducer_flag = False
     self.grad_reducer = None
     self.use_global_norm = use_global_norm
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [
             ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
     ]:
         self.reducer_flag = True
     if self.reducer_flag:
         mean = context.get_auto_parallel_context("gradients_mean")
         if auto_parallel_context().get_device_num_is_set():
             degree = context.get_auto_parallel_context("device_num")
         else:
             degree = get_group_size()
         self.grad_reducer = nn.DistributedGradReducer(
             optimizer.parameters, mean, degree)
     self.hyper_map = C.HyperMap()
コード例 #2
0
def _init_device_info():
    """
    INTERNAL USE ONLY!
    As rank_id need to pass into deep layer for numa and device_queue.
    One process work with only one rank_id, In standalone scenario,
    rank_id may come from env 'CUDA_VISIBLE_DEVICES', For distribute
    scenario, rank_id come from _get_global_rank()
    """
    from mindspore import context
    from mindspore.parallel._auto_parallel_context import auto_parallel_context
    from mindspore.parallel._utils import _get_global_rank
    numa_enable = False
    numa_enable_env = os.getenv("DATASET_ENABLE_NUMA", None)
    if numa_enable_env and numa_enable_env.strip() == 'True':
        numa_enable = True
    if context.get_context("device_target") == "GPU":
        rank_id = _get_global_rank()
        parallel_mode = auto_parallel_context().get_parallel_mode()
        if parallel_mode == "stand_alone":
            rank_id = context.get_context("device_id")
        if numa_enable:
            _config.set_numa_enable(True)
        _config.set_rank_id(rank_id)
    elif context.get_context("device_target") == "Ascend":
        # Ascend is a special scenario, we'd better get rank info from env
        env_rank_size = os.getenv("RANK_SIZE", None)
        env_rank_id = os.getenv("RANK_ID", None)
        if env_rank_size and env_rank_id:
            # Ascend only support multi-process scenario
            rank_size = int(env_rank_size.strip())
            rank_id = int(env_rank_id.strip())
            if rank_size > 1:
                if numa_enable:
                    _config.set_numa_enable(True)
                _config.set_rank_id(rank_id)
コード例 #3
0
ファイル: _utils.py プロジェクト: zsangel378/mindspore
def _restore_auto_parallel_context():
    """restore auto parallel context"""
    global _parallel_mode
    global _device_num
    global _global_rank
    global _parameter_broadcast
    global _mirror_mean
    global _cast_before_mirror
    global _loss_repeated_mean
    global _communication_backend
    global _enable_all_reduce_fusion
    _set_auto_parallel_context(parallel_mode=_parallel_mode, device_num=_device_num, global_rank=_global_rank,
                               parameter_broadcast=_parameter_broadcast, mirror_mean=_mirror_mean,
                               cast_before_mirror=_cast_before_mirror, loss_repeated_mean=_loss_repeated_mean)
    auto_parallel_context().set_communication_backend(_communication_backend)
    auto_parallel_context().set_enable_all_reduce_fusion(_enable_all_reduce_fusion)
コード例 #4
0
 def __init__(self, parameters, mean=True, degree=None):
     super(DistributedGradReducer, self).__init__(auto_prefix=False)
     self.map_ = C.Map()
     if degree is None:
         self.degree = get_group_size()
     else:
         if not isinstance(degree, int) or degree <= 0:
             raise ValueError(
                 "Parameter 'degree' in DistributedGradReducer should large than 0 and be int"
             )
         self.degree = degree
     self.mean = mean
     self.allreduce_filter = tuple(x.layerwise_parallel is False
                                   for x in parameters)
     is_parallel_optimizer = context.get_auto_parallel_context(
         "enable_parallel_optimizer")
     split_indices = auto_parallel_context(
     ).get_all_reduce_fusion_split_indices()
     if is_parallel_optimizer and split_indices:
         self.split_fusion = True
         self.op_list = _init_allreduce_operators(len(parameters),
                                                  split_indices)
     else:
         self.split_fusion = False
         self.allreduce = AllReduce().add_prim_attr('fusion', 1)
     self.allgather = AllGather(GlobalComm.WORLD_COMM_GROUP)
     ps_filter = lambda x: x.is_param_ps
     self.ps_parameters = tuple(ps_filter(x) for x in parameters)
     self.enable_parameter_server = any(self.ps_parameters)
コード例 #5
0
ファイル: config.py プロジェクト: chncwang/mindspore
def _init_device_info():
    """
    INTERNAL USE ONLY!
    As rank_id need to pass into deep layer for numa and device_queue.
    One process work with only one rank_id, In standalone scenario,
    rank_id may come from env 'CUDA_VISIBLE_DEVICES', For distribute
    scenario, rank_id come from _get_global_rank()
    """
    from mindspore import context
    from mindspore.parallel._auto_parallel_context import auto_parallel_context
    from mindspore.parallel._utils import _get_global_rank
    if context.get_context("device_target") == "GPU":
        rank_id = _get_global_rank()
        parallel_mode = auto_parallel_context().get_parallel_mode()
        if parallel_mode == "stand_alone":
            rank_id = context.get_context("device_id")
        _config.set_rank_id(rank_id)
    elif context.get_context("device_target") == "Ascend":
        # Ascend is a special scenario, we'd better get rank info from env
        env_rank_size = os.getenv("RANK_SIZE", None)
        env_rank_id = os.getenv("RANK_ID", None)
        if env_rank_size and env_rank_id:
            # Ascend only support multi-process scenario
            rank_size = int(env_rank_size.strip())
            rank_id = int(env_rank_id.strip())
            if rank_size > 1:
                _config.set_rank_id(rank_id)
            # Now single process under ascend mode doesn't support numa bind for performance consideration.
            if _config.get_numa_enable() is True and rank_size == 1:
                raise ValueError(
                    "single process under Ascend mode doesn't support numa bind for "
                    "performance consideration.")
コード例 #6
0
ファイル: centerface.py プロジェクト: stjordanis/mindspore
    def __init__(self, network, optimizer, sens=1.0):
        super(TrainingWrapper, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()  #False
        self.network.add_flags(defer_inline=True)
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.sens = sens
        self.reducer_flag = False
        self.grad_reducer = None

        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            if auto_parallel_context().get_device_num_is_set():
                degree = context.get_auto_parallel_context("device_num")
            else:
                degree = get_group_size()
            self.grad_reducer = nn.DistributedGradReducer(
                optimizer.parameters, mean, degree)

        self.hyper_map = C.HyperMap()
        self.alloc_status = NPUAllocFloatStatus()
        self.get_status = NPUGetFloatStatus()
        self.clear_status = NPUClearFloatStatus()
        self.reduce_sum = ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = LessEqual()
        self.allreduce = P.AllReduce()
        self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE
コード例 #7
0
 def __init__(self, G, generator, optimizer, sens=1.0):
     super(TrainOneStepG, self).__init__(auto_prefix=False)
     self.optimizer = optimizer
     self.G = G
     self.G.set_grad()
     self.G.set_train()
     self.G.D_A.set_grad(False)
     self.G.D_A.set_train(False)
     self.G.D_B.set_grad(False)
     self.G.D_B.set_train(False)
     self.grad = ops.GradOperation(get_by_list=True, sens_param=True)
     self.sens = sens
     self.weights = ms.ParameterTuple(generator.trainable_params())
     self.net = WithLossCell(G)
     self.reducer_flag = False
     self.grad_reducer = None
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [
             ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
     ]:
         self.reducer_flag = True
     if self.reducer_flag:
         mean = context.get_auto_parallel_context("gradients_mean")
         if auto_parallel_context().get_device_num_is_set():
             degree = context.get_auto_parallel_context("device_num")
         else:
             degree = get_group_size()
         self.grad_reducer = nn.DistributedGradReducer(
             optimizer.parameters, mean, degree)
コード例 #8
0
def test_lamb_split_fusion():
    """ test_Lamb_split_fusion """
    context.set_auto_parallel_context(parallel_mode="data_parallel",
                                      device_num=2,
                                      enable_parallel_optimizer=True)
    auto_parallel_context().set_all_reduce_fusion_split_indices([2, 4, 6, 8])
    inputs = Tensor(np.ones([32, 128]).astype(np.float32))
    label = Tensor(np.zeros([32, 768]).astype(np.float32))
    net = Net()
    net.set_train()
    loss = nn.SoftmaxCrossEntropyWithLogits()
    optimizer = Lamb(net.trainable_params(), learning_rate=0.1)

    net_with_loss = WithLossCell(net, loss)
    train_network = TrainOneStepCell(net_with_loss, optimizer)
    _executor.compile(train_network, inputs, label)
    context.reset_auto_parallel_context()
コード例 #9
0
def test_AdamWeightDecayDynamicLR():
    """ test_AdamWeightDecayDynamicLR """
    auto_parallel_context().set_enable_parallel_optimizer(True)
    context.set_auto_parallel_context(parallel_mode="data_parallel",
                                      device_num=2)
    inputs = Tensor(np.ones([32, 128]).astype(np.float32))
    label = Tensor(np.zeros([32, 768]).astype(np.float32))
    net = Net()
    net.set_train()
    loss = nn.SoftmaxCrossEntropyWithLogits()
    optimizer = AdamWeightDecayDynamicLR(net.trainable_params(),
                                         decay_steps=20,
                                         learning_rate=0.1)

    net_with_loss = WithLossCell(net, loss)
    train_network = TrainOneStepCell(net_with_loss, optimizer)
    _executor.compile(train_network, inputs, label)
コード例 #10
0
ファイル: _utils.py プロジェクト: zsangel378/mindspore
def _checkpoint_auto_parallel_context():
    """checkpoint auto parallel context"""
    global _has_checkpointed
    if _has_checkpointed is True:
        return

    global _parallel_mode
    global _device_num
    global _global_rank
    global _parameter_broadcast
    global _mirror_mean
    global _cast_before_mirror
    global _loss_repeated_mean
    global _communication_backend
    global _enable_all_reduce_fusion
    _parallel_mode = auto_parallel_context().get_parallel_mode()
    _device_num = _get_device_num()
    _global_rank = _get_global_rank()
    _parameter_broadcast = auto_parallel_context().get_parameter_broadcast()
    _mirror_mean = auto_parallel_context().get_mirror_mean()
    _cast_before_mirror = auto_parallel_context().get_cast_before_mirror()
    _loss_repeated_mean = auto_parallel_context().get_loss_repeated_mean()
    _communication_backend = auto_parallel_context().get_communication_backend()
    _enable_all_reduce_fusion = auto_parallel_context().get_enable_all_reduce_fusion()
    _has_checkpointed = True
コード例 #11
0
def test_reset_auto_parallel_context():
    context.reset_auto_parallel_context()
    device_num = context.get_auto_parallel_context("device_num")
    global_rank = context.get_auto_parallel_context("global_rank")
    mirror_mean = context.get_auto_parallel_context("mirror_mean")
    cast_before_mirror = context.get_auto_parallel_context("cast_before_mirror")
    parallel_mode = context.get_auto_parallel_context("parallel_mode")
    parameter_broadcast = context.get_auto_parallel_context("parameter_broadcast")
    device_num_is_set = auto_parallel_context().get_device_num_is_set()
    parameter_broadcast_is_set = auto_parallel_context().get_parameter_broadcast_is_set()
    assert device_num == 1
    assert global_rank == 0
    assert mirror_mean == False
    assert cast_before_mirror == True
    assert parallel_mode == "stand_alone"
    assert parameter_broadcast == False
    assert device_num_is_set == False
    assert parameter_broadcast_is_set == False
コード例 #12
0
ファイル: management.py プロジェクト: zuoshou030/mindspore
def init(backend_name=None):
    """
    Init distributed backend, e.g., hccl/nccl, it is required before communication service can be used.

    Note:
        The full name of hccl is Huawei Collective Communication Library.
        The full name of nccl is NVIDIA Collective Communication Library.

    Args:
        backend_name (str): Backend.

    Raises:
        TypeError: If backen_name is not a string.
        RuntimeError: If device target is invalid.
        RuntimeError: If backend is invalid or distributed init fails.
    """
    if MS_ROLE in ("MS_PSERVER", "MS_SCHED"):
        return
    if backend_name is None:
        device_target = context.get_context("device_target")
        if device_target == "Ascend":
            backend_name = "hccl"
        elif device_target == "GPU":
            backend_name = "nccl"
        else:
            raise RuntimeError(
                "Device target {} is not supported.".format(device_target))
    if not isinstance(backend_name, str):
        raise TypeError("Backend name must be a string, but got {}".format(
            type(backend_name)))

    if backend_name == "hccl":
        init_hccl()
        GlobalComm.BACKEND = Backend("hccl")
        GlobalComm.WORLD_COMM_GROUP = HCCL_WORLD_COMM_GROUP
    elif backend_name == "nccl":
        init_gpu_collective()
        GlobalComm.BACKEND = Backend("nccl")
        GlobalComm.WORLD_COMM_GROUP = NCCL_WORLD_COMM_GROUP
    else:
        raise RuntimeError(
            "Backend name {} is not supported.".format(backend_name))

    auto_parallel_context().set_communication_backend(backend_name)
コード例 #13
0
def test_reset_auto_parallel_context():
    context.reset_auto_parallel_context()
    device_num = context.get_auto_parallel_context("device_num")
    global_rank = context.get_auto_parallel_context("global_rank")
    gradients_mean = context.get_auto_parallel_context("gradients_mean")
    gradient_fp32_sync = context.get_auto_parallel_context(
        "gradient_fp32_sync")
    parallel_mode = context.get_auto_parallel_context("parallel_mode")
    parameter_broadcast = context.get_auto_parallel_context(
        "parameter_broadcast")
    device_num_is_set = auto_parallel_context().get_device_num_is_set()
    parameter_broadcast_is_set = auto_parallel_context(
    ).get_parameter_broadcast_is_set()
    assert device_num == 1
    assert global_rank == 0
    assert not gradients_mean
    assert gradient_fp32_sync
    assert parallel_mode == "stand_alone"
    assert not parameter_broadcast
    assert not device_num_is_set
    assert not parameter_broadcast_is_set
コード例 #14
0
def train_common(net):
    batch_size = 32
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2
    device_num = 4
    auto_parallel_context().set_enable_all_reduce_fusion(enable_all_reduce_fusion=True)
    context.set_auto_parallel_context(device_num=device_num, parameter_broadcast=False)
    context.set_context(mode=context.GRAPH_MODE)

    predict = Tensor(np.ones([batch_size, 128]), dtype=ms.float32)
    label = Tensor(np.ones([batch_size]), dtype=ms.int32)
    dataset = Dataset(predict, label, 2)

    loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
    opt = Momentum(net.trainable_params(), learning_rate, momentum)
    model = Model(net, loss, opt)

    model.train(epoch_size, dataset, dataset_sink_mode=False)
    allreduce_fusion_dict = _executor._get_allreduce_fusion(model._train_network)

    print(allreduce_fusion_dict)
    return allreduce_fusion_dict
コード例 #15
0
def train_process(q, device_id, epoch_size, num_classes, device_num,
                  batch_size, enable_hccl):
    os.system("mkdir " + str(device_id))
    os.chdir(str(device_id))
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        save_graphs=False)
    context.set_context(enable_task_sink=True, device_id=device_id)
    context.set_context(enable_loop_sink=True)
    context.set_context(enable_mem_reuse=True)
    context.set_context(enable_hccl=enable_hccl)
    os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH
    os.environ['RANK_ID'] = str(device_id)
    os.environ['RANK_SIZE'] = str(device_num)
    if enable_hccl:
        context.set_auto_parallel_context(
            device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL)
        auto_parallel_context().set_all_reduce_fusion_split_indices([140])
        init()
    context.set_context(mode=context.GRAPH_MODE)
    net = resnet50(batch_size, num_classes)
    loss = CrossEntropyLoss()
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                   0.01, 0.9)

    model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})

    dataset = create_dataset(epoch_size,
                             training=True,
                             batch_size=batch_size,
                             rank_id=device_id,
                             rank_size=device_num,
                             enable_hccl=enable_hccl)
    batch_num = dataset.get_dataset_size()
    loss_cb = LossGet()
    model.train(epoch_size, dataset, callbacks=[loss_cb])
    q.put(loss_cb.get_loss())
コード例 #16
0
def _get_pipeline_group():
    rank = get_rank()
    stage_nums = auto_parallel_context().get_pipeline_stages()
    device_nums = get_group_size()
    per_stage_device_nums = device_nums // stage_nums
    stage_id = rank // per_stage_device_nums
    local_stage_rank_id = rank % per_stage_device_nums
    group = range(0, stage_nums)
    rank_list = [
        local_stage_rank_id + x * per_stage_device_nums for x in group
    ]
    rank_str_list = [
        str(local_stage_rank_id + x * per_stage_device_nums) for x in group
    ]
    rank_list_str = "-".join(rank_str_list)
    return rank_list, rank_list_str
コード例 #17
0
def _get_model_parallel_group(mp):
    rank = get_rank()
    stage_nums = auto_parallel_context().get_pipeline_stages()
    device_nums = get_group_size()
    per_stage_device_nums = device_nums // stage_nums
    stage_id = rank // per_stage_device_nums
    local_stage_rank_id = rank % per_stage_device_nums
    index = local_stage_rank_id // mp
    group = range(0, mp)
    rank_str_list = [
        str(x + index * mp + stage_id * per_stage_device_nums) for x in group
    ]
    rank_list_str = "-".join(rank_str_list)
    rank_list = [
        x + index * mp + stage_id * per_stage_device_nums for x in group
    ]
    return rank_list, rank_list_str
コード例 #18
0
def _init_device_info():
    """
    INTERNAL USE ONLY!
    As rank_id need to pass into deep layer for numa and device_queue.
    One process work with only one rank_id, In standalone scenario,
    rank_id may come from env 'CUDA_VISIBLE_DEVICES', For distribute
    scenario, rank_id come from _get_global_rank()
    """
    from mindspore import context
    from mindspore.parallel._auto_parallel_context import auto_parallel_context
    from mindspore.parallel._utils import _get_global_rank
    if context.get_context("device_target") == "GPU":
        rank_id = _get_global_rank()
        parallel_mode = auto_parallel_context().get_parallel_mode()
        if parallel_mode == "stand_alone":
            cuda_device_info = os.getenv("CUDA_VISIBLE_DEVICES")
            if cuda_device_info:
                cuda_id = int(cuda_device_info.split(",")[0].strip())
                if cuda_id != rank_id:
                    rank_id = cuda_id
        _config.set_rank_id(rank_id)
コード例 #19
0
 def __init__(self, network, optimizer, sens=1.0):
     super(TrainingWrapper, self).__init__(auto_prefix=False)
     self.network = network
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.grad = C.GradOperation(get_by_list=True, sens_param=True)
     self.sens = sens
     self.reducer_flag = False
     self.grad_reducer = None
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [
             ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
     ]:
         self.reducer_flag = True
     if self.reducer_flag:
         mean = context.get_auto_parallel_context("mirror_mean")
         if auto_parallel_context().get_device_num_is_set():
             degree = context.get_auto_parallel_context("device_num")
         else:
             degree = get_group_size()
         self.grad_reducer = nn.DistributedGradReducer(
             optimizer.parameters, mean, degree)
コード例 #20
0
def run_pretrain(args_opt):
    """pre-train bert"""
    global device_id
    global device_num
    global rank_id
    global job_id
    args_opt.device_id = device_id
    args_opt.device_num = device_num
    sync_dataset(args_opt.data_url)

    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args_opt.device_target,
                        device_id=args_opt.device_id)
    context.set_context(reserve_class_name_in_scope=False)
    context.set_context(variable_memory_max_size="30GB")
    ckpt_save_dir = args_opt.save_checkpoint_path
    if args_opt.distribute == "true":
        if args_opt.device_target == 'Ascend':
            D.init('hccl')
            device_num = args_opt.device_num
            rank = args_opt.device_id % device_num
        else:
            D.init('nccl')
            device_num = D.get_group_size()
            rank = D.get_rank()
            ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(
                rank) + '/'

        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            device_num=device_num)
        from mindspore.parallel._auto_parallel_context import auto_parallel_context
        if bert_net_cfg.num_hidden_layers == 12:
            if bert_net_cfg.use_relative_positions:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [29, 58, 87, 116, 145, 174, 203, 217])
            else:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [28, 55, 82, 109, 136, 163, 190, 205])
        elif bert_net_cfg.num_hidden_layers == 24:
            if bert_net_cfg.use_relative_positions:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [30, 90, 150, 210, 270, 330, 390, 421])
            else:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [38, 93, 148, 203, 258, 313, 368, 397])
    else:
        rank = 0
        device_num = 1

    if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32:
        logger.warning('Gpu only support fp32 temporarily, run with fp32.')
        bert_net_cfg.compute_type = mstype.float32

    ds, new_repeat_count = create_bert_dataset(args_opt.epoch_size, device_num,
                                               rank, args_opt.do_shuffle,
                                               args_opt.enable_data_sink,
                                               args_opt.data_sink_steps,
                                               args_opt.data_dir,
                                               args_opt.schema_dir)
    if args_opt.train_steps > 0:
        new_repeat_count = min(
            new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps)
    netwithloss = BertNetworkWithLoss(bert_net_cfg, True)

    if cfg.optimizer == 'Lamb':
        optimizer = Lamb(netwithloss.trainable_params(),
                         decay_steps=ds.get_dataset_size() * new_repeat_count,
                         start_learning_rate=cfg.Lamb.start_learning_rate,
                         end_learning_rate=cfg.Lamb.end_learning_rate,
                         power=cfg.Lamb.power,
                         warmup_steps=cfg.Lamb.warmup_steps,
                         weight_decay=cfg.Lamb.weight_decay,
                         eps=cfg.Lamb.eps)
    elif cfg.optimizer == 'Momentum':
        optimizer = Momentum(netwithloss.trainable_params(),
                             learning_rate=cfg.Momentum.learning_rate,
                             momentum=cfg.Momentum.momentum)
    elif cfg.optimizer == 'AdamWeightDecayDynamicLR':
        optimizer = AdamWeightDecayDynamicLR(
            netwithloss.trainable_params(),
            decay_steps=ds.get_dataset_size() * new_repeat_count,
            learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate,
            end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate,
            power=cfg.AdamWeightDecayDynamicLR.power,
            weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay,
            eps=cfg.AdamWeightDecayDynamicLR.eps,
            warmup_steps=cfg.AdamWeightDecayDynamicLR.warmup_steps)
    else:
        raise ValueError(
            "Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecayDynamicLR]"
            .format(cfg.optimizer))
    callback = [TimeMonitor(ds.get_dataset_size()), LossCallBack()]
    print("Enable save checkpoint: ", args_opt.enable_save_ckpt)
    print("Rank ID: ", rank_id)
    if args_opt.enable_save_ckpt == "true" and rank_id % device_num == 0:
        print("Enable save checkpoint")
        config_ck = CheckpointConfig(
            save_checkpoint_steps=args_opt.save_checkpoint_steps,
            keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert',
                                     directory=ckpt_save_dir,
                                     config=config_ck)
        callback.append(ckpoint_cb)

    if args_opt.load_checkpoint_path:
        param_dict = load_checkpoint(args_opt.load_checkpoint_path)
        load_param_into_net(netwithloss, param_dict)

    if args_opt.enable_lossscale == "true":
        update_cell = DynamicLossScaleUpdateCell(
            loss_scale_value=cfg.loss_scale_value,
            scale_factor=cfg.scale_factor,
            scale_window=cfg.scale_window)
        netwithgrads = BertTrainOneStepWithLossScaleCell(
            netwithloss, optimizer=optimizer, scale_update_cell=update_cell)
    else:
        netwithgrads = BertTrainOneStepCell(netwithloss, optimizer=optimizer)

    model = Model(netwithgrads)
    model.train(new_repeat_count,
                ds,
                callbacks=callback,
                dataset_sink_mode=(args_opt.enable_data_sink == "true"))
コード例 #21
0
ファイル: _utils.py プロジェクト: zky001/mindspore
def _get_mirror_mean():
    return auto_parallel_context().get_mirror_mean()
コード例 #22
0
ファイル: _utils.py プロジェクト: zky001/mindspore
def _get_parallel_mode():
    return auto_parallel_context().get_parallel_mode()
コード例 #23
0
ファイル: _utils.py プロジェクト: noticeable/mindspore
def _get_pipeline_stages():
    """Get pipeline stages"""
    return auto_parallel_context().get_pipeline_stages()
コード例 #24
0
ファイル: _utils.py プロジェクト: noticeable/mindspore
def _get_full_batch():
    """Get whether to use full_batch."""
    return auto_parallel_context().get_full_batch()
コード例 #25
0
ファイル: _utils.py プロジェクト: noticeable/mindspore
def _get_gradients_mean():
    """Get if using gradients_mean."""
    return auto_parallel_context().get_gradients_mean()
コード例 #26
0
def _get_mirror_mean():
    """Get if using mirror_mean."""
    return auto_parallel_context().get_mirror_mean()
コード例 #27
0
def test_set_auto_parallel_context():
    context.set_auto_parallel_context(device_num=4,
                                      global_rank=3,
                                      gradients_mean=True,
                                      gradient_fp32_sync=False,
                                      parallel_mode="auto_parallel",
                                      parameter_broadcast=False)
    device_num = context.get_auto_parallel_context("device_num")
    global_rank = context.get_auto_parallel_context("global_rank")
    gradients_mean = context.get_auto_parallel_context("gradients_mean")
    gradient_fp32_sync = context.get_auto_parallel_context(
        "gradient_fp32_sync")
    parallel_mode = context.get_auto_parallel_context("parallel_mode")
    parameter_broadcast = context.get_auto_parallel_context(
        "parameter_broadcast")
    assert device_num == 4
    assert global_rank == 3
    assert gradients_mean
    assert not gradient_fp32_sync
    assert parallel_mode == "auto_parallel"
    assert not parameter_broadcast

    auto_parallel_context().set_device_num(4)
    device_num = auto_parallel_context().get_device_num()
    device_num_is_set = auto_parallel_context().get_device_num_is_set()
    assert device_num == 4
    assert device_num_is_set

    auto_parallel_context().set_global_rank(4)
    global_rank = auto_parallel_context().get_global_rank()
    assert global_rank == 4

    auto_parallel_context().set_gradients_mean(True)
    gradients_mean = auto_parallel_context().get_gradients_mean()
    assert gradients_mean

    auto_parallel_context().set_gradient_fp32_sync(False)
    gradient_fp32_sync = auto_parallel_context().get_gradient_fp32_sync()
    assert not gradient_fp32_sync

    parameter_broadcast_is_set = auto_parallel_context(
    ).get_parameter_broadcast_is_set()
    assert parameter_broadcast_is_set

    with pytest.raises(ValueError):
        context.set_auto_parallel_context(device_num=0)

    with pytest.raises(ValueError):
        context.set_auto_parallel_context(device_num=4097)

    with pytest.raises(ValueError):
        context.set_auto_parallel_context(global_rank=-1)

    with pytest.raises(ValueError):
        context.set_auto_parallel_context(parallel_mode="wrong_mode")

    with pytest.raises(ValueError):
        context.set_auto_parallel_context(global_rank=4096)

    with pytest.raises(ValueError):
        set_algo_parameters(tensor_slice_align_size=0)

    with pytest.raises(ValueError):
        set_algo_parameters(tensor_slice_align_size=1025)

    context.set_auto_parallel_context(enable_parallel_optimizer=True)
    assert context.get_auto_parallel_context("enable_parallel_optimizer")
    assert not auto_parallel_context().get_all_reduce_fusion_split_indices()
コード例 #28
0
ファイル: train.py プロジェクト: wudenggang/mindspore
        damping_each_step.append(damping_here)

    current_step = global_step
    damping_each_step = np.array(damping_each_step).astype(np.float32)
    damping_now = damping_each_step[current_step:]
    return damping_now


if __name__ == '__main__':
    if not args_opt.do_eval and args_opt.run_distribute:
        context.set_auto_parallel_context(
            device_num=args_opt.device_num,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            parameter_broadcast=True)
        auto_parallel_context().set_all_reduce_fusion_split_indices(
            [107], "hccl_world_groupsum1")
        auto_parallel_context().set_all_reduce_fusion_split_indices(
            [27], "hccl_world_groupsum2")
        auto_parallel_context().set_all_reduce_fusion_split_indices(
            [27], "hccl_world_groupsum3")
        auto_parallel_context().set_all_reduce_fusion_split_indices(
            [27], "hccl_world_groupsum4")
        auto_parallel_context().set_all_reduce_fusion_split_indices(
            [27], "hccl_world_groupsum5")

        init()

    epoch_size = config.epoch_size
    damping = get_model_damping(0, 0.03, 0.87, 50, 5004)
    net = resnet50(class_num=config.class_num,
                   damping=damping,
コード例 #29
0
ファイル: train.py プロジェクト: inventionzhang/mindspore
device_id = int(os.getenv('DEVICE_ID'))

context.set_context(mode=context.GRAPH_MODE,
                    device_target="Ascend",
                    save_graphs=False)
context.set_context(enable_task_sink=True, device_id=device_id)
context.set_context(enable_loop_sink=True)
context.set_context(enable_mem_reuse=True)

if __name__ == '__main__':
    if not args_opt.do_eval and args_opt.run_distribute:
        context.set_auto_parallel_context(
            device_num=args_opt.device_num,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True)
        auto_parallel_context().set_all_reduce_fusion_split_indices([140])
        init()

    epoch_size = config.epoch_size
    net = resnet50(class_num=config.class_num)
    loss = SoftmaxCrossEntropyWithLogits(sparse=True)

    if args_opt.do_train:
        dataset = create_dataset(dataset_path=args_opt.dataset_path,
                                 do_train=True,
                                 repeat_num=epoch_size,
                                 batch_size=config.batch_size)
        step_size = dataset.get_dataset_size()

        loss_scale = FixedLossScaleManager(config.loss_scale,
                                           drop_overflow_update=False)
コード例 #30
0
ファイル: _utils.py プロジェクト: noticeable/mindspore
def _get_parallel_mode():
    """Get parallel mode."""
    return auto_parallel_context().get_parallel_mode()