コード例 #1
0
def init_resource(config=None):
    """Initialize NPU resource"""
    if (not isinstance(config, config_pb2.ConfigProto)) or (not issubclass(type(config), config_pb2.ConfigProto)):
        config = config_pb2.ConfigProto()

    npu_optimizer = None
    for custom_optimizer in config.graph_options.rewrite_options.custom_optimizers:
        if custom_optimizer.name == 'NpuOptimizer':
            npu_optimizer = custom_optimizer
            break
    if not npu_optimizer:
        npu_optimizer = config.graph_options.rewrite_options.custom_optimizers.add()
        npu_optimizer.name = 'NpuOptimizer'
        config.graph_options.rewrite_options.memory_optimization = RewriterConfig.OFF

    config.allow_soft_placement = True
    config.log_device_placement = False
    config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
    config.graph_options.optimizer_options.global_jit_level = config_pb2.OptimizerOptions.OFF

    util.global_dict_init()
    npu_init = npu_ops.initialize_system()
    npu_shutdown = npu_ops.shutdown_system()

    sess = session.Session(config=config)
    sess.run(npu_init)
    npu_rank_id = get_rank_id()
    npu_local_rank_id = get_local_rank_id()
    npu_rank_size = get_rank_size()
    npu_local_rank_size = get_local_rank_size()
    util.set_value("npu_rank_id", npu_rank_id)
    util.set_value("npu_local_rank_id", npu_local_rank_id)
    util.set_value("npu_rank_size", npu_rank_size)
    util.set_value("npu_local_rank_size", npu_local_rank_size)
    return sess, npu_shutdown
コード例 #2
0
def open(device_id=None):
    """Initiate and return a NPU device handle"""
    if device_id is None:
        device_id = int(os.getenv("ASCEND_DEVICE_ID", '0'))

    with _npu_ctx_lock:
        if not isinstance(context.context(), _ContextWithDefaultDevice):
            ctx = _ContextWithDefaultDevice()
            ctx.ensure_initialized()
            context._set_context(ctx)
            _npu_device_instances.clear(
            )  # Global context has changed since last init npu

        if device_id in _npu_device_instances.keys():
            logging.info('Npu instance on device %s already created',
                         str(device_id))
            return _npu_device_instances.get(device_id)

        if len(_npu_device_instances):
            raise RuntimeError(
                'Failed create npu instance on device {} as existed instance on {}'
                ''.format(device_id, list(_npu_device_instances.keys())))

        global_kw_options = global_options().as_dict()
        workers_num = int(os.getenv('RANK_SIZE', '1'))
        if workers_num > 1:
            env_rank_table = os.getenv("RANK_TABLE_FILE")
            env_worker_id = os.getenv('RANK_ID')
            if not env_rank_table:
                raise RuntimeError(
                    'You must specify a rank table file by set env RANK_TABLE_FILE in distribution mode'
                )

            if not env_worker_id:
                raise RuntimeError(
                    'You must specify rank id by set env RANK_ID in distribution mode'
                )

            global_kw_options['_distribute.rank_table'] = env_rank_table
            global_kw_options['_distribute.rank_id'] = env_worker_id

        device_options = {}
        error_message = _npu_device_backends.Open(context.context()._handle,
                                                  NPU, device_id,
                                                  global_kw_options,
                                                  device_options)
        if error_message:
            raise RuntimeError("Failed open npu device %s : %s" %
                               (str(device_id), error_message))

        if workers_num > 1:
            from hccl.manage.api import get_rank_id
            worker_id = get_rank_id()
        else:
            worker_id = 0

        _npu_device_instances[device_id] = NpuDeviceHandle(
            context.context(), device_id, device_options, workers_num,
            worker_id)
        return _npu_device_instances[device_id]
コード例 #3
0
ファイル: trainer_torch.py プロジェクト: ylfzr/vega
 def _init_distributed_setting(self):
     if not self.distributed:
         return
     self._world_size = hvd.size() if zeus.is_gpu_device(
     ) else get_rank_size()
     self._rank_id = hvd.rank() if zeus.is_gpu_device() else get_rank_id()
     self._local_rank_id = hvd.local_rank() if zeus.is_gpu_device(
     ) else get_local_rank_id()
コード例 #4
0
ファイル: trainer.py プロジェクト: qianrenjian/xingtian
 def _init_distributed_setting(self):
     if not self.distributed:
         return
     if zeus.is_npu_device():
         self.npu_init = npu_ops.initialize_system()
         self.npu_shutdown = npu_ops.shutdown_system()
         self.sess.run(self.npu_init)
     self._world_size = hvd.size() if zeus.is_gpu_device() else get_rank_size()
     self._rank_id = hvd.rank() if zeus.is_gpu_device() else get_rank_id()
     self._local_rank_id = hvd.local_rank() if zeus.is_gpu_device() else get_local_rank_id()
コード例 #5
0
ファイル: npu_init.py プロジェクト: Judithsq/tensorflow
def init_resource():
    util.global_dict_init()
    npu_init = npu_ops.initialize_system()
    npu_shutdown = npu_ops.shutdown_system()
    config = config_pb2.ConfigProto(allow_soft_placement=True,
                                    log_device_placement=False)
    custom_op = config.graph_options.rewrite_options.custom_optimizers.add()
    custom_op.name = "NpuOptimizer"
    config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
    sess = session.Session(config=config)
    sess.run(npu_init)
    npu_rank_id = get_rank_id()
    npu_local_rank_id = get_local_rank_id()
    npu_rank_size = get_rank_size()
    util.set_value("npu_rank_id", npu_rank_id)
    util.set_value("npu_local_rank_id", npu_local_rank_id)
    util.set_value("npu_rank_size", npu_rank_size)
    return sess, npu_shutdown
コード例 #6
0
ファイル: npu_plugin.py プロジェクト: Ascend/tensorflow
def rdma_remote_register(remote_var_list):
    """
    remote_var_list: embedding and opt var list.
    """
    if not isinstance(remote_var_list, (tuple, list)):
        raise ValueError('{} should be tuple or list'.format(remote_var_list))
    var_addr_list = []
    local_rank_size = get_local_rank_size()
    rank_id = get_rank_id()
    server_id = int(rank_id / local_rank_size)
    for var in remote_var_list:
        for line in var:
            var_server_id = int(line[0] / local_rank_size)
            if server_id == var_server_id:
                host_var_info = tf_adapter.HostVarInfo()
                host_var_info.base_addr = line[1]
                host_var_info.var_size = line[2]
                var_addr_list.append(host_var_info)
    res = tf_adapter.RegistRdmaRemoteAddr(var_addr_list)
    if res != 0:
        raise RuntimeError('rdma remote register failed')
コード例 #7
0
    def _init_distributed_setting(self):
        if not self.distributed:
            return

        if zeus.is_npu_device():
            from npu_bridge.estimator import npu_ops
            self.npu_init = npu_ops.initialize_system()
            self.npu_shutdown = npu_ops.shutdown_system()
            self.sess.run(self.npu_init)

        import horovod.tensorflow as hvd
        if zeus.is_gpu_device():
            self._world_size = hvd.size()
            self._rank_id = hvd.rank()
            self._local_rank_id = hvd.local_rank()
        elif zeus.is_npu_device():
            from hccl.manage.api import get_local_rank_id
            from hccl.manage.api import get_rank_size
            from hccl.manage.api import get_rank_id
            self._world_size = get_rank_size()
            self._rank_id = get_rank_id()
            self._local_rank_id = get_local_rank_id()
コード例 #8
0
ファイル: npu_plugin.py プロジェクト: Ascend/tensorflow
def rdma_remote_init(remote_var_list, mem_size):
    """
    remote_var_list: embedding and opt var list.
    mem_size: ramd pool memory size to be allocated. type:int
    """
    if not isinstance(remote_var_list, (tuple, list)):
        raise ValueError('{} should be tuple or list'.format(remote_var_list))
    if not isinstance(mem_size, int):
        raise ValueError('{} should be int'.format(mem_size))
    var_addr_list = []
    local_rank_size = get_local_rank_size()
    rank_id = get_rank_id()
    server_id = int(rank_id / local_rank_size)
    for var in remote_var_list:
        server_var = var[server_id]
        host_var_info = tf_adapter.HostVarInfo()
        host_var_info.base_addr = server_var[1]
        host_var_info.var_size = server_var[2]
        var_addr_list.append(host_var_info)
    res = tf_adapter.RdmaInitAndRegister(var_addr_list, mem_size)
    if res != 0:
        raise RuntimeError('rdma init and register failed')
コード例 #9
0
ファイル: npu_strategy.py プロジェクト: Judithsq/tensorflow
 def _experimental_distribute_dataset(self, dataset):
     return dataset.shard(get_rank_size(), get_rank_id())