def _init_distributed_setting(self): if not self.distributed: return self._world_size = hvd.size() if zeus.is_gpu_device( ) else get_rank_size() self._rank_id = hvd.rank() if zeus.is_gpu_device() else get_rank_id() self._local_rank_id = hvd.local_rank() if zeus.is_gpu_device( ) else get_local_rank_id()
def _init_distributed_setting(self): if not self.distributed: return if zeus.is_npu_device(): self.npu_init = npu_ops.initialize_system() self.npu_shutdown = npu_ops.shutdown_system() self.sess.run(self.npu_init) self._world_size = hvd.size() if zeus.is_gpu_device() else get_rank_size() self._rank_id = hvd.rank() if zeus.is_gpu_device() else get_rank_id() self._local_rank_id = hvd.local_rank() if zeus.is_gpu_device() else get_local_rank_id()
def run_remote_worker(worker_id, worker_path, id): """Run worker on remote mochine.""" from zeus.common.utils import init_log init_log(level="info", log_file=".temp_{}.log".format(worker_id), log_path=worker_path) config = _load_config(worker_id, worker_path, id) os.environ["LD_LIBRARY_PATH"] = config["env"]["LD_LIBRARY_PATH"] os.environ["PWD"] = config["env"]["PWD"] os.chdir(os.environ["PWD"]) zeus.register_zeus(os.environ['BACKEND_TYPE'].lower()) if zeus.is_gpu_device(): sub_pid_list = call_in_gpu(config, id, worker_id, worker_path) elif zeus.is_npu_device(): os.environ["PYTHONPATH"] = config["env"]["PYTHONPATH"] os.environ["PATH"] = config["env"]["PATH"] os.environ["ASCEND_OPP_PATH"] = config["env"]["ASCEND_OPP_PATH"] sub_pid_list = call_in_npu(config, id, worker_id, worker_path) logging.info("DistributedWorker finished!") for sub_pid in sub_pid_list: kill_proc_tree(pid=sub_pid) logging.info("DistributedWorker subprocess cleaned!") return 0
def _init_tf_estimator(self): """Init tensorflow estimator.""" sess_config = self._init_session_config() if zeus.is_gpu_device(): self._init_gpu_estimator(sess_config) elif zeus.is_npu_device(): self._init_npu_estimator(sess_config)
def __call__(self, model=None, distributed=False): """Call Optimizer class. :param model: model, used in torch case :param distributed: use distributed :return: optimizer """ params = self.map_config.get("params", {}) logging.debug("Call Optimizer. name={}, params={}".format(self.optim_cls.__name__, params)) optimizer = None try: if zeus.is_torch_backend(): learnable_params = [param for param in model.parameters() if param.requires_grad] optimizer = self.optim_cls(learnable_params, **params) if distributed: optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=hvd.Compression.none) elif zeus.is_tf_backend(): optimizer = dynamic_optimizer(self.optim_cls, **params) if distributed: optimizer = hvd.DistributedOptimizer(optimizer) if zeus.is_gpu_device() else \ NPUDistributedOptimizer(optimizer) elif zeus.is_ms_backend(): learnable_params = [param for param in model.trainable_params() if param.requires_grad] optimizer = self.optim_cls(learnable_params, **params) return optimizer except Exception as ex: logging.error("Failed to call Optimizer name={}, params={}".format(self.optim_cls.__name__, params)) raise ex
def _init_tf_estimator(self): """Init tensorflow estimator.""" if not zeus.is_tf_backend(): return sess_config = self._init_session_config() if zeus.is_gpu_device(): self._init_gpu_estimator(sess_config) elif zeus.is_npu_device(): self._init_npu_estimator(sess_config)
def exclude_ignore_index(self, logits, labels): """Ignore certain index.""" logits = tf.transpose(logits, [0, 2, 3, 1]) if zeus.is_gpu_device(): indices = tf.where(tf.not_equal(labels, self.ignore_index)) labels = tf.cast(tf.gather_nd(labels, indices), tf.int32) logits = tf.gather_nd(logits, indices) return logits, labels, 1.0 elif zeus.is_npu_device(): weights = tf.not_equal(labels, self.ignore_index) labels = tf.multiply(labels, tf.cast(weights, labels.dtype)) return logits, labels, tf.to_float(weights)
def set_distributed(cls, optimizer, model=None): """Set distributed optimizer.""" if zeus.is_torch_backend(): optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=hvd.Compression.none) elif zeus.is_tf_backend(): optim_class = hvd.DistributedOptimizer if zeus.is_gpu_device( ) else NPUDistributedOptimizer optimizer = dynamic_distributed_optimizer(optim_class, optimizer) return optimizer
def _init_session_config(self): import tensorflow as tf if zeus.is_gpu_device(): sess_config = tf.compat.v1.ConfigProto() sess_config.gpu_options.allow_growth = True return sess_config elif zeus.is_npu_device(): from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig sess_config = tf.ConfigProto() sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add() custom_op.name = "NpuOptimizer" custom_op.parameter_map["use_off_line"].b = True return sess_config
def _calc_workers_num(self): """Calculate workers numbers.""" if not General.parallel_search: return 1 if zeus.is_gpu_device(): import torch world_size = General.env.world_size devices_per_node = torch.cuda.device_count() worker_num = (world_size * devices_per_node) // General.devices_per_trainer elif zeus.is_npu_device(): world_devices = int(os.environ['RANK_SIZE']) worker_num = world_devices // General.devices_per_trainer return worker_num
def register_datasets(backend): """Import and register datasets automatically.""" if backend == "pytorch": from . import pytorch from .common.auto_lane_datasets import AutoLaneConfig elif backend == "tensorflow": from . import tensorflow if zeus.is_gpu_device(): from .common.auto_lane_datasets import AutoLaneConfig elif backend == "mindspore": import mindspore.dataset from . import mindspore from . import common from .transforms import register_transforms register_transforms(backend)
def _init_distributed_setting(self): if not self.distributed: return if zeus.is_npu_device(): from npu_bridge.estimator import npu_ops self.npu_init = npu_ops.initialize_system() self.npu_shutdown = npu_ops.shutdown_system() self.sess.run(self.npu_init) import horovod.tensorflow as hvd if zeus.is_gpu_device(): self._world_size = hvd.size() self._rank_id = hvd.rank() self._local_rank_id = hvd.local_rank() elif zeus.is_npu_device(): from hccl.manage.api import get_local_rank_id from hccl.manage.api import get_rank_size from hccl.manage.api import get_rank_id self._world_size = get_rank_size() self._rank_id = get_rank_id() self._local_rank_id = get_local_rank_id()
def register_transforms(backend): """Import and register transforms automatically.""" import zeus if zeus.is_gpu_device(): from .ImageTransform import ImageTransform from .Invert import Invert from .MaskTransform import MaskTransform from .Posterize import Posterize from .RandomCrop_pair import RandomCrop_pair from .RandomHorizontalFlip_pair import RandomHorizontalFlip_pair from .RandomMirrow_pair import RandomMirrow_pair from .RandomRotate90_pair import RandomRotate90_pair from .RandomVerticallFlip_pair import RandomVerticallFlip_pair from .Rotate import Rotate from .SegMapTransform import SegMapTransform from .Sharpness import Sharpness from .Shear_X import Shear_X from .Shear_Y import Shear_Y from .Solarize import Solarize from .Translate_X import Translate_X from .Translate_Y import Translate_Y from .RandomColor_pair import RandomColor_pair from .RandomGaussianBlur_pair import RandomGaussianBlur_pair from .RandomRotate_pair import RandomRotate_pair from .Rescale_pair import Rescale_pair from .Normalize_pair import Normalize_pair from .RandomHorizontalFlipWithBoxes import RandomHorizontalFlipWithBoxes if backend == "pytorch": from . import pytorch try: from mmdet.datasets.extra_aug import PhotoMetricDistortion, Expand, ExtraAugmentation except Exception: pass elif backend == "tensorflow": from . import tensorflow elif backend == "mindspore": pass
def _init_logging_hook(self): logging_hook = [] if zeus.is_gpu_device() and self.distributed: import horovod.tensorflow as hvd logging_hook += [hvd.BroadcastGlobalVariablesHook(0)] return logging_hook
def _init_session_config(self): sess_config = self._init_gpu_session_config() if zeus.is_gpu_device() else \ self._init_npu_session_config() return sess_config
def _init_logging_hook(self): logging_hook = [] if zeus.is_gpu_device() and self.distributed: logging_hook += [hvd.BroadcastGlobalVariablesHook(0)] return logging_hook
from .avazu import AvazuDataset from .cifar10 import Cifar10 from .cifar100 import Cifar100 import zeus if zeus.is_gpu_device(): from .cityscapes import Cityscapes from .div2k import DIV2K from .div2k_unpair import Div2kUnpair from .fmnist import FashionMnist from .imagenet import Imagenet from .mnist import Mnist from .sr_datasets import Set5, Set14, BSDS100 # from .auto_lane_datasets import AutoLaneDataset from .cls_ds import ClassificationDataset from .coco import CocoDataset from .mrpc import MrpcDataset # from .nasbench101 import Nasbench101 # from .nasbench201 import Nasbench201