Esempio n. 1
0
 def _init_distributed_setting(self):
     if not self.distributed:
         return
     self._world_size = hvd.size() if zeus.is_gpu_device(
     ) else get_rank_size()
     self._rank_id = hvd.rank() if zeus.is_gpu_device() else get_rank_id()
     self._local_rank_id = hvd.local_rank() if zeus.is_gpu_device(
     ) else get_local_rank_id()
Esempio n. 2
0
 def _init_distributed_setting(self):
     if not self.distributed:
         return
     if zeus.is_npu_device():
         self.npu_init = npu_ops.initialize_system()
         self.npu_shutdown = npu_ops.shutdown_system()
         self.sess.run(self.npu_init)
     self._world_size = hvd.size() if zeus.is_gpu_device() else get_rank_size()
     self._rank_id = hvd.rank() if zeus.is_gpu_device() else get_rank_id()
     self._local_rank_id = hvd.local_rank() if zeus.is_gpu_device() else get_local_rank_id()
Esempio n. 3
0
def run_remote_worker(worker_id, worker_path, id):
    """Run worker on remote mochine."""
    from zeus.common.utils import init_log
    init_log(level="info",
             log_file=".temp_{}.log".format(worker_id),
             log_path=worker_path)

    config = _load_config(worker_id, worker_path, id)
    os.environ["LD_LIBRARY_PATH"] = config["env"]["LD_LIBRARY_PATH"]
    os.environ["PWD"] = config["env"]["PWD"]
    os.chdir(os.environ["PWD"])
    zeus.register_zeus(os.environ['BACKEND_TYPE'].lower())

    if zeus.is_gpu_device():
        sub_pid_list = call_in_gpu(config, id, worker_id, worker_path)
    elif zeus.is_npu_device():
        os.environ["PYTHONPATH"] = config["env"]["PYTHONPATH"]
        os.environ["PATH"] = config["env"]["PATH"]
        os.environ["ASCEND_OPP_PATH"] = config["env"]["ASCEND_OPP_PATH"]
        sub_pid_list = call_in_npu(config, id, worker_id, worker_path)
    logging.info("DistributedWorker finished!")
    for sub_pid in sub_pid_list:
        kill_proc_tree(pid=sub_pid)
    logging.info("DistributedWorker subprocess cleaned!")
    return 0
Esempio n. 4
0
 def _init_tf_estimator(self):
     """Init tensorflow estimator."""
     sess_config = self._init_session_config()
     if zeus.is_gpu_device():
         self._init_gpu_estimator(sess_config)
     elif zeus.is_npu_device():
         self._init_npu_estimator(sess_config)
Esempio n. 5
0
    def __call__(self, model=None, distributed=False):
        """Call Optimizer class.

        :param model: model, used in torch case
        :param distributed: use distributed
        :return: optimizer
        """
        params = self.map_config.get("params", {})
        logging.debug("Call Optimizer. name={}, params={}".format(self.optim_cls.__name__, params))
        optimizer = None
        try:
            if zeus.is_torch_backend():
                learnable_params = [param for param in model.parameters() if param.requires_grad]
                optimizer = self.optim_cls(learnable_params, **params)
                if distributed:
                    optimizer = hvd.DistributedOptimizer(optimizer,
                                                         named_parameters=model.named_parameters(),
                                                         compression=hvd.Compression.none)
            elif zeus.is_tf_backend():
                optimizer = dynamic_optimizer(self.optim_cls, **params)
                if distributed:
                    optimizer = hvd.DistributedOptimizer(optimizer) if zeus.is_gpu_device() else \
                        NPUDistributedOptimizer(optimizer)
            elif zeus.is_ms_backend():
                learnable_params = [param for param in model.trainable_params() if param.requires_grad]
                optimizer = self.optim_cls(learnable_params, **params)
            return optimizer
        except Exception as ex:
            logging.error("Failed to call Optimizer name={}, params={}".format(self.optim_cls.__name__, params))
            raise ex
Esempio n. 6
0
 def _init_tf_estimator(self):
     """Init tensorflow estimator."""
     if not zeus.is_tf_backend():
         return
     sess_config = self._init_session_config()
     if zeus.is_gpu_device():
         self._init_gpu_estimator(sess_config)
     elif zeus.is_npu_device():
         self._init_npu_estimator(sess_config)
Esempio n. 7
0
 def exclude_ignore_index(self, logits, labels):
     """Ignore certain index."""
     logits = tf.transpose(logits, [0, 2, 3, 1])
     if zeus.is_gpu_device():
         indices = tf.where(tf.not_equal(labels, self.ignore_index))
         labels = tf.cast(tf.gather_nd(labels, indices), tf.int32)
         logits = tf.gather_nd(logits, indices)
         return logits, labels, 1.0
     elif zeus.is_npu_device():
         weights = tf.not_equal(labels, self.ignore_index)
         labels = tf.multiply(labels, tf.cast(weights, labels.dtype))
         return logits, labels, tf.to_float(weights)
Esempio n. 8
0
 def set_distributed(cls, optimizer, model=None):
     """Set distributed optimizer."""
     if zeus.is_torch_backend():
         optimizer = hvd.DistributedOptimizer(
             optimizer,
             named_parameters=model.named_parameters(),
             compression=hvd.Compression.none)
     elif zeus.is_tf_backend():
         optim_class = hvd.DistributedOptimizer if zeus.is_gpu_device(
         ) else NPUDistributedOptimizer
         optimizer = dynamic_distributed_optimizer(optim_class, optimizer)
     return optimizer
Esempio n. 9
0
 def _init_session_config(self):
     import tensorflow as tf
     if zeus.is_gpu_device():
         sess_config = tf.compat.v1.ConfigProto()
         sess_config.gpu_options.allow_growth = True
         return sess_config
     elif zeus.is_npu_device():
         from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig
         sess_config = tf.ConfigProto()
         sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
         custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add()
         custom_op.name = "NpuOptimizer"
         custom_op.parameter_map["use_off_line"].b = True
         return sess_config
Esempio n. 10
0
 def _calc_workers_num(self):
     """Calculate workers numbers."""
     if not General.parallel_search:
         return 1
     if zeus.is_gpu_device():
         import torch
         world_size = General.env.world_size
         devices_per_node = torch.cuda.device_count()
         worker_num = (world_size *
                       devices_per_node) // General.devices_per_trainer
     elif zeus.is_npu_device():
         world_devices = int(os.environ['RANK_SIZE'])
         worker_num = world_devices // General.devices_per_trainer
     return worker_num
Esempio n. 11
0
def register_datasets(backend):
    """Import and register datasets automatically."""
    if backend == "pytorch":
        from . import pytorch
        from .common.auto_lane_datasets import AutoLaneConfig
    elif backend == "tensorflow":
        from . import tensorflow
        if zeus.is_gpu_device():
            from .common.auto_lane_datasets import AutoLaneConfig
    elif backend == "mindspore":
        import mindspore.dataset
        from . import mindspore
    from . import common

    from .transforms import register_transforms
    register_transforms(backend)
Esempio n. 12
0
    def _init_distributed_setting(self):
        if not self.distributed:
            return

        if zeus.is_npu_device():
            from npu_bridge.estimator import npu_ops
            self.npu_init = npu_ops.initialize_system()
            self.npu_shutdown = npu_ops.shutdown_system()
            self.sess.run(self.npu_init)

        import horovod.tensorflow as hvd
        if zeus.is_gpu_device():
            self._world_size = hvd.size()
            self._rank_id = hvd.rank()
            self._local_rank_id = hvd.local_rank()
        elif zeus.is_npu_device():
            from hccl.manage.api import get_local_rank_id
            from hccl.manage.api import get_rank_size
            from hccl.manage.api import get_rank_id
            self._world_size = get_rank_size()
            self._rank_id = get_rank_id()
            self._local_rank_id = get_local_rank_id()
Esempio n. 13
0
def register_transforms(backend):
    """Import and register transforms automatically."""
    import zeus
    if zeus.is_gpu_device():
        from .ImageTransform import ImageTransform
        from .Invert import Invert
        from .MaskTransform import MaskTransform
        from .Posterize import Posterize
        from .RandomCrop_pair import RandomCrop_pair
        from .RandomHorizontalFlip_pair import RandomHorizontalFlip_pair
        from .RandomMirrow_pair import RandomMirrow_pair
        from .RandomRotate90_pair import RandomRotate90_pair
        from .RandomVerticallFlip_pair import RandomVerticallFlip_pair
        from .Rotate import Rotate
        from .SegMapTransform import SegMapTransform
        from .Sharpness import Sharpness
        from .Shear_X import Shear_X
        from .Shear_Y import Shear_Y
        from .Solarize import Solarize
        from .Translate_X import Translate_X
        from .Translate_Y import Translate_Y
        from .RandomColor_pair import RandomColor_pair
        from .RandomGaussianBlur_pair import RandomGaussianBlur_pair
        from .RandomRotate_pair import RandomRotate_pair
        from .Rescale_pair import Rescale_pair
        from .Normalize_pair import Normalize_pair
        from .RandomHorizontalFlipWithBoxes import RandomHorizontalFlipWithBoxes
    if backend == "pytorch":
        from . import pytorch
        try:
            from mmdet.datasets.extra_aug import PhotoMetricDistortion, Expand, ExtraAugmentation
        except Exception:
            pass
    elif backend == "tensorflow":
        from . import tensorflow
    elif backend == "mindspore":
        pass
Esempio n. 14
0
 def _init_logging_hook(self):
     logging_hook = []
     if zeus.is_gpu_device() and self.distributed:
         import horovod.tensorflow as hvd
         logging_hook += [hvd.BroadcastGlobalVariablesHook(0)]
     return logging_hook
Esempio n. 15
0
 def _init_session_config(self):
     sess_config = self._init_gpu_session_config() if zeus.is_gpu_device() else \
         self._init_npu_session_config()
     return sess_config
Esempio n. 16
0
 def _init_logging_hook(self):
     logging_hook = []
     if zeus.is_gpu_device() and self.distributed:
         logging_hook += [hvd.BroadcastGlobalVariablesHook(0)]
     return logging_hook
Esempio n. 17
0
from .avazu import AvazuDataset
from .cifar10 import Cifar10
from .cifar100 import Cifar100
import zeus
if zeus.is_gpu_device():
    from .cityscapes import Cityscapes
    from .div2k import DIV2K
    from .div2k_unpair import Div2kUnpair
    from .fmnist import FashionMnist
    from .imagenet import Imagenet
    from .mnist import Mnist
    from .sr_datasets import Set5, Set14, BSDS100
    #   from .auto_lane_datasets import AutoLaneDataset
    from .cls_ds import ClassificationDataset
    from .coco import CocoDataset
    from .mrpc import MrpcDataset
#   from .nasbench101 import Nasbench101
#   from .nasbench201 import Nasbench201