def _init_ms_context(self): if zeus.is_npu_device(): context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") else: context.set_context(mode=context.GRAPH_MODE, device_target="CPU") self.dataset_sink_mode = True if zeus.is_npu_device() else False
def _init_tf_estimator(self): """Init tensorflow estimator.""" sess_config = self._init_session_config() if zeus.is_gpu_device(): self._init_gpu_estimator(sess_config) elif zeus.is_npu_device(): self._init_npu_estimator(sess_config)
def input_fn(self): """Return the next `batch_size` examples from this data set.""" if hasattr(self.dataset, "input_fn"): return self.dataset.input_fn() self._get_dateset_info() dataset = tf.data.Dataset.from_tensor_slices( (self.data_index, self.data_index)) if self.dataset.world_size > 1: dataset = dataset.shard(self.dataset.world_size, self.dataset.rank) if self.dataset.mode == 'train': dataset = dataset.repeat() if self.args.shuffle: dataset = dataset.shuffle(buffer_size=self._num_examples) if zeus.is_npu_device(): # esr cannot adapt to num_parallel_calls on NPU dataset = dataset.map(self.data_map_func) dataset = dataset.batch(batch_size=self.args.batch_size, drop_remainder=self.args.drop_last) else: dataset = dataset.map( self.data_map_func, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.batch(batch_size=self.args.batch_size, drop_remainder=self.args.drop_last) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) return dataset
def run_remote_worker(worker_id, worker_path, id): """Run worker on remote mochine.""" from zeus.common.utils import init_log init_log(level="info", log_file=".temp_{}.log".format(worker_id), log_path=worker_path) config = _load_config(worker_id, worker_path, id) os.environ["LD_LIBRARY_PATH"] = config["env"]["LD_LIBRARY_PATH"] os.environ["PWD"] = config["env"]["PWD"] os.chdir(os.environ["PWD"]) zeus.register_zeus(os.environ['BACKEND_TYPE'].lower()) if zeus.is_gpu_device(): sub_pid_list = call_in_gpu(config, id, worker_id, worker_path) elif zeus.is_npu_device(): os.environ["PYTHONPATH"] = config["env"]["PYTHONPATH"] os.environ["PATH"] = config["env"]["PATH"] os.environ["ASCEND_OPP_PATH"] = config["env"]["ASCEND_OPP_PATH"] sub_pid_list = call_in_npu(config, id, worker_id, worker_path) logging.info("DistributedWorker finished!") for sub_pid in sub_pid_list: kill_proc_tree(pid=sub_pid) logging.info("DistributedWorker subprocess cleaned!") return 0
def _init_tf_estimator(self): """Init tensorflow estimator.""" if not zeus.is_tf_backend(): return sess_config = self._init_session_config() if zeus.is_gpu_device(): self._init_gpu_estimator(sess_config) elif zeus.is_npu_device(): self._init_npu_estimator(sess_config)
def _init_distributed_setting(self): if not self.distributed: return if zeus.is_npu_device(): self.npu_init = npu_ops.initialize_system() self.npu_shutdown = npu_ops.shutdown_system() self.sess.run(self.npu_init) self._world_size = hvd.size() if zeus.is_gpu_device() else get_rank_size() self._rank_id = hvd.rank() if zeus.is_gpu_device() else get_rank_id() self._local_rank_id = hvd.local_rank() if zeus.is_gpu_device() else get_local_rank_id()
def exclude_ignore_index(self, logits, labels): """Ignore certain index.""" logits = tf.transpose(logits, [0, 2, 3, 1]) if zeus.is_gpu_device(): indices = tf.where(tf.not_equal(labels, self.ignore_index)) labels = tf.cast(tf.gather_nd(labels, indices), tf.int32) logits = tf.gather_nd(logits, indices) return logits, labels, 1.0 elif zeus.is_npu_device(): weights = tf.not_equal(labels, self.ignore_index) labels = tf.multiply(labels, tf.cast(weights, labels.dtype)) return logits, labels, tf.to_float(weights)
def _init_distributed_setting(self): if not self.distributed: return if zeus.is_npu_device(): from npu_bridge.estimator import npu_ops self.npu_init = npu_ops.initialize_system() self.npu_shutdown = npu_ops.shutdown_system() self.sess.run(self.npu_init) import horovod.tensorflow as hvd if zeus.is_gpu_device(): self._world_size = hvd.size() self._rank_id = hvd.rank() self._local_rank_id = hvd.local_rank() elif zeus.is_npu_device(): from hccl.manage.api import get_local_rank_id from hccl.manage.api import get_rank_size from hccl.manage.api import get_rank_id self._world_size = get_rank_size() self._rank_id = get_rank_id() self._local_rank_id = get_local_rank_id()
def _calc_workers_num(self): """Calculate workers numbers.""" if not General.parallel_search: return 1 if zeus.is_gpu_device(): import torch world_size = General.env.world_size devices_per_node = torch.cuda.device_count() worker_num = (world_size * devices_per_node) // General.devices_per_trainer elif zeus.is_npu_device(): world_devices = int(os.environ['RANK_SIZE']) worker_num = world_devices // General.devices_per_trainer return worker_num
def _init_session_config(self): import tensorflow as tf if zeus.is_gpu_device(): sess_config = tf.compat.v1.ConfigProto() sess_config.gpu_options.allow_growth = True return sess_config elif zeus.is_npu_device(): from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig sess_config = tf.ConfigProto() sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add() custom_op.name = "NpuOptimizer" custom_op.parameter_map["use_off_line"].b = True return sess_config
def adjust_pipeline_config(self, cfg): """Adjust pipeline config according.""" cfg_cp = copy.deepcopy(cfg) cfg_tiny = copy.deepcopy(cfg) workers_num = self._calc_workers_num() General.parallel_search = False self._get_time_params(cfg_cp) self._simulate_tiny_pipeline(cfg_tiny) General.parallel_search = cfg.general.parallel_search self._modify_pipeline_config(workers_num, self.epoch_time, self.params_dict) if zeus.is_npu_device(): os.environ['RANK_TABLE_FILE'] = os.environ[ 'ORIGIN_RANK_TABLE_FILE'] os.environ['RANK_SIZE'] = os.environ['ORIGIN_RANK_SIZE'] logging.info('Adjust runtime config successfully.')
def _shutdown_distributed(self): if zeus.is_npu_device() and self.distributed: self.sess.run(self.npu_shutdown) self.sess.close()
import tensorflow as tf from zeus.metrics.tensorflow.metrics import Metrics try: import horovod.tensorflow as hvd except Exception: # logging.warning("horovod not been installed, {}".format(str(e))) pass elif zeus.is_ms_backend(): from mindspore import context from mindspore.train import Model as MsModel from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor from .callbacks.ms_callbacks import EvalCallBack from zeus.metrics.mindspore.metrics import Metrics if zeus.is_npu_device() and zeus.is_tf_backend(): from npu_bridge.estimator.npu.npu_config import NPURunConfig from npu_bridge.estimator.npu.npu_estimator import NPUEstimator from npu_bridge.estimator import npu_ops from hccl.manage.api import get_local_rank_id from hccl.manage.api import get_rank_size from hccl.manage.api import get_rank_id from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig logger = logging.getLogger(__name__) @ClassFactory.register(ClassType.TRAINER) class Trainer(DistributedWorker): """Trainer class.
"""TensorFlow Trainer.""" import logging import numpy as np import tensorflow as tf from tensorflow.python.estimator import estimator as est from zeus.common.general import General import zeus from zeus.metrics.tensorflow.metrics import Metrics from zeus.trainer_base import TrainerBase try: import horovod.tensorflow as hvd except Exception: pass if zeus.is_npu_device(): from npu_bridge.estimator.npu.npu_config import NPURunConfig from npu_bridge.estimator.npu.npu_estimator import NPUEstimator from npu_bridge.estimator import npu_ops from hccl.manage.api import get_local_rank_id from hccl.manage.api import get_rank_size from hccl.manage.api import get_rank_id from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig from zeus.common import FileOps, init_log from zeus.trainer.modules.losses import Loss from zeus.trainer.modules.lr_schedulers import LrScheduler from zeus.trainer.modules.optimizer import Optimizer from zeus.tf_utils import TFVariables
def valid(self, valid_loader): """Validate one step of mode. :param loader: valid data loader """ if zeus.is_torch_backend(): import torch from zeus.metrics.pytorch import Metrics metrics = Metrics(self.config.metric) self.model.eval() data_num = 0 latency_sum = 0.0 with torch.no_grad(): for step, batch in enumerate(valid_loader): if isinstance(batch, list) or isinstance(batch, tuple): data = batch[0] target = batch[1] else: raise ValueError("The dataset format must be tuple or list," "but get {}.".format(type(batch))) if self.config.cuda: data, target = data.cuda(), target.cuda() self.model = self.model.cuda() time_start = time.time() logits = self.model(data) latency_sum += time.time() - time_start metrics(logits, target) n = data.size(0) data_num += n if step % self.config.report_freq == 0: logging.info("step [{}/{}], valid metric [{}]".format( step + 1, len(valid_loader), str(metrics.results))) latency = latency_sum / data_num elif zeus.is_tf_backend(): from zeus.metrics.tensorflow.metrics import Metrics metrics = Metrics(self.config.metric) estimator = self._init_tf_estimator() time_start = time.time() eval_metrics = estimator.evaluate(input_fn=valid_loader.input_fn, steps=len(valid_loader)) latency = (time.time() - time_start) / (len(valid_loader) * valid_loader.args.batch_size) metrics.update(eval_metrics) elif zeus.is_ms_backend(): from zeus.metrics.mindspore.metrics import Metrics from mindspore.train import Model as MsModel from .utils import FakeLoss metrics = Metrics(self.config.metric) metric_name = self.config.metric().type dataset_sink_mode = True if zeus.is_npu_device() else False # when eval, the loss_fn is not needed actually, but when initilized, the loss_fn can't be None ms_model = MsModel(network=self.model, loss_fn=FakeLoss(), metrics={metric_name: metrics()}) time_start = time.time() eval_metrics = ms_model.eval(valid_dataset=valid_loader, callbacks=None, dataset_sink_mode=dataset_sink_mode) for batch in valid_loader.create_dict_iterator(): batch_size = batch["image"].shape[0] break latency = (time.time() - time_start) / (valid_loader.get_dataset_size() * batch_size) metrics.update(eval_metrics) pfms = metrics.results if self.config.evaluate_latency: pfms["latency"] = latency logging.info("evaluate performance: {}".format(pfms)) return pfms