def __new__(cls, model=None, id=None, hps=None, load_ckpt_flag=False, model_desc=None, lazy_build=True, **kwargs): """Create Trainer clss.""" if zeus.is_torch_backend(): from zeus.trainer_torch import TrainerTorch trainer_cls = TrainerTorch elif zeus.is_tf_backend(): from zeus.trainer_tf import TrainerTf trainer_cls = TrainerTf else: from zeus.trainer_ms import TrainerMs trainer_cls = TrainerMs return trainer_cls(model=model, id=id, hps=hps, load_ckpt_flag=load_ckpt_flag, model_desc=model_desc, lazy_build=lazy_build, **kwargs)
def update_flops_params(self, epoch=None, logs=None): """Calculate flops and params.""" self.model = self.trainer.model try: if self.flops is None: flops_count, params_count = calc_model_flops_params( self.model, self.input) self.flops, self.params = flops_count * 1e-9, params_count * 1e-3 if self.latency is None: sess_config = self.trainer._init_session_config( ) if zeus.is_tf_backend() else None self.latency = calc_forward_latency(self.model, self.input, sess_config) * 1000 summary_perfs = logs.get('summary_perfs', {}) if epoch: summary_perfs.update({ 'flops': self.flops, 'params': self.params, 'latency': self.latency, 'epoch': epoch }) else: summary_perfs.update({ 'flops': self.flops, 'params': self.params, 'latency': self.latency }) logs.update({'summary_perfs': summary_perfs}) except Exception as ex: logging.warning("model statics failed, ex=%s", ex)
def _init_tf_session(self): if not zeus.is_tf_backend(): return sess_config = self._init_session_config() self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf.compat.v1.Session(config=sess_config)
def __call__(self, model=None, distributed=False, **kwargs): """Call Optimizer class. :param model: model, used in torch case :param distributed: use distributed :return: optimizer """ params = self.map_config.get("params", {}) logging.debug("Call Optimizer. name={}, params={}".format( self.optim_cls.__name__, params)) optimizer = None try: if zeus.is_torch_backend(): learnable_params = [ param for param in model.parameters() if param.requires_grad ] optimizer = self.optim_cls(learnable_params, **params) if distributed: optimizer = self.set_distributed(optimizer, model) elif zeus.is_tf_backend(): optimizer = dynamic_optimizer(self.optim_cls, **params) elif zeus.is_ms_backend(): if "dynamic_lr" in kwargs: params.update({"learning_rate": kwargs["dynamic_lr"]}) learnable_params = [ param for param in model.trainable_params() if param.requires_grad ] optimizer = self.optim_cls(learnable_params, **params) return optimizer except Exception as ex: logging.error("Failed to call Optimizer name={}, params={}".format( self.optim_cls.__name__, params)) raise ex
def _load_pretrained_model(cls, model, pretrained_model_file): pretrained_model_file = cls._get_abs_path(pretrained_model_file) logging.info("load model weights from file, weights file={}".format(pretrained_model_file)) if zeus.is_torch_backend(): if not os.path.isfile(pretrained_model_file): raise "Pretrained model is not existed, model={}".format(pretrained_model_file) import torch checkpoint = torch.load(pretrained_model_file) model.load_state_dict(checkpoint) if zeus.is_tf_backend(): if pretrained_model_file.endswith('.pth'): checkpoint = convert_checkpoint_from_pytorch(pretrained_model_file, model) model.load_checkpoint_from_numpy(checkpoint) else: pretrained_model_file = cls._get_tf_model_file(pretrained_model_file) model.load_checkpoint(pretrained_model_file) elif zeus.is_ms_backend(): from mindspore.train.serialization import load_checkpoint if hasattr(model, "pretrained"): pretrained_weight = model.pretrained(pretrained_model_file) else: if os.path.isfile(pretrained_model_file): pretrained_weight = pretrained_model_file else: for file in os.listdir(pretrained_model_file): if file.endswith(".ckpt"): pretrained_weight = os.path.join(pretrained_model_file, file) break load_checkpoint(pretrained_weight, net=model) return model
def _to_tensor(self, data): if zeus.is_torch_backend(): import torch return torch.tensor(data) elif zeus.is_tf_backend(): import tensorflow as tf return tf.convert_to_tensor(data)
def __init__(self, desc=None, weight_file=None, pb_file=None): super(GraphGetter, self).__init__() if isinstance(desc, dict): src_model = ModelZoo().get_model(desc) else: src_model = desc weights = OrderedDict() if is_tf_backend(): import tensorflow.compat.v1 as tf from tensorflow.python.framework import tensor_util tf.reset_default_graph() data_shape = (1, 224, 224, 3) x = tf.ones(data_shape) if pb_file: with tf.io.gfile.GFile(pb_file, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) graph = tf.Graph() with graph.as_default(): tf.import_graph_def(graph_def, name='') weight_file = None wts = [n for n in graph_def.node if n.op == 'Const'] for n in wts: weights[n.name] = tensor_util.MakeNdarray( n.attr['value'].tensor) else: src_model(x, self.training) graph = tf.get_default_graph() desc = graph2desc(graph) tf.reset_default_graph() self.model = ModelZoo().get_model(desc, weight_file) if weights: self.model.load_checkpoint_from_numpy(weights)
def get_cls(cls, type_name, t_cls_name=None): """Get class and bind config to class. :param type_name: type name of class registry :param t_cls_name: class name :return:t_cls """ # lazy load class if not cls.is_exists(type_name, t_cls_name) and t_cls_name: cls._import_pkg(type_name, t_cls_name) # verify class if not cls.is_exists(type_name, t_cls_name): raise ValueError("can't find class type {} class name {} in class registry".format(type_name, t_cls_name)) # create instance without configs if t_cls_name is None: from zeus.datasets.conf.dataset import DatasetConfig from zeus.evaluator.conf import EvaluatorConfig if type_name == ClassType.DATASET: t_cls_name = DatasetConfig.type elif type_name == ClassType.TRAINER: import zeus if zeus.is_torch_backend(): t_cls_name = "TrainerTorch" elif zeus.is_tf_backend(): t_cls_name = "TrainerTf" elif zeus.is_ms_backend(): t_cls_name = "TrainerMs" elif type_name == ClassType.EVALUATOR: t_cls_name = EvaluatorConfig.type else: pass if t_cls_name is None: raise ValueError("can't find class. class type={}".format(type_name)) t_cls = cls.__registry__.get(type_name).get(t_cls_name) return t_cls
def save(self, file_name): """Save model.""" if zeus.is_tf_backend(): with self.graph.as_default(): self.actor_var.save_weights(file_name + ".npz") return file_name + ".npz"
def _train_loop(self): """Do the training with data, callbacks and step functions etc.""" # Allow user to build trainer in before_train() callback, but they # should set lazy_built in configuration file to True self.callbacks.before_train() if self.skip_train: return if self.use_unsupervised_pretrain and zeus.is_torch_backend(): from .simclr.transforms import TransformsSimCLR from .simclr.train import simclr_train train_loader = self._init_dataloader(mode="train", transforms=TransformsSimCLR()) self.model = simclr_train(self.model, train_loader) repeat_time = 1 if zeus.is_ms_backend() else self.epochs repeat_time = 1 if zeus.is_tf_backend( ) and self.config.train_in_once else repeat_time for epoch in range(self._start_epoch, repeat_time): epoch_logs = {'train_num_batches': self.batch_num_train} if self.do_validation: epoch_logs.update({'valid_num_batches': self.batch_num_valid}) self.callbacks.before_epoch(epoch, epoch_logs) if self.config.with_train: self._train_epoch() if self.do_validation and self._should_run_validation(epoch): self._valid_epoch() self.callbacks.after_epoch(epoch) self.callbacks.after_train() if self.distributed: self._shutdown_distributed()
def _get_data_format(): if zeus.is_torch_backend() or zeus.is_ms_backend(): return 'channels_first' elif zeus.is_tf_backend(): return 'channels_last' else: return None
def __call__(self, model=None, distributed=False): """Call Optimizer class. :param model: model, used in torch case :param distributed: use distributed :return: optimizer """ params = self.map_config.get("params", {}) logging.debug("Call Optimizer. name={}, params={}".format(self.optim_cls.__name__, params)) optimizer = None try: if zeus.is_torch_backend(): learnable_params = [param for param in model.parameters() if param.requires_grad] optimizer = self.optim_cls(learnable_params, **params) if distributed: optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=hvd.Compression.none) elif zeus.is_tf_backend(): optimizer = dynamic_optimizer(self.optim_cls, **params) if distributed: optimizer = hvd.DistributedOptimizer(optimizer) if zeus.is_gpu_device() else \ NPUDistributedOptimizer(optimizer) elif zeus.is_ms_backend(): learnable_params = [param for param in model.trainable_params() if param.requires_grad] optimizer = self.optim_cls(learnable_params, **params) return optimizer except Exception as ex: logging.error("Failed to call Optimizer name={}, params={}".format(self.optim_cls.__name__, params)) raise ex
def is_filtered(self, desc=None): """Filter function of latency.""" if self.max_latency is None: return False model, count_input = self.get_model_input(desc) num = 100 if zeus.is_torch_backend(): start_time = time.time() for i in range(num): model(count_input) latency = (time.time() - start_time) / num elif zeus.is_tf_backend(): import tensorflow as tf input = tf.placeholder(tf.float32, shape=count_input.get_shape().as_list()) output = model(input, training=False) with tf.compat.v1.Session() as sess: input_numpy = count_input.eval(session=sess) start_time = time.time() for i in range(num): sess.run(output, feed_dict={input: input_numpy}) latency = (time.time() - start_time) / num logging.info('Sampled model\'s latency: {}'.format(latency)) if latency > self.max_latency: return True else: return False
def _valid_epoch(self): self.callbacks.before_valid() valid_logs = None if zeus.is_torch_backend(): self.model.eval() with torch.no_grad(): for batch_index, batch in enumerate(self.valid_loader): batch = self.make_batch(batch) batch_logs = {'valid_batch': batch} self.callbacks.before_valid_step(batch_index, batch_logs) valid_batch_output = self.valid_step(batch) self.callbacks.after_valid_step(batch_index, valid_batch_output) elif zeus.is_tf_backend(): eval_metrics = self.estimator.evaluate(input_fn=self.valid_input_fn, steps=len(self.valid_loader)) self.valid_metrics.update(eval_metrics) valid_logs = dict() valid_logs['cur_valid_perfs'] = self.valid_metrics.results elif zeus.is_ms_backend(): eval_metrics = self.ms_model.eval(valid_dataset=self.valid_loader, dataset_sink_mode=self.dataset_sink_mode) self.valid_metrics.update(eval_metrics) valid_logs = dict() valid_logs['cur_valid_perfs'] = self.valid_metrics.results self.callbacks.after_valid(valid_logs)
def _save_best_model(self): """Save best model.""" if zeus.is_torch_backend(): torch.save(self.trainer.model.state_dict(), self.trainer.weights_file) elif zeus.is_tf_backend(): worker_path = self.trainer.get_local_worker_path() model_id = "model_{}".format(self.trainer.worker_id) weights_folder = FileOps.join_path(worker_path, model_id) FileOps.make_dir(weights_folder) checkpoint_file = tf.train.latest_checkpoint(worker_path) ckpt_globs = glob.glob("{}.*".format(checkpoint_file)) for _file in ckpt_globs: dst_file = model_id + os.path.splitext(_file)[-1] FileOps.copy_file(_file, FileOps.join_path(weights_folder, dst_file)) FileOps.copy_file(FileOps.join_path(worker_path, 'checkpoint'), weights_folder) elif zeus.is_ms_backend(): worker_path = self.trainer.get_local_worker_path() save_path = os.path.join( worker_path, "model_{}.ckpt".format(self.trainer.worker_id)) for file in os.listdir(worker_path): if file.startswith("CKP") and file.endswith(".ckpt"): self.weights_file = FileOps.join_path(worker_path, file) os.rename(self.weights_file, save_path)
def _train_epoch(self): if zeus.is_torch_backend(): self.model.train() for batch_index, batch in enumerate(self.train_loader): batch = self.make_batch(batch) batch_logs = {'train_batch': batch} self.callbacks.before_train_step(batch_index, batch_logs) train_batch_output = self.train_step(batch) batch_logs.update(train_batch_output) if self.config.is_detection_trainer: batch_logs.update({'is_detection_trainer': True}) self.callbacks.after_train_step(batch_index, batch_logs) elif zeus.is_tf_backend(): self.estimator.train(input_fn=self.train_input_fn, steps=len(self.train_loader), hooks=self._init_logging_hook()) elif zeus.is_ms_backend(): self.ms_model = MsModel(network=self.model, loss_fn=self.loss, optimizer=self.optimizer, metrics={self.metric_name: self.valid_metrics()}) config_ck = CheckpointConfig(save_checkpoint_steps=self.config.save_steps) # save the network model and parameters for subsequence fine-tuning save_path = self.get_local_worker_path(self.step_name, self.worker_id) ckpoint_cb = ModelCheckpoint(config=config_ck, directory=save_path) loss_cb = LossMonitor(per_print_times=self.config.report_freq) eval_cb = EvalCallBack(self.ms_model, self.valid_loader) self.ms_model.train(epoch=self.epochs, train_dataset=self.train_loader, callbacks=[ckpoint_cb, loss_cb, eval_cb], dataset_sink_mode=self.dataset_sink_mode)
def _init_model(self): """Load model desc from save path and parse to model.""" model = self.trainer.model if self.trainer.config.is_detection_trainer: model_desc = self.trainer.model_desc else: model_desc = self._get_model_desc() if model_desc: ModelConfig.model_desc = model_desc pretrained_model_file = self._get_pretrained_model_file() if not model: if not model_desc: raise Exception( "Failed to Init model, can not get model description.") model = ModelZoo.get_model(model_desc, pretrained_model_file) if model: if zeus.is_torch_backend(): import torch if self.trainer.use_cuda: model = model.cuda() if General._parallel and General.devices_per_trainer > 1: model = torch.nn.DataParallel(self.trainer.model) if zeus.is_tf_backend(): if pretrained_model_file: model_folder = os.path.dirname(pretrained_model_file) FileOps.copy_folder(model_folder, self.trainer.get_local_worker_path()) return model
def predict(self, input): """Inference model.""" if zeus.is_tf_backend(): with self.graph.as_default(): feed_dict = {self.input: input} out = self.sess.run(self.logits, feed_dict) return out
def get_named_modules(layer): """Get named modules.""" if zeus.is_tf_backend(): return [(op.name, op) for op in layer] elif zeus.is_torch_backend(): return layer.named_modules() elif zeus.is_ms_backend(): return layer._children_scope_recursive()
def _init_tf_estimator(self): """Init tensorflow estimator.""" if not zeus.is_tf_backend(): return sess_config = self._init_session_config() if zeus.is_gpu_device(): self._init_gpu_estimator(sess_config) elif zeus.is_npu_device(): self._init_npu_estimator(sess_config)
def _set_default_funcs(self): if zeus.is_torch_backend(): self.make_batch = self._default_make_batch self.train_step = self._default_train_step self.valid_step = self._default_valid_step elif zeus.is_tf_backend(): self.model_fn = self._default_model_fn self.train_input_fn = self._default_train_input_fn self.valid_input_fn = self._default_valid_input_fn
def get_shape(layer): """Get weight shape.""" if zeus.is_tf_backend(): return layer.get_shape() elif zeus.is_torch_backend(): return layer.weight.data.shape elif zeus.is_ms_backend(): para_name = list(layer._params)[0] return getattr(layer, para_name).default_input.shape
def Adapter(dataset): """Adapter of dataset.""" if zeus.is_torch_backend(): from .pytorch import TorchAdapter as Adapter elif zeus.is_tf_backend(): from .tensorflow import TfAdapter as Adapter elif zeus.is_ms_backend(): from .mindspore import MsAdapter as Adapter else: raise ValueError return Adapter(dataset)
def _init_metrics(self, metrics=None): """Init metrics.""" if metrics is not None: return metrics else: if zeus.is_torch_backend(): from zeus.metrics.pytorch.metrics import Metrics elif zeus.is_tf_backend(): from zeus.metrics.tensorflow.metrics import Metrics elif zeus.is_ms_backend(): from zeus.metrics.mindspore.metrics import Metrics return Metrics()
def set_distributed(cls, optimizer, model=None): """Set distributed optimizer.""" if zeus.is_torch_backend(): optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=hvd.Compression.none) elif zeus.is_tf_backend(): optim_class = hvd.DistributedOptimizer if zeus.is_gpu_device( ) else NPUDistributedOptimizer optimizer = dynamic_distributed_optimizer(optim_class, optimizer) return optimizer
def _calc_forward_latency_davinci(model, input, sess_config=None, num=10, evaluate_config=None): """Model forward latency calculation. :param model: network model :type model: torch or tf module :param input: input tensor :type input: Tensor of torch or tf :param num: forward number :type num: int :param evaluate_config: some config for evaluate in davinci :type evaluate_config: dict :return: forward latency :rtype: float """ from zeus.evaluator.tools.evaluate_davinci_bolt import evaluate from zeus.common.task_ops import TaskOps # backend = evaluate_config.get("backend") hardware = evaluate_config.get("hardware") remote_host = evaluate_config.get("remote_host") worker_path = TaskOps().local_base_path save_data_file = os.path.join(worker_path, "input.bin") latency = 0. now_time = datetime.datetime.now().strftime('%Y%m%d%H%M%S%f') job_id = "pre_evaluate_" + now_time logging.info("The job id of evaluate service is {}.".format(job_id)) if zeus.is_torch_backend(): import torch input_shape = input.shape if torch.is_tensor(input): input = input.cpu().numpy() input.tofile(save_data_file) for index in range(num): reuse_model = False if index == 0 else True results = evaluate("pytorch", hardware, remote_host, model, None, save_data_file, input_shape, reuse_model, job_id) latency += np.float(results.get("latency")) elif zeus.is_tf_backend(): input_shape = input.shape.as_list() test_data = np.random.random(input_shape).astype(np.float32) test_data.tofile(save_data_file) for index in range(num): reuse_model = False if index == 0 else True results = evaluate("tensorflow", hardware, remote_host, model, None, save_data_file, input_shape, reuse_model, job_id) latency += np.float(results.get("latency")) return latency / num
def before_train(self, logs=None): """Be called before the training process.""" self.input = None self.flops = None self.params = None self.calc_params_each_epoch = self.trainer.config.calc_params_each_epoch if zeus.is_tf_backend(): import tensorflow as tf datasets = self.trainer.valid_input_fn() data_iter = tf.compat.v1.data.make_one_shot_iterator(datasets) # data_iter = self.trainer.valid_input_fn().make_one_shot_iterator() input_data, _ = data_iter.get_next() self.input = input_data[:1]
def before_train(self, logs=None): """Fetch trainer info before train stage.""" self._fix_path = "_".join( [self.trainer.step_name, str(self.trainer.worker_id)]) self.summary = SummaryBoard(self._archive_root, self._fix_path) if zeus.is_tf_backend(): import tensorflow as tf datasets = self.trainer.valid_input_fn() data_iter = tf.compat.v1.data.make_one_shot_iterator(datasets) input_data, _ = data_iter.get_next() self.input = input_data[:1]
def train(self, inputs, labels): """Train model.""" if zeus.is_tf_backend(): feed_dict = {} with self.graph.as_default(): for i in range(len(inputs)): feed_dict.update({self.inputs[i]: inputs[i]}) for i in range(len(labels)): feed_dict.update({self.labels[i]: labels[i]}) _, loss = self.sess.run([self.train_op, self.loss], feed_dict) return loss
def is_filtered(self, desc=None): """Filter function of latency.""" if self.max_latency is None: return False model, count_input = self.get_model_input(desc) trainer = ClassFactory.get_cls(ClassType.TRAINER)(model_desc=desc) sess_config = trainer._init_session_config() if zeus.is_tf_backend( ) else None latency = calc_forward_latency(model, count_input, sess_config) logging.info('Sampled model\'s latency: {}ms'.format(latency * 1000)) if latency > self.max_latency: return True else: return False