Example #1
0
    def __new__(cls,
                model=None,
                id=None,
                hps=None,
                load_ckpt_flag=False,
                model_desc=None,
                lazy_build=True,
                **kwargs):
        """Create Trainer clss."""
        if zeus.is_torch_backend():
            from zeus.trainer_torch import TrainerTorch
            trainer_cls = TrainerTorch
        elif zeus.is_tf_backend():
            from zeus.trainer_tf import TrainerTf
            trainer_cls = TrainerTf
        else:
            from zeus.trainer_ms import TrainerMs
            trainer_cls = TrainerMs

        return trainer_cls(model=model,
                           id=id,
                           hps=hps,
                           load_ckpt_flag=load_ckpt_flag,
                           model_desc=model_desc,
                           lazy_build=lazy_build,
                           **kwargs)
Example #2
0
 def update_flops_params(self, epoch=None, logs=None):
     """Calculate flops and params."""
     self.model = self.trainer.model
     try:
         if self.flops is None:
             flops_count, params_count = calc_model_flops_params(
                 self.model, self.input)
             self.flops, self.params = flops_count * 1e-9, params_count * 1e-3
         if self.latency is None:
             sess_config = self.trainer._init_session_config(
             ) if zeus.is_tf_backend() else None
             self.latency = calc_forward_latency(self.model, self.input,
                                                 sess_config) * 1000
         summary_perfs = logs.get('summary_perfs', {})
         if epoch:
             summary_perfs.update({
                 'flops': self.flops,
                 'params': self.params,
                 'latency': self.latency,
                 'epoch': epoch
             })
         else:
             summary_perfs.update({
                 'flops': self.flops,
                 'params': self.params,
                 'latency': self.latency
             })
         logs.update({'summary_perfs': summary_perfs})
     except Exception as ex:
         logging.warning("model statics failed, ex=%s", ex)
Example #3
0
 def _init_tf_session(self):
     if not zeus.is_tf_backend():
         return
     sess_config = self._init_session_config()
     self.graph = tf.Graph()
     with self.graph.as_default():
         self.sess = tf.compat.v1.Session(config=sess_config)
Example #4
0
    def __call__(self, model=None, distributed=False, **kwargs):
        """Call Optimizer class.

        :param model: model, used in torch case
        :param distributed: use distributed
        :return: optimizer
        """
        params = self.map_config.get("params", {})
        logging.debug("Call Optimizer. name={}, params={}".format(
            self.optim_cls.__name__, params))
        optimizer = None
        try:
            if zeus.is_torch_backend():
                learnable_params = [
                    param for param in model.parameters()
                    if param.requires_grad
                ]
                optimizer = self.optim_cls(learnable_params, **params)
                if distributed:
                    optimizer = self.set_distributed(optimizer, model)
            elif zeus.is_tf_backend():
                optimizer = dynamic_optimizer(self.optim_cls, **params)
            elif zeus.is_ms_backend():
                if "dynamic_lr" in kwargs:
                    params.update({"learning_rate": kwargs["dynamic_lr"]})
                learnable_params = [
                    param for param in model.trainable_params()
                    if param.requires_grad
                ]
                optimizer = self.optim_cls(learnable_params, **params)
            return optimizer
        except Exception as ex:
            logging.error("Failed to call Optimizer name={}, params={}".format(
                self.optim_cls.__name__, params))
            raise ex
Example #5
0
 def _load_pretrained_model(cls, model, pretrained_model_file):
     pretrained_model_file = cls._get_abs_path(pretrained_model_file)
     logging.info("load model weights from file, weights file={}".format(pretrained_model_file))
     if zeus.is_torch_backend():
         if not os.path.isfile(pretrained_model_file):
             raise "Pretrained model is not existed, model={}".format(pretrained_model_file)
         import torch
         checkpoint = torch.load(pretrained_model_file)
         model.load_state_dict(checkpoint)
     if zeus.is_tf_backend():
         if pretrained_model_file.endswith('.pth'):
             checkpoint = convert_checkpoint_from_pytorch(pretrained_model_file, model)
             model.load_checkpoint_from_numpy(checkpoint)
         else:
             pretrained_model_file = cls._get_tf_model_file(pretrained_model_file)
             model.load_checkpoint(pretrained_model_file)
     elif zeus.is_ms_backend():
         from mindspore.train.serialization import load_checkpoint
         if hasattr(model, "pretrained"):
             pretrained_weight = model.pretrained(pretrained_model_file)
         else:
             if os.path.isfile(pretrained_model_file):
                 pretrained_weight = pretrained_model_file
             else:
                 for file in os.listdir(pretrained_model_file):
                     if file.endswith(".ckpt"):
                         pretrained_weight = os.path.join(pretrained_model_file, file)
                         break
         load_checkpoint(pretrained_weight, net=model)
     return model
Example #6
0
 def _to_tensor(self, data):
     if zeus.is_torch_backend():
         import torch
         return torch.tensor(data)
     elif zeus.is_tf_backend():
         import tensorflow as tf
         return tf.convert_to_tensor(data)
Example #7
0
 def __init__(self, desc=None, weight_file=None, pb_file=None):
     super(GraphGetter, self).__init__()
     if isinstance(desc, dict):
         src_model = ModelZoo().get_model(desc)
     else:
         src_model = desc
     weights = OrderedDict()
     if is_tf_backend():
         import tensorflow.compat.v1 as tf
         from tensorflow.python.framework import tensor_util
         tf.reset_default_graph()
         data_shape = (1, 224, 224, 3)
         x = tf.ones(data_shape)
         if pb_file:
             with tf.io.gfile.GFile(pb_file, 'rb') as f:
                 graph_def = tf.GraphDef()
             graph_def.ParseFromString(f.read())
             graph = tf.Graph()
             with graph.as_default():
                 tf.import_graph_def(graph_def, name='')
             weight_file = None
             wts = [n for n in graph_def.node if n.op == 'Const']
             for n in wts:
                 weights[n.name] = tensor_util.MakeNdarray(
                     n.attr['value'].tensor)
         else:
             src_model(x, self.training)
             graph = tf.get_default_graph()
         desc = graph2desc(graph)
         tf.reset_default_graph()
     self.model = ModelZoo().get_model(desc, weight_file)
     if weights:
         self.model.load_checkpoint_from_numpy(weights)
Example #8
0
    def get_cls(cls, type_name, t_cls_name=None):
        """Get class and bind config to class.

        :param type_name: type name of class registry
        :param t_cls_name: class name
        :return:t_cls
        """
        # lazy load class
        if not cls.is_exists(type_name, t_cls_name) and t_cls_name:
            cls._import_pkg(type_name, t_cls_name)
        # verify class
        if not cls.is_exists(type_name, t_cls_name):
            raise ValueError("can't find class type {} class name {} in class registry".format(type_name, t_cls_name))
        # create instance without configs
        if t_cls_name is None:
            from zeus.datasets.conf.dataset import DatasetConfig
            from zeus.evaluator.conf import EvaluatorConfig
            if type_name == ClassType.DATASET:
                t_cls_name = DatasetConfig.type
            elif type_name == ClassType.TRAINER:
                import zeus
                if zeus.is_torch_backend():
                    t_cls_name = "TrainerTorch"
                elif zeus.is_tf_backend():
                    t_cls_name = "TrainerTf"
                elif zeus.is_ms_backend():
                    t_cls_name = "TrainerMs"
            elif type_name == ClassType.EVALUATOR:
                t_cls_name = EvaluatorConfig.type
            else:
                pass
        if t_cls_name is None:
            raise ValueError("can't find class. class type={}".format(type_name))
        t_cls = cls.__registry__.get(type_name).get(t_cls_name)
        return t_cls
Example #9
0
    def save(self, file_name):
        """Save model."""
        if zeus.is_tf_backend():
            with self.graph.as_default():
                self.actor_var.save_weights(file_name + ".npz")

            return file_name + ".npz"
Example #10
0
    def _train_loop(self):
        """Do the training with data, callbacks and step functions etc."""
        # Allow user to build trainer in before_train() callback, but they
        # should set lazy_built in configuration file to True
        self.callbacks.before_train()
        if self.skip_train:
            return

        if self.use_unsupervised_pretrain and zeus.is_torch_backend():
            from .simclr.transforms import TransformsSimCLR
            from .simclr.train import simclr_train
            train_loader = self._init_dataloader(mode="train",
                                                 transforms=TransformsSimCLR())
            self.model = simclr_train(self.model, train_loader)

        repeat_time = 1 if zeus.is_ms_backend() else self.epochs
        repeat_time = 1 if zeus.is_tf_backend(
        ) and self.config.train_in_once else repeat_time
        for epoch in range(self._start_epoch, repeat_time):
            epoch_logs = {'train_num_batches': self.batch_num_train}
            if self.do_validation:
                epoch_logs.update({'valid_num_batches': self.batch_num_valid})
            self.callbacks.before_epoch(epoch, epoch_logs)
            if self.config.with_train:
                self._train_epoch()
            if self.do_validation and self._should_run_validation(epoch):
                self._valid_epoch()
            self.callbacks.after_epoch(epoch)
        self.callbacks.after_train()
        if self.distributed:
            self._shutdown_distributed()
Example #11
0
def _get_data_format():
    if zeus.is_torch_backend() or zeus.is_ms_backend():
        return 'channels_first'
    elif zeus.is_tf_backend():
        return 'channels_last'
    else:
        return None
Example #12
0
    def __call__(self, model=None, distributed=False):
        """Call Optimizer class.

        :param model: model, used in torch case
        :param distributed: use distributed
        :return: optimizer
        """
        params = self.map_config.get("params", {})
        logging.debug("Call Optimizer. name={}, params={}".format(self.optim_cls.__name__, params))
        optimizer = None
        try:
            if zeus.is_torch_backend():
                learnable_params = [param for param in model.parameters() if param.requires_grad]
                optimizer = self.optim_cls(learnable_params, **params)
                if distributed:
                    optimizer = hvd.DistributedOptimizer(optimizer,
                                                         named_parameters=model.named_parameters(),
                                                         compression=hvd.Compression.none)
            elif zeus.is_tf_backend():
                optimizer = dynamic_optimizer(self.optim_cls, **params)
                if distributed:
                    optimizer = hvd.DistributedOptimizer(optimizer) if zeus.is_gpu_device() else \
                        NPUDistributedOptimizer(optimizer)
            elif zeus.is_ms_backend():
                learnable_params = [param for param in model.trainable_params() if param.requires_grad]
                optimizer = self.optim_cls(learnable_params, **params)
            return optimizer
        except Exception as ex:
            logging.error("Failed to call Optimizer name={}, params={}".format(self.optim_cls.__name__, params))
            raise ex
Example #13
0
 def is_filtered(self, desc=None):
     """Filter function of latency."""
     if self.max_latency is None:
         return False
     model, count_input = self.get_model_input(desc)
     num = 100
     if zeus.is_torch_backend():
         start_time = time.time()
         for i in range(num):
             model(count_input)
         latency = (time.time() - start_time) / num
     elif zeus.is_tf_backend():
         import tensorflow as tf
         input = tf.placeholder(tf.float32,
                                shape=count_input.get_shape().as_list())
         output = model(input, training=False)
         with tf.compat.v1.Session() as sess:
             input_numpy = count_input.eval(session=sess)
             start_time = time.time()
             for i in range(num):
                 sess.run(output, feed_dict={input: input_numpy})
             latency = (time.time() - start_time) / num
     logging.info('Sampled model\'s latency: {}'.format(latency))
     if latency > self.max_latency:
         return True
     else:
         return False
Example #14
0
    def _valid_epoch(self):
        self.callbacks.before_valid()
        valid_logs = None
        if zeus.is_torch_backend():
            self.model.eval()
            with torch.no_grad():
                for batch_index, batch in enumerate(self.valid_loader):
                    batch = self.make_batch(batch)
                    batch_logs = {'valid_batch': batch}
                    self.callbacks.before_valid_step(batch_index, batch_logs)
                    valid_batch_output = self.valid_step(batch)
                    self.callbacks.after_valid_step(batch_index, valid_batch_output)
        elif zeus.is_tf_backend():
            eval_metrics = self.estimator.evaluate(input_fn=self.valid_input_fn,
                                                   steps=len(self.valid_loader))
            self.valid_metrics.update(eval_metrics)
            valid_logs = dict()
            valid_logs['cur_valid_perfs'] = self.valid_metrics.results
        elif zeus.is_ms_backend():
            eval_metrics = self.ms_model.eval(valid_dataset=self.valid_loader,
                                              dataset_sink_mode=self.dataset_sink_mode)

            self.valid_metrics.update(eval_metrics)
            valid_logs = dict()
            valid_logs['cur_valid_perfs'] = self.valid_metrics.results
        self.callbacks.after_valid(valid_logs)
Example #15
0
 def _save_best_model(self):
     """Save best model."""
     if zeus.is_torch_backend():
         torch.save(self.trainer.model.state_dict(),
                    self.trainer.weights_file)
     elif zeus.is_tf_backend():
         worker_path = self.trainer.get_local_worker_path()
         model_id = "model_{}".format(self.trainer.worker_id)
         weights_folder = FileOps.join_path(worker_path, model_id)
         FileOps.make_dir(weights_folder)
         checkpoint_file = tf.train.latest_checkpoint(worker_path)
         ckpt_globs = glob.glob("{}.*".format(checkpoint_file))
         for _file in ckpt_globs:
             dst_file = model_id + os.path.splitext(_file)[-1]
             FileOps.copy_file(_file,
                               FileOps.join_path(weights_folder, dst_file))
         FileOps.copy_file(FileOps.join_path(worker_path, 'checkpoint'),
                           weights_folder)
     elif zeus.is_ms_backend():
         worker_path = self.trainer.get_local_worker_path()
         save_path = os.path.join(
             worker_path, "model_{}.ckpt".format(self.trainer.worker_id))
         for file in os.listdir(worker_path):
             if file.startswith("CKP") and file.endswith(".ckpt"):
                 self.weights_file = FileOps.join_path(worker_path, file)
                 os.rename(self.weights_file, save_path)
Example #16
0
 def _train_epoch(self):
     if zeus.is_torch_backend():
         self.model.train()
         for batch_index, batch in enumerate(self.train_loader):
             batch = self.make_batch(batch)
             batch_logs = {'train_batch': batch}
             self.callbacks.before_train_step(batch_index, batch_logs)
             train_batch_output = self.train_step(batch)
             batch_logs.update(train_batch_output)
             if self.config.is_detection_trainer:
                 batch_logs.update({'is_detection_trainer': True})
             self.callbacks.after_train_step(batch_index, batch_logs)
     elif zeus.is_tf_backend():
         self.estimator.train(input_fn=self.train_input_fn,
                              steps=len(self.train_loader),
                              hooks=self._init_logging_hook())
     elif zeus.is_ms_backend():
         self.ms_model = MsModel(network=self.model,
                                 loss_fn=self.loss,
                                 optimizer=self.optimizer,
                                 metrics={self.metric_name: self.valid_metrics()})
         config_ck = CheckpointConfig(save_checkpoint_steps=self.config.save_steps)
         # save the network model and parameters for subsequence fine-tuning
         save_path = self.get_local_worker_path(self.step_name, self.worker_id)
         ckpoint_cb = ModelCheckpoint(config=config_ck, directory=save_path)
         loss_cb = LossMonitor(per_print_times=self.config.report_freq)
         eval_cb = EvalCallBack(self.ms_model, self.valid_loader)
         self.ms_model.train(epoch=self.epochs,
                             train_dataset=self.train_loader,
                             callbacks=[ckpoint_cb, loss_cb, eval_cb],
                             dataset_sink_mode=self.dataset_sink_mode)
Example #17
0
 def _init_model(self):
     """Load model desc from save path and parse to model."""
     model = self.trainer.model
     if self.trainer.config.is_detection_trainer:
         model_desc = self.trainer.model_desc
     else:
         model_desc = self._get_model_desc()
     if model_desc:
         ModelConfig.model_desc = model_desc
     pretrained_model_file = self._get_pretrained_model_file()
     if not model:
         if not model_desc:
             raise Exception(
                 "Failed to Init model, can not get model description.")
         model = ModelZoo.get_model(model_desc, pretrained_model_file)
     if model:
         if zeus.is_torch_backend():
             import torch
             if self.trainer.use_cuda:
                 model = model.cuda()
             if General._parallel and General.devices_per_trainer > 1:
                 model = torch.nn.DataParallel(self.trainer.model)
         if zeus.is_tf_backend():
             if pretrained_model_file:
                 model_folder = os.path.dirname(pretrained_model_file)
                 FileOps.copy_folder(model_folder,
                                     self.trainer.get_local_worker_path())
     return model
Example #18
0
 def predict(self, input):
     """Inference model."""
     if zeus.is_tf_backend():
         with self.graph.as_default():
             feed_dict = {self.input: input}
             out = self.sess.run(self.logits, feed_dict)
             return out
Example #19
0
def get_named_modules(layer):
    """Get named modules."""
    if zeus.is_tf_backend():
        return [(op.name, op) for op in layer]
    elif zeus.is_torch_backend():
        return layer.named_modules()
    elif zeus.is_ms_backend():
        return layer._children_scope_recursive()
Example #20
0
 def _init_tf_estimator(self):
     """Init tensorflow estimator."""
     if not zeus.is_tf_backend():
         return
     sess_config = self._init_session_config()
     if zeus.is_gpu_device():
         self._init_gpu_estimator(sess_config)
     elif zeus.is_npu_device():
         self._init_npu_estimator(sess_config)
Example #21
0
 def _set_default_funcs(self):
     if zeus.is_torch_backend():
         self.make_batch = self._default_make_batch
         self.train_step = self._default_train_step
         self.valid_step = self._default_valid_step
     elif zeus.is_tf_backend():
         self.model_fn = self._default_model_fn
         self.train_input_fn = self._default_train_input_fn
         self.valid_input_fn = self._default_valid_input_fn
Example #22
0
def get_shape(layer):
    """Get weight shape."""
    if zeus.is_tf_backend():
        return layer.get_shape()
    elif zeus.is_torch_backend():
        return layer.weight.data.shape
    elif zeus.is_ms_backend():
        para_name = list(layer._params)[0]
        return getattr(layer, para_name).default_input.shape
Example #23
0
def Adapter(dataset):
    """Adapter of dataset."""
    if zeus.is_torch_backend():
        from .pytorch import TorchAdapter as Adapter
    elif zeus.is_tf_backend():
        from .tensorflow import TfAdapter as Adapter
    elif zeus.is_ms_backend():
        from .mindspore import MsAdapter as Adapter
    else:
        raise ValueError
    return Adapter(dataset)
Example #24
0
 def _init_metrics(self, metrics=None):
     """Init metrics."""
     if metrics is not None:
         return metrics
     else:
         if zeus.is_torch_backend():
             from zeus.metrics.pytorch.metrics import Metrics
         elif zeus.is_tf_backend():
             from zeus.metrics.tensorflow.metrics import Metrics
         elif zeus.is_ms_backend():
             from zeus.metrics.mindspore.metrics import Metrics
         return Metrics()
Example #25
0
 def set_distributed(cls, optimizer, model=None):
     """Set distributed optimizer."""
     if zeus.is_torch_backend():
         optimizer = hvd.DistributedOptimizer(
             optimizer,
             named_parameters=model.named_parameters(),
             compression=hvd.Compression.none)
     elif zeus.is_tf_backend():
         optim_class = hvd.DistributedOptimizer if zeus.is_gpu_device(
         ) else NPUDistributedOptimizer
         optimizer = dynamic_distributed_optimizer(optim_class, optimizer)
     return optimizer
Example #26
0
def _calc_forward_latency_davinci(model,
                                  input,
                                  sess_config=None,
                                  num=10,
                                  evaluate_config=None):
    """Model forward latency calculation.

    :param model: network model
    :type model: torch or tf module
    :param input: input tensor
    :type input: Tensor of torch or tf
    :param num: forward number
    :type num: int
    :param evaluate_config: some config for evaluate in davinci
    :type evaluate_config: dict
    :return: forward latency
    :rtype: float
    """
    from zeus.evaluator.tools.evaluate_davinci_bolt import evaluate
    from zeus.common.task_ops import TaskOps
    # backend = evaluate_config.get("backend")
    hardware = evaluate_config.get("hardware")
    remote_host = evaluate_config.get("remote_host")
    worker_path = TaskOps().local_base_path
    save_data_file = os.path.join(worker_path, "input.bin")

    latency = 0.
    now_time = datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')
    job_id = "pre_evaluate_" + now_time
    logging.info("The job id of evaluate service is {}.".format(job_id))
    if zeus.is_torch_backend():
        import torch
        input_shape = input.shape
        if torch.is_tensor(input):
            input = input.cpu().numpy()
        input.tofile(save_data_file)
        for index in range(num):
            reuse_model = False if index == 0 else True
            results = evaluate("pytorch", hardware, remote_host, model, None,
                               save_data_file, input_shape, reuse_model,
                               job_id)
            latency += np.float(results.get("latency"))
    elif zeus.is_tf_backend():
        input_shape = input.shape.as_list()
        test_data = np.random.random(input_shape).astype(np.float32)
        test_data.tofile(save_data_file)
        for index in range(num):
            reuse_model = False if index == 0 else True
            results = evaluate("tensorflow", hardware, remote_host, model,
                               None, save_data_file, input_shape, reuse_model,
                               job_id)
            latency += np.float(results.get("latency"))
    return latency / num
Example #27
0
 def before_train(self, logs=None):
     """Be called before the training process."""
     self.input = None
     self.flops = None
     self.params = None
     self.calc_params_each_epoch = self.trainer.config.calc_params_each_epoch
     if zeus.is_tf_backend():
         import tensorflow as tf
         datasets = self.trainer.valid_input_fn()
         data_iter = tf.compat.v1.data.make_one_shot_iterator(datasets)
         # data_iter = self.trainer.valid_input_fn().make_one_shot_iterator()
         input_data, _ = data_iter.get_next()
         self.input = input_data[:1]
Example #28
0
    def before_train(self, logs=None):
        """Fetch trainer info before train stage."""
        self._fix_path = "_".join(
            [self.trainer.step_name,
             str(self.trainer.worker_id)])
        self.summary = SummaryBoard(self._archive_root, self._fix_path)

        if zeus.is_tf_backend():
            import tensorflow as tf
            datasets = self.trainer.valid_input_fn()
            data_iter = tf.compat.v1.data.make_one_shot_iterator(datasets)
            input_data, _ = data_iter.get_next()
            self.input = input_data[:1]
Example #29
0
    def train(self, inputs, labels):
        """Train model."""
        if zeus.is_tf_backend():
            feed_dict = {}
            with self.graph.as_default():
                for i in range(len(inputs)):
                    feed_dict.update({self.inputs[i]: inputs[i]})

                for i in range(len(labels)):
                    feed_dict.update({self.labels[i]: labels[i]})

                _, loss = self.sess.run([self.train_op, self.loss], feed_dict)
                return loss
Example #30
0
 def is_filtered(self, desc=None):
     """Filter function of latency."""
     if self.max_latency is None:
         return False
     model, count_input = self.get_model_input(desc)
     trainer = ClassFactory.get_cls(ClassType.TRAINER)(model_desc=desc)
     sess_config = trainer._init_session_config() if zeus.is_tf_backend(
     ) else None
     latency = calc_forward_latency(model, count_input, sess_config)
     logging.info('Sampled model\'s latency: {}ms'.format(latency * 1000))
     if latency > self.max_latency:
         return True
     else:
         return False