Example #1
0
 def build(self):
     """Build the trainer by assembling the necessary components."""
     logging.debug("Trainer Config: {}".format(self.config))
     self.do_validation = self.config.with_valid
     self.use_syncbn = self.config.syncbn
     if self.use_syncbn and zeus.is_torch_backend():
         self.model = apex.parallel.convert_syncbn_model(self.model)
     self.train_loader = self._init_dataloader(mode='train')
     self.valid_loader = self._init_dataloader(mode='val')
     self.batch_num_train = self.train_loader.get_dataset_size(
     ) if zeus.is_ms_backend() else len(self.train_loader)
     self.batch_num_valid = self.valid_loader.get_dataset_size(
     ) if zeus.is_ms_backend() else len(self.valid_loader)
Example #2
0
def _get_data_format():
    if zeus.is_torch_backend() or zeus.is_ms_backend():
        return 'channels_first'
    elif zeus.is_tf_backend():
        return 'channels_last'
    else:
        return None
Example #3
0
    def _train_loop(self):
        """Do the training with data, callbacks and step functions etc."""
        # Allow user to build trainer in before_train() callback, but they
        # should set lazy_built in configuration file to True
        self.callbacks.before_train()
        if self.skip_train:
            return

        if self.use_unsupervised_pretrain and zeus.is_torch_backend():
            from .trainer.simclr.transforms import TransformsSimCLR
            from .trainer.simclr.train import simclr_train
            train_loader = self._init_dataloader(mode="train",
                                                 transforms=TransformsSimCLR())
            self.model = simclr_train(self.model, train_loader)

        repeat_time = 1 if zeus.is_ms_backend() else self.epochs
        for epoch in range(self._start_epoch, repeat_time):
            epoch_logs = {'train_num_batches': self.batch_num_train}
            if self.do_validation:
                epoch_logs.update({'valid_num_batches': self.batch_num_valid})
            self.callbacks.before_epoch(epoch, epoch_logs)
            self._train_epoch()
            if self.do_validation and self._should_run_validation(epoch):
                self._valid_epoch()
            self.callbacks.after_epoch(epoch)
        self.callbacks.after_train()
        if self.distributed:
            self._shutdown_distributed()
Example #4
0
    def get_cls(cls, type_name, t_cls_name=None):
        """Get class and bind config to class.

        :param type_name: type name of class registry
        :param t_cls_name: class name
        :return:t_cls
        """
        # lazy load class
        if not cls.is_exists(type_name, t_cls_name) and t_cls_name:
            cls._import_pkg(type_name, t_cls_name)
        # verify class
        if not cls.is_exists(type_name, t_cls_name):
            raise ValueError("can't find class type {} class name {} in class registry".format(type_name, t_cls_name))
        # create instance without configs
        if t_cls_name is None:
            from zeus.datasets.conf.dataset import DatasetConfig
            from zeus.evaluator.conf import EvaluatorConfig
            if type_name == ClassType.DATASET:
                t_cls_name = DatasetConfig.type
            elif type_name == ClassType.TRAINER:
                import zeus
                if zeus.is_torch_backend():
                    t_cls_name = "TrainerTorch"
                elif zeus.is_tf_backend():
                    t_cls_name = "TrainerTf"
                elif zeus.is_ms_backend():
                    t_cls_name = "TrainerMs"
            elif type_name == ClassType.EVALUATOR:
                t_cls_name = EvaluatorConfig.type
            else:
                pass
        if t_cls_name is None:
            raise ValueError("can't find class. class type={}".format(type_name))
        t_cls = cls.__registry__.get(type_name).get(t_cls_name)
        return t_cls
Example #5
0
    def __call__(self, model=None, distributed=False):
        """Call Optimizer class.

        :param model: model, used in torch case
        :param distributed: use distributed
        :return: optimizer
        """
        params = self.map_config.get("params", {})
        logging.debug("Call Optimizer. name={}, params={}".format(self.optim_cls.__name__, params))
        optimizer = None
        try:
            if zeus.is_torch_backend():
                learnable_params = [param for param in model.parameters() if param.requires_grad]
                optimizer = self.optim_cls(learnable_params, **params)
                if distributed:
                    optimizer = hvd.DistributedOptimizer(optimizer,
                                                         named_parameters=model.named_parameters(),
                                                         compression=hvd.Compression.none)
            elif zeus.is_tf_backend():
                optimizer = dynamic_optimizer(self.optim_cls, **params)
                if distributed:
                    optimizer = hvd.DistributedOptimizer(optimizer) if zeus.is_gpu_device() else \
                        NPUDistributedOptimizer(optimizer)
            elif zeus.is_ms_backend():
                learnable_params = [param for param in model.trainable_params() if param.requires_grad]
                optimizer = self.optim_cls(learnable_params, **params)
            return optimizer
        except Exception as ex:
            logging.error("Failed to call Optimizer name={}, params={}".format(self.optim_cls.__name__, params))
            raise ex
Example #6
0
 def _train_epoch(self):
     if zeus.is_torch_backend():
         self.model.train()
         for batch_index, batch in enumerate(self.train_loader):
             batch = self.make_batch(batch)
             batch_logs = {'train_batch': batch}
             self.callbacks.before_train_step(batch_index, batch_logs)
             train_batch_output = self.train_step(batch)
             batch_logs.update(train_batch_output)
             if self.config.is_detection_trainer:
                 batch_logs.update({'is_detection_trainer': True})
             self.callbacks.after_train_step(batch_index, batch_logs)
     elif zeus.is_tf_backend():
         self.estimator.train(input_fn=self.train_input_fn,
                              steps=len(self.train_loader),
                              hooks=self._init_logging_hook())
     elif zeus.is_ms_backend():
         self.ms_model = MsModel(network=self.model,
                                 loss_fn=self.loss,
                                 optimizer=self.optimizer,
                                 metrics={self.metric_name: self.valid_metrics()})
         config_ck = CheckpointConfig(save_checkpoint_steps=self.config.save_steps)
         # save the network model and parameters for subsequence fine-tuning
         save_path = self.get_local_worker_path(self.step_name, self.worker_id)
         ckpoint_cb = ModelCheckpoint(config=config_ck, directory=save_path)
         loss_cb = LossMonitor(per_print_times=self.config.report_freq)
         eval_cb = EvalCallBack(self.ms_model, self.valid_loader)
         self.ms_model.train(epoch=self.epochs,
                             train_dataset=self.train_loader,
                             callbacks=[ckpoint_cb, loss_cb, eval_cb],
                             dataset_sink_mode=self.dataset_sink_mode)
Example #7
0
 def _save_best_model(self):
     """Save best model."""
     if zeus.is_torch_backend():
         torch.save(self.trainer.model.state_dict(),
                    self.trainer.weights_file)
     elif zeus.is_tf_backend():
         worker_path = self.trainer.get_local_worker_path()
         model_id = "model_{}".format(self.trainer.worker_id)
         weights_folder = FileOps.join_path(worker_path, model_id)
         FileOps.make_dir(weights_folder)
         checkpoint_file = tf.train.latest_checkpoint(worker_path)
         ckpt_globs = glob.glob("{}.*".format(checkpoint_file))
         for _file in ckpt_globs:
             dst_file = model_id + os.path.splitext(_file)[-1]
             FileOps.copy_file(_file,
                               FileOps.join_path(weights_folder, dst_file))
         FileOps.copy_file(FileOps.join_path(worker_path, 'checkpoint'),
                           weights_folder)
     elif zeus.is_ms_backend():
         worker_path = self.trainer.get_local_worker_path()
         save_path = os.path.join(
             worker_path, "model_{}.ckpt".format(self.trainer.worker_id))
         for file in os.listdir(worker_path):
             if file.startswith("CKP") and file.endswith(".ckpt"):
                 self.weights_file = FileOps.join_path(worker_path, file)
                 os.rename(self.weights_file, save_path)
Example #8
0
    def _valid_epoch(self):
        self.callbacks.before_valid()
        valid_logs = None
        if zeus.is_torch_backend():
            self.model.eval()
            with torch.no_grad():
                for batch_index, batch in enumerate(self.valid_loader):
                    batch = self.make_batch(batch)
                    batch_logs = {'valid_batch': batch}
                    self.callbacks.before_valid_step(batch_index, batch_logs)
                    valid_batch_output = self.valid_step(batch)
                    self.callbacks.after_valid_step(batch_index, valid_batch_output)
        elif zeus.is_tf_backend():
            eval_metrics = self.estimator.evaluate(input_fn=self.valid_input_fn,
                                                   steps=len(self.valid_loader))
            self.valid_metrics.update(eval_metrics)
            valid_logs = dict()
            valid_logs['cur_valid_perfs'] = self.valid_metrics.results
        elif zeus.is_ms_backend():
            eval_metrics = self.ms_model.eval(valid_dataset=self.valid_loader,
                                              dataset_sink_mode=self.dataset_sink_mode)

            self.valid_metrics.update(eval_metrics)
            valid_logs = dict()
            valid_logs['cur_valid_perfs'] = self.valid_metrics.results
        self.callbacks.after_valid(valid_logs)
Example #9
0
    def load_records_from_model_folder(cls, model_folder):
        """Transfer json_file to records."""
        if not model_folder or not os.path.exists(model_folder):
            logging.error(
                "Failed to load records from model folder, folder={}".format(
                    model_folder))
            return []
        records = []
        pattern = FileOps.join_path(model_folder, "desc_*.json")
        files = glob.glob(pattern)
        for _file in files:
            try:
                with open(_file) as f:
                    worker_id = _file.split(".")[-2].split("_")[-1]
                    weights_file = os.path.join(os.path.dirname(_file),
                                                "model_{}".format(worker_id))
                    if zeus.is_torch_backend():
                        weights_file = '{}.pth'.format(weights_file)
                    elif zeus.is_ms_backend():
                        weights_file = '{}.ckpt'.format(weights_file)
                    if not os.path.exists(weights_file):
                        weights_file = None

                    sample = dict(worker_id=worker_id,
                                  desc=json.load(f),
                                  weights_file=weights_file)
                    record = ReportRecord().load_dict(sample)
                    records.append(record)
            except Exception as ex:
                logging.info(
                    'Can not read records from json because {}'.format(ex))
        return records
Example #10
0
 def _load_pretrained_model(cls, model, pretrained_model_file):
     pretrained_model_file = cls._get_abs_path(pretrained_model_file)
     logging.info("load model weights from file, weights file={}".format(pretrained_model_file))
     if zeus.is_torch_backend():
         if not os.path.isfile(pretrained_model_file):
             raise "Pretrained model is not existed, model={}".format(pretrained_model_file)
         import torch
         checkpoint = torch.load(pretrained_model_file)
         model.load_state_dict(checkpoint)
     if zeus.is_tf_backend():
         if pretrained_model_file.endswith('.pth'):
             checkpoint = convert_checkpoint_from_pytorch(pretrained_model_file, model)
             model.load_checkpoint_from_numpy(checkpoint)
         else:
             pretrained_model_file = cls._get_tf_model_file(pretrained_model_file)
             model.load_checkpoint(pretrained_model_file)
     elif zeus.is_ms_backend():
         from mindspore.train.serialization import load_checkpoint
         if hasattr(model, "pretrained"):
             pretrained_weight = model.pretrained(pretrained_model_file)
         else:
             if os.path.isfile(pretrained_model_file):
                 pretrained_weight = pretrained_model_file
             else:
                 for file in os.listdir(pretrained_model_file):
                     if file.endswith(".ckpt"):
                         pretrained_weight = os.path.join(pretrained_model_file, file)
                         break
         load_checkpoint(pretrained_weight, net=model)
     return model
Example #11
0
    def get_model(cls, model_desc=None, pretrained_model_file=None):
        """Get model from model zoo.

        :param network_name: the name of network, eg. ResNetVariant.
        :type network_name: str or None.
        :param network_desc: the description of network.
        :type network_desc: str or None.
        :param pretrained_model_file: path of model.
        :type pretrained_model_file: str.
        :return: model.
        :rtype: model.

        """
        try:
            network = NetworkDesc(model_desc)
            model = network.to_model()
        except Exception as e:
            logging.error("Failed to get model, model_desc={}, msg={}".format(
                model_desc, str(e)))
            raise e
        logging.info("Model was created.")
        if zeus.is_torch_backend() and pretrained_model_file:
            model = cls._load_pretrained_model(model, pretrained_model_file)
        elif zeus.is_ms_backend() and pretrained_model_file:
            model = cls._load_pretrained_model(model, pretrained_model_file)
        return model
Example #12
0
    def __call__(self, model=None, distributed=False, **kwargs):
        """Call Optimizer class.

        :param model: model, used in torch case
        :param distributed: use distributed
        :return: optimizer
        """
        params = self.map_config.get("params", {})
        logging.debug("Call Optimizer. name={}, params={}".format(
            self.optim_cls.__name__, params))
        optimizer = None
        try:
            if zeus.is_torch_backend():
                learnable_params = [
                    param for param in model.parameters()
                    if param.requires_grad
                ]
                optimizer = self.optim_cls(learnable_params, **params)
                if distributed:
                    optimizer = self.set_distributed(optimizer, model)
            elif zeus.is_tf_backend():
                optimizer = dynamic_optimizer(self.optim_cls, **params)
            elif zeus.is_ms_backend():
                if "dynamic_lr" in kwargs:
                    params.update({"learning_rate": kwargs["dynamic_lr"]})
                learnable_params = [
                    param for param in model.trainable_params()
                    if param.requires_grad
                ]
                optimizer = self.optim_cls(learnable_params, **params)
            return optimizer
        except Exception as ex:
            logging.error("Failed to call Optimizer name={}, params={}".format(
                self.optim_cls.__name__, params))
            raise ex
Example #13
0
 def _init_ms_context(self):
     if not zeus.is_ms_backend():
         return
     if zeus.is_npu_device():
         context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
     else:
         context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
     self.dataset_sink_mode = True if zeus.is_npu_device() else False
Example #14
0
def get_named_modules(layer):
    """Get named modules."""
    if zeus.is_tf_backend():
        return [(op.name, op) for op in layer]
    elif zeus.is_torch_backend():
        return layer.named_modules()
    elif zeus.is_ms_backend():
        return layer._children_scope_recursive()
Example #15
0
def get_shape(layer):
    """Get weight shape."""
    if zeus.is_tf_backend():
        return layer.get_shape()
    elif zeus.is_torch_backend():
        return layer.weight.data.shape
    elif zeus.is_ms_backend():
        para_name = list(layer._params)[0]
        return getattr(layer, para_name).default_input.shape
Example #16
0
def Adapter(dataset):
    """Adapter of dataset."""
    if zeus.is_torch_backend():
        from .pytorch import TorchAdapter as Adapter
    elif zeus.is_tf_backend():
        from .tensorflow import TfAdapter as Adapter
    elif zeus.is_ms_backend():
        from .mindspore import MsAdapter as Adapter
    else:
        raise ValueError
    return Adapter(dataset)
Example #17
0
 def _init_metrics(self, metrics=None):
     """Init metrics."""
     if metrics is not None:
         return metrics
     else:
         if zeus.is_torch_backend():
             from zeus.metrics.pytorch.metrics import Metrics
         elif zeus.is_tf_backend():
             from zeus.metrics.tensorflow.metrics import Metrics
         elif zeus.is_ms_backend():
             from zeus.metrics.mindspore.metrics import Metrics
         return Metrics()
Example #18
0
 def _load_pretrained_model(cls, model, pretrained_model_file):
     if zeus.is_torch_backend():
         import torch
         if not os.path.isfile(pretrained_model_file):
             raise "Pretrained model is not existed, model={}".format(pretrained_model_file)
         logging.info("load model weights from file, weights file={}".format(pretrained_model_file))
         checkpoint = torch.load(pretrained_model_file)
         model.load_state_dict(checkpoint)
     elif zeus.is_ms_backend():
         from mindspore.train.serialization import load_checkpoint
         load_checkpoint(pretrained_model_file, net=model)
     return model
Example #19
0
    def build(self):
        """Build the trainer by assembling the necessary components."""
        self._init_hps(self.hps)
        logging.debug("Trainer Config: {}".format(self.config))
        self.do_validation = self.config.with_valid
        self.use_syncbn = self.config.syncbn
        if self.use_syncbn and zeus.is_torch_backend():
            self.model = apex.parallel.convert_syncbn_model(self.model)
        self.train_loader = self._init_dataloader(mode='train')
        self.valid_loader = self._init_dataloader(mode='val')
        self.batch_num_train = self.train_loader.get_dataset_size() if zeus.is_ms_backend() else len(self.train_loader)
        self.batch_num_valid = self.valid_loader.get_dataset_size() if zeus.is_ms_backend() else len(self.valid_loader)

        if zeus.is_torch_backend():
            self.optimizer = Optimizer()(model=self.model, distributed=self.distributed)
            if hasattr(self.model, 'add_loss'):
                loss_cls = Loss()()
                self.model.add_loss(loss_cls)
                self.loss = self.model.overall_loss()
            else:
                self.loss = Loss()()
            self.lr_scheduler = LrScheduler()(self.optimizer)
        elif zeus.is_ms_backend():
            self.optimizer = Optimizer()(model=self.model)
            if hasattr(self.model, 'add_loss'):
                loss_cls = Loss()()
                self.model.add_loss(loss_cls)
                self.loss = self.model.overall_loss()
            else:
                self.loss = Loss()()
            self.metric_name = self.config.metric().type
        # Some trainer has different train batch size from valid batch
        self.train_metrics = self._init_metrics() if zeus.is_torch_backend() else None
        self.valid_metrics = self._init_metrics()
        self._init_horovod_setting()
        if self.use_amp and zeus.is_torch_backend():
            self.model, self.optimizer = amp.initialize(
                self.model, self.optimizer, opt_level='O1')
Example #20
0
def parse_module_name(name, module):
    """Parse the module name of mindspore."""
    if zeus.is_ms_backend():
        while (list(module.cells()) != []):
            module = list(module.cells())[0]

        name_list = name.split("/")[1:]
        new_name = ""
        for name in name_list:
            name = "." + name.split("-")[0]
            new_name += name
        return new_name[1:], module
    else:
        return name, module
Example #21
0
    def load_model(self):
        """Load model."""
        self.saved_folder = self.get_local_worker_path(self.step_name, self.worker_id)
        if not self.model_desc:
            self.model_desc = FileOps.join_path(self.saved_folder, 'desc_{}.json'.format(self.worker_id))
        if not self.weights_file:
            if zeus.is_torch_backend():
                self.weights_file = FileOps.join_path(self.saved_folder, 'model_{}.pth'.format(self.worker_id))
            elif zeus.is_ms_backend():
                for file in os.listdir(self.saved_folder):
                    if file.startswith("CKP") and file.endswith(".ckpt"):
                        self.weights_file = FileOps.join_path(self.saved_folder, file)

        if 'modules' not in self.model_desc:
            self.model_desc = ModelConfig.model_desc
        self.model = ModelZoo.get_model(self.model_desc, self.weights_file)
Example #22
0
 def is_filtered(self, desc=None):
     """Filter function of latency."""
     if zeus.is_ms_backend():
         return False
     try:
         if not self.dataloader:
             dataset_cls = ClassFactory.get_cls(ClassType.DATASET)
             self.dataset = dataset_cls()
             from zeus.datasets import Adapter
             self.dataloader = Adapter(self.dataset).loader
         model, count_input = self.get_model_input(desc)
         model(count_input)
         return False
     except Exception:
         encoding = desc['backbone']['encoding']
         logging.info('Invalid encoding: {}'.format(encoding))
         return True
Example #23
0
def calc_model_flops_params(model, input, custom_hooks=None, verbose=False):
    """Pytorch model flops and parameters calculation.

    :param model: pytorch model
    :type model: torch.nn.Module
    :param input: pytorch input tensor
    :type input: torch.Tensor
    :param custom_hooks: hooks defined by outside customer
    :type custom_hooks: dict or None
    :param verbose: whether to print op type which not in collection
    :type verbose: bool, default True
    :return: flops and params
    :rtype: float, float
    """
    if zeus.is_torch_backend():
        from thop import profile
        try:
            _model = deepcopy(model)
        except Exception as e:
            _model = model
        if custom_hooks is None:
            custom_hooks = {}
        custom_hooks = add_new_hooks(custom_hooks)
        inputs = (input, )
        flops, params = profile(_model, inputs, custom_hooks, verbose)
        del _model
    elif zeus.is_tf_backend():
        import tensorflow.compat.v1 as tf
        with tf.Graph().as_default() as graph:
            dummy_input = tf.placeholder(dtype=tf.float32,
                                         shape=input.shape.as_list())
            model.training = False
            model(dummy_input)
            opts = tf.profiler.ProfileOptionBuilder.float_operation()
            flops = tf.profiler.profile(graph, cmd='op',
                                        options=opts).total_float_ops
            opts = tf.profiler.ProfileOptionBuilder.trainable_variables_parameter(
            )
            params = tf.profiler.profile(graph, cmd='op',
                                         options=opts).total_parameters
            flops *= 0.5
    elif zeus.is_ms_backend():
        # TODO
        flops, params = 0, 0
    return flops, params
Example #24
0
 def get_input_data(self):
     """Get input data."""
     count_input = None
     if zeus.is_torch_backend():
         data_iter = iter(self.dataloader)
         input_data, _ = data_iter.next()
         count_input = input_data[:1]
     elif zeus.is_tf_backend():
         import tensorflow as tf
         datasets = self.dataloader.input_fn()
         data_iter = tf.compat.v1.data.make_one_shot_iterator(datasets)
         input_data, _ = data_iter.get_next()
         count_input = input_data[:1]
     elif zeus.is_ms_backend():
         data_iter = self.dataloader.create_dict_iterator()
         for batch in data_iter:
             count_input = batch['image']
             break
     return count_input
Example #25
0
 def apply(self, mask_code):
     """Apply mask to batchNorm."""
     end_mask = np.asarray(mask_code)
     idx = np.squeeze(
         np.argwhere(np.asarray(np.ones(end_mask.shape) -
                                end_mask))).tolist()
     self._make_mask(idx)
     if zeus.is_tf_backend():
         import tensorflow as tf
         return tf.assign(
             self.layer,
             self.layer * tf.constant(self.mask, dtype=self.layer.dtype))
     elif zeus.is_torch_backend():
         import torch
         self.layer.weight.data = self.layer.weight.data * torch.FloatTensor(
             self.mask)
         self.layer.bias.data = self.layer.bias.data * torch.FloatTensor(
             self.mask)
         self.layer.running_mean = self.layer.running_mean * torch.FloatTensor(
             self.mask)
         self.layer.running_var = self.layer.running_var * torch.FloatTensor(
             self.mask)
         self.layer.weight.data[idx].requires_grad = False
         self.layer.bias.data[idx].requires_grad = False
         self.layer.running_mean[idx].requires_grad = False
         self.layer.running_var[idx].requires_grad = False
     elif zeus.is_ms_backend():
         from mindspore import Tensor
         self.layer.moving_mean.default_input = self.layer.moving_mean.default_input * \
             Tensor(self.mask, self.layer.moving_mean.default_input.dtype)
         self.layer.moving_variance.default_input = self.layer.moving_variance.default_input * \
             Tensor(self.mask, self.layer.moving_variance.default_input.dtype)
         self.layer.gamma.default_input = self.layer.gamma.default_input * \
             Tensor(self.mask, self.layer.gamma.default_input.dtype)
         self.layer.beta.default_input = self.layer.beta.default_input * \
             Tensor(self.mask, self.layer.beta.default_input.dtype)
         for id in idx:
             self.layer.moving_mean.default_input[id].requires_grad = False
             self.layer.moving_variance.default_input[
                 id].requires_grad = False
             self.layer.gamma.default_input[id].requires_grad = False
             self.layer.beta.default_input[id].requires_grad = False
Example #26
0
 def apply(self, end_mask_code, start_mask_code=None):
     """Apply mask to weight."""
     end_mask_code = np.array(end_mask_code)
     if start_mask_code is not None:
         start_mask_code = np.array(start_mask_code)
     start_channel_idx = None
     end_channel_idx = np.squeeze(
         np.argwhere(
             np.asarray(np.ones(end_mask_code.shape) -
                        end_mask_code))).tolist()
     if start_mask_code is not None:
         start_channel_idx = np.squeeze(
             np.argwhere(
                 np.asarray(
                     np.ones(start_mask_code.shape) -
                     start_mask_code))).tolist()
     self._make_mask(end_mask_code, start_mask_code)
     if zeus.is_tf_backend():
         import tensorflow as tf
         return tf.assign(
             self.layer,
             self.layer * tf.constant(self.mask, dtype=self.layer.dtype))
     elif zeus.is_torch_backend():
         import torch
         self.layer.weight.data = self.layer.weight.data * torch.FloatTensor(
             self.mask)
         self.layer.weight.data[
             end_channel_idx, :, :, :].requires_grad = False
         if start_channel_idx is not None:
             self.layer.weight.data[:,
                                    start_channel_idx, :, :].requires_grad = False
     elif zeus.is_ms_backend():
         from mindspore import Tensor
         self.layer.weight.default_input = self.layer.weight.default_input * \
             Tensor(self.mask, self.layer.weight.default_input.dtype)
         for idx in end_channel_idx:
             self.layer.weight.default_input[
                 idx, :, :, :].requires_grad = False
         if start_channel_idx is not None:
             for idx in start_channel_idx:
                 self.layer.weight.default_input[:,
                                                 idx, :, :].requires_grad = False
Example #27
0
 def _train_loop(self):
     """Do the training with data, callbacks and step functions etc."""
     # Allow user to build trainer in before_train() callback, but they
     # should set lazy_built in configuration file to True
     self.callbacks.before_train()
     if self.skip_train:
         return
     repeat_time = 1 if zeus.is_ms_backend() else self.epochs
     for epoch in range(repeat_time):
         epoch_logs = {'train_num_batches': self.batch_num_train}
         if self.do_validation:
             epoch_logs.update({'valid_num_batches': self.batch_num_valid})
         self.callbacks.before_epoch(epoch, epoch_logs)
         self._train_epoch()
         if self.do_validation and self._should_run_validation(epoch):
             self._valid_epoch()
         self.callbacks.after_epoch(epoch)
     self.callbacks.after_train()
     if self.distributed:
         self._shutdown_distributed()
Example #28
0
    def _get_callbacks(self, customs, disables):
        defaults = []
        if zeus.is_torch_backend():
            defaults = ["ModelStatistics", "MetricsEvaluator", "ModelCheckpoint", "ModelBuilder", "PerformanceSaver",
                        "RuntimeCallback", "LearningRateScheduler", "ProgressLogger", "ReportCallback",
                        "VisualCallBack"
                        ]
        elif zeus.is_tf_backend():
            defaults = ["ModelStatistics", "MetricsEvaluator", "ModelCheckpoint", "ModelBuilder", "PerformanceSaver",
                        "RuntimeCallback", "ProgressLogger", "ReportCallback", "VisualCallBack"]
        elif zeus.is_ms_backend():
            defaults = ["ModelStatistics", "MetricsEvaluator", "ModelCheckpoint", "ModelBuilder", "PerformanceSaver",
                        "ProgressLogger", "ReportCallback", "VisualCallBack"]

        custom_disables = []
        disables = disables if disables else []
        customs = customs if customs else []
        custom_enables = []
        if customs:
            if isinstance(customs, str):
                customs = [customs]
            for customs_name in customs:
                callback_class = ClassFactory.get_cls(ClassType.CALLBACK, customs_name)
                # Sort the callbacks
                if hasattr(callback_class, "disable_callbacks"):
                    _disables = callback_class.disable_callbacks
                    if not isinstance(_disables, list):
                        _disables = [_disables]
                    custom_disables += _disables
                if hasattr(callback_class, "enable_callbacks"):
                    _enables = callback_class.enable_callbacks
                    if not isinstance(_enables, list):
                        _enables = [_enables]
                    custom_enables += _enables
        if custom_enables:
            callbacks = custom_enables
        else:
            callbacks = set([_cls for _cls in defaults + customs if _cls not in disables + custom_disables])
        callbacks = [ClassFactory.get_cls(ClassType.CALLBACK, _cls)() for _cls in callbacks]
        callbacks = sorted(callbacks, key=lambda callback: callback.priority)
        return callbacks
Example #29
0
def _calc_forward_latency_gpu(model, input, sess_config=None, num=100):
    """Model forward latency calculation.

    :param model: network model
    :type model: torch or tf module
    :param input: input tensor
    :type input: Tensor of torch or tf
    :param num: forward number
    :type num: int
    :return: forward latency
    :rtype: float
    """
    prepare_num = int(0.05 * num)
    if zeus.is_torch_backend():
        for _ in range(prepare_num):
            model(input)
        start_time = time.time()
        for _ in range(num):
            model(input)
        latency = (time.time() - start_time) / num
    elif zeus.is_tf_backend():
        import tensorflow.compat.v1 as tf
        with tf.Graph().as_default() as graph:
            input_holder = tf.placeholder(dtype=tf.float32,
                                          shape=input.shape.as_list())
            model.training = False
            output = model(input_holder)
            with tf.Session(config=sess_config) as sess:
                sess.run(tf.global_variables_initializer())
                input = tf.random.uniform(input.shape.as_list(),
                                          dtype=input.dtype)
                input_numpy = input.eval(session=sess)
                for _ in range(prepare_num):
                    sess.run(output, feed_dict={input_holder: input_numpy})
                start_time = time.time()
                for _ in range(num):
                    sess.run(output, feed_dict={input_holder: input_numpy})
        latency = (time.time() - start_time) / num
    elif zeus.is_ms_backend():
        latency = 0.
    return latency
Example #30
0
    def before_train(self, logs=None):
        """Fetch trainer info before train stage."""
        self._fix_path = "_".join(
            [self.trainer.step_name,
             str(self.trainer.worker_id)])
        self.summary = SummaryBoard(self._archive_root, self._fix_path)

        # add graph only once.
        if zeus.is_tf_backend():
            import tensorflow as tf
            datasets = self.trainer.valid_input_fn()
            data_iter = tf.compat.v1.data.make_one_shot_iterator(datasets)
            input_data, _ = data_iter.get_next()
            self.input = input_data[:1]

            graph = self.trainer.graph
            _graph_name_list = [n.name for n in graph.as_graph_def().node]
            if len(_graph_name_list) < 2:
                graph = _fetch_tf_graph(self.trainer.model, self.input)

            self.summary.add_graph(graph=graph, backend="tf")
        elif zeus.is_torch_backend():
            model = self.trainer.model
            data_iter = iter(self.trainer.train_loader)
            input_batch, _ = data_iter.next()

            input_data = input_batch[:1]
            if self.trainer.use_cuda and not self.trainer.config.is_detection_trainer:
                input_data = input_data.cuda()
            try:
                self.summary.add_graph(model=model,
                                       feed_data=input_data,
                                       backend="torch")
            except BaseException as err:
                logging.warning(
                    "Dump PyTorch model failed! with: \n{}".format(err))

        elif zeus.is_ms_backend():
            logging.debug("Don't support mindspore model dump yet.")
        else:
            logging.warning("non-known backend.")