def __call__(self, optimizer=None, epochs=None, steps=None): """Call lr scheduler class.""" params = obj2config(self.config).get("params", {}) logging.debug("Call LrScheduler. name={}, params={}".format( self._cls.__name__, params)) if self._cls.__name__ == "CosineAnnealingLR": if params.get("T_max", -1) == -1: if params.get("by_epoch", True): params["T_max"] = epochs else: params["T_max"] = epochs * steps if self._cls.__name__ == "WarmupScheduler": params["epochs"] = epochs params["steps"] = steps try: if params and optimizer: return self._cls(optimizer, **params) elif optimizer: return self._cls(optimizer) else: return self._cls(**params) except Exception as ex: logging.error( "Failed to call LrScheduler name={}, params={}".format( self._cls.__name__, params)) raise ex
def _do_horovod_fully_train(self): pwd_dir = os.path.dirname(os.path.abspath(__file__)) cf_file = os.path.join(pwd_dir, 'cf.pickle') cf_content = {'configs': ClassFactory.__configs__, 'registry': ClassFactory.__registry__, 'data': UserConfig().__data__, 'network_registry': NetworkFactory.__network_registry__, 'general': obj2config(General)} with open(cf_file, 'wb') as f: pickle.dump(cf_content, f) cf_file_remote = os.path.join(self.task.local_base_path, 'cf.pickle') FileOps.copy_file(cf_file, cf_file_remote) if os.environ.get('DLS_TASK_NUMBER') is None: # local cluster worker_ips = '127.0.0.1' if General.cluster.master_ip is not None and General.cluster.master_ip != '127.0.0.1': worker_ips = General.cluster.master_ip for ip in General.cluster.slaves: worker_ips = worker_ips + ',' + ip cmd = ['bash', '{}/horovod/run_cluster_horovod_train.sh'.format(pwd_dir), str(self.world_device_size), cf_file_remote, worker_ips] else: # Roma cmd = ['bash', '{}/horovod/run_horovod_train.sh'.format(pwd_dir), str(self.world_device_size), cf_file_remote] proc = subprocess.Popen(cmd, env=os.environ) proc.wait()
def __call__(self): """Call loss cls.""" params = obj2config(self.config).get("params", {}) logging.debug("Call Loss. name={}, params={}".format(self._cls.__name__, params)) try: if params: cls_obj = self._cls(**params) if isclass(self._cls) else partial(self._cls, **params) else: cls_obj = self._cls() if isclass(self._cls) else partial(self._cls) if vega.is_torch_backend() and TrainerConfig().cuda: cls_obj = cls_obj.cuda() return cls_obj except Exception as ex: logging.error("Failed to call Loss name={}, params={}".format(self._cls.__name__, params)) raise ex
def __init__(self, **kwargs): super(Cityscapes, self).__init__(**kwargs) config = obj2config(getattr(self.config, self.mode)) config.update(self.args) self.args = config self.root_dir = self.args['root_dir'] self.image_size = self.args.Rescale.size self.list_file = self.args.list_file self.batch_size = self.args.get('batch_size', 1) self.num_parallel_batches = self.args.get('num_parallel_batches', 1) self.drop_remainder = self.args.get('drop_remainder', False) self.transforms = self._init_transforms() self.root_dir = FileOps.download_dataset(self.root_dir) self._init_data_files()
def __init__(self, hps=None, mode='train', **kwargs): """Construct method.""" super(Dataset, self).__init__() if not hasattr(self, 'config'): raise ValueError("Dataset class should has attr config.") self.mode = mode if self.mode == "test" and not hasattr(self.config, "test"): self.mode = "val" self.args = deepcopy(obj2config(getattr(self.config, self.mode))) self._init_hps(hps) self.train = self.mode in ["train", "val"] self.num_images = self.args.get('num_images', 0) self.batch_size = self.args.batch_size self.world_size = 1 self.rank = 0
def __init__(self, search_space=None, **kwargs): """Init SearchAlgorithm.""" super(SearchAlgorithm, self).__init__() # modify config by kwargs, using local scope if self.config and kwargs: self.config = self.config() load_conf_from_desc(self.config, kwargs) self.search_space = search_space if hasattr(self.config, 'codec'): self.codec = Codec(search_space, type=self.config.codec) else: self.codec = None logging.debug("Config=%s", obj2config(self.config)) self.report = Report() self.record = ReportRecord() self.record.step_name = self.step_name
def __init__(self, metric_cfg=None): """Init Metrics.""" self.mdict = {} metric_config = obj2config(self.config) if not isinstance(metric_config, list): metric_config = [metric_config] for metric_item in metric_config: ClassFactory.get_cls(ClassType.METRIC, self.config.type) metric_name = metric_item.pop('type') metric_class = ClassFactory.get_cls(ClassType.METRIC, metric_name) if isfunction(metric_class): metric_class = partial(metric_class, **metric_item.get("params", {})) else: metric_class = metric_class(**metric_item.get("params", {})) self.mdict[metric_name] = metric_class self.mdict = Config(self.mdict) self.metric_results = dict()
def __call__(self, model=None, lr_scheduler=None, epoch=None, distributed=False): """Call Optimizer class. :param model: model, used in torch case :param lr_scheduler: learning rate scheduler, used in tf case :param epoch: epoch of training, used in tf case :param distributed: use distributed :return: optimizer """ params = obj2config(self.config).get("params", {}) logging.debug("Call Optimizer. name={}, params={}".format( self.optim_cls.__name__, params)) optimizer = None try: if vega.is_torch_backend(): learnable_params = [ param for param in model.parameters() if param.requires_grad ] optimizer = self.optim_cls(learnable_params, **params) if distributed: optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=hvd.Compression.none) elif vega.is_tf_backend(): lr_scheduler.step(epoch) params['learning_rate'] = lr_scheduler.get_lr()[0] optimizer = self.optim_cls(**params) if distributed: optimizer = hvd.DistributedOptimizer(optimizer) if vega.is_gpu_device() else \ NPUDistributedOptimizer(optimizer) return optimizer except Exception as ex: logging.error("Failed to call Optimizer name={}, params={}".format( self.optim_cls.__name__, params)) raise ex
def __init__(self, hps=None, mode='train', **kwargs): """Construct method.""" super(Dataset, self).__init__() self.args = dict() self.mode = mode if mode == "val" and not hasattr(self.config, "val"): self.mode = "test" # modify config from kwargs, `Cifar10(mode='test', data_path='/cache/datasets')` if kwargs: self.args = Config(kwargs) if hasattr(self, 'config'): config = obj2config(getattr(self.config, self.mode)) config.update(self.args) self.args = config self._init_hps(hps) self.train = self.mode in ["train", "val"] transforms_list = self._init_transforms() self._transforms = Transforms(transforms_list) if "transforms" in kwargs.keys(): self._transforms.__transform__ = kwargs["transforms"] self.dataset_init() self.sampler = self._init_sampler()
def __init__(self, args=None): """Init DistributedWorker.""" super(DistributedWorker, self).__init__() # privates DistributedWorker.__worker_id__ = DistributedWorker.__worker_id__ + 1 self._worker_id = DistributedWorker.__worker_id__ # publics self.rank = 0 self.world_size = 1 self.worker_addr = "" self.worker_nccl_port = 16666 self.timeout = int(float(General.worker.timeout) * 60 * 60) self.__env_config__ = (copy.deepcopy(UserConfig().data), copy.deepcopy(ClassFactory.__configs__), copy.deepcopy(ClassFactory.__registry__)) self.__network_config__ = copy.deepcopy( NetworkFactory.__network_registry__) self.__general__ = obj2config(General) self.__worker_device_folder__ = os.path.join(self.temp_path, '.worker_device') if not os.path.exists(self.__worker_device_folder__): os.makedirs(self.__worker_device_folder__, exist_ok=True) return
def build(self, model=None, optimizer=None, loss=None, lr_scheduler=None, metrics=None, hps=None, callbacks=None, train_loader=None, valid_loader=None, make_batch=None, train_step=None, valid_step=None, model_fn=None, train_input_fn=None, valid_input_fn=None, load_ckpt_flag=False, checkpoint_file_name="checkpoint.pth", model_pickle_file_name="model.pkl"): """Build the trainer by assembling the necessary components.""" # Intitialize hyperparameters by parameters or configurations self._init_hps(hps) logging.debug("Trainer Config: {}".format(obj2config(self.config))) self.checkpoint_file_name = checkpoint_file_name self.model_pickle_file_name = model_pickle_file_name if vega.is_torch_backend(): self._init_step_functions(make_batch, train_step, valid_step) elif vega.is_tf_backend(): self._init_estimator_fn(model_fn, train_input_fn, valid_input_fn) self._init_tf_session() self._init_distributed_setting() self._init_cuda_setting() self._init_tf_estimator() self.do_validation = self.config.with_valid self.model = self._init_model(model) self.load_ckpt_flag = load_ckpt_flag if self.load_ckpt_flag: self.load_checkpoint() else: self._load_pretrained_model() self.use_syncbn = self.config.syncbn if self.use_syncbn and vega.is_torch_backend(): self.model = apex.parallel.convert_syncbn_model(self.model) self.train_loader = self._init_dataloader(mode='train', loader=train_loader) self.valid_loader = self._init_dataloader(mode='val', loader=valid_loader) if vega.is_torch_backend(): self.optimizer = Optimizer()(model=self.model, distributed=self.distributed) \ if optimizer is None else optimizer self.loss = Loss()() if loss is None else loss self.lr_scheduler = LrScheduler()( self.optimizer) if lr_scheduler is None else lr_scheduler # Some trainer has different train batch size from valid batch self.train_metrics = self._init_metrics( metrics) if vega.is_torch_backend() else None self.valid_metrics = self._init_metrics(metrics) self._init_horovod_setting() if self.use_amp and vega.is_torch_backend(): self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level='O1') if self.callbacks is None: self.callbacks = callbacks # self.output_model_desc() cur_working_dir = FileOps.join_path(self.local_output_path, self.step_name) FileOps.make_dir(cur_working_dir) # Make sure Trainer has been built for training self.has_built = True