def _simulate_tiny_pipeline(self, cfg_tiny): """Simulate tiny pipeline by using one sample one epoch.""" report = ReportServer() for i, step_name in enumerate(PipelineConfig.steps): step_cfg = cfg_tiny.get(step_name) if step_cfg.pipe_step.type != 'SearchPipeStep': continue step_cfg.trainer.distributed = False step_cfg.trainer.epochs = 1 self.restrict_config.trials[step_name] = 1 General.step_name = step_name PipeStepConfig.from_dict(step_cfg) pipestep = PipeStep() if i == 0: pipestep.do() record = report.get_step_records(step_name)[-1] self.epoch_time = record.runtime _worker_path = TaskOps().local_base_path if os.path.exists(_worker_path): os.system('rm -rf {}'.format(_worker_path)) if step_cfg.pipe_step.type == 'SearchPipeStep': self.params_dict[step_name][ 'max_samples'] = pipestep.generator.search_alg.max_samples _file = os.path.join(TaskOps().step_path, ".generator") if os.path.exists(_file): os.system('rm {}'.format(_file))
def load_config(config_file): """Load config from file.""" import os import pickle import vega with open(config_file, 'rb') as f: config = pickle.load(f) for (key, value) in config["env"].items(): if value is not None: os.environ[key] = value vega.set_backend(os.environ['BACKEND_TYPE'].lower(), os.environ["DEVICE_CATEGORY"]) from vega.common.class_factory import ClassFactory from vega.common.general import General from vega.datasets.conf.dataset import DatasetConfig from vega.networks.model_config import ModelConfig from vega.trainer.conf import TrainerConfig from vega.evaluator.conf import EvaluatorConfig from vega.core.pipeline.conf import PipeStepConfig ClassFactory.__registry__ = config["class_factory"] General.from_dict(config["general"]) DatasetConfig.from_dict(config["dataset"]) ModelConfig.from_dict(config["model"]) TrainerConfig.from_dict(config["trainer"]) EvaluatorConfig.from_dict(config["evaluator"]) PipeStepConfig.from_dict(config["pipe_step"])
def _do_horovod_fully_train(self, trainer): # records = self._get_current_step_records() pwd_dir = os.path.dirname(os.path.abspath(__file__)) cf_file = os.path.join(self.task.temp_path, 'cf.pickle') cf_content = { 'registry': ClassFactory.__registry__, 'general_config': General().to_dict(), 'pipe_step_config': PipeStepConfig().to_dict(), 'model_desc': trainer.model_desc, 'worker_id': trainer.worker_id } with open(cf_file, 'wb') as f: pickle.dump(cf_content, f) if os.environ.get('DLS_TASK_NUMBER') is None: # local cluster worker_ips = '127.0.0.1' if General.cluster.master_ip is not None and General.cluster.master_ip != '127.0.0.1': worker_ips = General.cluster.master_ip for ip in General.cluster.slaves: worker_ips = worker_ips + ',' + ip cmd = [ 'bash', '{}/horovod/run_horovod_train.sh'.format(pwd_dir), str(self.world_device_size), cf_file, worker_ips ] else: # Roma cmd = [ 'bash', '/home/work/run_horovod_train.sh', str(self.world_device_size), cf_file ] proc = subprocess.Popen(cmd, env=os.environ) proc.wait()
def _do_horovod_fully_train(self): pwd_dir = os.path.dirname(os.path.abspath(__file__)) cf_file = os.path.join(pwd_dir, 'cf.pickle') cf_content = { 'registry': ClassFactory.__registry__, 'general_config': General().to_json(), 'pipe_step_config': PipeStepConfig().to_json() } with open(cf_file, 'wb') as f: pickle.dump(cf_content, f) cf_file_remote = os.path.join(self.task.local_base_path, 'cf.pickle') FileOps.copy_file(cf_file, cf_file_remote) if os.environ.get('DLS_TASK_NUMBER') is None: # local cluster worker_ips = '127.0.0.1' if General.cluster.master_ip is not None and General.cluster.master_ip != '127.0.0.1': worker_ips = General.cluster.master_ip for ip in General.cluster.slaves: worker_ips = worker_ips + ',' + ip cmd = [ 'bash', '{}/horovod/run_cluster_horovod_train.sh'.format(pwd_dir), str(self.world_device_size), cf_file_remote, worker_ips ] else: # Roma cmd = [ 'bash', '{}/horovod/run_horovod_train.sh'.format(pwd_dir), str(self.world_device_size), cf_file_remote ] proc = subprocess.Popen(cmd, env=os.environ) proc.wait()
def _train_multi_task(self): from copy import deepcopy for epoch in range(0, PipeStepConfig.pipe_step.multi_task_epochs): for alg in PipeStepConfig.pipe_step.tasks: desc = deepcopy(PipeStepConfig().to_dict()[alg]) model_desc = desc.model.model_desc desc.pop('model') self._train_single_model(model_desc=model_desc, model_id=0, hps=desc, multi_task=alg)
def _get_worker_config(worker): """Save worker config.""" from vega.common.class_factory import ClassFactory from vega.common.general import General from vega.datasets.conf.dataset import DatasetConfig from vega.networks.model_config import ModelConfig from vega.evaluator.conf import EvaluatorConfig from vega.core.pipeline.conf import PipeStepConfig env = { "LOCAL_RANK": os.environ.get("LOCAL_RANK", None), "PYTHONPATH": os.environ.get("PYTHONPATH", None), "LD_LIBRARY_PATH": os.environ.get("LD_LIBRARY_PATH", None), "PWD": os.environ.get("PWD", None), "DLS_JOB_ID": os.environ.get("DLS_JOB_ID", None), "RANK_TABLE_FILE": os.environ.get("RANK_TABLE_FILE", None), "RANK_SIZE": os.environ.get("RANK_SIZE", None), "DEVICE_ID": os.environ.get("DEVICE_ID", None), "RANK_ID": os.environ.get("RANK_ID", None), "DLS_TASK_NUMBER": os.environ.get("DLS_TASK_NUMBER", None), "NPU-VISIBLE-DEVICES": os.environ.get("NPU-VISIBLE-DEVICES", None), "NPU_VISIBLE_DEVICES": os.environ.get("NPU_VISIBLE_DEVICES", None), "PATH": os.environ.get("PATH", None), "ASCEND_OPP_PATH": os.environ.get("ASCEND_OPP_PATH", None), "DEVICE_CATEGORY": os.environ.get("DEVICE_CATEGORY", None), "BACKEND_TYPE": os.environ.get("BACKEND_TYPE", None), } worker_config = { "class_factory": deepcopy(ClassFactory.__registry__), "general": General().to_dict(), "dataset": DatasetConfig().to_dict(), "model": ModelConfig().to_dict(), "trainer": worker.config.to_dict(), "evaluator": EvaluatorConfig().to_dict(), "worker_nccl_port": worker.worker_nccl_port, "world_size": worker.world_size, "timeout": worker.timeout, "env": env, "pipe_step": PipeStepConfig().to_dict() } return worker_config
import horovod.torch as hvd from zeus.common import ClassFactory from zeus.common.general import General from vega.core.pipeline.conf import PipeStepConfig parser = argparse.ArgumentParser(description='Horovod Fully Train') parser.add_argument('--cf_file', type=str, help='ClassFactory pickle file') args = parser.parse_args() if 'VEGA_INIT_ENV' in os.environ: exec(os.environ.copy()['VEGA_INIT_ENV']) logging.info('start horovod setting') hvd.init() try: import moxing as mox mox.file.set_auth(obs_client_log=False) except Exception: pass hvd.join() with open(args.cf_file, 'rb') as f: cf_content = pickle.load(f) model_desc = cf_content.get('model_desc') worker_id = cf_content.get('worker_id') ClassFactory.__registry__ = cf_content.get('registry') General.from_dict(cf_content.get('general_config')) PipeStepConfig.from_dict(cf_content.get('pipe_step_config')) cls_trainer = ClassFactory.get_cls('trainer') # for record in records: trainer = cls_trainer(model_desc=model_desc, id=worker_id) trainer.train_process()
import logging import horovod.torch as hvd from zeus.common import ClassFactory from zeus.common.general import General from zeus.common import FileOps from vega.core.pipeline.conf import PipeStepConfig parser = argparse.ArgumentParser(description='Horovod Fully Train') parser.add_argument('--cf_file', type=str, help='ClassFactory pickle file') args = parser.parse_args() if 'VEGA_INIT_ENV' in os.environ: exec(os.environ.copy()['VEGA_INIT_ENV']) logging.info('start horovod setting') hvd.init() try: import moxing as mox mox.file.set_auth(obs_client_log=False) except Exception: pass FileOps.copy_file(args.cf_file, './cf_file.pickle') hvd.join() with open('./cf_file.pickle', 'rb') as f: cf_content = pickle.load(f) ClassFactory.__registry__ = cf_content.get('registry') General.from_json(cf_content.get('general_config')) PipeStepConfig.from_json(cf_content.get('pipe_step_config')) cls_trainer = ClassFactory.get_cls('trainer', "Trainer") trainer = cls_trainer(None, 0) trainer.train_process()