Example #1
0
 def _simulate_tiny_pipeline(self, cfg_tiny):
     """Simulate tiny pipeline by using one sample one epoch."""
     report = ReportServer()
     for i, step_name in enumerate(PipelineConfig.steps):
         step_cfg = cfg_tiny.get(step_name)
         if step_cfg.pipe_step.type != 'SearchPipeStep':
             continue
         step_cfg.trainer.distributed = False
         step_cfg.trainer.epochs = 1
         self.restrict_config.trials[step_name] = 1
         General.step_name = step_name
         PipeStepConfig.from_dict(step_cfg)
         pipestep = PipeStep()
         if i == 0:
             pipestep.do()
             record = report.get_step_records(step_name)[-1]
             self.epoch_time = record.runtime
             _worker_path = TaskOps().local_base_path
             if os.path.exists(_worker_path):
                 os.system('rm -rf {}'.format(_worker_path))
         if step_cfg.pipe_step.type == 'SearchPipeStep':
             self.params_dict[step_name][
                 'max_samples'] = pipestep.generator.search_alg.max_samples
         _file = os.path.join(TaskOps().step_path, ".generator")
         if os.path.exists(_file):
             os.system('rm {}'.format(_file))
Example #2
0
def load_config(config_file):
    """Load config from file."""
    import os
    import pickle
    import vega

    with open(config_file, 'rb') as f:
        config = pickle.load(f)
    for (key, value) in config["env"].items():
        if value is not None:
            os.environ[key] = value

    vega.set_backend(os.environ['BACKEND_TYPE'].lower(), os.environ["DEVICE_CATEGORY"])

    from vega.common.class_factory import ClassFactory
    from vega.common.general import General
    from vega.datasets.conf.dataset import DatasetConfig
    from vega.networks.model_config import ModelConfig
    from vega.trainer.conf import TrainerConfig
    from vega.evaluator.conf import EvaluatorConfig
    from vega.core.pipeline.conf import PipeStepConfig

    ClassFactory.__registry__ = config["class_factory"]
    General.from_dict(config["general"])
    DatasetConfig.from_dict(config["dataset"])
    ModelConfig.from_dict(config["model"])
    TrainerConfig.from_dict(config["trainer"])
    EvaluatorConfig.from_dict(config["evaluator"])
    PipeStepConfig.from_dict(config["pipe_step"])
Example #3
0
 def _do_horovod_fully_train(self, trainer):
     # records = self._get_current_step_records()
     pwd_dir = os.path.dirname(os.path.abspath(__file__))
     cf_file = os.path.join(self.task.temp_path, 'cf.pickle')
     cf_content = {
         'registry': ClassFactory.__registry__,
         'general_config': General().to_dict(),
         'pipe_step_config': PipeStepConfig().to_dict(),
         'model_desc': trainer.model_desc,
         'worker_id': trainer.worker_id
     }
     with open(cf_file, 'wb') as f:
         pickle.dump(cf_content, f)
     if os.environ.get('DLS_TASK_NUMBER') is None:
         # local cluster
         worker_ips = '127.0.0.1'
         if General.cluster.master_ip is not None and General.cluster.master_ip != '127.0.0.1':
             worker_ips = General.cluster.master_ip
             for ip in General.cluster.slaves:
                 worker_ips = worker_ips + ',' + ip
         cmd = [
             'bash', '{}/horovod/run_horovod_train.sh'.format(pwd_dir),
             str(self.world_device_size), cf_file, worker_ips
         ]
     else:
         # Roma
         cmd = [
             'bash', '/home/work/run_horovod_train.sh',
             str(self.world_device_size), cf_file
         ]
     proc = subprocess.Popen(cmd, env=os.environ)
     proc.wait()
Example #4
0
 def _do_horovod_fully_train(self):
     pwd_dir = os.path.dirname(os.path.abspath(__file__))
     cf_file = os.path.join(pwd_dir, 'cf.pickle')
     cf_content = {
         'registry': ClassFactory.__registry__,
         'general_config': General().to_json(),
         'pipe_step_config': PipeStepConfig().to_json()
     }
     with open(cf_file, 'wb') as f:
         pickle.dump(cf_content, f)
     cf_file_remote = os.path.join(self.task.local_base_path, 'cf.pickle')
     FileOps.copy_file(cf_file, cf_file_remote)
     if os.environ.get('DLS_TASK_NUMBER') is None:
         # local cluster
         worker_ips = '127.0.0.1'
         if General.cluster.master_ip is not None and General.cluster.master_ip != '127.0.0.1':
             worker_ips = General.cluster.master_ip
             for ip in General.cluster.slaves:
                 worker_ips = worker_ips + ',' + ip
         cmd = [
             'bash',
             '{}/horovod/run_cluster_horovod_train.sh'.format(pwd_dir),
             str(self.world_device_size), cf_file_remote, worker_ips
         ]
     else:
         # Roma
         cmd = [
             'bash', '{}/horovod/run_horovod_train.sh'.format(pwd_dir),
             str(self.world_device_size), cf_file_remote
         ]
     proc = subprocess.Popen(cmd, env=os.environ)
     proc.wait()
Example #5
0
 def _train_multi_task(self):
     from copy import deepcopy
     for epoch in range(0, PipeStepConfig.pipe_step.multi_task_epochs):
         for alg in PipeStepConfig.pipe_step.tasks:
             desc = deepcopy(PipeStepConfig().to_dict()[alg])
             model_desc = desc.model.model_desc
             desc.pop('model')
             self._train_single_model(model_desc=model_desc, model_id=0, hps=desc, multi_task=alg)
Example #6
0
def _get_worker_config(worker):
    """Save worker config."""
    from vega.common.class_factory import ClassFactory
    from vega.common.general import General
    from vega.datasets.conf.dataset import DatasetConfig
    from vega.networks.model_config import ModelConfig
    from vega.evaluator.conf import EvaluatorConfig
    from vega.core.pipeline.conf import PipeStepConfig

    env = {
        "LOCAL_RANK": os.environ.get("LOCAL_RANK", None),
        "PYTHONPATH": os.environ.get("PYTHONPATH", None),
        "LD_LIBRARY_PATH": os.environ.get("LD_LIBRARY_PATH", None),
        "PWD": os.environ.get("PWD", None),
        "DLS_JOB_ID": os.environ.get("DLS_JOB_ID", None),
        "RANK_TABLE_FILE": os.environ.get("RANK_TABLE_FILE", None),
        "RANK_SIZE": os.environ.get("RANK_SIZE", None),
        "DEVICE_ID": os.environ.get("DEVICE_ID", None),
        "RANK_ID": os.environ.get("RANK_ID", None),
        "DLS_TASK_NUMBER": os.environ.get("DLS_TASK_NUMBER", None),
        "NPU-VISIBLE-DEVICES": os.environ.get("NPU-VISIBLE-DEVICES", None),
        "NPU_VISIBLE_DEVICES": os.environ.get("NPU_VISIBLE_DEVICES", None),
        "PATH": os.environ.get("PATH", None),
        "ASCEND_OPP_PATH": os.environ.get("ASCEND_OPP_PATH", None),
        "DEVICE_CATEGORY": os.environ.get("DEVICE_CATEGORY", None),
        "BACKEND_TYPE": os.environ.get("BACKEND_TYPE", None),
    }
    worker_config = {
        "class_factory": deepcopy(ClassFactory.__registry__),
        "general": General().to_dict(),
        "dataset": DatasetConfig().to_dict(),
        "model": ModelConfig().to_dict(),
        "trainer": worker.config.to_dict(),
        "evaluator": EvaluatorConfig().to_dict(),

        "worker_nccl_port": worker.worker_nccl_port,
        "world_size": worker.world_size,
        "timeout": worker.timeout,

        "env": env,
        "pipe_step": PipeStepConfig().to_dict()
    }
    return worker_config
Example #7
0
import horovod.torch as hvd
from zeus.common import ClassFactory
from zeus.common.general import General
from vega.core.pipeline.conf import PipeStepConfig

parser = argparse.ArgumentParser(description='Horovod Fully Train')
parser.add_argument('--cf_file', type=str, help='ClassFactory pickle file')
args = parser.parse_args()

if 'VEGA_INIT_ENV' in os.environ:
    exec(os.environ.copy()['VEGA_INIT_ENV'])
logging.info('start horovod setting')
hvd.init()
try:
    import moxing as mox
    mox.file.set_auth(obs_client_log=False)
except Exception:
    pass
hvd.join()
with open(args.cf_file, 'rb') as f:
    cf_content = pickle.load(f)
model_desc = cf_content.get('model_desc')
worker_id = cf_content.get('worker_id')
ClassFactory.__registry__ = cf_content.get('registry')
General.from_dict(cf_content.get('general_config'))
PipeStepConfig.from_dict(cf_content.get('pipe_step_config'))
cls_trainer = ClassFactory.get_cls('trainer')
# for record in records:
trainer = cls_trainer(model_desc=model_desc, id=worker_id)
trainer.train_process()
Example #8
0
import logging
import horovod.torch as hvd
from zeus.common import ClassFactory
from zeus.common.general import General
from zeus.common import FileOps
from vega.core.pipeline.conf import PipeStepConfig

parser = argparse.ArgumentParser(description='Horovod Fully Train')
parser.add_argument('--cf_file', type=str, help='ClassFactory pickle file')
args = parser.parse_args()

if 'VEGA_INIT_ENV' in os.environ:
    exec(os.environ.copy()['VEGA_INIT_ENV'])
logging.info('start horovod setting')
hvd.init()
try:
    import moxing as mox
    mox.file.set_auth(obs_client_log=False)
except Exception:
    pass
FileOps.copy_file(args.cf_file, './cf_file.pickle')
hvd.join()
with open('./cf_file.pickle', 'rb') as f:
    cf_content = pickle.load(f)
ClassFactory.__registry__ = cf_content.get('registry')
General.from_json(cf_content.get('general_config'))
PipeStepConfig.from_json(cf_content.get('pipe_step_config'))
cls_trainer = ClassFactory.get_cls('trainer', "Trainer")
trainer = cls_trainer(None, 0)
trainer.train_process()