def shutdown(): """Shutdown all distributed cluster.""" mode = UserConfig().data.general.cluster_mode gpus = str(UserConfig().data.general.worker.gpus_per_job) if mode == ClusterMode.Single and gpus == "-1": return try: logging.info("Try to shutdown cluster.") from vega.core.trainer.utils import get_write_ip_master_local from distributed import Client ip, port = get_write_ip_master_local() if ip is None or port is None: logging.info( "Stand-alone mode, no need to shut down the cluster.") return shutdown_client = Client("{}:{}".format(ip, port)) logging.info("Cluster will be shut down.") shutdown_client.shutdown() shutdown_client.close() del shutdown_client logging.info("Cluster is shut down.") except Exception as e: logging.error("Pipeline's cluster shutdown error: {}".format( str(e))) logging.error(traceback.format_exc())
def _do_horovod_fully_train(self): """Call horovod bash file to load pickle files saved by vega.""" pwd_dir = os.path.dirname(os.path.abspath(__file__)) cf_file = os.path.join(pwd_dir, 'cf.pickle') cf_content = { 'configs': ClassFactory.__configs__, 'registry': ClassFactory.__registry__, 'data': UserConfig().__data__ } with open(cf_file, 'wb') as f: pickle.dump(cf_content, f) cf_file_remote = os.path.join(self.task.local_base_path, 'cf.pickle') FileOps.copy_file(cf_file, cf_file_remote) if os.environ.get('DLS_TASK_NUMBER') is None: # local cluster worker_ips = '127.0.0.1' if UserConfig().data.general.cluster.master_ip is not None and \ UserConfig().data.general.cluster.master_ip != '127.0.0.1': worker_ips = UserConfig().data.general.cluster.master_ip for ip in UserConfig().data.general.cluster.slaves: worker_ips = worker_ips + ',' + ip cmd = [ 'bash', '{}/horovod/run_cluster_horovod_train.sh'.format(pwd_dir), str(self.world_device_size), cf_file_remote, worker_ips ] else: # Roma cmd = [ 'bash', '{}/horovod/run_horovod_train.sh'.format(pwd_dir), str(self.world_device_size), cf_file_remote ] proc = subprocess.Popen(cmd, env=os.environ) proc.wait()
def __new__(cls): """Return a LocalMaster instance when run on local, else return a master instance.""" mode = UserConfig().data.general.cluster_mode gpus = str(UserConfig().data.general.worker.gpus_per_job) if mode == ClusterMode.Single and gpus == "-1": return LocalMaster() else: return object.__new__(cls)
def save_results(self): """Save the results of evolution contains the information of pupulation and elitism.""" step_name = Config(deepcopy(UserConfig().data)).general.step_name _path = FileOps.join_path(self.local_output_path, step_name) FileOps.make_dir(_path) arch_file = FileOps.join_path(_path, 'arch.txt') arch_child = FileOps.join_path(_path, 'arch_child.txt') sel_arch_file = FileOps.join_path(_path, 'selected_arch.npy') sel_arch = [] with open(arch_file, 'a') as fw_a, open(arch_child, 'a') as fw_ac: writer_a = csv.writer(fw_a, lineterminator='\n') writer_ac = csv.writer(fw_ac, lineterminator='\n') writer_ac.writerow(['Population Iteration: ' + str(self.evolution_count + 1)]) for c in range(self.individual_num): writer_ac.writerow( self._log_data(net_info_type='active_only', pop=self.pop[c], value=self.pop[c].fitness)) writer_a.writerow(['Population Iteration: ' + str(self.evolution_count + 1)]) for c in range(self.elitism_num): writer_a.writerow(self._log_data(net_info_type='active_only', pop=self.elitism[c], value=self.elit_fitness[c])) sel_arch.append(self.elitism[c].gene) sel_arch = np.stack(sel_arch) np.save(sel_arch_file, sel_arch) if self.backup_base_path is not None: FileOps.copy_folder(self.local_output_path, self.backup_base_path)
def world_device_size(self): """World device size is world size * device count in each world.""" import torch world_size = UserConfig().data.env.world_size device_nums = torch.cuda.device_count() num_devices = world_size * device_nums return num_devices
def __init__(self): """Init master attrs, setup and start dask distributed cluster and local multiprocess pool.""" self.cfg = copy.deepcopy(UserConfig().data.general) self.task_count = 0 self.eval_count = self.cfg.worker.eval_count self.dask_env = DaskEnv(UserConfig().data.env, self.__master_path__, self.cfg.worker.gpus_per_job, TaskOps(self.cfg).temp_path) status = self.dask_env.start() if not status or not self.dask_env.is_master: sys.exit(0) self._start_cluster() self._start_evaluator_multiprocess() self.t_queue = Queue() # now save GPU and Dloop Evaluator result. self.e_queue = utils.PairDictQueue() return
def _save_model_desc_file(self, id, desc): output_path = TaskOps(UserConfig().data.general).local_output_path desc_file = os.path.join(output_path, "nas", "model_desc_{}.json".format(id)) FileOps.make_base_dir(desc_file) output = {} for key in desc: if key in ["type", "modules", "custom"]: output[key] = desc[key] with open(desc_file, "w") as f: json.dump(output, f)
def do(self): """Start to run benchmark evaluator.""" logger.info("BenchmarkPipeStep started...") cfg = Config(deepcopy(UserConfig().data)) step_name = cfg.general.step_name pipe_step_cfg = cfg[step_name].pipe_step if "esr_models_file" in pipe_step_cfg and pipe_step_cfg.esr_models_file is not None: # TODO: ESR model self._evaluate_esr_models(pipe_step_cfg.esr_models_file, pipe_step_cfg.models_folder) elif "models_folder" in pipe_step_cfg and pipe_step_cfg.models_folder is not None: self._evaluate_multi_models(pipe_step_cfg.models_folder) else: self._evaluate_single_model() self._backup_output_path() logger.info("Complete model evaluation.")
def __init__(self, args=None): """Init DistributedWorker.""" super(DistributedWorker, self).__init__() # privates DistributedWorker.__worker_id__ = DistributedWorker.__worker_id__ + 1 self._worker_id = DistributedWorker.__worker_id__ # publics self.rank = 0 self.world_size = 1 self.worker_addr = "" self.worker_nccl_port = 16666 self.timeout = int(float(General.worker.timeout) * 60 * 60) self.__env_config__ = (copy.deepcopy(UserConfig().data), copy.deepcopy(ClassFactory.__configs__), copy.deepcopy(ClassFactory.__registry__)) self.__network_config__ = copy.deepcopy( NetworkFactory.__network_registry__) self.__general__ = obj2config(General) self.__worker_device_folder__ = os.path.join(self.temp_path, '.worker_device') if not os.path.exists(self.__worker_device_folder__): os.makedirs(self.__worker_device_folder__, exist_ok=True) return
def do(self): """Start to run fully train with horovod or local trainer.""" logger.info("FullyTrainPipeStep started...") cls_trainer = ClassFactory.get_cls('trainer') trainer_cfg = ClassFactory.__configs__.get('trainer') setattr(trainer_cfg, 'save_best_model', True) if cls_trainer.cfg.get('horovod', False): self._do_horovod_fully_train() else: cfg = Config(deepcopy(UserConfig().data)) step_name = cfg.general.step_name pipe_step_cfg = cfg[step_name].pipe_step if "esr_models_file" in pipe_step_cfg and pipe_step_cfg.esr_models_file is not None: self.master = Master() self._train_esr_models(pipe_step_cfg.esr_models_file) elif "models_folder" in pipe_step_cfg and pipe_step_cfg.models_folder is not None: self.master = Master() self._train_multi_models(pipe_step_cfg.models_folder) else: self.master = LocalMaster() self._train_single_model() self.master.join() self.master.close_client() self._backup_output_path()
def __init__(self): self.task = TaskOps(UserConfig().data.general)