Exemple #1
0
 def shutdown():
     """Shutdown all distributed cluster."""
     mode = UserConfig().data.general.cluster_mode
     gpus = str(UserConfig().data.general.worker.gpus_per_job)
     if mode == ClusterMode.Single and gpus == "-1":
         return
     try:
         logging.info("Try to shutdown cluster.")
         from vega.core.trainer.utils import get_write_ip_master_local
         from distributed import Client
         ip, port = get_write_ip_master_local()
         if ip is None or port is None:
             logging.info(
                 "Stand-alone mode, no need to shut down the cluster.")
             return
         shutdown_client = Client("{}:{}".format(ip, port))
         logging.info("Cluster will be shut down.")
         shutdown_client.shutdown()
         shutdown_client.close()
         del shutdown_client
         logging.info("Cluster is shut down.")
     except Exception as e:
         logging.error("Pipeline's cluster shutdown error: {}".format(
             str(e)))
         logging.error(traceback.format_exc())
 def _do_horovod_fully_train(self):
     """Call horovod bash file to load pickle files saved by vega."""
     pwd_dir = os.path.dirname(os.path.abspath(__file__))
     cf_file = os.path.join(pwd_dir, 'cf.pickle')
     cf_content = {
         'configs': ClassFactory.__configs__,
         'registry': ClassFactory.__registry__,
         'data': UserConfig().__data__
     }
     with open(cf_file, 'wb') as f:
         pickle.dump(cf_content, f)
     cf_file_remote = os.path.join(self.task.local_base_path, 'cf.pickle')
     FileOps.copy_file(cf_file, cf_file_remote)
     if os.environ.get('DLS_TASK_NUMBER') is None:
         # local cluster
         worker_ips = '127.0.0.1'
         if UserConfig().data.general.cluster.master_ip is not None and \
            UserConfig().data.general.cluster.master_ip != '127.0.0.1':
             worker_ips = UserConfig().data.general.cluster.master_ip
             for ip in UserConfig().data.general.cluster.slaves:
                 worker_ips = worker_ips + ',' + ip
         cmd = [
             'bash',
             '{}/horovod/run_cluster_horovod_train.sh'.format(pwd_dir),
             str(self.world_device_size), cf_file_remote, worker_ips
         ]
     else:
         # Roma
         cmd = [
             'bash', '{}/horovod/run_horovod_train.sh'.format(pwd_dir),
             str(self.world_device_size), cf_file_remote
         ]
     proc = subprocess.Popen(cmd, env=os.environ)
     proc.wait()
Exemple #3
0
 def __new__(cls):
     """Return a LocalMaster instance when run on local, else return a master instance."""
     mode = UserConfig().data.general.cluster_mode
     gpus = str(UserConfig().data.general.worker.gpus_per_job)
     if mode == ClusterMode.Single and gpus == "-1":
         return LocalMaster()
     else:
         return object.__new__(cls)
Exemple #4
0
    def save_results(self):
        """Save the results of evolution contains the information of pupulation and elitism."""
        step_name = Config(deepcopy(UserConfig().data)).general.step_name
        _path = FileOps.join_path(self.local_output_path, step_name)
        FileOps.make_dir(_path)
        arch_file = FileOps.join_path(_path, 'arch.txt')
        arch_child = FileOps.join_path(_path, 'arch_child.txt')
        sel_arch_file = FileOps.join_path(_path, 'selected_arch.npy')
        sel_arch = []
        with open(arch_file, 'a') as fw_a, open(arch_child, 'a') as fw_ac:
            writer_a = csv.writer(fw_a, lineterminator='\n')
            writer_ac = csv.writer(fw_ac, lineterminator='\n')
            writer_ac.writerow(['Population Iteration: ' + str(self.evolution_count + 1)])
            for c in range(self.individual_num):
                writer_ac.writerow(
                    self._log_data(net_info_type='active_only', pop=self.pop[c],
                                   value=self.pop[c].fitness))

            writer_a.writerow(['Population Iteration: ' + str(self.evolution_count + 1)])
            for c in range(self.elitism_num):
                writer_a.writerow(self._log_data(net_info_type='active_only',
                                                 pop=self.elitism[c],
                                                 value=self.elit_fitness[c]))
                sel_arch.append(self.elitism[c].gene)
        sel_arch = np.stack(sel_arch)
        np.save(sel_arch_file, sel_arch)
        if self.backup_base_path is not None:
            FileOps.copy_folder(self.local_output_path, self.backup_base_path)
 def world_device_size(self):
     """World device size is world size * device count in each world."""
     import torch
     world_size = UserConfig().data.env.world_size
     device_nums = torch.cuda.device_count()
     num_devices = world_size * device_nums
     return num_devices
Exemple #6
0
 def __init__(self):
     """Init master attrs, setup and start dask distributed cluster and local multiprocess pool."""
     self.cfg = copy.deepcopy(UserConfig().data.general)
     self.task_count = 0
     self.eval_count = self.cfg.worker.eval_count
     self.dask_env = DaskEnv(UserConfig().data.env, self.__master_path__,
                             self.cfg.worker.gpus_per_job,
                             TaskOps(self.cfg).temp_path)
     status = self.dask_env.start()
     if not status or not self.dask_env.is_master:
         sys.exit(0)
     self._start_cluster()
     self._start_evaluator_multiprocess()
     self.t_queue = Queue()
     # now save GPU and Dloop Evaluator result.
     self.e_queue = utils.PairDictQueue()
     return
Exemple #7
0
 def _save_model_desc_file(self, id, desc):
     output_path = TaskOps(UserConfig().data.general).local_output_path
     desc_file = os.path.join(output_path, "nas",
                              "model_desc_{}.json".format(id))
     FileOps.make_base_dir(desc_file)
     output = {}
     for key in desc:
         if key in ["type", "modules", "custom"]:
             output[key] = desc[key]
     with open(desc_file, "w") as f:
         json.dump(output, f)
Exemple #8
0
 def do(self):
     """Start to run benchmark evaluator."""
     logger.info("BenchmarkPipeStep started...")
     cfg = Config(deepcopy(UserConfig().data))
     step_name = cfg.general.step_name
     pipe_step_cfg = cfg[step_name].pipe_step
     if "esr_models_file" in pipe_step_cfg and pipe_step_cfg.esr_models_file is not None:
         # TODO: ESR model
         self._evaluate_esr_models(pipe_step_cfg.esr_models_file,
                                   pipe_step_cfg.models_folder)
     elif "models_folder" in pipe_step_cfg and pipe_step_cfg.models_folder is not None:
         self._evaluate_multi_models(pipe_step_cfg.models_folder)
     else:
         self._evaluate_single_model()
     self._backup_output_path()
     logger.info("Complete model evaluation.")
Exemple #9
0
 def __init__(self, args=None):
     """Init DistributedWorker."""
     super(DistributedWorker, self).__init__()
     # privates
     DistributedWorker.__worker_id__ = DistributedWorker.__worker_id__ + 1
     self._worker_id = DistributedWorker.__worker_id__
     # publics
     self.rank = 0
     self.world_size = 1
     self.worker_addr = ""
     self.worker_nccl_port = 16666
     self.timeout = int(float(General.worker.timeout) * 60 * 60)
     self.__env_config__ = (copy.deepcopy(UserConfig().data),
                            copy.deepcopy(ClassFactory.__configs__),
                            copy.deepcopy(ClassFactory.__registry__))
     self.__network_config__ = copy.deepcopy(
         NetworkFactory.__network_registry__)
     self.__general__ = obj2config(General)
     self.__worker_device_folder__ = os.path.join(self.temp_path,
                                                  '.worker_device')
     if not os.path.exists(self.__worker_device_folder__):
         os.makedirs(self.__worker_device_folder__, exist_ok=True)
     return
 def do(self):
     """Start to run fully train with horovod or local trainer."""
     logger.info("FullyTrainPipeStep started...")
     cls_trainer = ClassFactory.get_cls('trainer')
     trainer_cfg = ClassFactory.__configs__.get('trainer')
     setattr(trainer_cfg, 'save_best_model', True)
     if cls_trainer.cfg.get('horovod', False):
         self._do_horovod_fully_train()
     else:
         cfg = Config(deepcopy(UserConfig().data))
         step_name = cfg.general.step_name
         pipe_step_cfg = cfg[step_name].pipe_step
         if "esr_models_file" in pipe_step_cfg and pipe_step_cfg.esr_models_file is not None:
             self.master = Master()
             self._train_esr_models(pipe_step_cfg.esr_models_file)
         elif "models_folder" in pipe_step_cfg and pipe_step_cfg.models_folder is not None:
             self.master = Master()
             self._train_multi_models(pipe_step_cfg.models_folder)
         else:
             self.master = LocalMaster()
             self._train_single_model()
         self.master.join()
         self.master.close_client()
     self._backup_output_path()
Exemple #11
0
 def __init__(self):
     self.task = TaskOps(UserConfig().data.general)