def __init__(self, session_id=None, name='', tag='', processors: list = None, options: dict = None): if processors is None: processors = [] if options is None: options = {} if not session_id: self.__session_id = f'er_session_py_{time_now(format=DEFAULT_DATETIME_FORMAT)}_{get_self_ip()}' else: self.__session_id = session_id self.__eggroll_home = os.getenv('EGGROLL_HOME', None) if not self.__eggroll_home: raise EnvironmentError('EGGROLL_HOME is not set') if "EGGROLL_DEBUG" not in os.environ: os.environ['EGGROLL_DEBUG'] = "0" conf_path = options.get(CoreConfKeys.STATIC_CONF_PATH, f"{self.__eggroll_home}/conf/eggroll.properties") L.info(f"static conf path: {conf_path}") configs = configparser.ConfigParser() configs.read(conf_path) set_static_er_conf(configs['eggroll']) static_er_conf = get_static_er_conf() self.__options = options.copy() self.__options[SessionConfKeys.CONFKEY_SESSION_ID] = self.__session_id #self._cluster_manager_client = ClusterManagerClient(options=options) self.__is_standalone = options.get(SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE, "") == DeployModes.STANDALONE if self.__is_standalone and not processors and os.environ.get("EGGROLL_RESOURCE_MANAGER_BOOTSTRAP_DEBUG", "0") == "0": #port = int(options.get(ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, # static_er_conf.get(ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, "4689"))) port = 0 random_value = str(random.random()) os.environ['EGGROLL_STANDALONE_TAG'] = random_value if os.name != 'nt': startup_command = f'{self.__eggroll_home}/bin/eggroll_boot_standalone.sh -p {port} -s {self.__session_id}' else: startup_command = f'{self.__eggroll_home}/bin/eggroll_boot_standalone.py -p {port} -s {self.__session_id}' print("startup_command:", startup_command) import subprocess import atexit bootstrap_log_dir = f'{self.__eggroll_home}/logs/eggroll/' os.makedirs(bootstrap_log_dir, mode=0o755, exist_ok=True) with open(f'{bootstrap_log_dir}/standalone-manager.out', 'a+') as outfile, \ open(f'{bootstrap_log_dir}/standalone-manager.err', 'a+') as errfile: L.info(f'start up command: {startup_command}') manager_process = subprocess.Popen(startup_command, shell=True, stdout=outfile, stderr=errfile) manager_process.wait() returncode = manager_process.returncode L.info(f'start up returncode: {returncode}') def shutdown_standalone_manager(session_id, log_dir): standalone_tag = f'eggroll.standalone.tag={random_value}' if os.name != 'nt': shutdown_command = f"ps aux | grep eggroll | grep Bootstrap | grep '{standalone_tag}' | grep '{session_id}' | grep -v grep | awk '{{print $2}}' | xargs kill" else: pid_list = psutil.pids() ret_pid = 0 exception = None for pid in pid_list: try: p = psutil.Process(pid) exception = None except Exception as e: exception = e continue if "java.exe" not in p.name(): continue # if it is a system process, call p.cmdline() will dump cmdline = p.cmdline() if standalone_tag not in cmdline or '--bootstraps' not in cmdline: continue ret_pid = pid break if exception: raise RuntimeError("can not find the bootstrap process") shutdown_command = f"taskkill /pid {ret_pid} /f" L.info(f'shutdown command: {shutdown_command}') with open(f'{log_dir}/standalone-manager.out', 'a+') as outfile, open(f'{log_dir}/standalone-manager.err', 'a+') as errfile: manager_process = subprocess.run(shutdown_command, shell=True, stdout=outfile, stderr=errfile) returncode = manager_process.returncode L.info(f'shutdown returncode: {returncode}') file_name = f'{self.__eggroll_home}/logs/eggroll/bootstrap-standalone-manager.out' max_retry_cnt = 100 for i in range(max_retry_cnt): msg = f"retry get port from bootstrap-standalone-manager.out: retry_cnt: {i}," L.info(msg) if os.path.exists(file_name): break time.sleep(min(0.1 * i, 100)) try: for i in range(max_retry_cnt): with open(file_name) as fp: msg = f"retry get port of ClusterManager and NodeManager: retry_cnt: {i}," L.info(msg) port = 0 key = f"{random_value} server started at port " for line in fp.readlines(): if key in line: port = int(line.rsplit('port ', 2)[1]) if port != 0: break if port != 0: break time.sleep(min(0.1 * i, 100)) except IOError as e: L.info(f"get port from {file_name} failed!") raise e if port == 0: raise RuntimeError(f"get port from {file_name} failed!") options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = port self.__options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = port atexit.register(shutdown_standalone_manager, self.__session_id, bootstrap_log_dir) self._cluster_manager_client = ClusterManagerClient(options=options) session_meta = ErSessionMeta(id=self.__session_id, name=name, status=SessionStatus.NEW, tag=tag, processors=processors, options=options) from time import monotonic, sleep timeout = int(SessionConfKeys.EGGROLL_SESSION_START_TIMEOUT_MS.get_with(options)) / 1000 + 2 endtime = monotonic() + timeout # TODO:0: ignores exception while starting up in standalone mod while True: try: if not processors: self.__session_meta = self._cluster_manager_client.get_or_create_session(session_meta) else: self.__session_meta = self._cluster_manager_client.register_session(session_meta) break except: if monotonic() < endtime: sleep(0.1) else: raise self.__exit_tasks = list() self.__processors = self.__session_meta._processors L.info(f'session init finished: {self.__session_id}, details: {self.__session_meta}') self.stopped = self.__session_meta._status == SessionStatus.CLOSED or self.__session_meta._status == SessionStatus.KILLED self._rolls = list() self._eggs = dict() for processor in self.__session_meta._processors: processor_type = processor._processor_type if processor_type == ProcessorTypes.EGG_PAIR: server_node_id = processor._server_node_id if server_node_id not in self._eggs: self._eggs[server_node_id] = list() self._eggs[server_node_id].append(processor) elif processor_type == ProcessorTypes.ROLL_PAIR_MASTER: self._rolls.append(processor) else: raise ValueError(f'processor type {processor_type} not supported in roll pair')
args_parser = argparse.ArgumentParser() args_parser.add_argument('-d', '--data-dir') args_parser.add_argument('-cm', '--cluster-manager') args_parser.add_argument('-nm', '--node-manager') args_parser.add_argument('-s', '--session-id') args_parser.add_argument('-p', '--port', default='0') args_parser.add_argument('-t', '--transfer-port', default='0') args_parser.add_argument('-sn', '--server-node-id') args_parser.add_argument('-prid', '--processor-id', default='0') args_parser.add_argument('-c', '--config') args = args_parser.parse_args() EGGROLL_HOME = os.environ['EGGROLL_HOME'] configs = configparser.ConfigParser() if args.config: conf_file = args.config L.info(f'reading config path: {conf_file}') else: conf_file = f'{EGGROLL_HOME}/conf/eggroll.properties' L.info(f'reading default config: {conf_file}') configs.read(conf_file) set_static_er_conf(configs['eggroll']) if configs: if not args.data_dir: args.data_dir = configs['eggroll']['eggroll.data.dir'] L.info(args) serve(args)
def __init__(self, session_id=None, name='', tag='', processors: list = None, options: dict = None): if processors is None: processors = [] if options is None: options = {} if not session_id: self.__session_id = f'er_session_py_{time_now(format=DEFAULT_DATETIME_FORMAT)}_{get_self_ip()}' else: self.__session_id = session_id self.__eggroll_home = os.getenv('EGGROLL_HOME', None) if not self.__eggroll_home: raise EnvironmentError('EGGROLL_HOME is not set') if "EGGROLL_DEBUG" not in os.environ: os.environ['EGGROLL_DEBUG'] = "0" conf_path = options.get( CoreConfKeys.STATIC_CONF_PATH, f"{self.__eggroll_home}/conf/eggroll.properties") L.info(f"static conf path: {conf_path}") configs = configparser.ConfigParser() configs.read(conf_path) set_static_er_conf(configs['eggroll']) static_er_conf = get_static_er_conf() self.__options = options.copy() self.__options[SessionConfKeys.CONFKEY_SESSION_ID] = self.__session_id self._cluster_manager_client = ClusterManagerClient(options=options) self.__is_standalone = options.get( SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE, "") == DeployModes.STANDALONE if self.__is_standalone and os.name != 'nt' and not processors and os.environ.get( "EGGROLL_RESOURCE_MANAGER_AUTO_BOOTSTRAP", "1") == "1": port = int( options.get( ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, static_er_conf.get( ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, "4670"))) startup_command = f'bash {self.__eggroll_home}/bin/eggroll_boot_standalone.sh -c {conf_path} -s {self.__session_id}' import subprocess import atexit bootstrap_log_dir = f'{self.__eggroll_home}/logs/eggroll/' os.makedirs(bootstrap_log_dir, mode=0o755, exist_ok=True) with open(f'{bootstrap_log_dir}/standalone-manager.out', 'a+') as outfile, \ open(f'{bootstrap_log_dir}/standalone-manager.err', 'a+') as errfile: L.info(f'start up command: {startup_command}') manager_process = subprocess.run(startup_command, shell=True, stdout=outfile, stderr=errfile) returncode = manager_process.returncode L.info(f'start up returncode: {returncode}') def shutdown_standalone_manager(port, session_id, log_dir): shutdown_command = f"ps aux | grep eggroll | grep Bootstrap | grep '{port}' | grep '{session_id}' | grep -v grep | awk '{{print $2}}' | xargs kill" L.info(f'shutdown command: {shutdown_command}') with open(f'{log_dir}/standalone-manager.out', 'a+') as outfile, open( f'{log_dir}/standalone-manager.err', 'a+') as errfile: manager_process = subprocess.run(shutdown_command, shell=True, stdout=outfile, stderr=errfile) returncode = manager_process.returncode L.info(f'shutdown returncode: {returncode}') atexit.register(shutdown_standalone_manager, port, self.__session_id, bootstrap_log_dir) session_meta = ErSessionMeta(id=self.__session_id, name=name, status=SessionStatus.NEW, tag=tag, processors=processors, options=options) from time import monotonic, sleep timeout = int( SessionConfKeys.EGGROLL_SESSION_START_TIMEOUT_MS.get_with( options)) / 1000 + 2 endtime = monotonic() + timeout # TODO:0: ignores exception while starting up in standalone mod while True: try: if not processors: self.__session_meta = self._cluster_manager_client.get_or_create_session( session_meta) else: self.__session_meta = self._cluster_manager_client.register_session( session_meta) break except: if monotonic() < endtime: sleep(0.1) else: raise self.__exit_tasks = list() self.__processors = self.__session_meta._processors L.info( f'session init finished: {self.__session_id}, details: {self.__session_meta}' ) self.stopped = self.__session_meta._status == SessionStatus.CLOSED or self.__session_meta._status == SessionStatus.KILLED self._rolls = list() self._eggs = dict() for processor in self.__session_meta._processors: processor_type = processor._processor_type if processor_type == ProcessorTypes.EGG_PAIR: server_node_id = processor._server_node_id if server_node_id not in self._eggs: self._eggs[server_node_id] = list() self._eggs[server_node_id].append(processor) elif processor_type == ProcessorTypes.ROLL_PAIR_MASTER: self._rolls.append(processor) else: raise ValueError( f'processor type {processor_type} not supported in roll pair' )