def stop_processor(cluster_manager_client: ClusterManagerClient, myself: ErProcessor): import win32file import win32pipe L.info(f"stop_processor pid:{os.getpid()}, ppid:{os.getppid()}") pipe_name = r'\\.\pipe\pid_pipe' + str(os.getpid()) pipe_buffer_size = 1024 while True: named_pipe = win32pipe.CreateNamedPipe( pipe_name, win32pipe.PIPE_ACCESS_DUPLEX, win32pipe.PIPE_TYPE_MESSAGE | win32pipe.PIPE_WAIT | win32pipe.PIPE_READMODE_MESSAGE, win32pipe.PIPE_UNLIMITED_INSTANCES, pipe_buffer_size, pipe_buffer_size, 500, None) try: while True: try: win32pipe.ConnectNamedPipe(named_pipe, None) data = win32file.ReadFile(named_pipe, pipe_buffer_size, None) if data is None or len(data) < 2: continue print('receive msg:', data) cmd_str = data[1].decode('utf-8') if 'stop' in cmd_str and str(os.getpid()) in cmd_str: myself._status = ProcessorStatus.STOPPED cluster_manager_client.heartbeat(myself) except BaseException as e: print("exception:", e) break finally: try: win32pipe.DisconnectNamedPipe(named_pipe) except: pass
def serve(args): prefix = 'v1/egg-pair' set_data_dir(args.data_dir) CommandRouter.get_instance().register( service_name=f"{prefix}/runTask", route_to_module_name="eggroll.roll_pair.egg_pair", route_to_class_name="EggPair", route_to_method_name="run_task") max_workers = int( RollPairConfKeys. EGGROLL_ROLLPAIR_EGGPAIR_SERVER_EXECUTOR_POOL_MAX_SIZE.get()) executor_pool_type = CoreConfKeys.EGGROLL_CORE_DEFAULT_EXECUTOR_POOL.get() command_server = grpc.server( create_executor_pool(canonical_name=executor_pool_type, max_workers=max_workers, thread_name_prefix="eggpair-command-server"), options= [("grpc.max_metadata_size", int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_METADATA_SIZE.get()) ), ('grpc.max_send_message_length', int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.get()) ), ('grpc.max_receive_message_length', int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.get()) ), ('grpc.keepalive_time_ms', int(CoreConfKeys.CONFKEY_CORE_GRPC_CHANNEL_KEEPALIVE_TIME_SEC.get()) * 1000), ('grpc.keepalive_timeout_ms', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_TIMEOUT_SEC.get()) * 1000), ('grpc.keepalive_permit_without_calls', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED. get())), ('grpc.per_rpc_retry_buffer_size', int(CoreConfKeys.CONFKEY_CORE_GRPC_SERVER_CHANNEL_RETRY_BUFFER_SIZE. get())), ('grpc.so_reuseport', False)]) command_servicer = CommandServicer() command_pb2_grpc.add_CommandServiceServicer_to_server( command_servicer, command_server) transfer_servicer = GrpcTransferServicer() port = args.port transfer_port = args.transfer_port port = command_server.add_insecure_port(f'[::]:{port}') if transfer_port == "-1": transfer_server = command_server transfer_port = port transfer_pb2_grpc.add_TransferServiceServicer_to_server( transfer_servicer, transfer_server) else: transfer_server_max_workers = int( RollPairConfKeys. EGGROLL_ROLLPAIR_EGGPAIR_DATA_SERVER_EXECUTOR_POOL_MAX_SIZE.get()) transfer_server = grpc.server( create_executor_pool(canonical_name=executor_pool_type, max_workers=transfer_server_max_workers, thread_name_prefix="transfer_server"), options= [('grpc.max_metadata_size', int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_METADATA_SIZE. get())), ('grpc.max_send_message_length', int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE. get())), ('grpc.max_receive_message_length', int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE. get())), ('grpc.keepalive_time_ms', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED .get()) * 1000), ('grpc.keepalive_timeout_ms', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_TIMEOUT_SEC.get()) * 1000), ('grpc.keepalive_permit_without_calls', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED .get())), ('grpc.per_rpc_retry_buffer_size', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_RETRY_BUFFER_SIZE.get())), ('grpc.so_reuseport', False)]) transfer_port = transfer_server.add_insecure_port( f'[::]:{transfer_port}') transfer_pb2_grpc.add_TransferServiceServicer_to_server( transfer_servicer, transfer_server) transfer_server.start() pid = os.getpid() L.info( f"starting egg_pair service, port: {port}, transfer port: {transfer_port}, pid: {pid}" ) command_server.start() cluster_manager = args.cluster_manager myself = None cluster_manager_client = None if cluster_manager: session_id = args.session_id server_node_id = int(args.server_node_id) static_er_conf = get_static_er_conf() static_er_conf['server_node_id'] = server_node_id if not session_id: raise ValueError('session id is missing') options = {SessionConfKeys.CONFKEY_SESSION_ID: args.session_id} myself = ErProcessor(id=int(args.processor_id), server_node_id=server_node_id, processor_type=ProcessorTypes.EGG_PAIR, command_endpoint=ErEndpoint(host='localhost', port=port), transfer_endpoint=ErEndpoint(host='localhost', port=transfer_port), pid=pid, options=options, status=ProcessorStatus.RUNNING) cluster_manager_host, cluster_manager_port = cluster_manager.strip( ).split(':') L.info(f'egg_pair cluster_manager: {cluster_manager}') cluster_manager_client = ClusterManagerClient( options={ ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_HOST: cluster_manager_host, ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT: cluster_manager_port }) cluster_manager_client.heartbeat(myself) if platform.system() == "Windows": t1 = threading.Thread(target=stop_processor, args=[cluster_manager_client, myself]) t1.start() L.info(f'egg_pair started at port={port}, transfer_port={transfer_port}') run = True def exit_gracefully(signum, frame): nonlocal run run = False L.info( f'egg_pair {args.processor_id} at port={port}, transfer_port={transfer_port}, pid={pid} receives signum={signal.getsignal(signum)}, stopping gracefully.' ) signal.signal(signal.SIGTERM, exit_gracefully) signal.signal(signal.SIGINT, exit_gracefully) while run: time.sleep(1) L.info(f'sending exit heartbeat to cm') if cluster_manager: myself._status = ProcessorStatus.STOPPED cluster_manager_client.heartbeat(myself) GrpcChannelFactory.shutdown_all_now() L.info(f'closing RocksDB open dbs') #todo:1: move to RocksdbAdapter and provide a cleanup method from eggroll.core.pair_store.rocksdb import RocksdbAdapter for path, db in RocksdbAdapter.db_dict.items(): del db gc.collect() L.info(f'system metric at exit: {get_system_metric(1)}') L.info( f'egg_pair {args.processor_id} at port={port}, transfer_port={transfer_port}, pid={pid} stopped gracefully' )
def __init__(self, session_id=None, name='', tag='', processors: list = None, options: dict = None): if processors is None: processors = [] if options is None: options = {} if not session_id: self.__session_id = f'er_session_py_{time_now(format=DEFAULT_DATETIME_FORMAT)}_{get_self_ip()}' else: self.__session_id = session_id self.__eggroll_home = os.getenv('EGGROLL_HOME', None) if not self.__eggroll_home: raise EnvironmentError('EGGROLL_HOME is not set') if "EGGROLL_DEBUG" not in os.environ: os.environ['EGGROLL_DEBUG'] = "0" conf_path = options.get(CoreConfKeys.STATIC_CONF_PATH, f"{self.__eggroll_home}/conf/eggroll.properties") L.info(f"static conf path: {conf_path}") configs = configparser.ConfigParser() configs.read(conf_path) set_static_er_conf(configs['eggroll']) static_er_conf = get_static_er_conf() self.__options = options.copy() self.__options[SessionConfKeys.CONFKEY_SESSION_ID] = self.__session_id #self._cluster_manager_client = ClusterManagerClient(options=options) self.__is_standalone = options.get(SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE, "") == DeployModes.STANDALONE if self.__is_standalone and not processors and os.environ.get("EGGROLL_RESOURCE_MANAGER_BOOTSTRAP_DEBUG", "0") == "0": #port = int(options.get(ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, # static_er_conf.get(ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, "4689"))) port = 0 random_value = str(random.random()) os.environ['EGGROLL_STANDALONE_TAG'] = random_value if os.name != 'nt': startup_command = f'{self.__eggroll_home}/bin/eggroll_boot_standalone.sh -p {port} -s {self.__session_id}' else: startup_command = f'{self.__eggroll_home}/bin/eggroll_boot_standalone.py -p {port} -s {self.__session_id}' print("startup_command:", startup_command) import subprocess import atexit bootstrap_log_dir = f'{self.__eggroll_home}/logs/eggroll/' os.makedirs(bootstrap_log_dir, mode=0o755, exist_ok=True) with open(f'{bootstrap_log_dir}/standalone-manager.out', 'a+') as outfile, \ open(f'{bootstrap_log_dir}/standalone-manager.err', 'a+') as errfile: L.info(f'start up command: {startup_command}') manager_process = subprocess.Popen(startup_command, shell=True, stdout=outfile, stderr=errfile) manager_process.wait() returncode = manager_process.returncode L.info(f'start up returncode: {returncode}') def shutdown_standalone_manager(session_id, log_dir): standalone_tag = f'eggroll.standalone.tag={random_value}' if os.name != 'nt': shutdown_command = f"ps aux | grep eggroll | grep Bootstrap | grep '{standalone_tag}' | grep '{session_id}' | grep -v grep | awk '{{print $2}}' | xargs kill" else: pid_list = psutil.pids() ret_pid = 0 exception = None for pid in pid_list: try: p = psutil.Process(pid) exception = None except Exception as e: exception = e continue if "java.exe" not in p.name(): continue # if it is a system process, call p.cmdline() will dump cmdline = p.cmdline() if standalone_tag not in cmdline or '--bootstraps' not in cmdline: continue ret_pid = pid break if exception: raise RuntimeError("can not find the bootstrap process") shutdown_command = f"taskkill /pid {ret_pid} /f" L.info(f'shutdown command: {shutdown_command}') with open(f'{log_dir}/standalone-manager.out', 'a+') as outfile, open(f'{log_dir}/standalone-manager.err', 'a+') as errfile: manager_process = subprocess.run(shutdown_command, shell=True, stdout=outfile, stderr=errfile) returncode = manager_process.returncode L.info(f'shutdown returncode: {returncode}') file_name = f'{self.__eggroll_home}/logs/eggroll/bootstrap-standalone-manager.out' max_retry_cnt = 100 for i in range(max_retry_cnt): msg = f"retry get port from bootstrap-standalone-manager.out: retry_cnt: {i}," L.info(msg) if os.path.exists(file_name): break time.sleep(min(0.1 * i, 100)) try: for i in range(max_retry_cnt): with open(file_name) as fp: msg = f"retry get port of ClusterManager and NodeManager: retry_cnt: {i}," L.info(msg) port = 0 key = f"{random_value} server started at port " for line in fp.readlines(): if key in line: port = int(line.rsplit('port ', 2)[1]) if port != 0: break if port != 0: break time.sleep(min(0.1 * i, 100)) except IOError as e: L.info(f"get port from {file_name} failed!") raise e if port == 0: raise RuntimeError(f"get port from {file_name} failed!") options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = port self.__options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = port atexit.register(shutdown_standalone_manager, self.__session_id, bootstrap_log_dir) self._cluster_manager_client = ClusterManagerClient(options=options) session_meta = ErSessionMeta(id=self.__session_id, name=name, status=SessionStatus.NEW, tag=tag, processors=processors, options=options) from time import monotonic, sleep timeout = int(SessionConfKeys.EGGROLL_SESSION_START_TIMEOUT_MS.get_with(options)) / 1000 + 2 endtime = monotonic() + timeout # TODO:0: ignores exception while starting up in standalone mod while True: try: if not processors: self.__session_meta = self._cluster_manager_client.get_or_create_session(session_meta) else: self.__session_meta = self._cluster_manager_client.register_session(session_meta) break except: if monotonic() < endtime: sleep(0.1) else: raise self.__exit_tasks = list() self.__processors = self.__session_meta._processors L.info(f'session init finished: {self.__session_id}, details: {self.__session_meta}') self.stopped = self.__session_meta._status == SessionStatus.CLOSED or self.__session_meta._status == SessionStatus.KILLED self._rolls = list() self._eggs = dict() for processor in self.__session_meta._processors: processor_type = processor._processor_type if processor_type == ProcessorTypes.EGG_PAIR: server_node_id = processor._server_node_id if server_node_id not in self._eggs: self._eggs[server_node_id] = list() self._eggs[server_node_id].append(processor) elif processor_type == ProcessorTypes.ROLL_PAIR_MASTER: self._rolls.append(processor) else: raise ValueError(f'processor type {processor_type} not supported in roll pair')
class ErSession(object): executor = ErThreadUnpooledExecutor( max_workers=int(CoreConfKeys.EGGROLL_CORE_CLIENT_COMMAND_EXECUTOR_POOL_MAX_SIZE.get()), thread_name_prefix="session_server") def __init__(self, session_id=None, name='', tag='', processors: list = None, options: dict = None): if processors is None: processors = [] if options is None: options = {} if not session_id: self.__session_id = f'er_session_py_{time_now(format=DEFAULT_DATETIME_FORMAT)}_{get_self_ip()}' else: self.__session_id = session_id self.__eggroll_home = os.getenv('EGGROLL_HOME', None) if not self.__eggroll_home: raise EnvironmentError('EGGROLL_HOME is not set') if "EGGROLL_DEBUG" not in os.environ: os.environ['EGGROLL_DEBUG'] = "0" conf_path = options.get(CoreConfKeys.STATIC_CONF_PATH, f"{self.__eggroll_home}/conf/eggroll.properties") L.info(f"static conf path: {conf_path}") configs = configparser.ConfigParser() configs.read(conf_path) set_static_er_conf(configs['eggroll']) static_er_conf = get_static_er_conf() self.__options = options.copy() self.__options[SessionConfKeys.CONFKEY_SESSION_ID] = self.__session_id #self._cluster_manager_client = ClusterManagerClient(options=options) self.__is_standalone = options.get(SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE, "") == DeployModes.STANDALONE if self.__is_standalone and not processors and os.environ.get("EGGROLL_RESOURCE_MANAGER_BOOTSTRAP_DEBUG", "0") == "0": #port = int(options.get(ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, # static_er_conf.get(ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, "4689"))) port = 0 random_value = str(random.random()) os.environ['EGGROLL_STANDALONE_TAG'] = random_value if os.name != 'nt': startup_command = f'{self.__eggroll_home}/bin/eggroll_boot_standalone.sh -p {port} -s {self.__session_id}' else: startup_command = f'{self.__eggroll_home}/bin/eggroll_boot_standalone.py -p {port} -s {self.__session_id}' print("startup_command:", startup_command) import subprocess import atexit bootstrap_log_dir = f'{self.__eggroll_home}/logs/eggroll/' os.makedirs(bootstrap_log_dir, mode=0o755, exist_ok=True) with open(f'{bootstrap_log_dir}/standalone-manager.out', 'a+') as outfile, \ open(f'{bootstrap_log_dir}/standalone-manager.err', 'a+') as errfile: L.info(f'start up command: {startup_command}') manager_process = subprocess.Popen(startup_command, shell=True, stdout=outfile, stderr=errfile) manager_process.wait() returncode = manager_process.returncode L.info(f'start up returncode: {returncode}') def shutdown_standalone_manager(session_id, log_dir): standalone_tag = f'eggroll.standalone.tag={random_value}' if os.name != 'nt': shutdown_command = f"ps aux | grep eggroll | grep Bootstrap | grep '{standalone_tag}' | grep '{session_id}' | grep -v grep | awk '{{print $2}}' | xargs kill" else: pid_list = psutil.pids() ret_pid = 0 exception = None for pid in pid_list: try: p = psutil.Process(pid) exception = None except Exception as e: exception = e continue if "java.exe" not in p.name(): continue # if it is a system process, call p.cmdline() will dump cmdline = p.cmdline() if standalone_tag not in cmdline or '--bootstraps' not in cmdline: continue ret_pid = pid break if exception: raise RuntimeError("can not find the bootstrap process") shutdown_command = f"taskkill /pid {ret_pid} /f" L.info(f'shutdown command: {shutdown_command}') with open(f'{log_dir}/standalone-manager.out', 'a+') as outfile, open(f'{log_dir}/standalone-manager.err', 'a+') as errfile: manager_process = subprocess.run(shutdown_command, shell=True, stdout=outfile, stderr=errfile) returncode = manager_process.returncode L.info(f'shutdown returncode: {returncode}') file_name = f'{self.__eggroll_home}/logs/eggroll/bootstrap-standalone-manager.out' max_retry_cnt = 100 for i in range(max_retry_cnt): msg = f"retry get port from bootstrap-standalone-manager.out: retry_cnt: {i}," L.info(msg) if os.path.exists(file_name): break time.sleep(min(0.1 * i, 100)) try: for i in range(max_retry_cnt): with open(file_name) as fp: msg = f"retry get port of ClusterManager and NodeManager: retry_cnt: {i}," L.info(msg) port = 0 key = f"{random_value} server started at port " for line in fp.readlines(): if key in line: port = int(line.rsplit('port ', 2)[1]) if port != 0: break if port != 0: break time.sleep(min(0.1 * i, 100)) except IOError as e: L.info(f"get port from {file_name} failed!") raise e if port == 0: raise RuntimeError(f"get port from {file_name} failed!") options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = port self.__options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = port atexit.register(shutdown_standalone_manager, self.__session_id, bootstrap_log_dir) self._cluster_manager_client = ClusterManagerClient(options=options) session_meta = ErSessionMeta(id=self.__session_id, name=name, status=SessionStatus.NEW, tag=tag, processors=processors, options=options) from time import monotonic, sleep timeout = int(SessionConfKeys.EGGROLL_SESSION_START_TIMEOUT_MS.get_with(options)) / 1000 + 2 endtime = monotonic() + timeout # TODO:0: ignores exception while starting up in standalone mod while True: try: if not processors: self.__session_meta = self._cluster_manager_client.get_or_create_session(session_meta) else: self.__session_meta = self._cluster_manager_client.register_session(session_meta) break except: if monotonic() < endtime: sleep(0.1) else: raise self.__exit_tasks = list() self.__processors = self.__session_meta._processors L.info(f'session init finished: {self.__session_id}, details: {self.__session_meta}') self.stopped = self.__session_meta._status == SessionStatus.CLOSED or self.__session_meta._status == SessionStatus.KILLED self._rolls = list() self._eggs = dict() for processor in self.__session_meta._processors: processor_type = processor._processor_type if processor_type == ProcessorTypes.EGG_PAIR: server_node_id = processor._server_node_id if server_node_id not in self._eggs: self._eggs[server_node_id] = list() self._eggs[server_node_id].append(processor) elif processor_type == ProcessorTypes.ROLL_PAIR_MASTER: self._rolls.append(processor) else: raise ValueError(f'processor type {processor_type} not supported in roll pair') def get_rank_in_node(self, partition_id, server_node_id): processor_count_of_node = len(self._eggs[server_node_id]) node_count = len(self._eggs) rank_in_node = (partition_id // node_count) % processor_count_of_node return rank_in_node def route_to_egg(self, partition: ErPartition): server_node_id = partition._processor._server_node_id rank_in_node = partition._rank_in_node if partition._rank_in_node is None or rank_in_node < 0: rank_in_node = self.get_rank_in_node(partition_id=partition._id, server_node_id=server_node_id) result = self.route_to_egg_by_rank(server_node_id, rank_in_node) return result def route_to_egg_by_rank(self, server_node_id, rank_in_node): result = self._eggs[server_node_id][rank_in_node] if not result._command_endpoint._host or result._command_endpoint._port <= 0: raise ValueError(f'error routing to egg: {result} in session: {self.__session_id}') return result def populate_processor(self, store: ErStore): populated_partitions = list() for p in store._partitions: server_node_id = p._processor._server_node_id rank_in_node = self.get_rank_in_node(p._id, p._processor._server_node_id) pp = ErPartition(id=p._id, store_locator=p._store_locator, processor=self.route_to_egg_by_rank(server_node_id, rank_in_node), rank_in_node=rank_in_node) populated_partitions.append(pp) return ErStore(store_locator=store._store_locator, partitions=populated_partitions, options=store._options) def submit_job(self, job: ErJob, output_types: list = None, command_uri: CommandURI = None, create_output_if_missing=True): if not output_types: output_types = [ErTask] final_job = self.populate_output_store(job) if create_output_if_missing else job tasks = self._decompose_job(final_job) command_client = CommandClient() return command_client.async_call(args=tasks, output_types=output_types, command_uri=command_uri) def wait_until_job_finished(self, task_futures: list, timeout=None, return_when=FIRST_EXCEPTION): return wait(task_futures, timeout=timeout, return_when=return_when).done def _decompose_job(self, job: ErJob): input_total_partitions = job._inputs[0]._store_locator._total_partitions output_total_partitions = 0 \ if not job._outputs \ else job._outputs[0]._store_locator._total_partitions larger_total_partitions = max(input_total_partitions, output_total_partitions) populated_input_partitions = self.populate_processor(job._inputs[0])._partitions if output_total_partitions > 0: populated_output_partitions = self.populate_processor(job._outputs[0])._partitions else: populated_output_partitions = list() result = list() for i in range(larger_total_partitions): input_partitions = list() output_partitions = list() if i < input_total_partitions: input_processor = populated_input_partitions[i]._processor input_server_node_id = input_processor._server_node_id for input_store in job._inputs: input_partitions.append(ErPartition( id=i, store_locator=input_store._store_locator, processor=input_processor)) else: input_processor = None input_server_node_id = None if i < output_total_partitions: output_processor = populated_output_partitions[i]._processor output_server_node_id = output_processor._server_node_id for output_store in job._outputs: output_partitions.append(ErPartition( id=i, store_locator=output_store._store_locator, processor=output_processor)) else: output_processor = None output_server_node_id = None tasks = [ErTask(id=generate_task_id(job._id, i), name=f'{job._name}', inputs=input_partitions, outputs=output_partitions, job=job)] if input_server_node_id == output_server_node_id: result.append( (tasks, input_processor._command_endpoint)) else: if input_server_node_id is not None: result.append( (tasks, input_processor._command_endpoint)) if output_server_node_id is not None: result.append( (tasks, output_processor._command_endpoint)) return result def populate_output_store(self, job: ErJob): is_output_blank = not job._outputs or not job._outputs[0] is_output_not_populated = is_output_blank or not job._outputs[0]._partitions if is_output_not_populated: if is_output_blank: final_output_proposal = job._inputs[0].fork() else: final_output_proposal = job._outputs[0] refresh_nodes = job._options.get('refresh_nodes') if refresh_nodes is None or refresh_nodes: final_output_proposal._partitions = [] else: if not final_output_proposal._partitions: final_output_proposal._partitions = job._inputs[0]._partitions else: final_output_proposal = job._outputs[0] final_output = self.populate_processor( self._cluster_manager_client.get_or_create_store(final_output_proposal)) if final_output._store_locator._total_partitions != \ final_output_proposal._store_locator._total_partitions: raise ValueError(f'partition count of actual output and proposed output does not match. ' f'actual={final_output}, proposed={final_output_proposal}') final_job = deepcopy(job) final_job._outputs = [final_output] return final_job def stop(self): L.info(f'stopping session (gracefully): {self.__session_id}') L.debug(f'stopping session (gracefully), details: {self.__session_meta}') L.debug(f'stopping (gracefully) for {self.__session_id} from: {get_stack()}') self.run_exit_tasks() self.stopped = True return self._cluster_manager_client.stop_session(self.__session_meta) def kill(self): L.info(f'killing session (forcefully): {self.__session_id}') L.debug(f'killing session (forcefully), details: {self.__session_meta}') L.debug(f'killing (forcefully) for {self.__session_id} from: {get_stack()}') self.stopped = True future = self.executor.submit(self.stop) done = wait([future], timeout=1, return_when=FIRST_EXCEPTION).done if done: L.info(f'stopped successfully before kill session: {self.__session_id}') else: L.warn(f'stopped timeout before kill session: {self.__session_id}') return self._cluster_manager_client.kill_session(self.__session_meta) def get_session_id(self): return self.__session_id def get_session_meta(self): return self.__session_meta # todo:1: add_exit_task? not necessarily a cleanup semantic def add_exit_task(self, func): self.__exit_tasks.append(func) def run_exit_tasks(self): L.debug(f'running exit tasks: {self.__session_id}') for func in self.__exit_tasks: func() def get_option(self, key, default=None): return self.__options.get(key, default) def has_option(self, key): return self.__options.get(key) is not None def get_all_options(self): return self.__options.copy() def is_stopped(self): return self.stopped
def __init__(self, session_id=None, name='', tag='', processors: list = None, options: dict = None): if processors is None: processors = [] if options is None: options = {} if not session_id: self.__session_id = f'er_session_py_{time_now(format=DEFAULT_DATETIME_FORMAT)}_{get_self_ip()}' else: self.__session_id = session_id self.__eggroll_home = os.getenv('EGGROLL_HOME', None) if not self.__eggroll_home: raise EnvironmentError('EGGROLL_HOME is not set') if "EGGROLL_DEBUG" not in os.environ: os.environ['EGGROLL_DEBUG'] = "0" conf_path = options.get( CoreConfKeys.STATIC_CONF_PATH, f"{self.__eggroll_home}/conf/eggroll.properties") L.info(f"static conf path: {conf_path}") configs = configparser.ConfigParser() configs.read(conf_path) set_static_er_conf(configs['eggroll']) static_er_conf = get_static_er_conf() self.__options = options.copy() self.__options[SessionConfKeys.CONFKEY_SESSION_ID] = self.__session_id self._cluster_manager_client = ClusterManagerClient(options=options) self.__is_standalone = options.get( SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE, "") == DeployModes.STANDALONE if self.__is_standalone and os.name != 'nt' and not processors and os.environ.get( "EGGROLL_RESOURCE_MANAGER_AUTO_BOOTSTRAP", "1") == "1": port = int( options.get( ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, static_er_conf.get( ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, "4670"))) startup_command = f'bash {self.__eggroll_home}/bin/eggroll_boot_standalone.sh -c {conf_path} -s {self.__session_id}' import subprocess import atexit bootstrap_log_dir = f'{self.__eggroll_home}/logs/eggroll/' os.makedirs(bootstrap_log_dir, mode=0o755, exist_ok=True) with open(f'{bootstrap_log_dir}/standalone-manager.out', 'a+') as outfile, \ open(f'{bootstrap_log_dir}/standalone-manager.err', 'a+') as errfile: L.info(f'start up command: {startup_command}') manager_process = subprocess.run(startup_command, shell=True, stdout=outfile, stderr=errfile) returncode = manager_process.returncode L.info(f'start up returncode: {returncode}') def shutdown_standalone_manager(port, session_id, log_dir): shutdown_command = f"ps aux | grep eggroll | grep Bootstrap | grep '{port}' | grep '{session_id}' | grep -v grep | awk '{{print $2}}' | xargs kill" L.info(f'shutdown command: {shutdown_command}') with open(f'{log_dir}/standalone-manager.out', 'a+') as outfile, open( f'{log_dir}/standalone-manager.err', 'a+') as errfile: manager_process = subprocess.run(shutdown_command, shell=True, stdout=outfile, stderr=errfile) returncode = manager_process.returncode L.info(f'shutdown returncode: {returncode}') atexit.register(shutdown_standalone_manager, port, self.__session_id, bootstrap_log_dir) session_meta = ErSessionMeta(id=self.__session_id, name=name, status=SessionStatus.NEW, tag=tag, processors=processors, options=options) from time import monotonic, sleep timeout = int( SessionConfKeys.EGGROLL_SESSION_START_TIMEOUT_MS.get_with( options)) / 1000 + 2 endtime = monotonic() + timeout # TODO:0: ignores exception while starting up in standalone mod while True: try: if not processors: self.__session_meta = self._cluster_manager_client.get_or_create_session( session_meta) else: self.__session_meta = self._cluster_manager_client.register_session( session_meta) break except: if monotonic() < endtime: sleep(0.1) else: raise self.__exit_tasks = list() self.__processors = self.__session_meta._processors L.info( f'session init finished: {self.__session_id}, details: {self.__session_meta}' ) self.stopped = self.__session_meta._status == SessionStatus.CLOSED or self.__session_meta._status == SessionStatus.KILLED self._rolls = list() self._eggs = dict() for processor in self.__session_meta._processors: processor_type = processor._processor_type if processor_type == ProcessorTypes.EGG_PAIR: server_node_id = processor._server_node_id if server_node_id not in self._eggs: self._eggs[server_node_id] = list() self._eggs[server_node_id].append(processor) elif processor_type == ProcessorTypes.ROLL_PAIR_MASTER: self._rolls.append(processor) else: raise ValueError( f'processor type {processor_type} not supported in roll pair' )
def serve(args): prefix = 'v1/egg-pair' set_data_dir(args.data_dir) CommandRouter.get_instance().register( service_name=f"{prefix}/runTask", route_to_module_name="eggroll.roll_pair.egg_pair", route_to_class_name="EggPair", route_to_method_name="run_task") command_server = grpc.server( futures.ThreadPoolExecutor(max_workers=500, thread_name_prefix="grpc_server"), options=[("grpc.max_metadata_size", 32 << 20), (cygrpc.ChannelArgKey.max_send_message_length, 2 << 30 - 1), (cygrpc.ChannelArgKey.max_receive_message_length, 2 << 30 - 1) ]) command_servicer = CommandServicer() command_pb2_grpc.add_CommandServiceServicer_to_server( command_servicer, command_server) transfer_servicer = GrpcTransferServicer() port = args.port transfer_port = args.transfer_port port = command_server.add_insecure_port(f'[::]:{port}') if transfer_port == "-1": transfer_server = command_server transfer_port = port transfer_pb2_grpc.add_TransferServiceServicer_to_server( transfer_servicer, transfer_server) else: transfer_server = grpc.server( futures.ThreadPoolExecutor(max_workers=500, thread_name_prefix="transfer_server"), options=[ (cygrpc.ChannelArgKey.max_send_message_length, 2 << 30 - 1), (cygrpc.ChannelArgKey.max_receive_message_length, 2 << 30 - 1), ('grpc.max_metadata_size', 32 << 20) ]) transfer_port = transfer_server.add_insecure_port( f'[::]:{transfer_port}') transfer_pb2_grpc.add_TransferServiceServicer_to_server( transfer_servicer, transfer_server) transfer_server.start() L.info( f"starting egg_pair service, port:{port}, transfer port: {transfer_port}" ) command_server.start() cluster_manager = args.cluster_manager myself = None cluster_manager_client = None if cluster_manager: session_id = args.session_id if not session_id: raise ValueError('session id is missing') options = {SessionConfKeys.CONFKEY_SESSION_ID: args.session_id} myself = ErProcessor(id=int(args.processor_id), server_node_id=int(args.server_node_id), processor_type=ProcessorTypes.EGG_PAIR, command_endpoint=ErEndpoint(host='localhost', port=port), transfer_endpoint=ErEndpoint(host='localhost', port=transfer_port), pid=os.getpid(), options=options, status=ProcessorStatus.RUNNING) cluster_manager_host, cluster_manager_port = cluster_manager.strip( ).split(':') L.info(f'cluster_manager: {cluster_manager}') cluster_manager_client = ClusterManagerClient( options={ ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_HOST: cluster_manager_host, ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT: cluster_manager_port }) cluster_manager_client.heartbeat(myself) L.info(f'egg_pair started at port {port}, transfer_port {transfer_port}') run = True def exit_gracefully(signum, frame): nonlocal run run = False signal.signal(signal.SIGTERM, exit_gracefully) signal.signal(signal.SIGINT, exit_gracefully) import time while run: time.sleep(1) if cluster_manager: myself._status = ProcessorStatus.STOPPED cluster_manager_client.heartbeat(myself) L.info( f'egg_pair at port {port}, transfer_port {transfer_port} stopped gracefully' )