def stop_processor(cluster_manager_client: ClusterManagerClient, myself: ErProcessor): import win32file import win32pipe L.info(f"stop_processor pid:{os.getpid()}, ppid:{os.getppid()}") pipe_name = r'\\.\pipe\pid_pipe' + str(os.getpid()) pipe_buffer_size = 1024 while True: named_pipe = win32pipe.CreateNamedPipe( pipe_name, win32pipe.PIPE_ACCESS_DUPLEX, win32pipe.PIPE_TYPE_MESSAGE | win32pipe.PIPE_WAIT | win32pipe.PIPE_READMODE_MESSAGE, win32pipe.PIPE_UNLIMITED_INSTANCES, pipe_buffer_size, pipe_buffer_size, 500, None) try: while True: try: win32pipe.ConnectNamedPipe(named_pipe, None) data = win32file.ReadFile(named_pipe, pipe_buffer_size, None) if data is None or len(data) < 2: continue print('receive msg:', data) cmd_str = data[1].decode('utf-8') if 'stop' in cmd_str and str(os.getpid()) in cmd_str: myself._status = ProcessorStatus.STOPPED cluster_manager_client.heartbeat(myself) except BaseException as e: print("exception:", e) break finally: try: win32pipe.DisconnectNamedPipe(named_pipe) except: pass
def serve(args): prefix = 'v1/egg-pair' set_data_dir(args.data_dir) CommandRouter.get_instance().register( service_name=f"{prefix}/runTask", route_to_module_name="eggroll.roll_pair.egg_pair", route_to_class_name="EggPair", route_to_method_name="run_task") max_workers = int( RollPairConfKeys. EGGROLL_ROLLPAIR_EGGPAIR_SERVER_EXECUTOR_POOL_MAX_SIZE.get()) executor_pool_type = CoreConfKeys.EGGROLL_CORE_DEFAULT_EXECUTOR_POOL.get() command_server = grpc.server( create_executor_pool(canonical_name=executor_pool_type, max_workers=max_workers, thread_name_prefix="eggpair-command-server"), options= [("grpc.max_metadata_size", int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_METADATA_SIZE.get()) ), ('grpc.max_send_message_length', int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.get()) ), ('grpc.max_receive_message_length', int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.get()) ), ('grpc.keepalive_time_ms', int(CoreConfKeys.CONFKEY_CORE_GRPC_CHANNEL_KEEPALIVE_TIME_SEC.get()) * 1000), ('grpc.keepalive_timeout_ms', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_TIMEOUT_SEC.get()) * 1000), ('grpc.keepalive_permit_without_calls', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED. get())), ('grpc.per_rpc_retry_buffer_size', int(CoreConfKeys.CONFKEY_CORE_GRPC_SERVER_CHANNEL_RETRY_BUFFER_SIZE. get())), ('grpc.so_reuseport', False)]) command_servicer = CommandServicer() command_pb2_grpc.add_CommandServiceServicer_to_server( command_servicer, command_server) transfer_servicer = GrpcTransferServicer() port = args.port transfer_port = args.transfer_port port = command_server.add_insecure_port(f'[::]:{port}') if transfer_port == "-1": transfer_server = command_server transfer_port = port transfer_pb2_grpc.add_TransferServiceServicer_to_server( transfer_servicer, transfer_server) else: transfer_server_max_workers = int( RollPairConfKeys. EGGROLL_ROLLPAIR_EGGPAIR_DATA_SERVER_EXECUTOR_POOL_MAX_SIZE.get()) transfer_server = grpc.server( create_executor_pool(canonical_name=executor_pool_type, max_workers=transfer_server_max_workers, thread_name_prefix="transfer_server"), options= [('grpc.max_metadata_size', int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_METADATA_SIZE. get())), ('grpc.max_send_message_length', int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE. get())), ('grpc.max_receive_message_length', int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE. get())), ('grpc.keepalive_time_ms', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED .get()) * 1000), ('grpc.keepalive_timeout_ms', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_TIMEOUT_SEC.get()) * 1000), ('grpc.keepalive_permit_without_calls', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED .get())), ('grpc.per_rpc_retry_buffer_size', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_RETRY_BUFFER_SIZE.get())), ('grpc.so_reuseport', False)]) transfer_port = transfer_server.add_insecure_port( f'[::]:{transfer_port}') transfer_pb2_grpc.add_TransferServiceServicer_to_server( transfer_servicer, transfer_server) transfer_server.start() pid = os.getpid() L.info( f"starting egg_pair service, port: {port}, transfer port: {transfer_port}, pid: {pid}" ) command_server.start() cluster_manager = args.cluster_manager myself = None cluster_manager_client = None if cluster_manager: session_id = args.session_id server_node_id = int(args.server_node_id) static_er_conf = get_static_er_conf() static_er_conf['server_node_id'] = server_node_id if not session_id: raise ValueError('session id is missing') options = {SessionConfKeys.CONFKEY_SESSION_ID: args.session_id} myself = ErProcessor(id=int(args.processor_id), server_node_id=server_node_id, processor_type=ProcessorTypes.EGG_PAIR, command_endpoint=ErEndpoint(host='localhost', port=port), transfer_endpoint=ErEndpoint(host='localhost', port=transfer_port), pid=pid, options=options, status=ProcessorStatus.RUNNING) cluster_manager_host, cluster_manager_port = cluster_manager.strip( ).split(':') L.info(f'egg_pair cluster_manager: {cluster_manager}') cluster_manager_client = ClusterManagerClient( options={ ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_HOST: cluster_manager_host, ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT: cluster_manager_port }) cluster_manager_client.heartbeat(myself) if platform.system() == "Windows": t1 = threading.Thread(target=stop_processor, args=[cluster_manager_client, myself]) t1.start() L.info(f'egg_pair started at port={port}, transfer_port={transfer_port}') run = True def exit_gracefully(signum, frame): nonlocal run run = False L.info( f'egg_pair {args.processor_id} at port={port}, transfer_port={transfer_port}, pid={pid} receives signum={signal.getsignal(signum)}, stopping gracefully.' ) signal.signal(signal.SIGTERM, exit_gracefully) signal.signal(signal.SIGINT, exit_gracefully) while run: time.sleep(1) L.info(f'sending exit heartbeat to cm') if cluster_manager: myself._status = ProcessorStatus.STOPPED cluster_manager_client.heartbeat(myself) GrpcChannelFactory.shutdown_all_now() L.info(f'closing RocksDB open dbs') #todo:1: move to RocksdbAdapter and provide a cleanup method from eggroll.core.pair_store.rocksdb import RocksdbAdapter for path, db in RocksdbAdapter.db_dict.items(): del db gc.collect() L.info(f'system metric at exit: {get_system_metric(1)}') L.info( f'egg_pair {args.processor_id} at port={port}, transfer_port={transfer_port}, pid={pid} stopped gracefully' )
def serve(args): prefix = 'v1/egg-pair' set_data_dir(args.data_dir) CommandRouter.get_instance().register( service_name=f"{prefix}/runTask", route_to_module_name="eggroll.roll_pair.egg_pair", route_to_class_name="EggPair", route_to_method_name="run_task") command_server = grpc.server( futures.ThreadPoolExecutor(max_workers=500, thread_name_prefix="grpc_server"), options=[("grpc.max_metadata_size", 32 << 20), (cygrpc.ChannelArgKey.max_send_message_length, 2 << 30 - 1), (cygrpc.ChannelArgKey.max_receive_message_length, 2 << 30 - 1) ]) command_servicer = CommandServicer() command_pb2_grpc.add_CommandServiceServicer_to_server( command_servicer, command_server) transfer_servicer = GrpcTransferServicer() port = args.port transfer_port = args.transfer_port port = command_server.add_insecure_port(f'[::]:{port}') if transfer_port == "-1": transfer_server = command_server transfer_port = port transfer_pb2_grpc.add_TransferServiceServicer_to_server( transfer_servicer, transfer_server) else: transfer_server = grpc.server( futures.ThreadPoolExecutor(max_workers=500, thread_name_prefix="transfer_server"), options=[ (cygrpc.ChannelArgKey.max_send_message_length, 2 << 30 - 1), (cygrpc.ChannelArgKey.max_receive_message_length, 2 << 30 - 1), ('grpc.max_metadata_size', 32 << 20) ]) transfer_port = transfer_server.add_insecure_port( f'[::]:{transfer_port}') transfer_pb2_grpc.add_TransferServiceServicer_to_server( transfer_servicer, transfer_server) transfer_server.start() L.info( f"starting egg_pair service, port:{port}, transfer port: {transfer_port}" ) command_server.start() cluster_manager = args.cluster_manager myself = None cluster_manager_client = None if cluster_manager: session_id = args.session_id if not session_id: raise ValueError('session id is missing') options = {SessionConfKeys.CONFKEY_SESSION_ID: args.session_id} myself = ErProcessor(id=int(args.processor_id), server_node_id=int(args.server_node_id), processor_type=ProcessorTypes.EGG_PAIR, command_endpoint=ErEndpoint(host='localhost', port=port), transfer_endpoint=ErEndpoint(host='localhost', port=transfer_port), pid=os.getpid(), options=options, status=ProcessorStatus.RUNNING) cluster_manager_host, cluster_manager_port = cluster_manager.strip( ).split(':') L.info(f'cluster_manager: {cluster_manager}') cluster_manager_client = ClusterManagerClient( options={ ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_HOST: cluster_manager_host, ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT: cluster_manager_port }) cluster_manager_client.heartbeat(myself) L.info(f'egg_pair started at port {port}, transfer_port {transfer_port}') run = True def exit_gracefully(signum, frame): nonlocal run run = False signal.signal(signal.SIGTERM, exit_gracefully) signal.signal(signal.SIGINT, exit_gracefully) import time while run: time.sleep(1) if cluster_manager: myself._status = ProcessorStatus.STOPPED cluster_manager_client.heartbeat(myself) L.info( f'egg_pair at port {port}, transfer_port {transfer_port} stopped gracefully' )