コード例 #1
0
def stop_processor(cluster_manager_client: ClusterManagerClient,
                   myself: ErProcessor):
    import win32file
    import win32pipe
    L.info(f"stop_processor pid:{os.getpid()}, ppid:{os.getppid()}")
    pipe_name = r'\\.\pipe\pid_pipe' + str(os.getpid())
    pipe_buffer_size = 1024
    while True:
        named_pipe = win32pipe.CreateNamedPipe(
            pipe_name, win32pipe.PIPE_ACCESS_DUPLEX,
            win32pipe.PIPE_TYPE_MESSAGE | win32pipe.PIPE_WAIT
            | win32pipe.PIPE_READMODE_MESSAGE,
            win32pipe.PIPE_UNLIMITED_INSTANCES, pipe_buffer_size,
            pipe_buffer_size, 500, None)
        try:
            while True:
                try:
                    win32pipe.ConnectNamedPipe(named_pipe, None)
                    data = win32file.ReadFile(named_pipe, pipe_buffer_size,
                                              None)

                    if data is None or len(data) < 2:
                        continue

                    print('receive msg:', data)
                    cmd_str = data[1].decode('utf-8')
                    if 'stop' in cmd_str and str(os.getpid()) in cmd_str:
                        myself._status = ProcessorStatus.STOPPED
                        cluster_manager_client.heartbeat(myself)

                except BaseException as e:
                    print("exception:", e)
                    break
        finally:
            try:
                win32pipe.DisconnectNamedPipe(named_pipe)
            except:
                pass
コード例 #2
0
def serve(args):
    prefix = 'v1/egg-pair'

    set_data_dir(args.data_dir)

    CommandRouter.get_instance().register(
        service_name=f"{prefix}/runTask",
        route_to_module_name="eggroll.roll_pair.egg_pair",
        route_to_class_name="EggPair",
        route_to_method_name="run_task")

    max_workers = int(
        RollPairConfKeys.
        EGGROLL_ROLLPAIR_EGGPAIR_SERVER_EXECUTOR_POOL_MAX_SIZE.get())
    executor_pool_type = CoreConfKeys.EGGROLL_CORE_DEFAULT_EXECUTOR_POOL.get()
    command_server = grpc.server(
        create_executor_pool(canonical_name=executor_pool_type,
                             max_workers=max_workers,
                             thread_name_prefix="eggpair-command-server"),
        options=
        [("grpc.max_metadata_size",
          int(CoreConfKeys.
              EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_METADATA_SIZE.get())
          ),
         ('grpc.max_send_message_length',
          int(CoreConfKeys.
              EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.get())
          ),
         ('grpc.max_receive_message_length',
          int(CoreConfKeys.
              EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.get())
          ),
         ('grpc.keepalive_time_ms',
          int(CoreConfKeys.CONFKEY_CORE_GRPC_CHANNEL_KEEPALIVE_TIME_SEC.get())
          * 1000),
         ('grpc.keepalive_timeout_ms',
          int(CoreConfKeys.
              CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_TIMEOUT_SEC.get()) *
          1000),
         ('grpc.keepalive_permit_without_calls',
          int(CoreConfKeys.
              CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED.
              get())),
         ('grpc.per_rpc_retry_buffer_size',
          int(CoreConfKeys.CONFKEY_CORE_GRPC_SERVER_CHANNEL_RETRY_BUFFER_SIZE.
              get())), ('grpc.so_reuseport', False)])

    command_servicer = CommandServicer()
    command_pb2_grpc.add_CommandServiceServicer_to_server(
        command_servicer, command_server)

    transfer_servicer = GrpcTransferServicer()

    port = args.port
    transfer_port = args.transfer_port

    port = command_server.add_insecure_port(f'[::]:{port}')

    if transfer_port == "-1":
        transfer_server = command_server
        transfer_port = port
        transfer_pb2_grpc.add_TransferServiceServicer_to_server(
            transfer_servicer, transfer_server)
    else:
        transfer_server_max_workers = int(
            RollPairConfKeys.
            EGGROLL_ROLLPAIR_EGGPAIR_DATA_SERVER_EXECUTOR_POOL_MAX_SIZE.get())
        transfer_server = grpc.server(
            create_executor_pool(canonical_name=executor_pool_type,
                                 max_workers=transfer_server_max_workers,
                                 thread_name_prefix="transfer_server"),
            options=
            [('grpc.max_metadata_size',
              int(CoreConfKeys.
                  EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_METADATA_SIZE.
                  get())),
             ('grpc.max_send_message_length',
              int(CoreConfKeys.
                  EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.
                  get())),
             ('grpc.max_receive_message_length',
              int(CoreConfKeys.
                  EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.
                  get())),
             ('grpc.keepalive_time_ms',
              int(CoreConfKeys.
                  CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED
                  .get()) * 1000),
             ('grpc.keepalive_timeout_ms',
              int(CoreConfKeys.
                  CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_TIMEOUT_SEC.get())
              * 1000),
             ('grpc.keepalive_permit_without_calls',
              int(CoreConfKeys.
                  CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED
                  .get())),
             ('grpc.per_rpc_retry_buffer_size',
              int(CoreConfKeys.
                  CONFKEY_CORE_GRPC_SERVER_CHANNEL_RETRY_BUFFER_SIZE.get())),
             ('grpc.so_reuseport', False)])
        transfer_port = transfer_server.add_insecure_port(
            f'[::]:{transfer_port}')
        transfer_pb2_grpc.add_TransferServiceServicer_to_server(
            transfer_servicer, transfer_server)
        transfer_server.start()
    pid = os.getpid()

    L.info(
        f"starting egg_pair service, port: {port}, transfer port: {transfer_port}, pid: {pid}"
    )
    command_server.start()

    cluster_manager = args.cluster_manager
    myself = None
    cluster_manager_client = None
    if cluster_manager:
        session_id = args.session_id
        server_node_id = int(args.server_node_id)
        static_er_conf = get_static_er_conf()
        static_er_conf['server_node_id'] = server_node_id

        if not session_id:
            raise ValueError('session id is missing')
        options = {SessionConfKeys.CONFKEY_SESSION_ID: args.session_id}
        myself = ErProcessor(id=int(args.processor_id),
                             server_node_id=server_node_id,
                             processor_type=ProcessorTypes.EGG_PAIR,
                             command_endpoint=ErEndpoint(host='localhost',
                                                         port=port),
                             transfer_endpoint=ErEndpoint(host='localhost',
                                                          port=transfer_port),
                             pid=pid,
                             options=options,
                             status=ProcessorStatus.RUNNING)

        cluster_manager_host, cluster_manager_port = cluster_manager.strip(
        ).split(':')

        L.info(f'egg_pair cluster_manager: {cluster_manager}')
        cluster_manager_client = ClusterManagerClient(
            options={
                ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_HOST:
                cluster_manager_host,
                ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT:
                cluster_manager_port
            })
        cluster_manager_client.heartbeat(myself)

        if platform.system() == "Windows":
            t1 = threading.Thread(target=stop_processor,
                                  args=[cluster_manager_client, myself])
            t1.start()

    L.info(f'egg_pair started at port={port}, transfer_port={transfer_port}')

    run = True

    def exit_gracefully(signum, frame):
        nonlocal run
        run = False
        L.info(
            f'egg_pair {args.processor_id} at port={port}, transfer_port={transfer_port}, pid={pid} receives signum={signal.getsignal(signum)}, stopping gracefully.'
        )

    signal.signal(signal.SIGTERM, exit_gracefully)
    signal.signal(signal.SIGINT, exit_gracefully)

    while run:
        time.sleep(1)

    L.info(f'sending exit heartbeat to cm')
    if cluster_manager:
        myself._status = ProcessorStatus.STOPPED
        cluster_manager_client.heartbeat(myself)

    GrpcChannelFactory.shutdown_all_now()

    L.info(f'closing RocksDB open dbs')
    #todo:1: move to RocksdbAdapter and provide a cleanup method
    from eggroll.core.pair_store.rocksdb import RocksdbAdapter
    for path, db in RocksdbAdapter.db_dict.items():
        del db

    gc.collect()

    L.info(f'system metric at exit: {get_system_metric(1)}')
    L.info(
        f'egg_pair {args.processor_id} at port={port}, transfer_port={transfer_port}, pid={pid} stopped gracefully'
    )
コード例 #3
0
ファイル: egg_pair.py プロジェクト: xingxingt/eggroll
def serve(args):
    prefix = 'v1/egg-pair'

    set_data_dir(args.data_dir)

    CommandRouter.get_instance().register(
        service_name=f"{prefix}/runTask",
        route_to_module_name="eggroll.roll_pair.egg_pair",
        route_to_class_name="EggPair",
        route_to_method_name="run_task")

    command_server = grpc.server(
        futures.ThreadPoolExecutor(max_workers=500,
                                   thread_name_prefix="grpc_server"),
        options=[("grpc.max_metadata_size", 32 << 20),
                 (cygrpc.ChannelArgKey.max_send_message_length, 2 << 30 - 1),
                 (cygrpc.ChannelArgKey.max_receive_message_length, 2 << 30 - 1)
                 ])

    command_servicer = CommandServicer()
    command_pb2_grpc.add_CommandServiceServicer_to_server(
        command_servicer, command_server)

    transfer_servicer = GrpcTransferServicer()

    port = args.port
    transfer_port = args.transfer_port

    port = command_server.add_insecure_port(f'[::]:{port}')

    if transfer_port == "-1":
        transfer_server = command_server
        transfer_port = port
        transfer_pb2_grpc.add_TransferServiceServicer_to_server(
            transfer_servicer, transfer_server)
    else:
        transfer_server = grpc.server(
            futures.ThreadPoolExecutor(max_workers=500,
                                       thread_name_prefix="transfer_server"),
            options=[
                (cygrpc.ChannelArgKey.max_send_message_length, 2 << 30 - 1),
                (cygrpc.ChannelArgKey.max_receive_message_length, 2 << 30 - 1),
                ('grpc.max_metadata_size', 32 << 20)
            ])
        transfer_port = transfer_server.add_insecure_port(
            f'[::]:{transfer_port}')
        transfer_pb2_grpc.add_TransferServiceServicer_to_server(
            transfer_servicer, transfer_server)
        transfer_server.start()

    L.info(
        f"starting egg_pair service, port:{port}, transfer port: {transfer_port}"
    )
    command_server.start()

    cluster_manager = args.cluster_manager
    myself = None
    cluster_manager_client = None
    if cluster_manager:
        session_id = args.session_id

        if not session_id:
            raise ValueError('session id is missing')
        options = {SessionConfKeys.CONFKEY_SESSION_ID: args.session_id}
        myself = ErProcessor(id=int(args.processor_id),
                             server_node_id=int(args.server_node_id),
                             processor_type=ProcessorTypes.EGG_PAIR,
                             command_endpoint=ErEndpoint(host='localhost',
                                                         port=port),
                             transfer_endpoint=ErEndpoint(host='localhost',
                                                          port=transfer_port),
                             pid=os.getpid(),
                             options=options,
                             status=ProcessorStatus.RUNNING)

        cluster_manager_host, cluster_manager_port = cluster_manager.strip(
        ).split(':')

        L.info(f'cluster_manager: {cluster_manager}')
        cluster_manager_client = ClusterManagerClient(
            options={
                ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_HOST:
                cluster_manager_host,
                ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT:
                cluster_manager_port
            })
        cluster_manager_client.heartbeat(myself)

    L.info(f'egg_pair started at port {port}, transfer_port {transfer_port}')

    run = True

    def exit_gracefully(signum, frame):
        nonlocal run
        run = False

    signal.signal(signal.SIGTERM, exit_gracefully)
    signal.signal(signal.SIGINT, exit_gracefully)

    import time

    while run:
        time.sleep(1)

    if cluster_manager:
        myself._status = ProcessorStatus.STOPPED
        cluster_manager_client.heartbeat(myself)

    L.info(
        f'egg_pair at port {port}, transfer_port {transfer_port} stopped gracefully'
    )