Exemple #1
0
def stop_processor(cluster_manager_client: ClusterManagerClient,
                   myself: ErProcessor):
    import win32file
    import win32pipe
    L.info(f"stop_processor pid:{os.getpid()}, ppid:{os.getppid()}")
    pipe_name = r'\\.\pipe\pid_pipe' + str(os.getpid())
    pipe_buffer_size = 1024
    while True:
        named_pipe = win32pipe.CreateNamedPipe(
            pipe_name, win32pipe.PIPE_ACCESS_DUPLEX,
            win32pipe.PIPE_TYPE_MESSAGE | win32pipe.PIPE_WAIT
            | win32pipe.PIPE_READMODE_MESSAGE,
            win32pipe.PIPE_UNLIMITED_INSTANCES, pipe_buffer_size,
            pipe_buffer_size, 500, None)
        try:
            while True:
                try:
                    win32pipe.ConnectNamedPipe(named_pipe, None)
                    data = win32file.ReadFile(named_pipe, pipe_buffer_size,
                                              None)

                    if data is None or len(data) < 2:
                        continue

                    print('receive msg:', data)
                    cmd_str = data[1].decode('utf-8')
                    if 'stop' in cmd_str and str(os.getpid()) in cmd_str:
                        myself._status = ProcessorStatus.STOPPED
                        cluster_manager_client.heartbeat(myself)

                except BaseException as e:
                    print("exception:", e)
                    break
        finally:
            try:
                win32pipe.DisconnectNamedPipe(named_pipe)
            except:
                pass
Exemple #2
0
def serve(args):
    prefix = 'v1/egg-pair'

    set_data_dir(args.data_dir)

    CommandRouter.get_instance().register(
        service_name=f"{prefix}/runTask",
        route_to_module_name="eggroll.roll_pair.egg_pair",
        route_to_class_name="EggPair",
        route_to_method_name="run_task")

    max_workers = int(
        RollPairConfKeys.
        EGGROLL_ROLLPAIR_EGGPAIR_SERVER_EXECUTOR_POOL_MAX_SIZE.get())
    executor_pool_type = CoreConfKeys.EGGROLL_CORE_DEFAULT_EXECUTOR_POOL.get()
    command_server = grpc.server(
        create_executor_pool(canonical_name=executor_pool_type,
                             max_workers=max_workers,
                             thread_name_prefix="eggpair-command-server"),
        options=
        [("grpc.max_metadata_size",
          int(CoreConfKeys.
              EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_METADATA_SIZE.get())
          ),
         ('grpc.max_send_message_length',
          int(CoreConfKeys.
              EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.get())
          ),
         ('grpc.max_receive_message_length',
          int(CoreConfKeys.
              EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.get())
          ),
         ('grpc.keepalive_time_ms',
          int(CoreConfKeys.CONFKEY_CORE_GRPC_CHANNEL_KEEPALIVE_TIME_SEC.get())
          * 1000),
         ('grpc.keepalive_timeout_ms',
          int(CoreConfKeys.
              CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_TIMEOUT_SEC.get()) *
          1000),
         ('grpc.keepalive_permit_without_calls',
          int(CoreConfKeys.
              CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED.
              get())),
         ('grpc.per_rpc_retry_buffer_size',
          int(CoreConfKeys.CONFKEY_CORE_GRPC_SERVER_CHANNEL_RETRY_BUFFER_SIZE.
              get())), ('grpc.so_reuseport', False)])

    command_servicer = CommandServicer()
    command_pb2_grpc.add_CommandServiceServicer_to_server(
        command_servicer, command_server)

    transfer_servicer = GrpcTransferServicer()

    port = args.port
    transfer_port = args.transfer_port

    port = command_server.add_insecure_port(f'[::]:{port}')

    if transfer_port == "-1":
        transfer_server = command_server
        transfer_port = port
        transfer_pb2_grpc.add_TransferServiceServicer_to_server(
            transfer_servicer, transfer_server)
    else:
        transfer_server_max_workers = int(
            RollPairConfKeys.
            EGGROLL_ROLLPAIR_EGGPAIR_DATA_SERVER_EXECUTOR_POOL_MAX_SIZE.get())
        transfer_server = grpc.server(
            create_executor_pool(canonical_name=executor_pool_type,
                                 max_workers=transfer_server_max_workers,
                                 thread_name_prefix="transfer_server"),
            options=
            [('grpc.max_metadata_size',
              int(CoreConfKeys.
                  EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_METADATA_SIZE.
                  get())),
             ('grpc.max_send_message_length',
              int(CoreConfKeys.
                  EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.
                  get())),
             ('grpc.max_receive_message_length',
              int(CoreConfKeys.
                  EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.
                  get())),
             ('grpc.keepalive_time_ms',
              int(CoreConfKeys.
                  CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED
                  .get()) * 1000),
             ('grpc.keepalive_timeout_ms',
              int(CoreConfKeys.
                  CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_TIMEOUT_SEC.get())
              * 1000),
             ('grpc.keepalive_permit_without_calls',
              int(CoreConfKeys.
                  CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED
                  .get())),
             ('grpc.per_rpc_retry_buffer_size',
              int(CoreConfKeys.
                  CONFKEY_CORE_GRPC_SERVER_CHANNEL_RETRY_BUFFER_SIZE.get())),
             ('grpc.so_reuseport', False)])
        transfer_port = transfer_server.add_insecure_port(
            f'[::]:{transfer_port}')
        transfer_pb2_grpc.add_TransferServiceServicer_to_server(
            transfer_servicer, transfer_server)
        transfer_server.start()
    pid = os.getpid()

    L.info(
        f"starting egg_pair service, port: {port}, transfer port: {transfer_port}, pid: {pid}"
    )
    command_server.start()

    cluster_manager = args.cluster_manager
    myself = None
    cluster_manager_client = None
    if cluster_manager:
        session_id = args.session_id
        server_node_id = int(args.server_node_id)
        static_er_conf = get_static_er_conf()
        static_er_conf['server_node_id'] = server_node_id

        if not session_id:
            raise ValueError('session id is missing')
        options = {SessionConfKeys.CONFKEY_SESSION_ID: args.session_id}
        myself = ErProcessor(id=int(args.processor_id),
                             server_node_id=server_node_id,
                             processor_type=ProcessorTypes.EGG_PAIR,
                             command_endpoint=ErEndpoint(host='localhost',
                                                         port=port),
                             transfer_endpoint=ErEndpoint(host='localhost',
                                                          port=transfer_port),
                             pid=pid,
                             options=options,
                             status=ProcessorStatus.RUNNING)

        cluster_manager_host, cluster_manager_port = cluster_manager.strip(
        ).split(':')

        L.info(f'egg_pair cluster_manager: {cluster_manager}')
        cluster_manager_client = ClusterManagerClient(
            options={
                ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_HOST:
                cluster_manager_host,
                ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT:
                cluster_manager_port
            })
        cluster_manager_client.heartbeat(myself)

        if platform.system() == "Windows":
            t1 = threading.Thread(target=stop_processor,
                                  args=[cluster_manager_client, myself])
            t1.start()

    L.info(f'egg_pair started at port={port}, transfer_port={transfer_port}')

    run = True

    def exit_gracefully(signum, frame):
        nonlocal run
        run = False
        L.info(
            f'egg_pair {args.processor_id} at port={port}, transfer_port={transfer_port}, pid={pid} receives signum={signal.getsignal(signum)}, stopping gracefully.'
        )

    signal.signal(signal.SIGTERM, exit_gracefully)
    signal.signal(signal.SIGINT, exit_gracefully)

    while run:
        time.sleep(1)

    L.info(f'sending exit heartbeat to cm')
    if cluster_manager:
        myself._status = ProcessorStatus.STOPPED
        cluster_manager_client.heartbeat(myself)

    GrpcChannelFactory.shutdown_all_now()

    L.info(f'closing RocksDB open dbs')
    #todo:1: move to RocksdbAdapter and provide a cleanup method
    from eggroll.core.pair_store.rocksdb import RocksdbAdapter
    for path, db in RocksdbAdapter.db_dict.items():
        del db

    gc.collect()

    L.info(f'system metric at exit: {get_system_metric(1)}')
    L.info(
        f'egg_pair {args.processor_id} at port={port}, transfer_port={transfer_port}, pid={pid} stopped gracefully'
    )
Exemple #3
0
    def __init__(self,
            session_id=None,
            name='',
            tag='',
            processors: list = None,
            options: dict = None):
        if processors is None:
            processors = []
        if options is None:
            options = {}
        if not session_id:
            self.__session_id = f'er_session_py_{time_now(format=DEFAULT_DATETIME_FORMAT)}_{get_self_ip()}'
        else:
            self.__session_id = session_id

        self.__eggroll_home = os.getenv('EGGROLL_HOME', None)
        if not self.__eggroll_home:
            raise EnvironmentError('EGGROLL_HOME is not set')

        if "EGGROLL_DEBUG" not in os.environ:
            os.environ['EGGROLL_DEBUG'] = "0"

        conf_path = options.get(CoreConfKeys.STATIC_CONF_PATH, f"{self.__eggroll_home}/conf/eggroll.properties")

        L.info(f"static conf path: {conf_path}")
        configs = configparser.ConfigParser()
        configs.read(conf_path)
        set_static_er_conf(configs['eggroll'])
        static_er_conf = get_static_er_conf()

        self.__options = options.copy()
        self.__options[SessionConfKeys.CONFKEY_SESSION_ID] = self.__session_id
        #self._cluster_manager_client = ClusterManagerClient(options=options)

        self.__is_standalone = options.get(SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE, "") == DeployModes.STANDALONE
        if self.__is_standalone and not processors and os.environ.get("EGGROLL_RESOURCE_MANAGER_BOOTSTRAP_DEBUG", "0") == "0":
            #port = int(options.get(ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT,
            #                      static_er_conf.get(ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, "4689")))
            port = 0
            random_value = str(random.random())
            os.environ['EGGROLL_STANDALONE_TAG'] = random_value
            if os.name != 'nt':
                startup_command = f'{self.__eggroll_home}/bin/eggroll_boot_standalone.sh -p {port} -s {self.__session_id}'
            else:
                startup_command = f'{self.__eggroll_home}/bin/eggroll_boot_standalone.py -p {port} -s {self.__session_id}'

            print("startup_command:", startup_command)
            import subprocess
            import atexit

            bootstrap_log_dir = f'{self.__eggroll_home}/logs/eggroll/'
            os.makedirs(bootstrap_log_dir, mode=0o755, exist_ok=True)
            with open(f'{bootstrap_log_dir}/standalone-manager.out', 'a+') as outfile, \
                    open(f'{bootstrap_log_dir}/standalone-manager.err', 'a+') as errfile:
                L.info(f'start up command: {startup_command}')
                manager_process = subprocess.Popen(startup_command, shell=True, stdout=outfile, stderr=errfile)
                manager_process.wait()
                returncode = manager_process.returncode
                L.info(f'start up returncode: {returncode}')

            def shutdown_standalone_manager(session_id, log_dir):
                standalone_tag = f'eggroll.standalone.tag={random_value}'
                if os.name != 'nt':
                    shutdown_command = f"ps aux | grep eggroll | grep Bootstrap | grep '{standalone_tag}' | grep '{session_id}' | grep -v grep | awk '{{print $2}}' | xargs kill"
                else:
                    pid_list = psutil.pids()
                    ret_pid = 0
                    exception = None
                    for pid in pid_list:
                        try:
                            p = psutil.Process(pid)
                            exception = None
                        except Exception as e:
                            exception = e
                            continue

                        if "java.exe" not in p.name():
                            continue
                        # if it is a system process, call p.cmdline() will dump
                        cmdline = p.cmdline()
                        if standalone_tag not in cmdline or '--bootstraps' not in cmdline:
                            continue

                        ret_pid = pid
                        break
                    if exception:
                        raise RuntimeError("can not find the bootstrap process")

                    shutdown_command = f"taskkill /pid {ret_pid} /f"

                L.info(f'shutdown command: {shutdown_command}')
                with open(f'{log_dir}/standalone-manager.out', 'a+') as outfile, open(f'{log_dir}/standalone-manager.err', 'a+') as errfile:
                    manager_process = subprocess.run(shutdown_command, shell=True, stdout=outfile, stderr=errfile)
                    returncode = manager_process.returncode
                    L.info(f'shutdown returncode: {returncode}')

            file_name = f'{self.__eggroll_home}/logs/eggroll/bootstrap-standalone-manager.out'
            max_retry_cnt = 100
            for i in range(max_retry_cnt):
                msg = f"retry get port from bootstrap-standalone-manager.out: retry_cnt: {i},"
                L.info(msg)

                if os.path.exists(file_name):
                    break
                time.sleep(min(0.1 * i, 100))

            try:
                for i in range(max_retry_cnt):
                    with open(file_name) as fp:
                        msg = f"retry get port of ClusterManager and NodeManager: retry_cnt: {i},"
                        L.info(msg)

                        port = 0
                        key = f"{random_value} server started at port "
                        for line in fp.readlines():
                            if key in line:
                                port = int(line.rsplit('port ', 2)[1])
                                if port != 0:
                                    break

                        if port != 0:
                            break
                    time.sleep(min(0.1 * i, 100))
            except IOError as e:
                L.info(f"get port from {file_name} failed!")
                raise e

            if port == 0:
                raise RuntimeError(f"get port from {file_name} failed!")

            options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = port
            self.__options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = port
            atexit.register(shutdown_standalone_manager, self.__session_id, bootstrap_log_dir)

        self._cluster_manager_client = ClusterManagerClient(options=options)
        session_meta = ErSessionMeta(id=self.__session_id,
                                     name=name,
                                     status=SessionStatus.NEW,
                                     tag=tag,
                                     processors=processors,
                                     options=options)

        from time import monotonic, sleep
        timeout = int(SessionConfKeys.EGGROLL_SESSION_START_TIMEOUT_MS.get_with(options)) / 1000 + 2
        endtime = monotonic() + timeout

        # TODO:0: ignores exception while starting up in standalone mod
        while True:
            try:
                if not processors:
                    self.__session_meta = self._cluster_manager_client.get_or_create_session(session_meta)
                else:
                    self.__session_meta = self._cluster_manager_client.register_session(session_meta)
                break
            except:
                if monotonic() < endtime:
                    sleep(0.1)
                else:
                    raise

        self.__exit_tasks = list()
        self.__processors = self.__session_meta._processors

        L.info(f'session init finished: {self.__session_id}, details: {self.__session_meta}')
        self.stopped = self.__session_meta._status == SessionStatus.CLOSED or self.__session_meta._status == SessionStatus.KILLED
        self._rolls = list()
        self._eggs = dict()

        for processor in self.__session_meta._processors:
            processor_type = processor._processor_type
            if processor_type == ProcessorTypes.EGG_PAIR:
                server_node_id = processor._server_node_id
                if server_node_id not in self._eggs:
                    self._eggs[server_node_id] = list()
                self._eggs[server_node_id].append(processor)
            elif processor_type == ProcessorTypes.ROLL_PAIR_MASTER:
                self._rolls.append(processor)
            else:
                raise ValueError(f'processor type {processor_type} not supported in roll pair')
Exemple #4
0
class ErSession(object):
    executor = ErThreadUnpooledExecutor(
        max_workers=int(CoreConfKeys.EGGROLL_CORE_CLIENT_COMMAND_EXECUTOR_POOL_MAX_SIZE.get()),
        thread_name_prefix="session_server")
    def __init__(self,
            session_id=None,
            name='',
            tag='',
            processors: list = None,
            options: dict = None):
        if processors is None:
            processors = []
        if options is None:
            options = {}
        if not session_id:
            self.__session_id = f'er_session_py_{time_now(format=DEFAULT_DATETIME_FORMAT)}_{get_self_ip()}'
        else:
            self.__session_id = session_id

        self.__eggroll_home = os.getenv('EGGROLL_HOME', None)
        if not self.__eggroll_home:
            raise EnvironmentError('EGGROLL_HOME is not set')

        if "EGGROLL_DEBUG" not in os.environ:
            os.environ['EGGROLL_DEBUG'] = "0"

        conf_path = options.get(CoreConfKeys.STATIC_CONF_PATH, f"{self.__eggroll_home}/conf/eggroll.properties")

        L.info(f"static conf path: {conf_path}")
        configs = configparser.ConfigParser()
        configs.read(conf_path)
        set_static_er_conf(configs['eggroll'])
        static_er_conf = get_static_er_conf()

        self.__options = options.copy()
        self.__options[SessionConfKeys.CONFKEY_SESSION_ID] = self.__session_id
        #self._cluster_manager_client = ClusterManagerClient(options=options)

        self.__is_standalone = options.get(SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE, "") == DeployModes.STANDALONE
        if self.__is_standalone and not processors and os.environ.get("EGGROLL_RESOURCE_MANAGER_BOOTSTRAP_DEBUG", "0") == "0":
            #port = int(options.get(ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT,
            #                      static_er_conf.get(ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, "4689")))
            port = 0
            random_value = str(random.random())
            os.environ['EGGROLL_STANDALONE_TAG'] = random_value
            if os.name != 'nt':
                startup_command = f'{self.__eggroll_home}/bin/eggroll_boot_standalone.sh -p {port} -s {self.__session_id}'
            else:
                startup_command = f'{self.__eggroll_home}/bin/eggroll_boot_standalone.py -p {port} -s {self.__session_id}'

            print("startup_command:", startup_command)
            import subprocess
            import atexit

            bootstrap_log_dir = f'{self.__eggroll_home}/logs/eggroll/'
            os.makedirs(bootstrap_log_dir, mode=0o755, exist_ok=True)
            with open(f'{bootstrap_log_dir}/standalone-manager.out', 'a+') as outfile, \
                    open(f'{bootstrap_log_dir}/standalone-manager.err', 'a+') as errfile:
                L.info(f'start up command: {startup_command}')
                manager_process = subprocess.Popen(startup_command, shell=True, stdout=outfile, stderr=errfile)
                manager_process.wait()
                returncode = manager_process.returncode
                L.info(f'start up returncode: {returncode}')

            def shutdown_standalone_manager(session_id, log_dir):
                standalone_tag = f'eggroll.standalone.tag={random_value}'
                if os.name != 'nt':
                    shutdown_command = f"ps aux | grep eggroll | grep Bootstrap | grep '{standalone_tag}' | grep '{session_id}' | grep -v grep | awk '{{print $2}}' | xargs kill"
                else:
                    pid_list = psutil.pids()
                    ret_pid = 0
                    exception = None
                    for pid in pid_list:
                        try:
                            p = psutil.Process(pid)
                            exception = None
                        except Exception as e:
                            exception = e
                            continue

                        if "java.exe" not in p.name():
                            continue
                        # if it is a system process, call p.cmdline() will dump
                        cmdline = p.cmdline()
                        if standalone_tag not in cmdline or '--bootstraps' not in cmdline:
                            continue

                        ret_pid = pid
                        break
                    if exception:
                        raise RuntimeError("can not find the bootstrap process")

                    shutdown_command = f"taskkill /pid {ret_pid} /f"

                L.info(f'shutdown command: {shutdown_command}')
                with open(f'{log_dir}/standalone-manager.out', 'a+') as outfile, open(f'{log_dir}/standalone-manager.err', 'a+') as errfile:
                    manager_process = subprocess.run(shutdown_command, shell=True, stdout=outfile, stderr=errfile)
                    returncode = manager_process.returncode
                    L.info(f'shutdown returncode: {returncode}')

            file_name = f'{self.__eggroll_home}/logs/eggroll/bootstrap-standalone-manager.out'
            max_retry_cnt = 100
            for i in range(max_retry_cnt):
                msg = f"retry get port from bootstrap-standalone-manager.out: retry_cnt: {i},"
                L.info(msg)

                if os.path.exists(file_name):
                    break
                time.sleep(min(0.1 * i, 100))

            try:
                for i in range(max_retry_cnt):
                    with open(file_name) as fp:
                        msg = f"retry get port of ClusterManager and NodeManager: retry_cnt: {i},"
                        L.info(msg)

                        port = 0
                        key = f"{random_value} server started at port "
                        for line in fp.readlines():
                            if key in line:
                                port = int(line.rsplit('port ', 2)[1])
                                if port != 0:
                                    break

                        if port != 0:
                            break
                    time.sleep(min(0.1 * i, 100))
            except IOError as e:
                L.info(f"get port from {file_name} failed!")
                raise e

            if port == 0:
                raise RuntimeError(f"get port from {file_name} failed!")

            options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = port
            self.__options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = port
            atexit.register(shutdown_standalone_manager, self.__session_id, bootstrap_log_dir)

        self._cluster_manager_client = ClusterManagerClient(options=options)
        session_meta = ErSessionMeta(id=self.__session_id,
                                     name=name,
                                     status=SessionStatus.NEW,
                                     tag=tag,
                                     processors=processors,
                                     options=options)

        from time import monotonic, sleep
        timeout = int(SessionConfKeys.EGGROLL_SESSION_START_TIMEOUT_MS.get_with(options)) / 1000 + 2
        endtime = monotonic() + timeout

        # TODO:0: ignores exception while starting up in standalone mod
        while True:
            try:
                if not processors:
                    self.__session_meta = self._cluster_manager_client.get_or_create_session(session_meta)
                else:
                    self.__session_meta = self._cluster_manager_client.register_session(session_meta)
                break
            except:
                if monotonic() < endtime:
                    sleep(0.1)
                else:
                    raise

        self.__exit_tasks = list()
        self.__processors = self.__session_meta._processors

        L.info(f'session init finished: {self.__session_id}, details: {self.__session_meta}')
        self.stopped = self.__session_meta._status == SessionStatus.CLOSED or self.__session_meta._status == SessionStatus.KILLED
        self._rolls = list()
        self._eggs = dict()

        for processor in self.__session_meta._processors:
            processor_type = processor._processor_type
            if processor_type == ProcessorTypes.EGG_PAIR:
                server_node_id = processor._server_node_id
                if server_node_id not in self._eggs:
                    self._eggs[server_node_id] = list()
                self._eggs[server_node_id].append(processor)
            elif processor_type == ProcessorTypes.ROLL_PAIR_MASTER:
                self._rolls.append(processor)
            else:
                raise ValueError(f'processor type {processor_type} not supported in roll pair')

    def get_rank_in_node(self, partition_id, server_node_id):
        processor_count_of_node = len(self._eggs[server_node_id])
        node_count = len(self._eggs)
        rank_in_node = (partition_id // node_count) % processor_count_of_node

        return rank_in_node

    def route_to_egg(self, partition: ErPartition):
        server_node_id = partition._processor._server_node_id
        rank_in_node = partition._rank_in_node
        if partition._rank_in_node is None or rank_in_node < 0:
            rank_in_node = self.get_rank_in_node(partition_id=partition._id,
                                                 server_node_id=server_node_id)

        result = self.route_to_egg_by_rank(server_node_id, rank_in_node)

        return result

    def route_to_egg_by_rank(self, server_node_id, rank_in_node):
        result = self._eggs[server_node_id][rank_in_node]
        if not result._command_endpoint._host or result._command_endpoint._port <= 0:
            raise ValueError(f'error routing to egg: {result} in session: {self.__session_id}')

        return result

    def populate_processor(self, store: ErStore):
        populated_partitions = list()
        for p in store._partitions:
            server_node_id = p._processor._server_node_id
            rank_in_node = self.get_rank_in_node(p._id, p._processor._server_node_id)
            pp = ErPartition(id=p._id,
                             store_locator=p._store_locator,
                             processor=self.route_to_egg_by_rank(server_node_id, rank_in_node),
                             rank_in_node=rank_in_node)
            populated_partitions.append(pp)
        return ErStore(store_locator=store._store_locator, partitions=populated_partitions, options=store._options)

    def submit_job(self,
            job: ErJob,
            output_types: list = None,
            command_uri: CommandURI = None,
            create_output_if_missing=True):
        if not output_types:
            output_types = [ErTask]
        final_job = self.populate_output_store(job) if create_output_if_missing else job
        tasks = self._decompose_job(final_job)
        command_client = CommandClient()
        return command_client.async_call(args=tasks, output_types=output_types, command_uri=command_uri)

    def wait_until_job_finished(self, task_futures: list, timeout=None, return_when=FIRST_EXCEPTION):
        return wait(task_futures, timeout=timeout, return_when=return_when).done

    def _decompose_job(self, job: ErJob):
        input_total_partitions = job._inputs[0]._store_locator._total_partitions
        output_total_partitions = 0 \
            if not job._outputs \
            else job._outputs[0]._store_locator._total_partitions

        larger_total_partitions = max(input_total_partitions, output_total_partitions)

        populated_input_partitions = self.populate_processor(job._inputs[0])._partitions

        if output_total_partitions > 0:
            populated_output_partitions = self.populate_processor(job._outputs[0])._partitions
        else:
            populated_output_partitions = list()

        result = list()
        for i in range(larger_total_partitions):
            input_partitions = list()
            output_partitions = list()

            if i < input_total_partitions:
                input_processor = populated_input_partitions[i]._processor
                input_server_node_id = input_processor._server_node_id
                for input_store in job._inputs:
                    input_partitions.append(ErPartition(
                            id=i,
                            store_locator=input_store._store_locator,
                            processor=input_processor))
            else:
                input_processor = None
                input_server_node_id = None

            if i < output_total_partitions:
                output_processor = populated_output_partitions[i]._processor
                output_server_node_id = output_processor._server_node_id
                for output_store in job._outputs:
                    output_partitions.append(ErPartition(
                            id=i,
                            store_locator=output_store._store_locator,
                            processor=output_processor))
            else:
                output_processor = None
                output_server_node_id = None

            tasks = [ErTask(id=generate_task_id(job._id, i),
                           name=f'{job._name}',
                           inputs=input_partitions,
                           outputs=output_partitions,
                           job=job)]
            if input_server_node_id == output_server_node_id:
                result.append(
                        (tasks, input_processor._command_endpoint))
            else:
                if input_server_node_id is not None:
                    result.append(
                            (tasks, input_processor._command_endpoint))
                if output_server_node_id is not None:
                    result.append(
                            (tasks, output_processor._command_endpoint))

        return result

    def populate_output_store(self, job: ErJob):
        is_output_blank = not job._outputs or not job._outputs[0]
        is_output_not_populated = is_output_blank or not job._outputs[0]._partitions
        if is_output_not_populated:
            if is_output_blank:
                final_output_proposal = job._inputs[0].fork()
            else:
                final_output_proposal = job._outputs[0]

            refresh_nodes = job._options.get('refresh_nodes')
            if refresh_nodes is None or refresh_nodes:
                final_output_proposal._partitions = []
            else:
                if not final_output_proposal._partitions:
                    final_output_proposal._partitions = job._inputs[0]._partitions
        else:
            final_output_proposal = job._outputs[0]

        final_output = self.populate_processor(
                self._cluster_manager_client.get_or_create_store(final_output_proposal))

        if final_output._store_locator._total_partitions != \
                final_output_proposal._store_locator._total_partitions:
            raise ValueError(f'partition count of actual output and proposed output does not match. '
                             f'actual={final_output}, proposed={final_output_proposal}')
        final_job = deepcopy(job)
        final_job._outputs = [final_output]

        return final_job

    def stop(self):
        L.info(f'stopping session (gracefully): {self.__session_id}')
        L.debug(f'stopping session (gracefully), details: {self.__session_meta}')
        L.debug(f'stopping (gracefully) for {self.__session_id} from: {get_stack()}')
        self.run_exit_tasks()
        self.stopped = True
        return self._cluster_manager_client.stop_session(self.__session_meta)

    def kill(self):
        L.info(f'killing session (forcefully): {self.__session_id}')
        L.debug(f'killing session (forcefully), details: {self.__session_meta}')
        L.debug(f'killing (forcefully) for {self.__session_id} from: {get_stack()}')
        self.stopped = True

        future = self.executor.submit(self.stop)
        done = wait([future], timeout=1, return_when=FIRST_EXCEPTION).done
        if done:
            L.info(f'stopped successfully before kill session: {self.__session_id}')
        else:
            L.warn(f'stopped timeout before kill session: {self.__session_id}')

        return self._cluster_manager_client.kill_session(self.__session_meta)

    def get_session_id(self):
        return self.__session_id

    def get_session_meta(self):
        return self.__session_meta

    # todo:1: add_exit_task? not necessarily a cleanup semantic
    def add_exit_task(self, func):
        self.__exit_tasks.append(func)

    def run_exit_tasks(self):
        L.debug(f'running exit tasks: {self.__session_id}')
        for func in self.__exit_tasks:
            func()

    def get_option(self, key, default=None):
        return self.__options.get(key, default)

    def has_option(self, key):
        return self.__options.get(key) is not None

    def get_all_options(self):
        return self.__options.copy()

    def is_stopped(self):
        return self.stopped
Exemple #5
0
    def __init__(self,
                 session_id=None,
                 name='',
                 tag='',
                 processors: list = None,
                 options: dict = None):
        if processors is None:
            processors = []
        if options is None:
            options = {}
        if not session_id:
            self.__session_id = f'er_session_py_{time_now(format=DEFAULT_DATETIME_FORMAT)}_{get_self_ip()}'
        else:
            self.__session_id = session_id

        self.__eggroll_home = os.getenv('EGGROLL_HOME', None)
        if not self.__eggroll_home:
            raise EnvironmentError('EGGROLL_HOME is not set')

        if "EGGROLL_DEBUG" not in os.environ:
            os.environ['EGGROLL_DEBUG'] = "0"

        conf_path = options.get(
            CoreConfKeys.STATIC_CONF_PATH,
            f"{self.__eggroll_home}/conf/eggroll.properties")

        L.info(f"static conf path: {conf_path}")
        configs = configparser.ConfigParser()
        configs.read(conf_path)
        set_static_er_conf(configs['eggroll'])
        static_er_conf = get_static_er_conf()

        self.__options = options.copy()
        self.__options[SessionConfKeys.CONFKEY_SESSION_ID] = self.__session_id
        self._cluster_manager_client = ClusterManagerClient(options=options)

        self.__is_standalone = options.get(
            SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE,
            "") == DeployModes.STANDALONE
        if self.__is_standalone and os.name != 'nt' and not processors and os.environ.get(
                "EGGROLL_RESOURCE_MANAGER_AUTO_BOOTSTRAP", "1") == "1":
            port = int(
                options.get(
                    ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT,
                    static_er_conf.get(
                        ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT,
                        "4670")))
            startup_command = f'bash {self.__eggroll_home}/bin/eggroll_boot_standalone.sh -c {conf_path} -s {self.__session_id}'
            import subprocess
            import atexit

            bootstrap_log_dir = f'{self.__eggroll_home}/logs/eggroll/'
            os.makedirs(bootstrap_log_dir, mode=0o755, exist_ok=True)
            with open(f'{bootstrap_log_dir}/standalone-manager.out', 'a+') as outfile, \
                    open(f'{bootstrap_log_dir}/standalone-manager.err', 'a+') as errfile:
                L.info(f'start up command: {startup_command}')
                manager_process = subprocess.run(startup_command,
                                                 shell=True,
                                                 stdout=outfile,
                                                 stderr=errfile)
                returncode = manager_process.returncode
                L.info(f'start up returncode: {returncode}')

            def shutdown_standalone_manager(port, session_id, log_dir):
                shutdown_command = f"ps aux | grep eggroll | grep Bootstrap | grep '{port}' | grep '{session_id}' | grep -v grep | awk '{{print $2}}' | xargs kill"
                L.info(f'shutdown command: {shutdown_command}')
                with open(f'{log_dir}/standalone-manager.out',
                          'a+') as outfile, open(
                              f'{log_dir}/standalone-manager.err',
                              'a+') as errfile:
                    manager_process = subprocess.run(shutdown_command,
                                                     shell=True,
                                                     stdout=outfile,
                                                     stderr=errfile)
                    returncode = manager_process.returncode
                    L.info(f'shutdown returncode: {returncode}')

            atexit.register(shutdown_standalone_manager, port,
                            self.__session_id, bootstrap_log_dir)

        session_meta = ErSessionMeta(id=self.__session_id,
                                     name=name,
                                     status=SessionStatus.NEW,
                                     tag=tag,
                                     processors=processors,
                                     options=options)

        from time import monotonic, sleep
        timeout = int(
            SessionConfKeys.EGGROLL_SESSION_START_TIMEOUT_MS.get_with(
                options)) / 1000 + 2
        endtime = monotonic() + timeout

        # TODO:0: ignores exception while starting up in standalone mod
        while True:
            try:
                if not processors:
                    self.__session_meta = self._cluster_manager_client.get_or_create_session(
                        session_meta)
                else:
                    self.__session_meta = self._cluster_manager_client.register_session(
                        session_meta)
                break
            except:
                if monotonic() < endtime:
                    sleep(0.1)
                else:
                    raise

        self.__exit_tasks = list()
        self.__processors = self.__session_meta._processors

        L.info(
            f'session init finished: {self.__session_id}, details: {self.__session_meta}'
        )
        self.stopped = self.__session_meta._status == SessionStatus.CLOSED or self.__session_meta._status == SessionStatus.KILLED
        self._rolls = list()
        self._eggs = dict()

        for processor in self.__session_meta._processors:
            processor_type = processor._processor_type
            if processor_type == ProcessorTypes.EGG_PAIR:
                server_node_id = processor._server_node_id
                if server_node_id not in self._eggs:
                    self._eggs[server_node_id] = list()
                self._eggs[server_node_id].append(processor)
            elif processor_type == ProcessorTypes.ROLL_PAIR_MASTER:
                self._rolls.append(processor)
            else:
                raise ValueError(
                    f'processor type {processor_type} not supported in roll pair'
                )
Exemple #6
0
def serve(args):
    prefix = 'v1/egg-pair'

    set_data_dir(args.data_dir)

    CommandRouter.get_instance().register(
        service_name=f"{prefix}/runTask",
        route_to_module_name="eggroll.roll_pair.egg_pair",
        route_to_class_name="EggPair",
        route_to_method_name="run_task")

    command_server = grpc.server(
        futures.ThreadPoolExecutor(max_workers=500,
                                   thread_name_prefix="grpc_server"),
        options=[("grpc.max_metadata_size", 32 << 20),
                 (cygrpc.ChannelArgKey.max_send_message_length, 2 << 30 - 1),
                 (cygrpc.ChannelArgKey.max_receive_message_length, 2 << 30 - 1)
                 ])

    command_servicer = CommandServicer()
    command_pb2_grpc.add_CommandServiceServicer_to_server(
        command_servicer, command_server)

    transfer_servicer = GrpcTransferServicer()

    port = args.port
    transfer_port = args.transfer_port

    port = command_server.add_insecure_port(f'[::]:{port}')

    if transfer_port == "-1":
        transfer_server = command_server
        transfer_port = port
        transfer_pb2_grpc.add_TransferServiceServicer_to_server(
            transfer_servicer, transfer_server)
    else:
        transfer_server = grpc.server(
            futures.ThreadPoolExecutor(max_workers=500,
                                       thread_name_prefix="transfer_server"),
            options=[
                (cygrpc.ChannelArgKey.max_send_message_length, 2 << 30 - 1),
                (cygrpc.ChannelArgKey.max_receive_message_length, 2 << 30 - 1),
                ('grpc.max_metadata_size', 32 << 20)
            ])
        transfer_port = transfer_server.add_insecure_port(
            f'[::]:{transfer_port}')
        transfer_pb2_grpc.add_TransferServiceServicer_to_server(
            transfer_servicer, transfer_server)
        transfer_server.start()

    L.info(
        f"starting egg_pair service, port:{port}, transfer port: {transfer_port}"
    )
    command_server.start()

    cluster_manager = args.cluster_manager
    myself = None
    cluster_manager_client = None
    if cluster_manager:
        session_id = args.session_id

        if not session_id:
            raise ValueError('session id is missing')
        options = {SessionConfKeys.CONFKEY_SESSION_ID: args.session_id}
        myself = ErProcessor(id=int(args.processor_id),
                             server_node_id=int(args.server_node_id),
                             processor_type=ProcessorTypes.EGG_PAIR,
                             command_endpoint=ErEndpoint(host='localhost',
                                                         port=port),
                             transfer_endpoint=ErEndpoint(host='localhost',
                                                          port=transfer_port),
                             pid=os.getpid(),
                             options=options,
                             status=ProcessorStatus.RUNNING)

        cluster_manager_host, cluster_manager_port = cluster_manager.strip(
        ).split(':')

        L.info(f'cluster_manager: {cluster_manager}')
        cluster_manager_client = ClusterManagerClient(
            options={
                ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_HOST:
                cluster_manager_host,
                ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT:
                cluster_manager_port
            })
        cluster_manager_client.heartbeat(myself)

    L.info(f'egg_pair started at port {port}, transfer_port {transfer_port}')

    run = True

    def exit_gracefully(signum, frame):
        nonlocal run
        run = False

    signal.signal(signal.SIGTERM, exit_gracefully)
    signal.signal(signal.SIGINT, exit_gracefully)

    import time

    while run:
        time.sleep(1)

    if cluster_manager:
        myself._status = ProcessorStatus.STOPPED
        cluster_manager_client.heartbeat(myself)

    L.info(
        f'egg_pair at port {port}, transfer_port {transfer_port} stopped gracefully'
    )