Example #1
0
    def __init__(self,
            session_id=None,
            name='',
            tag='',
            processors: list = None,
            options: dict = None):
        if processors is None:
            processors = []
        if options is None:
            options = {}
        if not session_id:
            self.__session_id = f'er_session_py_{time_now(format=DEFAULT_DATETIME_FORMAT)}_{get_self_ip()}'
        else:
            self.__session_id = session_id

        self.__eggroll_home = os.getenv('EGGROLL_HOME', None)
        if not self.__eggroll_home:
            raise EnvironmentError('EGGROLL_HOME is not set')

        if "EGGROLL_DEBUG" not in os.environ:
            os.environ['EGGROLL_DEBUG'] = "0"

        conf_path = options.get(CoreConfKeys.STATIC_CONF_PATH, f"{self.__eggroll_home}/conf/eggroll.properties")

        L.info(f"static conf path: {conf_path}")
        configs = configparser.ConfigParser()
        configs.read(conf_path)
        set_static_er_conf(configs['eggroll'])
        static_er_conf = get_static_er_conf()

        self.__options = options.copy()
        self.__options[SessionConfKeys.CONFKEY_SESSION_ID] = self.__session_id
        #self._cluster_manager_client = ClusterManagerClient(options=options)

        self.__is_standalone = options.get(SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE, "") == DeployModes.STANDALONE
        if self.__is_standalone and not processors and os.environ.get("EGGROLL_RESOURCE_MANAGER_BOOTSTRAP_DEBUG", "0") == "0":
            #port = int(options.get(ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT,
            #                      static_er_conf.get(ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, "4689")))
            port = 0
            random_value = str(random.random())
            os.environ['EGGROLL_STANDALONE_TAG'] = random_value
            if os.name != 'nt':
                startup_command = f'{self.__eggroll_home}/bin/eggroll_boot_standalone.sh -p {port} -s {self.__session_id}'
            else:
                startup_command = f'{self.__eggroll_home}/bin/eggroll_boot_standalone.py -p {port} -s {self.__session_id}'

            print("startup_command:", startup_command)
            import subprocess
            import atexit

            bootstrap_log_dir = f'{self.__eggroll_home}/logs/eggroll/'
            os.makedirs(bootstrap_log_dir, mode=0o755, exist_ok=True)
            with open(f'{bootstrap_log_dir}/standalone-manager.out', 'a+') as outfile, \
                    open(f'{bootstrap_log_dir}/standalone-manager.err', 'a+') as errfile:
                L.info(f'start up command: {startup_command}')
                manager_process = subprocess.Popen(startup_command, shell=True, stdout=outfile, stderr=errfile)
                manager_process.wait()
                returncode = manager_process.returncode
                L.info(f'start up returncode: {returncode}')

            def shutdown_standalone_manager(session_id, log_dir):
                standalone_tag = f'eggroll.standalone.tag={random_value}'
                if os.name != 'nt':
                    shutdown_command = f"ps aux | grep eggroll | grep Bootstrap | grep '{standalone_tag}' | grep '{session_id}' | grep -v grep | awk '{{print $2}}' | xargs kill"
                else:
                    pid_list = psutil.pids()
                    ret_pid = 0
                    exception = None
                    for pid in pid_list:
                        try:
                            p = psutil.Process(pid)
                            exception = None
                        except Exception as e:
                            exception = e
                            continue

                        if "java.exe" not in p.name():
                            continue
                        # if it is a system process, call p.cmdline() will dump
                        cmdline = p.cmdline()
                        if standalone_tag not in cmdline or '--bootstraps' not in cmdline:
                            continue

                        ret_pid = pid
                        break
                    if exception:
                        raise RuntimeError("can not find the bootstrap process")

                    shutdown_command = f"taskkill /pid {ret_pid} /f"

                L.info(f'shutdown command: {shutdown_command}')
                with open(f'{log_dir}/standalone-manager.out', 'a+') as outfile, open(f'{log_dir}/standalone-manager.err', 'a+') as errfile:
                    manager_process = subprocess.run(shutdown_command, shell=True, stdout=outfile, stderr=errfile)
                    returncode = manager_process.returncode
                    L.info(f'shutdown returncode: {returncode}')

            file_name = f'{self.__eggroll_home}/logs/eggroll/bootstrap-standalone-manager.out'
            max_retry_cnt = 100
            for i in range(max_retry_cnt):
                msg = f"retry get port from bootstrap-standalone-manager.out: retry_cnt: {i},"
                L.info(msg)

                if os.path.exists(file_name):
                    break
                time.sleep(min(0.1 * i, 100))

            try:
                for i in range(max_retry_cnt):
                    with open(file_name) as fp:
                        msg = f"retry get port of ClusterManager and NodeManager: retry_cnt: {i},"
                        L.info(msg)

                        port = 0
                        key = f"{random_value} server started at port "
                        for line in fp.readlines():
                            if key in line:
                                port = int(line.rsplit('port ', 2)[1])
                                if port != 0:
                                    break

                        if port != 0:
                            break
                    time.sleep(min(0.1 * i, 100))
            except IOError as e:
                L.info(f"get port from {file_name} failed!")
                raise e

            if port == 0:
                raise RuntimeError(f"get port from {file_name} failed!")

            options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = port
            self.__options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = port
            atexit.register(shutdown_standalone_manager, self.__session_id, bootstrap_log_dir)

        self._cluster_manager_client = ClusterManagerClient(options=options)
        session_meta = ErSessionMeta(id=self.__session_id,
                                     name=name,
                                     status=SessionStatus.NEW,
                                     tag=tag,
                                     processors=processors,
                                     options=options)

        from time import monotonic, sleep
        timeout = int(SessionConfKeys.EGGROLL_SESSION_START_TIMEOUT_MS.get_with(options)) / 1000 + 2
        endtime = monotonic() + timeout

        # TODO:0: ignores exception while starting up in standalone mod
        while True:
            try:
                if not processors:
                    self.__session_meta = self._cluster_manager_client.get_or_create_session(session_meta)
                else:
                    self.__session_meta = self._cluster_manager_client.register_session(session_meta)
                break
            except:
                if monotonic() < endtime:
                    sleep(0.1)
                else:
                    raise

        self.__exit_tasks = list()
        self.__processors = self.__session_meta._processors

        L.info(f'session init finished: {self.__session_id}, details: {self.__session_meta}')
        self.stopped = self.__session_meta._status == SessionStatus.CLOSED or self.__session_meta._status == SessionStatus.KILLED
        self._rolls = list()
        self._eggs = dict()

        for processor in self.__session_meta._processors:
            processor_type = processor._processor_type
            if processor_type == ProcessorTypes.EGG_PAIR:
                server_node_id = processor._server_node_id
                if server_node_id not in self._eggs:
                    self._eggs[server_node_id] = list()
                self._eggs[server_node_id].append(processor)
            elif processor_type == ProcessorTypes.ROLL_PAIR_MASTER:
                self._rolls.append(processor)
            else:
                raise ValueError(f'processor type {processor_type} not supported in roll pair')
Example #2
0
    args_parser = argparse.ArgumentParser()
    args_parser.add_argument('-d', '--data-dir')
    args_parser.add_argument('-cm', '--cluster-manager')
    args_parser.add_argument('-nm', '--node-manager')
    args_parser.add_argument('-s', '--session-id')
    args_parser.add_argument('-p', '--port', default='0')
    args_parser.add_argument('-t', '--transfer-port', default='0')
    args_parser.add_argument('-sn', '--server-node-id')
    args_parser.add_argument('-prid', '--processor-id', default='0')
    args_parser.add_argument('-c', '--config')

    args = args_parser.parse_args()

    EGGROLL_HOME = os.environ['EGGROLL_HOME']
    configs = configparser.ConfigParser()
    if args.config:
        conf_file = args.config
        L.info(f'reading config path: {conf_file}')
    else:
        conf_file = f'{EGGROLL_HOME}/conf/eggroll.properties'
        L.info(f'reading default config: {conf_file}')

    configs.read(conf_file)
    set_static_er_conf(configs['eggroll'])
    if configs:
        if not args.data_dir:
            args.data_dir = configs['eggroll']['eggroll.data.dir']

    L.info(args)
    serve(args)
Example #3
0
    def __init__(self,
                 session_id=None,
                 name='',
                 tag='',
                 processors: list = None,
                 options: dict = None):
        if processors is None:
            processors = []
        if options is None:
            options = {}
        if not session_id:
            self.__session_id = f'er_session_py_{time_now(format=DEFAULT_DATETIME_FORMAT)}_{get_self_ip()}'
        else:
            self.__session_id = session_id

        self.__eggroll_home = os.getenv('EGGROLL_HOME', None)
        if not self.__eggroll_home:
            raise EnvironmentError('EGGROLL_HOME is not set')

        if "EGGROLL_DEBUG" not in os.environ:
            os.environ['EGGROLL_DEBUG'] = "0"

        conf_path = options.get(
            CoreConfKeys.STATIC_CONF_PATH,
            f"{self.__eggroll_home}/conf/eggroll.properties")

        L.info(f"static conf path: {conf_path}")
        configs = configparser.ConfigParser()
        configs.read(conf_path)
        set_static_er_conf(configs['eggroll'])
        static_er_conf = get_static_er_conf()

        self.__options = options.copy()
        self.__options[SessionConfKeys.CONFKEY_SESSION_ID] = self.__session_id
        self._cluster_manager_client = ClusterManagerClient(options=options)

        self.__is_standalone = options.get(
            SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE,
            "") == DeployModes.STANDALONE
        if self.__is_standalone and os.name != 'nt' and not processors and os.environ.get(
                "EGGROLL_RESOURCE_MANAGER_AUTO_BOOTSTRAP", "1") == "1":
            port = int(
                options.get(
                    ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT,
                    static_er_conf.get(
                        ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT,
                        "4670")))
            startup_command = f'bash {self.__eggroll_home}/bin/eggroll_boot_standalone.sh -c {conf_path} -s {self.__session_id}'
            import subprocess
            import atexit

            bootstrap_log_dir = f'{self.__eggroll_home}/logs/eggroll/'
            os.makedirs(bootstrap_log_dir, mode=0o755, exist_ok=True)
            with open(f'{bootstrap_log_dir}/standalone-manager.out', 'a+') as outfile, \
                    open(f'{bootstrap_log_dir}/standalone-manager.err', 'a+') as errfile:
                L.info(f'start up command: {startup_command}')
                manager_process = subprocess.run(startup_command,
                                                 shell=True,
                                                 stdout=outfile,
                                                 stderr=errfile)
                returncode = manager_process.returncode
                L.info(f'start up returncode: {returncode}')

            def shutdown_standalone_manager(port, session_id, log_dir):
                shutdown_command = f"ps aux | grep eggroll | grep Bootstrap | grep '{port}' | grep '{session_id}' | grep -v grep | awk '{{print $2}}' | xargs kill"
                L.info(f'shutdown command: {shutdown_command}')
                with open(f'{log_dir}/standalone-manager.out',
                          'a+') as outfile, open(
                              f'{log_dir}/standalone-manager.err',
                              'a+') as errfile:
                    manager_process = subprocess.run(shutdown_command,
                                                     shell=True,
                                                     stdout=outfile,
                                                     stderr=errfile)
                    returncode = manager_process.returncode
                    L.info(f'shutdown returncode: {returncode}')

            atexit.register(shutdown_standalone_manager, port,
                            self.__session_id, bootstrap_log_dir)

        session_meta = ErSessionMeta(id=self.__session_id,
                                     name=name,
                                     status=SessionStatus.NEW,
                                     tag=tag,
                                     processors=processors,
                                     options=options)

        from time import monotonic, sleep
        timeout = int(
            SessionConfKeys.EGGROLL_SESSION_START_TIMEOUT_MS.get_with(
                options)) / 1000 + 2
        endtime = monotonic() + timeout

        # TODO:0: ignores exception while starting up in standalone mod
        while True:
            try:
                if not processors:
                    self.__session_meta = self._cluster_manager_client.get_or_create_session(
                        session_meta)
                else:
                    self.__session_meta = self._cluster_manager_client.register_session(
                        session_meta)
                break
            except:
                if monotonic() < endtime:
                    sleep(0.1)
                else:
                    raise

        self.__exit_tasks = list()
        self.__processors = self.__session_meta._processors

        L.info(
            f'session init finished: {self.__session_id}, details: {self.__session_meta}'
        )
        self.stopped = self.__session_meta._status == SessionStatus.CLOSED or self.__session_meta._status == SessionStatus.KILLED
        self._rolls = list()
        self._eggs = dict()

        for processor in self.__session_meta._processors:
            processor_type = processor._processor_type
            if processor_type == ProcessorTypes.EGG_PAIR:
                server_node_id = processor._server_node_id
                if server_node_id not in self._eggs:
                    self._eggs[server_node_id] = list()
                self._eggs[server_node_id].append(processor)
            elif processor_type == ProcessorTypes.ROLL_PAIR_MASTER:
                self._rolls.append(processor)
            else:
                raise ValueError(
                    f'processor type {processor_type} not supported in roll pair'
                )