Ejemplo n.º 1
0
    def __init__(self, adapter: RollSiteAdapter, options: dict = None):
        if options is None:
            options = {}
        self.adapter = adapter

        self.roll_site_header: ErRollSiteHeader = adapter.roll_site_header
        self.namespace = adapter.namespace
        self.name = create_store_name(self.roll_site_header)

        self.tagged_key = ''
        self.obj_type = adapter.obj_type

        self.proxy_endpoint = adapter.proxy_endpoint
        channel = self.grpc_channel_factory.create_channel(self.proxy_endpoint)
        self.stub = proxy_pb2_grpc.DataTransferServiceStub(channel)

        static_er_conf = get_static_er_conf()
        self.__bin_packet_len = int(options.get(
                RollSiteConfKeys.EGGROLL_ROLLSITE_ADAPTER_SENDBUF_SIZE.key,
                static_er_conf.get(RollSiteConfKeys.EGGROLL_ROLLSITE_ADAPTER_SENDBUF_SIZE.key,
                                   RollSiteConfKeys.EGGROLL_ROLLSITE_ADAPTER_SENDBUF_SIZE.default_value)))
        self.total_written = 0

        self.ba = bytearray(self.__bin_packet_len)
        self.buffer = ArrayByteBuffer(self.ba)
        self.writer = PairBinWriter(pair_buffer=self.buffer)

        self.push_batch_cnt = 0
        self.push_pair_cnt = 0

        self.topic_src = proxy_pb2.Topic(name=self.name, partyId=self.roll_site_header._src_party_id,
                                         role=self.roll_site_header._src_role, callback=None)
        self.topic_dst = proxy_pb2.Topic(name=self.name, partyId=self.roll_site_header._dst_party_id,
                                         role=self.roll_site_header._dst_role, callback=None)
Ejemplo n.º 2
0
    def __init__(self, options=None):
        if options is None:
            options = {}
        static_er_conf = get_static_er_conf()
        host = options.get(
            ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_HOST,
            static_er_conf.get(
                ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_HOST, None))
        port = options.get(
            ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT,
            static_er_conf.get(
                ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, None))

        if not host or not port:
            raise ValueError(
                f'failed to load host or port in creating cluster manager client. host: {host}, port: {port}'
            )

        self.__endpoint = ErEndpoint(host, int(port))
        if 'serdes_type' in options:
            self.__serdes_type = options['serdes_type']
        else:
            self.__serdes_type = SerdesTypes.PROTOBUF
        self.__command_client = CommandClient()
Ejemplo n.º 3
0
def serve(args):
    prefix = 'v1/egg-pair'

    set_data_dir(args.data_dir)

    CommandRouter.get_instance().register(
        service_name=f"{prefix}/runTask",
        route_to_module_name="eggroll.roll_pair.egg_pair",
        route_to_class_name="EggPair",
        route_to_method_name="run_task")

    max_workers = int(
        RollPairConfKeys.
        EGGROLL_ROLLPAIR_EGGPAIR_SERVER_EXECUTOR_POOL_MAX_SIZE.get())
    executor_pool_type = CoreConfKeys.EGGROLL_CORE_DEFAULT_EXECUTOR_POOL.get()
    command_server = grpc.server(
        create_executor_pool(canonical_name=executor_pool_type,
                             max_workers=max_workers,
                             thread_name_prefix="eggpair-command-server"),
        options=
        [("grpc.max_metadata_size",
          int(CoreConfKeys.
              EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_METADATA_SIZE.get())
          ),
         ('grpc.max_send_message_length',
          int(CoreConfKeys.
              EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.get())
          ),
         ('grpc.max_receive_message_length',
          int(CoreConfKeys.
              EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.get())
          ),
         ('grpc.keepalive_time_ms',
          int(CoreConfKeys.CONFKEY_CORE_GRPC_CHANNEL_KEEPALIVE_TIME_SEC.get())
          * 1000),
         ('grpc.keepalive_timeout_ms',
          int(CoreConfKeys.
              CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_TIMEOUT_SEC.get()) *
          1000),
         ('grpc.keepalive_permit_without_calls',
          int(CoreConfKeys.
              CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED.
              get())),
         ('grpc.per_rpc_retry_buffer_size',
          int(CoreConfKeys.CONFKEY_CORE_GRPC_SERVER_CHANNEL_RETRY_BUFFER_SIZE.
              get())), ('grpc.so_reuseport', False)])

    command_servicer = CommandServicer()
    command_pb2_grpc.add_CommandServiceServicer_to_server(
        command_servicer, command_server)

    transfer_servicer = GrpcTransferServicer()

    port = args.port
    transfer_port = args.transfer_port

    port = command_server.add_insecure_port(f'[::]:{port}')

    if transfer_port == "-1":
        transfer_server = command_server
        transfer_port = port
        transfer_pb2_grpc.add_TransferServiceServicer_to_server(
            transfer_servicer, transfer_server)
    else:
        transfer_server_max_workers = int(
            RollPairConfKeys.
            EGGROLL_ROLLPAIR_EGGPAIR_DATA_SERVER_EXECUTOR_POOL_MAX_SIZE.get())
        transfer_server = grpc.server(
            create_executor_pool(canonical_name=executor_pool_type,
                                 max_workers=transfer_server_max_workers,
                                 thread_name_prefix="transfer_server"),
            options=
            [('grpc.max_metadata_size',
              int(CoreConfKeys.
                  EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_METADATA_SIZE.
                  get())),
             ('grpc.max_send_message_length',
              int(CoreConfKeys.
                  EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.
                  get())),
             ('grpc.max_receive_message_length',
              int(CoreConfKeys.
                  EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.
                  get())),
             ('grpc.keepalive_time_ms',
              int(CoreConfKeys.
                  CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED
                  .get()) * 1000),
             ('grpc.keepalive_timeout_ms',
              int(CoreConfKeys.
                  CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_TIMEOUT_SEC.get())
              * 1000),
             ('grpc.keepalive_permit_without_calls',
              int(CoreConfKeys.
                  CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED
                  .get())),
             ('grpc.per_rpc_retry_buffer_size',
              int(CoreConfKeys.
                  CONFKEY_CORE_GRPC_SERVER_CHANNEL_RETRY_BUFFER_SIZE.get())),
             ('grpc.so_reuseport', False)])
        transfer_port = transfer_server.add_insecure_port(
            f'[::]:{transfer_port}')
        transfer_pb2_grpc.add_TransferServiceServicer_to_server(
            transfer_servicer, transfer_server)
        transfer_server.start()
    pid = os.getpid()

    L.info(
        f"starting egg_pair service, port: {port}, transfer port: {transfer_port}, pid: {pid}"
    )
    command_server.start()

    cluster_manager = args.cluster_manager
    myself = None
    cluster_manager_client = None
    if cluster_manager:
        session_id = args.session_id
        server_node_id = int(args.server_node_id)
        static_er_conf = get_static_er_conf()
        static_er_conf['server_node_id'] = server_node_id

        if not session_id:
            raise ValueError('session id is missing')
        options = {SessionConfKeys.CONFKEY_SESSION_ID: args.session_id}
        myself = ErProcessor(id=int(args.processor_id),
                             server_node_id=server_node_id,
                             processor_type=ProcessorTypes.EGG_PAIR,
                             command_endpoint=ErEndpoint(host='localhost',
                                                         port=port),
                             transfer_endpoint=ErEndpoint(host='localhost',
                                                          port=transfer_port),
                             pid=pid,
                             options=options,
                             status=ProcessorStatus.RUNNING)

        cluster_manager_host, cluster_manager_port = cluster_manager.strip(
        ).split(':')

        L.info(f'egg_pair cluster_manager: {cluster_manager}')
        cluster_manager_client = ClusterManagerClient(
            options={
                ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_HOST:
                cluster_manager_host,
                ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT:
                cluster_manager_port
            })
        cluster_manager_client.heartbeat(myself)

        if platform.system() == "Windows":
            t1 = threading.Thread(target=stop_processor,
                                  args=[cluster_manager_client, myself])
            t1.start()

    L.info(f'egg_pair started at port={port}, transfer_port={transfer_port}')

    run = True

    def exit_gracefully(signum, frame):
        nonlocal run
        run = False
        L.info(
            f'egg_pair {args.processor_id} at port={port}, transfer_port={transfer_port}, pid={pid} receives signum={signal.getsignal(signum)}, stopping gracefully.'
        )

    signal.signal(signal.SIGTERM, exit_gracefully)
    signal.signal(signal.SIGINT, exit_gracefully)

    while run:
        time.sleep(1)

    L.info(f'sending exit heartbeat to cm')
    if cluster_manager:
        myself._status = ProcessorStatus.STOPPED
        cluster_manager_client.heartbeat(myself)

    GrpcChannelFactory.shutdown_all_now()

    L.info(f'closing RocksDB open dbs')
    #todo:1: move to RocksdbAdapter and provide a cleanup method
    from eggroll.core.pair_store.rocksdb import RocksdbAdapter
    for path, db in RocksdbAdapter.db_dict.items():
        del db

    gc.collect()

    L.info(f'system metric at exit: {get_system_metric(1)}')
    L.info(
        f'egg_pair {args.processor_id} at port={port}, transfer_port={transfer_port}, pid={pid} stopped gracefully'
    )
Ejemplo n.º 4
0
    def _run_unary(self, func, task, shuffle=False, reduce_op=None):
        input_store_head = task._job._inputs[0]
        output_store_head = task._job._outputs[0]
        input_key_serdes = create_serdes(
            input_store_head._store_locator._serdes)
        input_value_serdes = create_serdes(
            input_store_head._store_locator._serdes)
        output_key_serdes = create_serdes(
            output_store_head._store_locator._serdes)
        output_value_serdes = create_serdes(
            output_store_head._store_locator._serdes)

        if input_key_serdes != output_key_serdes or \
                input_value_serdes != output_value_serdes:
            raise ValueError(
                f"input key-value serdes:{(input_key_serdes, input_value_serdes)}"
                f"differ from output key-value serdes:{(output_key_serdes, output_value_serdes)}"
            )

        if shuffle:
            from eggroll.roll_pair.transfer_pair import TransferPair
            input_total_partitions = input_store_head._store_locator._total_partitions
            output_total_partitions = output_store_head._store_locator._total_partitions
            output_store = output_store_head

            my_server_node_id = get_static_er_conf().get(
                'server_node_id', None)
            shuffler = TransferPair(transfer_id=task._job._id)
            if not task._outputs or \
                    (my_server_node_id is not None
                     and my_server_node_id != task._outputs[0]._processor._server_node_id):
                store_future = None
            else:
                store_future = shuffler.store_broker(
                    store_partition=task._outputs[0],
                    is_shuffle=True,
                    total_writers=input_total_partitions,
                    reduce_op=reduce_op)

            if not task._inputs or \
                    (my_server_node_id is not None
                     and my_server_node_id != task._inputs[0]._processor._server_node_id):
                scatter_future = None
            else:
                shuffle_broker = FifoBroker()
                write_bb = BatchBroker(shuffle_broker)
                try:
                    scatter_future = shuffler.scatter(
                        input_broker=shuffle_broker,
                        partition_function=partitioner(
                            hash_func=hash_code,
                            total_partitions=output_total_partitions),
                        output_store=output_store)
                    with create_adapter(task._inputs[0]) as input_db, \
                        input_db.iteritems() as rb:
                        func(rb, input_key_serdes, input_value_serdes,
                             write_bb)
                finally:
                    write_bb.signal_write_finish()

            if scatter_future:
                scatter_results = scatter_future.result()
            else:
                scatter_results = 'no scatter for this partition'
            if store_future:
                store_results = store_future.result()
            else:
                store_results = 'no store for this partition'
        else:  # no shuffle
            with create_adapter(task._inputs[0]) as input_db, \
                    input_db.iteritems() as rb, \
                    create_adapter(task._outputs[0], options=task._job._options) as db, \
                    db.new_batch() as wb:
                func(rb, input_key_serdes, input_value_serdes, wb)
            L.trace(f"close_store_adatper:{task._inputs[0]}")
Ejemplo n.º 5
0
    def __init__(self,
            session_id=None,
            name='',
            tag='',
            processors: list = None,
            options: dict = None):
        if processors is None:
            processors = []
        if options is None:
            options = {}
        if not session_id:
            self.__session_id = f'er_session_py_{time_now(format=DEFAULT_DATETIME_FORMAT)}_{get_self_ip()}'
        else:
            self.__session_id = session_id

        self.__eggroll_home = os.getenv('EGGROLL_HOME', None)
        if not self.__eggroll_home:
            raise EnvironmentError('EGGROLL_HOME is not set')

        if "EGGROLL_DEBUG" not in os.environ:
            os.environ['EGGROLL_DEBUG'] = "0"

        conf_path = options.get(CoreConfKeys.STATIC_CONF_PATH, f"{self.__eggroll_home}/conf/eggroll.properties")

        L.info(f"static conf path: {conf_path}")
        configs = configparser.ConfigParser()
        configs.read(conf_path)
        set_static_er_conf(configs['eggroll'])
        static_er_conf = get_static_er_conf()

        self.__options = options.copy()
        self.__options[SessionConfKeys.CONFKEY_SESSION_ID] = self.__session_id
        #self._cluster_manager_client = ClusterManagerClient(options=options)

        self.__is_standalone = options.get(SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE, "") == DeployModes.STANDALONE
        if self.__is_standalone and not processors and os.environ.get("EGGROLL_RESOURCE_MANAGER_BOOTSTRAP_DEBUG", "0") == "0":
            #port = int(options.get(ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT,
            #                      static_er_conf.get(ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, "4689")))
            port = 0
            random_value = str(random.random())
            os.environ['EGGROLL_STANDALONE_TAG'] = random_value
            if os.name != 'nt':
                startup_command = f'{self.__eggroll_home}/bin/eggroll_boot_standalone.sh -p {port} -s {self.__session_id}'
            else:
                startup_command = f'{self.__eggroll_home}/bin/eggroll_boot_standalone.py -p {port} -s {self.__session_id}'

            print("startup_command:", startup_command)
            import subprocess
            import atexit

            bootstrap_log_dir = f'{self.__eggroll_home}/logs/eggroll/'
            os.makedirs(bootstrap_log_dir, mode=0o755, exist_ok=True)
            with open(f'{bootstrap_log_dir}/standalone-manager.out', 'a+') as outfile, \
                    open(f'{bootstrap_log_dir}/standalone-manager.err', 'a+') as errfile:
                L.info(f'start up command: {startup_command}')
                manager_process = subprocess.Popen(startup_command, shell=True, stdout=outfile, stderr=errfile)
                manager_process.wait()
                returncode = manager_process.returncode
                L.info(f'start up returncode: {returncode}')

            def shutdown_standalone_manager(session_id, log_dir):
                standalone_tag = f'eggroll.standalone.tag={random_value}'
                if os.name != 'nt':
                    shutdown_command = f"ps aux | grep eggroll | grep Bootstrap | grep '{standalone_tag}' | grep '{session_id}' | grep -v grep | awk '{{print $2}}' | xargs kill"
                else:
                    pid_list = psutil.pids()
                    ret_pid = 0
                    exception = None
                    for pid in pid_list:
                        try:
                            p = psutil.Process(pid)
                            exception = None
                        except Exception as e:
                            exception = e
                            continue

                        if "java.exe" not in p.name():
                            continue
                        # if it is a system process, call p.cmdline() will dump
                        cmdline = p.cmdline()
                        if standalone_tag not in cmdline or '--bootstraps' not in cmdline:
                            continue

                        ret_pid = pid
                        break
                    if exception:
                        raise RuntimeError("can not find the bootstrap process")

                    shutdown_command = f"taskkill /pid {ret_pid} /f"

                L.info(f'shutdown command: {shutdown_command}')
                with open(f'{log_dir}/standalone-manager.out', 'a+') as outfile, open(f'{log_dir}/standalone-manager.err', 'a+') as errfile:
                    manager_process = subprocess.run(shutdown_command, shell=True, stdout=outfile, stderr=errfile)
                    returncode = manager_process.returncode
                    L.info(f'shutdown returncode: {returncode}')

            file_name = f'{self.__eggroll_home}/logs/eggroll/bootstrap-standalone-manager.out'
            max_retry_cnt = 100
            for i in range(max_retry_cnt):
                msg = f"retry get port from bootstrap-standalone-manager.out: retry_cnt: {i},"
                L.info(msg)

                if os.path.exists(file_name):
                    break
                time.sleep(min(0.1 * i, 100))

            try:
                for i in range(max_retry_cnt):
                    with open(file_name) as fp:
                        msg = f"retry get port of ClusterManager and NodeManager: retry_cnt: {i},"
                        L.info(msg)

                        port = 0
                        key = f"{random_value} server started at port "
                        for line in fp.readlines():
                            if key in line:
                                port = int(line.rsplit('port ', 2)[1])
                                if port != 0:
                                    break

                        if port != 0:
                            break
                    time.sleep(min(0.1 * i, 100))
            except IOError as e:
                L.info(f"get port from {file_name} failed!")
                raise e

            if port == 0:
                raise RuntimeError(f"get port from {file_name} failed!")

            options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = port
            self.__options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = port
            atexit.register(shutdown_standalone_manager, self.__session_id, bootstrap_log_dir)

        self._cluster_manager_client = ClusterManagerClient(options=options)
        session_meta = ErSessionMeta(id=self.__session_id,
                                     name=name,
                                     status=SessionStatus.NEW,
                                     tag=tag,
                                     processors=processors,
                                     options=options)

        from time import monotonic, sleep
        timeout = int(SessionConfKeys.EGGROLL_SESSION_START_TIMEOUT_MS.get_with(options)) / 1000 + 2
        endtime = monotonic() + timeout

        # TODO:0: ignores exception while starting up in standalone mod
        while True:
            try:
                if not processors:
                    self.__session_meta = self._cluster_manager_client.get_or_create_session(session_meta)
                else:
                    self.__session_meta = self._cluster_manager_client.register_session(session_meta)
                break
            except:
                if monotonic() < endtime:
                    sleep(0.1)
                else:
                    raise

        self.__exit_tasks = list()
        self.__processors = self.__session_meta._processors

        L.info(f'session init finished: {self.__session_id}, details: {self.__session_meta}')
        self.stopped = self.__session_meta._status == SessionStatus.CLOSED or self.__session_meta._status == SessionStatus.KILLED
        self._rolls = list()
        self._eggs = dict()

        for processor in self.__session_meta._processors:
            processor_type = processor._processor_type
            if processor_type == ProcessorTypes.EGG_PAIR:
                server_node_id = processor._server_node_id
                if server_node_id not in self._eggs:
                    self._eggs[server_node_id] = list()
                self._eggs[server_node_id].append(processor)
            elif processor_type == ProcessorTypes.ROLL_PAIR_MASTER:
                self._rolls.append(processor)
            else:
                raise ValueError(f'processor type {processor_type} not supported in roll pair')
Ejemplo n.º 6
0
    def __init__(self,
                 session_id=None,
                 name='',
                 tag='',
                 processors: list = None,
                 options: dict = None):
        if processors is None:
            processors = []
        if options is None:
            options = {}
        if not session_id:
            self.__session_id = f'er_session_py_{time_now(format=DEFAULT_DATETIME_FORMAT)}_{get_self_ip()}'
        else:
            self.__session_id = session_id

        self.__eggroll_home = os.getenv('EGGROLL_HOME', None)
        if not self.__eggroll_home:
            raise EnvironmentError('EGGROLL_HOME is not set')

        if "EGGROLL_DEBUG" not in os.environ:
            os.environ['EGGROLL_DEBUG'] = "0"

        conf_path = options.get(
            CoreConfKeys.STATIC_CONF_PATH,
            f"{self.__eggroll_home}/conf/eggroll.properties")

        L.info(f"static conf path: {conf_path}")
        configs = configparser.ConfigParser()
        configs.read(conf_path)
        set_static_er_conf(configs['eggroll'])
        static_er_conf = get_static_er_conf()

        self.__options = options.copy()
        self.__options[SessionConfKeys.CONFKEY_SESSION_ID] = self.__session_id
        self._cluster_manager_client = ClusterManagerClient(options=options)

        self.__is_standalone = options.get(
            SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE,
            "") == DeployModes.STANDALONE
        if self.__is_standalone and os.name != 'nt' and not processors and os.environ.get(
                "EGGROLL_RESOURCE_MANAGER_AUTO_BOOTSTRAP", "1") == "1":
            port = int(
                options.get(
                    ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT,
                    static_er_conf.get(
                        ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT,
                        "4670")))
            startup_command = f'bash {self.__eggroll_home}/bin/eggroll_boot_standalone.sh -c {conf_path} -s {self.__session_id}'
            import subprocess
            import atexit

            bootstrap_log_dir = f'{self.__eggroll_home}/logs/eggroll/'
            os.makedirs(bootstrap_log_dir, mode=0o755, exist_ok=True)
            with open(f'{bootstrap_log_dir}/standalone-manager.out', 'a+') as outfile, \
                    open(f'{bootstrap_log_dir}/standalone-manager.err', 'a+') as errfile:
                L.info(f'start up command: {startup_command}')
                manager_process = subprocess.run(startup_command,
                                                 shell=True,
                                                 stdout=outfile,
                                                 stderr=errfile)
                returncode = manager_process.returncode
                L.info(f'start up returncode: {returncode}')

            def shutdown_standalone_manager(port, session_id, log_dir):
                shutdown_command = f"ps aux | grep eggroll | grep Bootstrap | grep '{port}' | grep '{session_id}' | grep -v grep | awk '{{print $2}}' | xargs kill"
                L.info(f'shutdown command: {shutdown_command}')
                with open(f'{log_dir}/standalone-manager.out',
                          'a+') as outfile, open(
                              f'{log_dir}/standalone-manager.err',
                              'a+') as errfile:
                    manager_process = subprocess.run(shutdown_command,
                                                     shell=True,
                                                     stdout=outfile,
                                                     stderr=errfile)
                    returncode = manager_process.returncode
                    L.info(f'shutdown returncode: {returncode}')

            atexit.register(shutdown_standalone_manager, port,
                            self.__session_id, bootstrap_log_dir)

        session_meta = ErSessionMeta(id=self.__session_id,
                                     name=name,
                                     status=SessionStatus.NEW,
                                     tag=tag,
                                     processors=processors,
                                     options=options)

        from time import monotonic, sleep
        timeout = int(
            SessionConfKeys.EGGROLL_SESSION_START_TIMEOUT_MS.get_with(
                options)) / 1000 + 2
        endtime = monotonic() + timeout

        # TODO:0: ignores exception while starting up in standalone mod
        while True:
            try:
                if not processors:
                    self.__session_meta = self._cluster_manager_client.get_or_create_session(
                        session_meta)
                else:
                    self.__session_meta = self._cluster_manager_client.register_session(
                        session_meta)
                break
            except:
                if monotonic() < endtime:
                    sleep(0.1)
                else:
                    raise

        self.__exit_tasks = list()
        self.__processors = self.__session_meta._processors

        L.info(
            f'session init finished: {self.__session_id}, details: {self.__session_meta}'
        )
        self.stopped = self.__session_meta._status == SessionStatus.CLOSED or self.__session_meta._status == SessionStatus.KILLED
        self._rolls = list()
        self._eggs = dict()

        for processor in self.__session_meta._processors:
            processor_type = processor._processor_type
            if processor_type == ProcessorTypes.EGG_PAIR:
                server_node_id = processor._server_node_id
                if server_node_id not in self._eggs:
                    self._eggs[server_node_id] = list()
                self._eggs[server_node_id].append(processor)
            elif processor_type == ProcessorTypes.ROLL_PAIR_MASTER:
                self._rolls.append(processor)
            else:
                raise ValueError(
                    f'processor type {processor_type} not supported in roll pair'
                )