def __init__(self, adapter: RollSiteAdapter, options: dict = None): if options is None: options = {} self.adapter = adapter self.roll_site_header: ErRollSiteHeader = adapter.roll_site_header self.namespace = adapter.namespace self.name = create_store_name(self.roll_site_header) self.tagged_key = '' self.obj_type = adapter.obj_type self.proxy_endpoint = adapter.proxy_endpoint channel = self.grpc_channel_factory.create_channel(self.proxy_endpoint) self.stub = proxy_pb2_grpc.DataTransferServiceStub(channel) static_er_conf = get_static_er_conf() self.__bin_packet_len = int(options.get( RollSiteConfKeys.EGGROLL_ROLLSITE_ADAPTER_SENDBUF_SIZE.key, static_er_conf.get(RollSiteConfKeys.EGGROLL_ROLLSITE_ADAPTER_SENDBUF_SIZE.key, RollSiteConfKeys.EGGROLL_ROLLSITE_ADAPTER_SENDBUF_SIZE.default_value))) self.total_written = 0 self.ba = bytearray(self.__bin_packet_len) self.buffer = ArrayByteBuffer(self.ba) self.writer = PairBinWriter(pair_buffer=self.buffer) self.push_batch_cnt = 0 self.push_pair_cnt = 0 self.topic_src = proxy_pb2.Topic(name=self.name, partyId=self.roll_site_header._src_party_id, role=self.roll_site_header._src_role, callback=None) self.topic_dst = proxy_pb2.Topic(name=self.name, partyId=self.roll_site_header._dst_party_id, role=self.roll_site_header._dst_role, callback=None)
def __init__(self, options=None): if options is None: options = {} static_er_conf = get_static_er_conf() host = options.get( ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_HOST, static_er_conf.get( ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_HOST, None)) port = options.get( ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, static_er_conf.get( ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, None)) if not host or not port: raise ValueError( f'failed to load host or port in creating cluster manager client. host: {host}, port: {port}' ) self.__endpoint = ErEndpoint(host, int(port)) if 'serdes_type' in options: self.__serdes_type = options['serdes_type'] else: self.__serdes_type = SerdesTypes.PROTOBUF self.__command_client = CommandClient()
def serve(args): prefix = 'v1/egg-pair' set_data_dir(args.data_dir) CommandRouter.get_instance().register( service_name=f"{prefix}/runTask", route_to_module_name="eggroll.roll_pair.egg_pair", route_to_class_name="EggPair", route_to_method_name="run_task") max_workers = int( RollPairConfKeys. EGGROLL_ROLLPAIR_EGGPAIR_SERVER_EXECUTOR_POOL_MAX_SIZE.get()) executor_pool_type = CoreConfKeys.EGGROLL_CORE_DEFAULT_EXECUTOR_POOL.get() command_server = grpc.server( create_executor_pool(canonical_name=executor_pool_type, max_workers=max_workers, thread_name_prefix="eggpair-command-server"), options= [("grpc.max_metadata_size", int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_METADATA_SIZE.get()) ), ('grpc.max_send_message_length', int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.get()) ), ('grpc.max_receive_message_length', int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE.get()) ), ('grpc.keepalive_time_ms', int(CoreConfKeys.CONFKEY_CORE_GRPC_CHANNEL_KEEPALIVE_TIME_SEC.get()) * 1000), ('grpc.keepalive_timeout_ms', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_TIMEOUT_SEC.get()) * 1000), ('grpc.keepalive_permit_without_calls', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED. get())), ('grpc.per_rpc_retry_buffer_size', int(CoreConfKeys.CONFKEY_CORE_GRPC_SERVER_CHANNEL_RETRY_BUFFER_SIZE. get())), ('grpc.so_reuseport', False)]) command_servicer = CommandServicer() command_pb2_grpc.add_CommandServiceServicer_to_server( command_servicer, command_server) transfer_servicer = GrpcTransferServicer() port = args.port transfer_port = args.transfer_port port = command_server.add_insecure_port(f'[::]:{port}') if transfer_port == "-1": transfer_server = command_server transfer_port = port transfer_pb2_grpc.add_TransferServiceServicer_to_server( transfer_servicer, transfer_server) else: transfer_server_max_workers = int( RollPairConfKeys. EGGROLL_ROLLPAIR_EGGPAIR_DATA_SERVER_EXECUTOR_POOL_MAX_SIZE.get()) transfer_server = grpc.server( create_executor_pool(canonical_name=executor_pool_type, max_workers=transfer_server_max_workers, thread_name_prefix="transfer_server"), options= [('grpc.max_metadata_size', int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_METADATA_SIZE. get())), ('grpc.max_send_message_length', int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE. get())), ('grpc.max_receive_message_length', int(CoreConfKeys. EGGROLL_CORE_GRPC_SERVER_CHANNEL_MAX_INBOUND_MESSAGE_SIZE. get())), ('grpc.keepalive_time_ms', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED .get()) * 1000), ('grpc.keepalive_timeout_ms', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_TIMEOUT_SEC.get()) * 1000), ('grpc.keepalive_permit_without_calls', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_KEEPALIVE_WITHOUT_CALLS_ENABLED .get())), ('grpc.per_rpc_retry_buffer_size', int(CoreConfKeys. CONFKEY_CORE_GRPC_SERVER_CHANNEL_RETRY_BUFFER_SIZE.get())), ('grpc.so_reuseport', False)]) transfer_port = transfer_server.add_insecure_port( f'[::]:{transfer_port}') transfer_pb2_grpc.add_TransferServiceServicer_to_server( transfer_servicer, transfer_server) transfer_server.start() pid = os.getpid() L.info( f"starting egg_pair service, port: {port}, transfer port: {transfer_port}, pid: {pid}" ) command_server.start() cluster_manager = args.cluster_manager myself = None cluster_manager_client = None if cluster_manager: session_id = args.session_id server_node_id = int(args.server_node_id) static_er_conf = get_static_er_conf() static_er_conf['server_node_id'] = server_node_id if not session_id: raise ValueError('session id is missing') options = {SessionConfKeys.CONFKEY_SESSION_ID: args.session_id} myself = ErProcessor(id=int(args.processor_id), server_node_id=server_node_id, processor_type=ProcessorTypes.EGG_PAIR, command_endpoint=ErEndpoint(host='localhost', port=port), transfer_endpoint=ErEndpoint(host='localhost', port=transfer_port), pid=pid, options=options, status=ProcessorStatus.RUNNING) cluster_manager_host, cluster_manager_port = cluster_manager.strip( ).split(':') L.info(f'egg_pair cluster_manager: {cluster_manager}') cluster_manager_client = ClusterManagerClient( options={ ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_HOST: cluster_manager_host, ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT: cluster_manager_port }) cluster_manager_client.heartbeat(myself) if platform.system() == "Windows": t1 = threading.Thread(target=stop_processor, args=[cluster_manager_client, myself]) t1.start() L.info(f'egg_pair started at port={port}, transfer_port={transfer_port}') run = True def exit_gracefully(signum, frame): nonlocal run run = False L.info( f'egg_pair {args.processor_id} at port={port}, transfer_port={transfer_port}, pid={pid} receives signum={signal.getsignal(signum)}, stopping gracefully.' ) signal.signal(signal.SIGTERM, exit_gracefully) signal.signal(signal.SIGINT, exit_gracefully) while run: time.sleep(1) L.info(f'sending exit heartbeat to cm') if cluster_manager: myself._status = ProcessorStatus.STOPPED cluster_manager_client.heartbeat(myself) GrpcChannelFactory.shutdown_all_now() L.info(f'closing RocksDB open dbs') #todo:1: move to RocksdbAdapter and provide a cleanup method from eggroll.core.pair_store.rocksdb import RocksdbAdapter for path, db in RocksdbAdapter.db_dict.items(): del db gc.collect() L.info(f'system metric at exit: {get_system_metric(1)}') L.info( f'egg_pair {args.processor_id} at port={port}, transfer_port={transfer_port}, pid={pid} stopped gracefully' )
def _run_unary(self, func, task, shuffle=False, reduce_op=None): input_store_head = task._job._inputs[0] output_store_head = task._job._outputs[0] input_key_serdes = create_serdes( input_store_head._store_locator._serdes) input_value_serdes = create_serdes( input_store_head._store_locator._serdes) output_key_serdes = create_serdes( output_store_head._store_locator._serdes) output_value_serdes = create_serdes( output_store_head._store_locator._serdes) if input_key_serdes != output_key_serdes or \ input_value_serdes != output_value_serdes: raise ValueError( f"input key-value serdes:{(input_key_serdes, input_value_serdes)}" f"differ from output key-value serdes:{(output_key_serdes, output_value_serdes)}" ) if shuffle: from eggroll.roll_pair.transfer_pair import TransferPair input_total_partitions = input_store_head._store_locator._total_partitions output_total_partitions = output_store_head._store_locator._total_partitions output_store = output_store_head my_server_node_id = get_static_er_conf().get( 'server_node_id', None) shuffler = TransferPair(transfer_id=task._job._id) if not task._outputs or \ (my_server_node_id is not None and my_server_node_id != task._outputs[0]._processor._server_node_id): store_future = None else: store_future = shuffler.store_broker( store_partition=task._outputs[0], is_shuffle=True, total_writers=input_total_partitions, reduce_op=reduce_op) if not task._inputs or \ (my_server_node_id is not None and my_server_node_id != task._inputs[0]._processor._server_node_id): scatter_future = None else: shuffle_broker = FifoBroker() write_bb = BatchBroker(shuffle_broker) try: scatter_future = shuffler.scatter( input_broker=shuffle_broker, partition_function=partitioner( hash_func=hash_code, total_partitions=output_total_partitions), output_store=output_store) with create_adapter(task._inputs[0]) as input_db, \ input_db.iteritems() as rb: func(rb, input_key_serdes, input_value_serdes, write_bb) finally: write_bb.signal_write_finish() if scatter_future: scatter_results = scatter_future.result() else: scatter_results = 'no scatter for this partition' if store_future: store_results = store_future.result() else: store_results = 'no store for this partition' else: # no shuffle with create_adapter(task._inputs[0]) as input_db, \ input_db.iteritems() as rb, \ create_adapter(task._outputs[0], options=task._job._options) as db, \ db.new_batch() as wb: func(rb, input_key_serdes, input_value_serdes, wb) L.trace(f"close_store_adatper:{task._inputs[0]}")
def __init__(self, session_id=None, name='', tag='', processors: list = None, options: dict = None): if processors is None: processors = [] if options is None: options = {} if not session_id: self.__session_id = f'er_session_py_{time_now(format=DEFAULT_DATETIME_FORMAT)}_{get_self_ip()}' else: self.__session_id = session_id self.__eggroll_home = os.getenv('EGGROLL_HOME', None) if not self.__eggroll_home: raise EnvironmentError('EGGROLL_HOME is not set') if "EGGROLL_DEBUG" not in os.environ: os.environ['EGGROLL_DEBUG'] = "0" conf_path = options.get(CoreConfKeys.STATIC_CONF_PATH, f"{self.__eggroll_home}/conf/eggroll.properties") L.info(f"static conf path: {conf_path}") configs = configparser.ConfigParser() configs.read(conf_path) set_static_er_conf(configs['eggroll']) static_er_conf = get_static_er_conf() self.__options = options.copy() self.__options[SessionConfKeys.CONFKEY_SESSION_ID] = self.__session_id #self._cluster_manager_client = ClusterManagerClient(options=options) self.__is_standalone = options.get(SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE, "") == DeployModes.STANDALONE if self.__is_standalone and not processors and os.environ.get("EGGROLL_RESOURCE_MANAGER_BOOTSTRAP_DEBUG", "0") == "0": #port = int(options.get(ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, # static_er_conf.get(ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, "4689"))) port = 0 random_value = str(random.random()) os.environ['EGGROLL_STANDALONE_TAG'] = random_value if os.name != 'nt': startup_command = f'{self.__eggroll_home}/bin/eggroll_boot_standalone.sh -p {port} -s {self.__session_id}' else: startup_command = f'{self.__eggroll_home}/bin/eggroll_boot_standalone.py -p {port} -s {self.__session_id}' print("startup_command:", startup_command) import subprocess import atexit bootstrap_log_dir = f'{self.__eggroll_home}/logs/eggroll/' os.makedirs(bootstrap_log_dir, mode=0o755, exist_ok=True) with open(f'{bootstrap_log_dir}/standalone-manager.out', 'a+') as outfile, \ open(f'{bootstrap_log_dir}/standalone-manager.err', 'a+') as errfile: L.info(f'start up command: {startup_command}') manager_process = subprocess.Popen(startup_command, shell=True, stdout=outfile, stderr=errfile) manager_process.wait() returncode = manager_process.returncode L.info(f'start up returncode: {returncode}') def shutdown_standalone_manager(session_id, log_dir): standalone_tag = f'eggroll.standalone.tag={random_value}' if os.name != 'nt': shutdown_command = f"ps aux | grep eggroll | grep Bootstrap | grep '{standalone_tag}' | grep '{session_id}' | grep -v grep | awk '{{print $2}}' | xargs kill" else: pid_list = psutil.pids() ret_pid = 0 exception = None for pid in pid_list: try: p = psutil.Process(pid) exception = None except Exception as e: exception = e continue if "java.exe" not in p.name(): continue # if it is a system process, call p.cmdline() will dump cmdline = p.cmdline() if standalone_tag not in cmdline or '--bootstraps' not in cmdline: continue ret_pid = pid break if exception: raise RuntimeError("can not find the bootstrap process") shutdown_command = f"taskkill /pid {ret_pid} /f" L.info(f'shutdown command: {shutdown_command}') with open(f'{log_dir}/standalone-manager.out', 'a+') as outfile, open(f'{log_dir}/standalone-manager.err', 'a+') as errfile: manager_process = subprocess.run(shutdown_command, shell=True, stdout=outfile, stderr=errfile) returncode = manager_process.returncode L.info(f'shutdown returncode: {returncode}') file_name = f'{self.__eggroll_home}/logs/eggroll/bootstrap-standalone-manager.out' max_retry_cnt = 100 for i in range(max_retry_cnt): msg = f"retry get port from bootstrap-standalone-manager.out: retry_cnt: {i}," L.info(msg) if os.path.exists(file_name): break time.sleep(min(0.1 * i, 100)) try: for i in range(max_retry_cnt): with open(file_name) as fp: msg = f"retry get port of ClusterManager and NodeManager: retry_cnt: {i}," L.info(msg) port = 0 key = f"{random_value} server started at port " for line in fp.readlines(): if key in line: port = int(line.rsplit('port ', 2)[1]) if port != 0: break if port != 0: break time.sleep(min(0.1 * i, 100)) except IOError as e: L.info(f"get port from {file_name} failed!") raise e if port == 0: raise RuntimeError(f"get port from {file_name} failed!") options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = port self.__options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = port atexit.register(shutdown_standalone_manager, self.__session_id, bootstrap_log_dir) self._cluster_manager_client = ClusterManagerClient(options=options) session_meta = ErSessionMeta(id=self.__session_id, name=name, status=SessionStatus.NEW, tag=tag, processors=processors, options=options) from time import monotonic, sleep timeout = int(SessionConfKeys.EGGROLL_SESSION_START_TIMEOUT_MS.get_with(options)) / 1000 + 2 endtime = monotonic() + timeout # TODO:0: ignores exception while starting up in standalone mod while True: try: if not processors: self.__session_meta = self._cluster_manager_client.get_or_create_session(session_meta) else: self.__session_meta = self._cluster_manager_client.register_session(session_meta) break except: if monotonic() < endtime: sleep(0.1) else: raise self.__exit_tasks = list() self.__processors = self.__session_meta._processors L.info(f'session init finished: {self.__session_id}, details: {self.__session_meta}') self.stopped = self.__session_meta._status == SessionStatus.CLOSED or self.__session_meta._status == SessionStatus.KILLED self._rolls = list() self._eggs = dict() for processor in self.__session_meta._processors: processor_type = processor._processor_type if processor_type == ProcessorTypes.EGG_PAIR: server_node_id = processor._server_node_id if server_node_id not in self._eggs: self._eggs[server_node_id] = list() self._eggs[server_node_id].append(processor) elif processor_type == ProcessorTypes.ROLL_PAIR_MASTER: self._rolls.append(processor) else: raise ValueError(f'processor type {processor_type} not supported in roll pair')
def __init__(self, session_id=None, name='', tag='', processors: list = None, options: dict = None): if processors is None: processors = [] if options is None: options = {} if not session_id: self.__session_id = f'er_session_py_{time_now(format=DEFAULT_DATETIME_FORMAT)}_{get_self_ip()}' else: self.__session_id = session_id self.__eggroll_home = os.getenv('EGGROLL_HOME', None) if not self.__eggroll_home: raise EnvironmentError('EGGROLL_HOME is not set') if "EGGROLL_DEBUG" not in os.environ: os.environ['EGGROLL_DEBUG'] = "0" conf_path = options.get( CoreConfKeys.STATIC_CONF_PATH, f"{self.__eggroll_home}/conf/eggroll.properties") L.info(f"static conf path: {conf_path}") configs = configparser.ConfigParser() configs.read(conf_path) set_static_er_conf(configs['eggroll']) static_er_conf = get_static_er_conf() self.__options = options.copy() self.__options[SessionConfKeys.CONFKEY_SESSION_ID] = self.__session_id self._cluster_manager_client = ClusterManagerClient(options=options) self.__is_standalone = options.get( SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE, "") == DeployModes.STANDALONE if self.__is_standalone and os.name != 'nt' and not processors and os.environ.get( "EGGROLL_RESOURCE_MANAGER_AUTO_BOOTSTRAP", "1") == "1": port = int( options.get( ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, static_er_conf.get( ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT, "4670"))) startup_command = f'bash {self.__eggroll_home}/bin/eggroll_boot_standalone.sh -c {conf_path} -s {self.__session_id}' import subprocess import atexit bootstrap_log_dir = f'{self.__eggroll_home}/logs/eggroll/' os.makedirs(bootstrap_log_dir, mode=0o755, exist_ok=True) with open(f'{bootstrap_log_dir}/standalone-manager.out', 'a+') as outfile, \ open(f'{bootstrap_log_dir}/standalone-manager.err', 'a+') as errfile: L.info(f'start up command: {startup_command}') manager_process = subprocess.run(startup_command, shell=True, stdout=outfile, stderr=errfile) returncode = manager_process.returncode L.info(f'start up returncode: {returncode}') def shutdown_standalone_manager(port, session_id, log_dir): shutdown_command = f"ps aux | grep eggroll | grep Bootstrap | grep '{port}' | grep '{session_id}' | grep -v grep | awk '{{print $2}}' | xargs kill" L.info(f'shutdown command: {shutdown_command}') with open(f'{log_dir}/standalone-manager.out', 'a+') as outfile, open( f'{log_dir}/standalone-manager.err', 'a+') as errfile: manager_process = subprocess.run(shutdown_command, shell=True, stdout=outfile, stderr=errfile) returncode = manager_process.returncode L.info(f'shutdown returncode: {returncode}') atexit.register(shutdown_standalone_manager, port, self.__session_id, bootstrap_log_dir) session_meta = ErSessionMeta(id=self.__session_id, name=name, status=SessionStatus.NEW, tag=tag, processors=processors, options=options) from time import monotonic, sleep timeout = int( SessionConfKeys.EGGROLL_SESSION_START_TIMEOUT_MS.get_with( options)) / 1000 + 2 endtime = monotonic() + timeout # TODO:0: ignores exception while starting up in standalone mod while True: try: if not processors: self.__session_meta = self._cluster_manager_client.get_or_create_session( session_meta) else: self.__session_meta = self._cluster_manager_client.register_session( session_meta) break except: if monotonic() < endtime: sleep(0.1) else: raise self.__exit_tasks = list() self.__processors = self.__session_meta._processors L.info( f'session init finished: {self.__session_id}, details: {self.__session_meta}' ) self.stopped = self.__session_meta._status == SessionStatus.CLOSED or self.__session_meta._status == SessionStatus.KILLED self._rolls = list() self._eggs = dict() for processor in self.__session_meta._processors: processor_type = processor._processor_type if processor_type == ProcessorTypes.EGG_PAIR: server_node_id = processor._server_node_id if server_node_id not in self._eggs: self._eggs[server_node_id] = list() self._eggs[server_node_id].append(processor) elif processor_type == ProcessorTypes.ROLL_PAIR_MASTER: self._rolls.append(processor) else: raise ValueError( f'processor type {processor_type} not supported in roll pair' )