def test_init_cluster(self):
        options = {}
        base_dir = os.environ['EGGROLL_HOME']
        options[DeployConfKeys.CONFKEY_DEPLOY_ROLLPAIR_VENV_PATH] = os.environ[
            'EGGROLL_HOME'] / venv
        options[DeployConfKeys.
                CONFKEY_DEPLOY_ROLLPAIR_DATA_DIR_PATH] = '/tmp/eggroll'
        options[
            ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_HOST] = 'localhost'
        options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = '4670'

        options[DeployConfKeys.
                CONFKEY_DEPLOY_ROLLPAIR_PYTHON_PATH] = f'{base_dir}/python'
        options[
            DeployConfKeys.
            CONFKEY_DEPLOY_ROLLPAIR_EGGPAIR_PATH] = f'{base_dir}/python/eggroll/roll_pair/egg_pair.py'
        options[
            DeployConfKeys.
            CONFKEY_DEPLOY_JVM_MAINCLASS] = 'com.webank.eggroll.rollpair.Main'
        options[
            DeployConfKeys.
            CONFKEY_DEPLOY_JVM_CLASSPATH] = f'{base_dir}/jvm/roll_pair/target/lib/*:{base_dir}/jvm/roll_pair/target/eggroll-roll-pair-2.0.jar:{base_dir}/jvm/roll_pair/main/resources'
        options[SessionConfKeys.CONFKEY_SESSION_ID] = 'testing'
        options[SessionConfKeys.CONFKEY_SESSION_PROCESSORS_PER_NODE] = '1'

        session = ErSession(session_id='test_init', options=options)
        context = RollPairContext(session)

        context.load("ns1", "n21").put("k1", "v1")
        print(context.load("ns1", "n21").get("k1"))
Beispiel #2
0
    def __init__(self,
                 roll_site_session_id,
                 rp_ctx: RollPairContext,
                 options: dict = None):
        if options is None:
            options = {}
        self.roll_site_session_id = roll_site_session_id
        self.rp_ctx = rp_ctx

        self.push_session_enabled = RollSiteConfKeys.EGGROLL_ROLLSITE_PUSH_SESSION_ENABLED.get_with(
            options)
        if self.push_session_enabled:
            # create session for push roll_pair and object
            self._push_session = ErSession(
                session_id=roll_site_session_id + "_push",
                options=rp_ctx.get_session().get_all_options())
            self._push_rp_ctx = RollPairContext(session=self._push_session)
            L.info(
                f"push_session={self._push_session.get_session_id()} enabled")

            def stop_push_session():
                self._push_session.stop()
        else:
            self._push_session = None
            self._push_rp_ctx = None

        self.role = options["self_role"]
        self.party_id = str(options["self_party_id"])
        self._options = options

        self._registered_comm_types = dict()
        self.register_comm_type('grpc', RollSiteGrpc)

        endpoint = options["proxy_endpoint"]
        if isinstance(endpoint, str):
            splitted = endpoint.split(':')
            self.proxy_endpoint = ErEndpoint(host=splitted[0].strip(),
                                             port=int(splitted[1].strip()))
        elif isinstance(endpoint, ErEndpoint):
            self.proxy_endpoint = endpoint
        else:
            raise ValueError("endpoint only support str and ErEndpoint type")

        self.is_standalone = RollSiteConfKeys.EGGROLL_ROLLSITE_DEPLOY_MODE.get_with(
            options) == "standalone"
        # if self.is_standalone:
        #     self.stub = None
        # else:
        #     channel = self.grpc_channel_factory.create_channel(self.proxy_endpoint)
        #     self.stub = proxy_pb2_grpc.DataTransferServiceStub(channel)

        self.pushing_latch = CountDownLatch(0)
        self.rp_ctx.get_session().add_exit_task(self._wait_push_complete)
        if self.push_session_enabled:
            self.rp_ctx.get_session().add_exit_task(stop_push_session)
        self._wait_push_exit_timeout = int(
            RollSiteConfKeys.EGGROLL_ROLLSITE_PUSH_OVERALL_TIMEOUT_SEC.
            get_with(options))

        L.info(f"inited RollSiteContext: {self.__dict__}")
Beispiel #3
0
 def create(self):
     from eggroll.core.session import session_init
     from eggroll.roll_pair.roll_pair import RollPairContext
     self._options['eggroll.session.deploy.mode'] = "cluster"
     self._rp_session = session_init(session_id=self._session_id,
                                     options=self._options)
     self._rpc = RollPairContext(session=self._rp_session)
     self._session_id = self._rp_session.get_session_id()
Beispiel #4
0
 def __init__(self, session_id, options=None):
     super(StorageSession, self).__init__(session_id=session_id,
                                          engine=StorageEngine.EGGROLL)
     self._options = options if options else {}
     self._options['eggroll.session.deploy.mode'] = "cluster"
     self._rp_session = session_init(session_id=self._session_id,
                                     options=self._options)
     self._rpc = RollPairContext(session=self._rp_session)
     self._session_id = self._rp_session.get_session_id()
Beispiel #5
0
def get_debug_test_context(is_standalone=False, manager_port=4670, egg_port=20001, transfer_port=20002, session_id='testing'):
    manager_port = manager_port
    egg_ports = [egg_port]
    egg_transfer_ports = [transfer_port]
    self_server_node_id = 2

    options = {}
    if is_standalone:
        options[SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE] = "standalone"
    options[TransferConfKeys.CONFKEY_TRANSFER_SERVICE_HOST] = "127.0.0.1"
    options[TransferConfKeys.CONFKEY_TRANSFER_SERVICE_PORT] = str(transfer_port)
    options[ClusterManagerConfKeys.CONFKEY_CLUSTER_MANAGER_PORT] = str(manager_port)
    options[NodeManagerConfKeys.CONFKEY_NODE_MANAGER_PORT] = str(manager_port)

    egg = ErProcessor(id=1,
                      server_node_id=self_server_node_id,
                      processor_type=ProcessorTypes.EGG_PAIR,
                      status=ProcessorStatus.RUNNING,
                      command_endpoint=ErEndpoint("127.0.0.1", egg_ports[0]),
                      transfer_endpoint=ErEndpoint("127.0.0.1",
                                                   egg_transfer_ports[0]))

    roll = ErProcessor(id=1,
                       server_node_id=self_server_node_id,
                       processor_type=ProcessorTypes.ROLL_PAIR_MASTER,
                       status=ProcessorStatus.RUNNING,
                       command_endpoint=ErEndpoint("127.0.0.1", manager_port))

    session = ErSession(session_id,
                        processors=[egg, roll],
                        options=options)
    context = RollPairContext(session)
    return context
Beispiel #6
0
def init_roll_site_context(runtime_conf, session_id):
    from eggroll.roll_site.roll_site import RollSiteContext
    from eggroll.roll_pair.roll_pair import RollPairContext
    LOGGER.info("init_roll_site_context runtime_conf: {}".format(runtime_conf))
    session_instance = FateSession.get_instance()._eggroll.get_session()
    rp_context = RollPairContext(session_instance)

    role = runtime_conf.get("local").get("role")
    party_id = str(runtime_conf.get("local").get("party_id"))
    _path = file_utils.get_project_base_directory(
    ) + "/arch/conf/server_conf.json"

    server_conf = file_utils.load_json_conf(_path)
    host = server_conf.get('servers').get('proxy').get("host")
    port = server_conf.get('servers').get('proxy').get("port")

    options = {
        'self_role': role,
        'self_party_id': party_id,
        'proxy_endpoint': ErEndpoint(host, int(port))
    }

    rs_context = RollSiteContext(session_id,
                                 rp_ctx=rp_context,
                                 options=options)
    LOGGER.info("init_roll_site_context done: {}".format(rs_context.__dict__))
    return rp_context, rs_context
Beispiel #7
0
class StorageSession(StorageSessionBase):
    def __init__(self, session_id, options=None):
        super(StorageSession, self).__init__(session_id=session_id,
                                             engine_name=StorageEngine.EGGROLL)
        self._options = options if options else {}
        self._rp_session = None
        self._rpc = None

    def create(self):
        from eggroll.core.session import session_init
        from eggroll.roll_pair.roll_pair import RollPairContext
        self._options['eggroll.session.deploy.mode'] = "cluster"
        self._rp_session = session_init(session_id=self._session_id,
                                        options=self._options)
        self._rpc = RollPairContext(session=self._rp_session)
        self._session_id = self._rp_session.get_session_id()

    def table(self,
              name,
              namespace,
              address: AddressABC,
              partitions,
              storage_type: EggRollStorageType = EggRollStorageType.
              ROLLPAIR_LMDB,
              options=None,
              **kwargs):
        if isinstance(address, EggRollAddress):
            from fate_arch.storage.eggroll._table import StorageTable
            return StorageTable(context=self._rpc,
                                name=name,
                                namespace=namespace,
                                address=address,
                                partitions=partitions,
                                storage_type=storage_type,
                                options=options)
        raise NotImplementedError(
            f"address type {type(address)} not supported with eggroll storage")

    def cleanup(self, name, namespace):
        self._rpc.cleanup(name=name, namespace=namespace)

    def stop(self):
        return self._rp_session.stop()

    def kill(self):
        return self._rp_session.kill()
Beispiel #8
0
def get_standalone_context(options=None):
    if options is None:
        options = {}
    options[SessionConfKeys.CONFKEY_SESSION_DEPLOY_MODE] = DeployModes.STANDALONE

    session = ErSession(options=options)
    print(session.get_session_id())
    context = RollPairContext(session)

    return context
Beispiel #9
0
def get_cluster_context(options=None):
    if options is None:
        options = {}

    if 'session_id' in options:
        session_id = options['session_id']
    else:
        session_id = None
    session = ErSession(session_id=session_id, options=options)
    print(session.get_session_id())
    context = RollPairContext(session)
    return context
Beispiel #10
0
def build_eggroll_runtime(work_mode: WorkMode, eggroll_session):
    # if isinstance(eggroll_session, EggrollSession):
    #     if work_mode.is_standalone():
    #         from eggroll.api.standalone.eggroll import Standalone
    #         return Standalone(eggroll_session)
    #
    #     elif work_mode.is_cluster():
    #         from eggroll.api.cluster.eggroll import eggroll_init, _EggRoll
    #         if _EggRoll.instance is None:
    #             return eggroll_init(eggroll_session)
    #     else:
    #         raise ValueError(f"work_mode: {work_mode} not supported!")
    if isinstance(eggroll_session, ErSession):
        if work_mode.is_standalone() or work_mode.is_cluster():
            return RollPairContext(eggroll_session)
        else:
            raise ValueError(f"work_mode: {work_mode} not supported!")
    else:
        raise ValueError(f"eggroll_session: {type(eggroll_session)} not supported!")
Beispiel #11
0
def build_eggroll_runtime(eggroll_session):
    rpc = RollPairContext(session=eggroll_session)
    return rpc
Beispiel #12
0
#  limitations under the License.

from eggroll.core.session import ErSession
from eggroll.roll_paillier_tensor.roll_paillier_tensor import RptContext
from eggroll.roll_pair.roll_pair import RollPairContext

import roll_paillier_tensor as rpt_engine
import unittest
import pandas as pd
from eggroll.core.io.kv_adapter import RocksdbSortedKvAdapter

#mat = pd.read_csv("/data/home/qijunhuang/czn/code/Python_C_Paillier/pData/testMat_mpi.csv").values


session = ErSession(options={"eggroll.deploy.mode": "standalone"})
rptc = RptContext(RollPairContext(session))


mat = pd.read_csv("/data/czn/data/testGemmMat.csv").values
vec = pd.read_csv("/data/czn/data/testGemmVec.csv").values
#test lr
brest_G = pd.read_csv("/data/czn/data/breast_a_egr.csv").values
brest_H = pd.read_csv("/data/czn/data/breast_b_egr.csv").values
brest_Y = pd.read_csv("/data/czn/data/breast_b_y_egr.csv").values

#mini
brest_G_mini = pd.read_csv("/data/czn/data/breast_a_egr_mini.csv").values
brest_H_mini = pd.read_csv("/data/czn/data/breast_b_egr_mini.csv").values
brest_Y_mini = pd.read_csv("/data/czn/data/breast_b_y_egr_mini.csv").values
brest_G_py = pd.read_csv("/data/czn/data/breast_a_egr_py.csv").values
brest_H_py = pd.read_csv("/data/czn/data/breast_b_egr_py.csv").values
Beispiel #13
0
def check_actual_max_threads():
    def getMemInfo(fn):
        def query_cmd(cmd):
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].decode().strip().split('\n')
            return p[0]
 
        def get_host_ip():
            try:
                s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
                s.connect(('8.8.8.8', 80))
                ip = s.getsockname()[0]
            finally:
                s.close()
            return ip

        mem = psutil.virtual_memory()
        mem_total = round2(mem.total)
        mem_used = round2(mem.used)
        mem_used_per = str(round(mem.percent)) + '%'

        swap_mem = psutil.swap_memory()
        swap_total = round2(swap_mem.total)
        swap_used = round2(swap_mem.used)
        swap_use_per = str(round(swap_mem.percent)) + '%'

        data_disk = psutil.disk_usage('/data')
        disk_total = round2(data_disk.total)
        disk_used = round2(data_disk.used)
        disk_per = str(round(data_disk.percent)) + '%'

        mem_info = {}
        mem_info["Ip"] = get_host_ip()
        mem_info["MemTotal"] = mem_total
        mem_info["MemUsed"] = mem_used
        mem_info["MemUsedPCT"] = mem_used_per

        mem_info["SwapTotal"] = swap_total
        mem_info["SwapUsed"] = swap_used
        mem_info["SwapUsePCT"] = swap_use_per

        mem_info["DiskTotal"] = disk_total
        mem_info["DiskUsed"] = disk_used
        mem_info["DiskUsedPCT"] = disk_per

        mem_info["/proc/sys/kernel/threads-max"] = query_cmd("cat /proc/sys/kernel/threads-max")
        mem_info["/etc/sysctl.conf"] = query_cmd("grep kernel.pid_max /etc/sysctl.conf | awk -F= '{print $2}'")
        mem_info["/proc/sys/kernel/pid_max"] = query_cmd("cat /proc/sys/kernel/pid_max")
        mem_info["/proc/sys/vm/max_map_count"] = query_cmd("cat /proc/sys/vm/max_map_count")

        mem_info["/etc/security/limits.conf"] = query_cmd("cat /etc/security/limits.conf | grep nofile | awk '{print $4}'")
        mem_info["/etc/security/limits.d/80-nofile.conf"] = query_cmd("cat /etc/security/limits.d/80-nofile.conf | grep nofile | awk '{print $4}'")
        mem_info["/etc/sysctl.conf"] = query_cmd("grep fs.file-max  /etc/sysctl.conf | awk -F= '{print $2}'")
        mem_info["/proc/sys/fs/file-max"] = query_cmd("cat /proc/sys/fs/file-max")

        mem_info["CurrentUseProcesses"] = query_cmd("pstree -p `ps -e |grep egg_pair |awk '{print $1}'` |wc -l")
        mem_info["NodeProcessors"] = query_cmd("grep eggroll.session.processors.per.node ${EGGROLL_HOME}/conf/eggroll.properties | awk -F= '{print $2}'")
        mem_info["PoolSize"] = query_cmd("grep eggroll.rollpair.eggpair.server.executor.pool.max.size ${EGGROLL_HOME}/conf/eggroll.properties | awk -F= '{print $2}'")

        rollsite_pid = query_cmd("ps aux | grep ${EGGROLL_HOME} | grep com.webank.eggroll.rollsite.Proxy | grep -v grep | awk '{print $2}'")
        if rollsite_pid:
            rollsite_used_memory = psutil.Process(int(rollsite_pid)).memory_info().rss
            myfile = open(sys.path[1] + '/../../../conf/eggroll.properties')
            properties = myfile.read()
            jvm_options = re.findall(r"(?<=MaxHeapSize=).*?(?=G)", properties)
            if len(jvm_options):
                rollsite_total_memory = int(jvm_options[0]) * 1024 * 1024 * 1024
            else:
                rollsite_total_memory = mem.total
            myfile.close()

            mem_info["RollsiteUsedPercent"] = '{:.2%}'.format(rollsite_used_memory / (rollsite_total_memory * 4))
        else:
            mem_info["RollsiteUsedPercent"] = 0


        return mem_info

    session = ErSession(options={"eggroll.session.processors.per.node": args.nodes})
    try:
        ctx = RollPairContext(session)
        rp = ctx.parallelize(str_generator(row_limit=1000), options={'total_partitions': args.partitions})
        result = rp.with_stores(func=getMemInfo)
        print_green(str(datetime.datetime.now()))
        #print(json.dumps(result, indent=1))
        for node in result:
            print_green("==============This is node " + str(node[0]) + ":" + node[1]["Ip"] + "===========================================")
            print_yellow("[WARNING] MemTotal:" + node[1]["MemTotal"] + "G, MemUsed:" + node[1]["MemUsed"] + "G, MemUsedPCT:" + node[1]["MemUsedPCT"])
            if float(node[1]["SwapTotal"]) < 128:
                print_red("[ERROR] The swap memory is:" + node[1]["SwapTotal"] + "G, no less than 128G.")
            else:
                print_yellow("[WARNING] SwapTotal:" + node[1]["SwapTotal"] + "G, SwapUsed:" + node[1]["SwapUsed"] + "G, SwapUsePCT:" + node[1]["SwapUsePCT"])
            print_yellow("[WARNING] DiskTotal:" + node[1]["DiskTotal"] + "G, DiskUsed:" + node[1]["DiskUsed"] + "G, DiskUsedPCT:" + node[1]["DiskUsedPCT"])
            print_green("--------------Max user processes and max file count----------------------------------------")
            for key in ["/proc/sys/kernel/threads-max", "/etc/sysctl.conf", "/proc/sys/kernel/pid_max", "/proc/sys/vm/max_map_count", "/etc/security/limits.conf", "/etc/security/limits.d/80-nofile.conf", "/etc/sysctl.conf", "/proc/sys/fs/file-max"]:
                if int(node[1][key]) > 65535:
                    print_green("[OK] " + key + " = " + node[1][key])
                else:
                    print_red("[ERROR] please check " + key + " = " + node[1][key] + ", no less than 65535.")
            print_green("--------------Thread count check-----------------------------------------------------------")
            if len(node[1]["PoolSize"]) == 0:
                node[1]["PoolSize"] = 500
            if int(node[1]["CurrentUseProcesses"]) < int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"]):
                print_green("[OK] The thread count = %s, the total processes = %s * %s = %i" % (node[1]["CurrentUseProcesses"], node[1]["NodeProcessors"] ,node[1]["PoolSize"], int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"])))
            else:
                print_red("[ERROR] The thread count = %s, the total processes = %s * %s = %i. eggroll.rollpair.eggpair.server.executor.pool.max.size is not enough, turn it up." % (node[1]["CurrentUseProcesses"], node[1]["NodeProcessors"] ,node[1]["PoolSize"], int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"])))
            if node[1]["RollsiteUsedPercent"] != 0:
                print_green("----------Rollsite memory use percent--------------------------------------------------")
                print_yellow("[WARNING] rollsite memory use: " + node[1]["RollsiteUsedPercent"])
            print("\n")
    finally:
        session.kill()
 def test_init(self):
     session = ErSession(options={"eggroll.deploy.mode": "standalone"})
     # session = ErSession()
     context = RollPairContext(session)
     #context.load("ns1", "n21").put("k1", "v1")
     print(context.load("ns1", "n21").get("k1"))
Beispiel #15
0
def check_actual_max_threads():
    def getMemInfo(fn):
        def query_cmd(cmd):
            result = True
            p = subprocess.Popen(
                cmd, stdout=subprocess.PIPE,
                shell=True).communicate()[0].decode().strip().split('\n')
            print(p)
            for i in p:
                if int(i) < 65535:
                    result = False
            return result

        mem = psutil.virtual_memory()
        mem_total = round2(mem.total)
        mem_used = round2(mem.used)
        mem_used_per = str(round(mem.percent)) + '%'

        swap_mem = psutil.swap_memory()
        swap_total = round2(swap_mem.total)
        swap_used = round2(swap_mem.used)
        swap_use_per = str(round(swap_mem.percent)) + '%'

        data_disk = psutil.disk_usage('/data')
        disk_total = round2(data_disk.total)
        disk_used = round2(data_disk.used)
        disk_per = str(round(data_disk.percent)) + '%'

        mem_info = {}
        mem_info["MemTotal"] = mem_total
        mem_info["MemUsed"] = mem_used
        mem_info["MemUsedPer"] = mem_used_per

        mem_info["SwapTotal"] = swap_total
        mem_info["SwapUsed"] = swap_used
        mem_info["SwapUsePer"] = swap_use_per

        mem_info["DiskTotal"] = disk_total
        mem_info["DiskUsed"] = disk_used
        mem_info["DiskPer"] = disk_per

        mem_info["/proc/sys/kernel/threads-max"] = query_cmd(
            "cat /proc/sys/kernel/threads-max")
        mem_info["/etc/sysctl.conf"] = query_cmd(
            "grep kernel.pid_max /etc/sysctl.conf | awk -F= '{print $2}'")
        mem_info["/proc/sys/kernel/pid_max"] = query_cmd(
            "cat /proc/sys/kernel/pid_max")
        mem_info["/proc/sys/vm/max_map_count"] = query_cmd(
            "cat /proc/sys/vm/max_map_count")

        mem_info["/etc/security/limits.conf"] = query_cmd(
            "cat /etc/security/limits.conf | grep nofile | awk '{print $4}'")
        mem_info["/etc/security/limits.d/80-nofile.conf"] = query_cmd(
            "cat /etc/security/limits.d/80-nofile.conf | grep nofile | awk '{print $4}'"
        )
        mem_info["/etc/sysctl.conf"] = query_cmd(
            "grep fs.file-max  /etc/sysctl.conf | awk -F= '{print $2}'")
        mem_info["/proc/sys/fs/file-max"] = query_cmd(
            "cat /proc/sys/fs/file-max")

        return mem_info

    session = ErSession(
        options={"eggroll.session.processors.per.node": args.nodes})
    try:
        ctx = RollPairContext(session)
        rp = ctx.parallelize(str_generator(row_limit=1000),
                             options={'total_partitions': args.partitions})
        result = rp.with_stores(func=getMemInfo)
        print_green(str(datetime.datetime.now()))
        #print(json.dumps(result, indent=1))
        for node in result:
            print_green("==============This is node :" + str(node[0]) +
                        "================")
            print_yellow("[WARNING] MemTotal:" + node[1]["MemTotal"] +
                         ", MemUsed:" + node[1]["MemUsed"] + ", MemUsedPer:" +
                         node[1]["MemUsedPer"])
            print_yellow("[WARNING] SwapTotal:" + node[1]["SwapTotal"] +
                         ", SwapUsed:" + node[1]["SwapUsed"] +
                         ", SwapUsePer:" + node[1]["SwapUsePer"])
            print_yellow("[WARNING] DiskTotal:" + node[1]["DiskTotal"] +
                         ", DiskUsed:" + node[1]["DiskUsed"] + ", DiskPer:" +
                         node[1]["DiskPer"])
            print_green(
                "--------Max user processes and max file count--------")
            for key in [
                    "/proc/sys/kernel/threads-max", "/etc/sysctl.conf",
                    "/proc/sys/kernel/pid_max", "/proc/sys/vm/max_map_count",
                    "/etc/security/limits.conf",
                    "/etc/security/limits.d/80-nofile.conf",
                    "/etc/sysctl.conf", "/proc/sys/fs/file-max"
            ]:
                if node[1][key]:
                    print_green("[OK] " + key + " is ok.")
                else:
                    print_red("[ERROR] please check " + key +
                              ", no less than 65535.")
            print("\n")
    finally:
        session.kill()
Beispiel #16
0
def check_actual_max_threads():
    def getMemInfo(fn):
        def query_cmd(cmd):
            p = subprocess.Popen(
                cmd, stdout=subprocess.PIPE,
                shell=True).communicate()[0].decode().strip().split('\n')
            return p[0]

        def get_host_ip():
            try:
                s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
                s.connect(('8.8.8.8', 80))
                ip = s.getsockname()[0]
            finally:
                s.close()
            return ip

        fate_flow_client = "/data/projects/fate/python/fate_flow/fate_flow_client.py"
        mem_info = {}
        mem_info["Ip"] = get_host_ip()
        eggroll_home = query_cmd("echo $EGGROLL_HOME")
        route_file = eggroll_home + "/conf/route_table.json"
        f = open(route_file, encoding='utf-8')
        mem_info["route_table"] = json.load(f)
        mem_info["services"] = [
            'ClusterManagerBootstrap', 'NodeManagerBootstrap', 'rollsite',
            'fate_flow_server.py', 'fateboard', 'mysql'
        ]
        mem_info["job_run"] = query_cmd(
            "if [ -f %s ];then python %s -f query_job -s running | grep f_job_id |wc -l; else echo -1; fi"
            % (fate_flow_client, fate_flow_client))
        mem_info["job_wait"] = query_cmd(
            "if [ -f %s ];then python %s -f query_job -s waiting | grep f_job_id |wc -l; else echo -1; fi"
            % (fate_flow_client, fate_flow_client))
        mem_info["job_thread"] = []
        mem_info["jobs"] = query_cmd(
            "array=(`python %s -f query_job -s running | grep f_job_id |awk -F: '{print $2}' |awk -F '\"' '{print $2}'`);echo ${array[@]}"
            % (fate_flow_client))
        mem_info["job_mem"] = []
        for job_id in mem_info["jobs"]:
            mem_info["job_thread"] = query_cmd(
                "ps -ef |grep egg_pair |grep -v grep |grep %s |wc -l" %
                (job_id))
            mem_info["job_mem"] = query_cmd(
                "ps aux |grep egg_pair |grep %s |awk '{sum+=$6};END {print sum}'"
                % (job_id))
        mem_info["server_mem"] = {}
        mem_info["thread"] = {}
        for service in mem_info["services"]:
            mem_info["thread"][service] = query_cmd(
                "ps -ef |grep %s |grep -v grep |wc -l" % (service))
            mem_info["server_mem"][service] = str(
                query_cmd(
                    "ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}'"
                    % (service)))
        return mem_info

    session = ErSession(
        options={"eggroll.session.processors.per.node": args.nodes})
    try:
        ctx = RollPairContext(session)
        rp = ctx.parallelize(str_generator(row_limit=1000),
                             options={'total_partitions': args.partitions})
        result = rp.with_stores(func=getMemInfo)
        print_green(str(datetime.datetime.now()))
        for node in result:
            print_green("==============This is node " + str(node[0]) + ":" +
                        node[1]["Ip"] +
                        "===========================================")
            print_green(
                "-------------default route check-------------------------------------------------------"
            )
            route_table_dict = node[1]["route_table"]
            if 'default' not in route_table_dict['route_table']:
                print_red(
                    "[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!"
                )
            else:
                try:
                    ip = route_table_dict['route_table']['default']['default'][
                        0]['ip']
                    port = route_table_dict['route_table']['default'][
                        'default'][0]['port']
                    print_green("[OK] eggroll route configured!")
                    print_green("exchange ip:{}, exchange port:{}".format(
                        ip, port))
                except KeyError:
                    print_red(
                        "[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!"
                    )

            print_green(
                "--------------fate service check-------------------------------------------------------"
            )
            for server in node[1]["services"]:
                if int(node[1]["thread"][server]) > 0:
                    print_green(
                        "[OK] the " + server.ljust(23) +
                        " service is running , number of processes is : " +
                        str(node[1]["thread"][server]) + "; used memory : " +
                        str(node[1]["server_mem"][server]) + "KB.")
                else:
                    print_yellow(
                        "[WARNING] the " + server +
                        " service not running, please check service status.")

            print_green(
                "--------------fate_flow jobs process and mem info check--------------------------------------------------"
            )
            if int(node[1]["job_run"]) == -1:
                print_red(
                    "[ERROR] There is no such fate_flow_client.py file, please check fate_flow server if it is running!"
                )
            else:
                print_green("[OK] Number of tasks running is " +
                            node[1]["job_run"])
                print_green("[OK] Number of tasks waiting is " +
                            node[1]["job_wait"])
                if int(node[1]["job_run"]) > 0:
                    for job_id in node[1]["jobs"].split(" "):
                        print_green("[OK] running task job_id : " + job_id +
                                    ", number of egg_pair processes is : " +
                                    str(node[1]["job_thread"]) +
                                    "; used memory : " +
                                    str(node[1]["job_mem"]) + "KB.")

            print("\n")
    finally:
        session.kill()