def test_get_and_stop_and_kill_session(self): session = self.ctx.get_session() id = session.get_session_id() session.stop() from eggroll.core.session import ErSession dead_session = ErSession(id) dead_session.stop() dead_session = ErSession(id) dead_session.kill()
def check_actual_max_threads(): def getMemInfo(fn): def query_cmd(cmd): result = True p = subprocess.Popen( cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].decode().strip().split('\n') print(p) for i in p: if int(i) < 65535: result = False return result mem = psutil.virtual_memory() mem_total = round2(mem.total) mem_used = round2(mem.used) mem_used_per = str(round(mem.percent)) + '%' swap_mem = psutil.swap_memory() swap_total = round2(swap_mem.total) swap_used = round2(swap_mem.used) swap_use_per = str(round(swap_mem.percent)) + '%' data_disk = psutil.disk_usage('/data') disk_total = round2(data_disk.total) disk_used = round2(data_disk.used) disk_per = str(round(data_disk.percent)) + '%' mem_info = {} mem_info["MemTotal"] = mem_total mem_info["MemUsed"] = mem_used mem_info["MemUsedPer"] = mem_used_per mem_info["SwapTotal"] = swap_total mem_info["SwapUsed"] = swap_used mem_info["SwapUsePer"] = swap_use_per mem_info["DiskTotal"] = disk_total mem_info["DiskUsed"] = disk_used mem_info["DiskPer"] = disk_per mem_info["/proc/sys/kernel/threads-max"] = query_cmd( "cat /proc/sys/kernel/threads-max") mem_info["/etc/sysctl.conf"] = query_cmd( "grep kernel.pid_max /etc/sysctl.conf | awk -F= '{print $2}'") mem_info["/proc/sys/kernel/pid_max"] = query_cmd( "cat /proc/sys/kernel/pid_max") mem_info["/proc/sys/vm/max_map_count"] = query_cmd( "cat /proc/sys/vm/max_map_count") mem_info["/etc/security/limits.conf"] = query_cmd( "cat /etc/security/limits.conf | grep nofile | awk '{print $4}'") mem_info["/etc/security/limits.d/80-nofile.conf"] = query_cmd( "cat /etc/security/limits.d/80-nofile.conf | grep nofile | awk '{print $4}'" ) mem_info["/etc/sysctl.conf"] = query_cmd( "grep fs.file-max /etc/sysctl.conf | awk -F= '{print $2}'") mem_info["/proc/sys/fs/file-max"] = query_cmd( "cat /proc/sys/fs/file-max") return mem_info session = ErSession( options={"eggroll.session.processors.per.node": args.nodes}) try: ctx = RollPairContext(session) rp = ctx.parallelize(str_generator(row_limit=1000), options={'total_partitions': args.partitions}) result = rp.with_stores(func=getMemInfo) print_green(str(datetime.datetime.now())) #print(json.dumps(result, indent=1)) for node in result: print_green("==============This is node :" + str(node[0]) + "================") print_yellow("[WARNING] MemTotal:" + node[1]["MemTotal"] + ", MemUsed:" + node[1]["MemUsed"] + ", MemUsedPer:" + node[1]["MemUsedPer"]) print_yellow("[WARNING] SwapTotal:" + node[1]["SwapTotal"] + ", SwapUsed:" + node[1]["SwapUsed"] + ", SwapUsePer:" + node[1]["SwapUsePer"]) print_yellow("[WARNING] DiskTotal:" + node[1]["DiskTotal"] + ", DiskUsed:" + node[1]["DiskUsed"] + ", DiskPer:" + node[1]["DiskPer"]) print_green( "--------Max user processes and max file count--------") for key in [ "/proc/sys/kernel/threads-max", "/etc/sysctl.conf", "/proc/sys/kernel/pid_max", "/proc/sys/vm/max_map_count", "/etc/security/limits.conf", "/etc/security/limits.d/80-nofile.conf", "/etc/sysctl.conf", "/proc/sys/fs/file-max" ]: if node[1][key]: print_green("[OK] " + key + " is ok.") else: print_red("[ERROR] please check " + key + ", no less than 65535.") print("\n") finally: session.kill()
def check_actual_max_threads(): def getMemInfo(fn): def query_cmd(cmd): p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].decode().strip().split('\n') return p[0] def get_host_ip(): try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(('8.8.8.8', 80)) ip = s.getsockname()[0] finally: s.close() return ip mem = psutil.virtual_memory() mem_total = round2(mem.total) mem_used = round2(mem.used) mem_used_per = str(round(mem.percent)) + '%' swap_mem = psutil.swap_memory() swap_total = round2(swap_mem.total) swap_used = round2(swap_mem.used) swap_use_per = str(round(swap_mem.percent)) + '%' data_disk = psutil.disk_usage('/data') disk_total = round2(data_disk.total) disk_used = round2(data_disk.used) disk_per = str(round(data_disk.percent)) + '%' mem_info = {} mem_info["Ip"] = get_host_ip() mem_info["MemTotal"] = mem_total mem_info["MemUsed"] = mem_used mem_info["MemUsedPCT"] = mem_used_per mem_info["SwapTotal"] = swap_total mem_info["SwapUsed"] = swap_used mem_info["SwapUsePCT"] = swap_use_per mem_info["DiskTotal"] = disk_total mem_info["DiskUsed"] = disk_used mem_info["DiskUsedPCT"] = disk_per mem_info["/proc/sys/kernel/threads-max"] = query_cmd("cat /proc/sys/kernel/threads-max") mem_info["/etc/sysctl.conf"] = query_cmd("grep kernel.pid_max /etc/sysctl.conf | awk -F= '{print $2}'") mem_info["/proc/sys/kernel/pid_max"] = query_cmd("cat /proc/sys/kernel/pid_max") mem_info["/proc/sys/vm/max_map_count"] = query_cmd("cat /proc/sys/vm/max_map_count") mem_info["/etc/security/limits.conf"] = query_cmd("cat /etc/security/limits.conf | grep nofile | awk '{print $4}'") mem_info["/etc/security/limits.d/80-nofile.conf"] = query_cmd("cat /etc/security/limits.d/80-nofile.conf | grep nofile | awk '{print $4}'") mem_info["/etc/sysctl.conf"] = query_cmd("grep fs.file-max /etc/sysctl.conf | awk -F= '{print $2}'") mem_info["/proc/sys/fs/file-max"] = query_cmd("cat /proc/sys/fs/file-max") mem_info["CurrentUseProcesses"] = query_cmd("pstree -p `ps -e |grep egg_pair |awk '{print $1}'` |wc -l") mem_info["NodeProcessors"] = query_cmd("grep eggroll.session.processors.per.node ${EGGROLL_HOME}/conf/eggroll.properties | awk -F= '{print $2}'") mem_info["PoolSize"] = query_cmd("grep eggroll.rollpair.eggpair.server.executor.pool.max.size ${EGGROLL_HOME}/conf/eggroll.properties | awk -F= '{print $2}'") rollsite_pid = query_cmd("ps aux | grep ${EGGROLL_HOME} | grep com.webank.eggroll.rollsite.Proxy | grep -v grep | awk '{print $2}'") if rollsite_pid: rollsite_used_memory = psutil.Process(int(rollsite_pid)).memory_info().rss myfile = open(sys.path[1] + '/../../../conf/eggroll.properties') properties = myfile.read() jvm_options = re.findall(r"(?<=MaxHeapSize=).*?(?=G)", properties) if len(jvm_options): rollsite_total_memory = int(jvm_options[0]) * 1024 * 1024 * 1024 else: rollsite_total_memory = mem.total myfile.close() mem_info["RollsiteUsedPercent"] = '{:.2%}'.format(rollsite_used_memory / (rollsite_total_memory * 4)) else: mem_info["RollsiteUsedPercent"] = 0 return mem_info session = ErSession(options={"eggroll.session.processors.per.node": args.nodes}) try: ctx = RollPairContext(session) rp = ctx.parallelize(str_generator(row_limit=1000), options={'total_partitions': args.partitions}) result = rp.with_stores(func=getMemInfo) print_green(str(datetime.datetime.now())) #print(json.dumps(result, indent=1)) for node in result: print_green("==============This is node " + str(node[0]) + ":" + node[1]["Ip"] + "===========================================") print_yellow("[WARNING] MemTotal:" + node[1]["MemTotal"] + "G, MemUsed:" + node[1]["MemUsed"] + "G, MemUsedPCT:" + node[1]["MemUsedPCT"]) if float(node[1]["SwapTotal"]) < 128: print_red("[ERROR] The swap memory is:" + node[1]["SwapTotal"] + "G, no less than 128G.") else: print_yellow("[WARNING] SwapTotal:" + node[1]["SwapTotal"] + "G, SwapUsed:" + node[1]["SwapUsed"] + "G, SwapUsePCT:" + node[1]["SwapUsePCT"]) print_yellow("[WARNING] DiskTotal:" + node[1]["DiskTotal"] + "G, DiskUsed:" + node[1]["DiskUsed"] + "G, DiskUsedPCT:" + node[1]["DiskUsedPCT"]) print_green("--------------Max user processes and max file count----------------------------------------") for key in ["/proc/sys/kernel/threads-max", "/etc/sysctl.conf", "/proc/sys/kernel/pid_max", "/proc/sys/vm/max_map_count", "/etc/security/limits.conf", "/etc/security/limits.d/80-nofile.conf", "/etc/sysctl.conf", "/proc/sys/fs/file-max"]: if int(node[1][key]) > 65535: print_green("[OK] " + key + " = " + node[1][key]) else: print_red("[ERROR] please check " + key + " = " + node[1][key] + ", no less than 65535.") print_green("--------------Thread count check-----------------------------------------------------------") if len(node[1]["PoolSize"]) == 0: node[1]["PoolSize"] = 500 if int(node[1]["CurrentUseProcesses"]) < int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"]): print_green("[OK] The thread count = %s, the total processes = %s * %s = %i" % (node[1]["CurrentUseProcesses"], node[1]["NodeProcessors"] ,node[1]["PoolSize"], int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"]))) else: print_red("[ERROR] The thread count = %s, the total processes = %s * %s = %i. eggroll.rollpair.eggpair.server.executor.pool.max.size is not enough, turn it up." % (node[1]["CurrentUseProcesses"], node[1]["NodeProcessors"] ,node[1]["PoolSize"], int(node[1]["NodeProcessors"]) * int(node[1]["PoolSize"]))) if node[1]["RollsiteUsedPercent"] != 0: print_green("----------Rollsite memory use percent--------------------------------------------------") print_yellow("[WARNING] rollsite memory use: " + node[1]["RollsiteUsedPercent"]) print("\n") finally: session.kill()
def check_actual_max_threads(): def getMemInfo(fn): def query_cmd(cmd): p = subprocess.Popen( cmd, stdout=subprocess.PIPE, shell=True).communicate()[0].decode().strip().split('\n') return p[0] def get_host_ip(): try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(('8.8.8.8', 80)) ip = s.getsockname()[0] finally: s.close() return ip fate_flow_client = "/data/projects/fate/python/fate_flow/fate_flow_client.py" mem_info = {} mem_info["Ip"] = get_host_ip() eggroll_home = query_cmd("echo $EGGROLL_HOME") route_file = eggroll_home + "/conf/route_table.json" f = open(route_file, encoding='utf-8') mem_info["route_table"] = json.load(f) mem_info["services"] = [ 'ClusterManagerBootstrap', 'NodeManagerBootstrap', 'rollsite', 'fate_flow_server.py', 'fateboard', 'mysql' ] mem_info["job_run"] = query_cmd( "if [ -f %s ];then python %s -f query_job -s running | grep f_job_id |wc -l; else echo -1; fi" % (fate_flow_client, fate_flow_client)) mem_info["job_wait"] = query_cmd( "if [ -f %s ];then python %s -f query_job -s waiting | grep f_job_id |wc -l; else echo -1; fi" % (fate_flow_client, fate_flow_client)) mem_info["job_thread"] = [] mem_info["jobs"] = query_cmd( "array=(`python %s -f query_job -s running | grep f_job_id |awk -F: '{print $2}' |awk -F '\"' '{print $2}'`);echo ${array[@]}" % (fate_flow_client)) mem_info["job_mem"] = [] for job_id in mem_info["jobs"]: mem_info["job_thread"] = query_cmd( "ps -ef |grep egg_pair |grep -v grep |grep %s |wc -l" % (job_id)) mem_info["job_mem"] = query_cmd( "ps aux |grep egg_pair |grep %s |awk '{sum+=$6};END {print sum}'" % (job_id)) mem_info["server_mem"] = {} mem_info["thread"] = {} for service in mem_info["services"]: mem_info["thread"][service] = query_cmd( "ps -ef |grep %s |grep -v grep |wc -l" % (service)) mem_info["server_mem"][service] = str( query_cmd( "ps aux |grep %s |grep -v grep |awk '{sum+=$6};END {print sum}'" % (service))) return mem_info session = ErSession( options={"eggroll.session.processors.per.node": args.nodes}) try: ctx = RollPairContext(session) rp = ctx.parallelize(str_generator(row_limit=1000), options={'total_partitions': args.partitions}) result = rp.with_stores(func=getMemInfo) print_green(str(datetime.datetime.now())) for node in result: print_green("==============This is node " + str(node[0]) + ":" + node[1]["Ip"] + "===========================================") print_green( "-------------default route check-------------------------------------------------------" ) route_table_dict = node[1]["route_table"] if 'default' not in route_table_dict['route_table']: print_red( "[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!" ) else: try: ip = route_table_dict['route_table']['default']['default'][ 0]['ip'] port = route_table_dict['route_table']['default'][ 'default'][0]['port'] print_green("[OK] eggroll route configured!") print_green("exchange ip:{}, exchange port:{}".format( ip, port)) except KeyError: print_red( "[ERROR] eggroll exchange route is not configured, please check data/projects/fate/eggroll/conf/route_table.json file if it is existed!" ) print_green( "--------------fate service check-------------------------------------------------------" ) for server in node[1]["services"]: if int(node[1]["thread"][server]) > 0: print_green( "[OK] the " + server.ljust(23) + " service is running , number of processes is : " + str(node[1]["thread"][server]) + "; used memory : " + str(node[1]["server_mem"][server]) + "KB.") else: print_yellow( "[WARNING] the " + server + " service not running, please check service status.") print_green( "--------------fate_flow jobs process and mem info check--------------------------------------------------" ) if int(node[1]["job_run"]) == -1: print_red( "[ERROR] There is no such fate_flow_client.py file, please check fate_flow server if it is running!" ) else: print_green("[OK] Number of tasks running is " + node[1]["job_run"]) print_green("[OK] Number of tasks waiting is " + node[1]["job_wait"]) if int(node[1]["job_run"]) > 0: for job_id in node[1]["jobs"].split(" "): print_green("[OK] running task job_id : " + job_id + ", number of egg_pair processes is : " + str(node[1]["job_thread"]) + "; used memory : " + str(node[1]["job_mem"]) + "KB.") print("\n") finally: session.kill()