#!/usr/bin/python import zkpython import sys import os if __name__ == '__main__': zkc = zkpython.ZKClient('localhost:2181') if len(sys.argv)>0: zkc.delete(sys.argv[1])
def main(args,loger): # try: # opts, arg = getopt.getopt(args, "h", ["help"]) # except getopt.GetoptError: # usage() # sys.exit(2) # # for opt, argitem in opts: # if opt in ("-h", "help"): # usage() # sys.exit(0) # else: # usage() # sys.exit(2) try : pid = 0 bRun = False bFileExits = False runtm = time.time() zombiestatustm = 0 if (os.path.exists(filelock)): bFileExits = True # 检证pid是不是在运行 prepid = 0 prepid = readpidfile(pidfile) if prepid != 0 and prepid != None: cmd = "ps %d" % prepid a = os.system(cmd) if a == 0: bRun = True else: bRun = False # 如果锁文件存在并/var/run/imonitor_master.pid中记录的pid在运行,则不运行些脚本 if bFileExits and bRun: print "Service Is Running<pid:%d>!" % prepid return # 成为守护进程 #daemonize() # 将此进程的pid写到/var/run/oristar_sms_server.pid writepidfile(pidfile) os.system('touch %s' % filelock) # 加载配置文件 log = 'Monitor Master Start\n' log +='Loading config file' loger.info(log) cfg = {} with open('/usr/local/imonitor2/config.json', 'r') as config: cfg = json.load(config) agent_node = cfg['AgentNode'] sms_run_host = cfg['RunHost'] ingest_sms = cfg['Ingest_SMS'] timeout = cfg['TimeOut'] agent_ls = agent_node.keys() log = "Load config successful,AgentNode %s"%agent_node loger.info(log) location_mgr = LocationMgr(agent_ls, sms_run_host,agent_node,loger) zkctl = zkpython.ZKClient(zk_server_ip) zkctl.async() localhost_name = cfg['NodeName'] # 获取sms的重启时间点 sms_reboot_time = cfg['sms_reboot_time'] reboot_cfg = [] reboot_cfg.append(sms_reboot_time['first']) reboot_cfg.append(sms_reboot_time['second']) conf_txt = '{"sms_reboot_time":%s}'%reboot_cfg zkctl.set('/scheduler/task',conf_txt) # 根据NodeName创建元服务节点 server_node_name = '/scheduler/server/%s' %localhost_name loger.info( server_node_name) if not zkctl.exists(server_node_name): zkctl.create(server_node_name, "", 1) else: while zkctl.exists(server_node_name): time.sleep(2) log = '%s has exist, wait for it deleted'%server_node_name zkctl.create(server_node_name, "", 1) # 创建master管理实例,并启动/scheduler/server节点的监测 master_node = cfg['MasterNode'] other_master_host = '' for host in master_node: if host == localhost_name: continue else: other_master_host = host regist_timeout = timeout['slaver_regist'] master2_stat = '' if localhost_name=='master1': if zkctl.exists('/scheduler/server/master2'): master2_stat = zkctl.get('/scheduler/server/master2') for i in range(10): zkctl.async('/scheduler/task') child_ls = zkctl.get_children('/scheduler/task') if not child_ls: break time.sleep(3) log = 'waiting for master2 delete /scheduler/task/slaveX ' loger.info(log) master_mgr = MasterMgr(localhost_name, other_master_host, zkctl, regist_timeout, ingest_sms, loger) master_mgr.start_monitor() # 创建agent管理实例,并启动/scheduler/agent节点的监控 agent_mgr = AgentMgr(agent_ls,localhost_name,location_mgr, zkctl, timeout['slaver_regist'],loger) # 如果master2已经接管,则恢复接管 if len(master2_stat) != 0 and master2_stat[0]: cur_master_stat = json.loads(master2_stat[0]) agent_mgr.set_stat_matrix(cur_master_stat) log = 'backing take over,set stat of master2 to master1 ' loger.info(log) master_mgr.set_setstatmatrix_fun(agent_mgr.set_stat_matrix)#注册设置节点状态回调 master_mgr.set_startagentmonitor_fun(agent_mgr.start_agent_monitor)#注册重启agent_monitor回调 master_mgr.set_deltasknode_fun(agent_mgr.del_task_node)#注册删除task回调 agent_mgr.start_monitor() # 开启webservice spyne_webservice.setMainObj(agent_mgr) webservice_th = Thread(target=spyne_webservice.webserverFun) webservice_th.start() try: wsclient = suds.client.Client('http://127.0.0.1/sms/webservice/wsnotice?wsdl') except: log = 'open http://127.0.0.1/sms/webservice/wsnotice?wsdl failed' loger.info(log) cnt = 0 boot_ingest_sms_delay = 0 while True: # 定时更新sms的状�? time.sleep(2) zkctl.async('/scheduler') # 获取所有sms的运行状态和位置 all_run_sms = agent_mgr.get_all_run_sms() if not all_run_sms: continue is_take_over = False for sms_id in all_run_sms: if spyne_webservice.g_sms_stat[sms_id][1] != all_run_sms[sms_id][1]: pre_loc = spyne_webservice.g_sms_stat[sms_id][1] spyne_webservice.g_sms_stat[sms_id][1] = all_run_sms[sms_id][1]#1 为sms的运行位置 log = '%s from %s to %s'%(sms_id,pre_loc,spyne_webservice.g_sms_stat[sms_id][1]) loger.info(log) is_take_over = True # 位置发生改变则通知web if is_take_over: try: if wsclient: wsclient.service.isSwitchHall(True) else: wsclient = suds.client.Client('http://127.0.0.1/sms/webservice/wsnotice?wsdl') wsclient.service.isSwitchHall(True) log = 'sms location change and notice web' loger.info(log) except: log = 'call webserivce isSwitchHall failed' loger.info(log) # 每隔30秒输出一次排序后的状态 cnt += 1 if cnt % 3 == 0: db_stat = check_db_sync_stat() spyne_webservice.setDBSyncStat(db_stat) deadnode = checkhdfs_stat() deadname=[] if deadnode: for name in deadnode: deadname.append(name.split(' ')[1][:-1]) log = 'check hdfs dead stat:%s(%s:%d)'%(deadnode,deadname,len(deadname)) loger.info(log) spyne_webservice.setHDFSStat(len(deadname)) spyne_webservice.setHDFSDeadNode(deadname) if (agent_mgr.getNodeHealthStat() and check_db_sync_second() == 0 ) or boot_ingest_sms_delay > 10: master_mgr.start_ingest_sms() else: health = agent_mgr.getNodeHealthStat() second = check_db_sync_second() log = 'node health:%s db sync second:%d delay cnt:%d,so delay boot ingest sms'%(health,second,boot_ingest_sms_delay) loger.info(log) boot_ingest_sms_delay += 1 if cnt % 15 == 0: log = 'all sms run stat:%s'%[(k,all_run_sms[k]) for k in sorted(all_run_sms.keys())] loger.info(log) cnt = 0 except: fp = StringIO.StringIO() traceback.print_exc(file=fp) message = fp.getvalue() loger.info(message)
# 如果锁文件存在并且记录的pid在运行,则不运行些脚本 if bFileExits and bRun: log = "Service Is Running<pid:%d>!" % prepid loger.info(log) exit(0) # 装此进程的pid写到pid file中 writepidfile(filelock) os.system('touch %s' % filelock) loger.info('Monitor Agent Start....') hostname = getHostName() name_ls = hostname.split('-') hostname = name_ls[0][:-1] zkclt = zkpython.ZKClient(zk_server_ip) if not init(zkclt, loger): exit(0) #开启事件处理 MainMgr = EventMgr(hostname, 10, zkclt, loger) setLogAndMgr(loger, MainMgr) MainMgr.start() # 读取配置 cfg_txt = zkclt.get('/scheduler/task', watcher) log = 'read confg form /scheduler/task(%s)' % cfg_txt[0] loger.info(log) if len(cfg_txt[0]) > 1: cfg = json.loads(cfg_txt[0]) rt = cfg['sms_reboot_time']