def resourcemanager_web(host,port): ''' 检查以下事项: 1. 不健康的node 及其对应的report ''' state = "OK" msg = u"" #state list:DECOMMISSIONED,LOST,NEW,REBOOTED,RUNNING,UNHEALTHY, url = "http://%s:%s/ws/v1/cluster/nodes?state=unhealthy" % (host,port) nodes = util.get_http_json(url,5) if nodes != None : if nodes["nodes"] != None : for node in nodes["nodes"]["node"] : state = "ERROR" msg += u"检测到不健康机器 %s 健康报告 %s." % (node["nodeHostName"],node["healthReport"]) else: return ("ERROR",u"不能连接到RM %s:%s" % (host,port) ) url = "http://%s:%s/ws/v1/cluster/nodes?state=lost" % (host,port) nodes = util.get_http_json(url,5) if nodes != None : if nodes["nodes"] != None : for node in nodes["nodes"]["node"] : state = "ERROR" ts = time.localtime(node['lastHealthUpdate']/1000) date_time = time.strftime("%Y-%m-%d %H:%M:%S",ts) msg += u"检测到丢失的机器 %s 丢失时间 %s." % (node["nodeHostName"],date_time) else: return ("ERROR",u"不能连接到RM %s:%s" % (host,port) ) return (state,msg)
def getJobAllTask(self, jobid): url = "http://%s:%s/ws/v1/history/mapreduce/jobs/%s/tasks" % (self.hshost, self.hsport, jobid) tasks = util.get_http_json(url) if not tasks or not tasks.has_key("tasks") or not tasks["tasks"].has_key("task"): return for task in tasks["tasks"]["task"]: taskId = task["id"] url = "http://%s:%s/ws/v1/history/mapreduce/jobs/%s/tasks/%s/attempts" % ( self.hshost, self.hsport, jobid, taskId, ) attempts = util.get_http_json(url) if ( not attempts or not attempts.has_key("taskAttempts") or not attempts["taskAttempts"].has_key("taskAttempt") ): return for attempt in attempts["taskAttempts"]["taskAttempt"]: attemptId = attempt["id"] url = "http://%s:%s/ws/v1/history/mapreduce/jobs/%s/tasks/%s/attempts/%s/counters" % ( self.hshost, self.hsport, jobid, taskId, attemptId, ) attemptCounter = util.get_http_json(url) self.updateWithAttempt(attempt, attemptCounter)
def getAppList(self): url = "http://%s:%s/ws/v1/cluster/apps?finishedTimeBegin=%d&finishedTimeEnd=%d" % ( self.rmhost, self.rmport, (self.recordTime * 1000), (self.recordTime + self.interval) * 1000, ) return util.get_http_json(url)
def getJobAllTask(self,jobid): url = ("http://%s:%s/ws/v1/history/mapreduce/jobs/%s/tasks" % (self.hshost,self.hsport,jobid)) tasks = util.get_http_json(url) if not tasks or not tasks.has_key('tasks') or not tasks['tasks'].has_key('task') : return for task in tasks['tasks']['task']: taskId = task['id'] url = ("http://%s:%s/ws/v1/history/mapreduce/jobs/%s/tasks/%s/attempts" % (self.hshost,self.hsport,jobid,taskId)) attempts = util.get_http_json(url) if not attempts or not attempts.has_key('taskAttempts') \ or not attempts['taskAttempts'].has_key('taskAttempt') : return for attempt in attempts['taskAttempts']['taskAttempt']: attemptId = attempt['id'] url = ("http://%s:%s/ws/v1/history/mapreduce/jobs/%s/tasks/%s/attempts/%s/counters" % (self.hshost,self.hsport,jobid,taskId,attemptId)) attemptCounter = util.get_http_json(url) self.updateWithAttempt(attempt,attemptCounter)
def resourcemanager_web(host,port): ''' 检查以下事项: 1. 不健康的node 及其对应的report ''' msg = u"" alarm_list = [] #state list:DECOMMISSIONED,LOST,NEW,REBOOTED,RUNNING,UNHEALTHY, url = "http://%s:%s/ws/v1/cluster/nodes?state=unhealthy" % (host,port) nodes = util.get_http_json(url,5) if nodes != None : if nodes["nodes"] != None : for node in nodes["nodes"]["node"] : key_word = "%s(rm unhealty node)" % node["nodeHostName"] msg = u"检测到不健康机器 %s 健康报告 %s." % (node["nodeHostName"],node["healthReport"]) alarm_list.append({"key_word":key_word,"msg":msg}) else: key_word = "%s(connect rm error)" % host msg = u"不能连接到RM %s:%s" % (host,port) alarm_list.append({"key_word":key_word,"msg":msg}) return alarm_list url = "http://%s:%s/ws/v1/cluster/nodes?state=lost" % (host,port) nodes = util.get_http_json(url,5) if nodes != None : if nodes["nodes"] != None : for node in nodes["nodes"]["node"] : ts = time.localtime(node['lastHealthUpdate']/1000) date_time = time.strftime("%Y-%m-%d %H:%M:%S",ts) msg = u"检测到丢失的机器 %s 丢失时间 %s." % (node["nodeHostName"],date_time) key_word = "%s(rm lost node)" % node["nodeHostName"] alarm_list.append({"key_word":key_word,"msg":msg}) else: key_word = "%s(connect rm error)" % host msg = u"不能连接到RM %s:%s" % (host,port) alarm_list.append({"key_word":key_word,"msg":msg}) return alarm_list return alarm_list
def getJobAllTask(self, jobid): url = ("http://%s:%s/ws/v1/history/mapreduce/jobs/%s/tasks" % (self.hshost, self.hsport, jobid)) tasks = util.get_http_json(url) if not tasks or not tasks.has_key( 'tasks') or not tasks['tasks'].has_key('task'): return for task in tasks['tasks']['task']: taskId = task['id'] url = ( "http://%s:%s/ws/v1/history/mapreduce/jobs/%s/tasks/%s/attempts" % (self.hshost, self.hsport, jobid, taskId)) attempts = util.get_http_json(url) if not attempts or not attempts.has_key('taskAttempts') \ or not attempts['taskAttempts'].has_key('taskAttempt') : return for attempt in attempts['taskAttempts']['taskAttempt']: attemptId = attempt['id'] url = ( "http://%s:%s/ws/v1/history/mapreduce/jobs/%s/tasks/%s/attempts/%s/counters" % (self.hshost, self.hsport, jobid, taskId, attemptId)) attemptCounter = util.get_http_json(url) self.updateWithAttempt(attempt, attemptCounter)
def getMetrics(self): url = ("http://%s:%s/ws/v1/cluster/metrics" % (self.rmhost,self.rmport)) return util.get_http_json(url)
def getJobHistory(self,jobid): url = ("http://%s:%s/ws/v1/history/mapreduce/jobs/%s" % (self.hshost,self.hsport,jobid)) return util.get_http_json(url)
def getAppList(self): url = ( "http://%s:%s/ws/v1/cluster/apps?finishedTimeBegin=%d&finishedTimeEnd=%d" % (self.rmhost, self.rmport, (self.recordTime * 1000), (self.recordTime + self.interval) * 1000)) return util.get_http_json(url)
def getMetrics(self): url = ("http://%s:%s/ws/v1/cluster/metrics" % (self.rmhost, self.rmport)) return util.get_http_json(url)
def getJobHistory(self, jobid): url = ("http://%s:%s/ws/v1/history/mapreduce/jobs/%s" % (self.hshost, self.hsport, jobid)) return util.get_http_json(url)