def get_nm_and_check(self,host,port): url = "http://%s:%s/dfshealth.jsp" % (host,port) html = util.get_http(url,5) if html == None : self.update_result("ERROR",u"不能连接到%s:%s。" % (host,port) ) return ("",0) #get ha_state tid ha_state = self.get_ha_state(html) tid = self.get_tid(html) #get storge dirs = self.get_storge_dir(html) for (dir,dir_state) in dirs.items(): if dir_state != "Active": self.update_result("ERROR",u"目录 %s 状态为 %s。" % (dir,dir_state) ) #get live datanode status # {"name","pcremaining","volfails"} url = "http://%s:%s/dfsnodelist.jsp?whatNodes=LIVE" % (host,port) html = util.get_http(url,5) if html == None : self.update_result("ERROR",u"不能连接到%s:%s。" % (host,port) ) return (ha_state,tid) tables = self.get_live_dn_table(html) for row in tables : if row["volfails"] > 0 : self.update_result("ERROR", u"机器 %s 检测到 %d 个损坏的卷。" % (row["name"],row["volfails"]) ) if row["last_contact"] > 30 : self.update_result("ERROR", u"机器 %s 检测丢失心跳 %d秒。" % (row["name"],row["last_contact"]) ) return (ha_state,tid)
def check_nn_active(host, port): url = "http://%s:%s/dfshealth.jsp" % (host,port) html = util.get_http(url,5) if not html: return False ha_state = '' pattern = re.compile("<h1>NameNode '.*?' \((.*?)\)</h1>",re.S) m = pattern.search(html) if m: ha_state = m.group(1) return ha_state == 'active'
def get_nm_and_check(self, host, port): url = "http://%s:%s/dfshealth.jsp" % (host, port) html = util.get_http(url, 5) if html == None: self.update_result("%s(connect nmweb error)" % host, u"不能连接到%s:%s。" % (host, port)) return ("", 0) #get ha_state tid ha_state = self.get_ha_state(html) tid = self.get_tid(html) #get storge dirs = self.get_storge_dir(html) for (dir, dir_state) in dirs.items(): if dir_state != "Active": self.update_result("%s(storge error)" % (host, dir), u"目录 %s 状态为 %s。" % (dir, dir_state)) #get live datanode status # {"name","pcremaining","volfails"} url = "http://%s:%s/dfsnodelist.jsp?whatNodes=LIVE" % (host, port) html = util.get_http(url, 5) if html == None: self.update_result("%s(connect nmweb error)" % host, u"不能连接到%s:%s。" % (host, port)) return (ha_state, tid) tables = self.get_live_dn_table(html) for row in tables: if row["volfails"] > 0: self.update_result( "%s(datanode error Vol)" % row["name"], u"机器 %s 检测到 %d 个损坏的卷。" % (row["name"], row["volfails"])) if row["last_contact"] > 30: self.update_result( "%s(datanode lost heartbeat)" % row["name"], u"机器 %s 检测丢失心跳 %d秒。" % (row["name"], row["last_contact"])) return (ha_state, tid)
def getJobCounter(self,jobid): """ 从appid的对应的counter的网页上截取信息.从restapi获取的不全,缺少data-local等的信息 """ url = ("http://%s:%s/jobhistory/jobcounters/%s" % (self.hshost,self.hsport,jobid)) html = util.get_http(url) if not html: return None keys = ["DATA_LOCAL_MAPS","RACK_LOCAL_MAPS", "FILE_BYTES_READ","FILE_BYTES_WRITTEN", "HDFS_BYTES_READ","HDFS_BYTES_WRITTEN"] counters = {} for key in keys: counters[key] = self.getCounterFromHtml(html,key) return counters
def getJobCounter(self, jobid): """ 从appid的对应的counter的网页上截取信息.从restapi获取的不全,缺少data-local等的信息 """ url = ("http://%s:%s/jobhistory/jobcounters/%s" % (self.hshost, self.hsport, jobid)) html = util.get_http(url) if not html: return None keys = [ "DATA_LOCAL_MAPS", "RACK_LOCAL_MAPS", "FILE_BYTES_READ", "FILE_BYTES_WRITTEN", "HDFS_BYTES_READ", "HDFS_BYTES_WRITTEN" ] counters = {} for key in keys: counters[key] = self.getCounterFromHtml(html, key) return counters