def run(): ipmi_manager = IPMIManager() try: result = ipmi_manager.getPowerStatus(HOST) if result == "OK": return True else: return False except: return False
def run(check_timeout=60): ipmi_manager = IPMIManager() result = ipmi_manager.shutOffNode(HOST) while check_timeout > 0: power_status = ipmi_manager.getPowerStatus(HOST) if power_status == "Error" and result.code == "succeed": time.sleep(60) return True check_timeout -= 1 time.sleep(1) return False
class Detector(object): def __init__(self, node, port): self.node = node.name self.ipmi_status = node.ipmi_status self.ipmi_manager = IPMIManager() self.port = port self.sock = None self.config = ConfigParser.RawConfigParser() self.config.read('/home/localadmin/HASS/hass.conf') self.connect() def connect(self): # connect to FA try: print "[" + self.node + "] create socket connection" self.sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) self.sock.setblocking(0) self.sock.settimeout(10) self.sock.connect((self.node, self.port)) except Exception as e: logging.error("detector connect error %s" % str(e)) print str(e) print "Init [" + self.node + "] connection failed" def checkNetworkStatus(self): heartbeat_time = int(self.config.get("default", "heartbeat_time")) fail = False while heartbeat_time > 0: try: response = subprocess.check_output( ['timeout', '0.2', 'ping', '-c', '1', self.node], stderr=subprocess.STDOUT, universal_newlines=True) fail = False except Exception as e: logging.error("transient network fail") fail = True pass finally: time.sleep(1) heartbeat_time -= 1 if not fail: return State.HEALTH return State.NETWORK_FAIL def checkServiceStatus(self): try: line = "polling request" self.sock.sendall(line) data, addr = self.sock.recvfrom(1024) if data == "OK": return State.HEALTH elif "error" in data: print data print "[" + self.node + "]service Failed" elif not data: print "[" + self.node + "]no ACK" else: print "[" + self.node + "]Receive:" + data return State.SERVICE_FAIL except Exception as e: logging.error(str(e)) fail_services = "agents" print "[" + self.node + "] connection failed" self.sock.connect((self.node, self.port)) return State.SERVICE_FAIL def checkPowerStatus(self): if not self.ipmi_status: return State.HEALTH status = self.ipmi_manager.getPowerStatus(self.node) if status == "OK": return State.HEALTH return State.POWER_FAIL def checkOSStatus(self): if not self.ipmi_status: return State.HEALTH status = self.ipmi_manager.getOSStatus(self.node) if status == "OK": return State.HEALTH return State.OS_FAIL def checkSensorStatus(self): if not self.ipmi_status: return State.HEALTH status = self.ipmi_manager.getSensorStatus(self.node) if status == "OK": return State.HEALTH return State.SENSOR_FAIL def getFailServices(self): try: line = "polling request" self.sock.sendall(line) data, addr = self.sock.recvfrom(1024) if data != "OK": return data except Exception as e: return "agents"
class Operator(object): def __init__(self): # self.clusterList = self.nova_client = NovaClient.getInstance() self.ipmi_module = IPMIManager() self.cluster_list = ClusterManager.getClusterList() config = ConfigParser.RawConfigParser() config.read('hass.conf') self.port = int(config.get("detection", "polling_port")) def startNode(self, node_name, default_wait_time=180): message = "" # code = "" result = None if self._checkNodeIPMI(node_name): # code = "0" message += " IPMIOperator--node is in compute pool . The node is %s." % node_name try: ipmi_result = self.ipmi_module.startNode(node_name) if ipmi_result.code == "succeed": boot_up = self._checkNodeBootSuccess( node_name, default_wait_time) if boot_up: message += "start node success.The node is %s." % node_name logging.info(message) detection = self._checkDetectionAgent( node_name, default_wait_time) if not detection: message += "detectionagent in computing node is fail." # result = {"code": "0", "node_name": node_name, "message": message} result = Response(code="succeed", message=message, data={"node_name": node_name}) else: raise Exception("check node boot fail") else: raise Exception("IpmiModule start node fail") except Exception as e: # start fail message += "IPMIOperator--start node fail.The node is %s.%s" % ( node_name, e) logging.error(message) # result = {"code": "1", "node_name": node_name, "message": message} result = Response(code="failed", message=message, data={"node_name": node_name}) else: # code = "1" message += " IPMIOperator--node is not in compute pool or is not a IPMI PC . The node is %s." % node_name logging.error(message) # result = {"code": "1", "node_name": node_name, "message": message} result = Response(code="failed", message=message, data={"node_name": node_name}) return result def shutOffNode(self, node_name): message = "" # result =None if self._checkNodeIPMI(node_name) and self._checkNodeNotInCluster( node_name): try: ipmi_result = self.ipmi_module.shutOffNode(node_name) # check power status in IPMIModule if ipmi_result.code == "succeed": message += "shut off node success.The node is %s." % node_name logging.info(message) # result = {"code": "0", "node_name": node_name, "message": message} result = Response(code="succeed", message=message, data={"node_name": node_name}) else: raise Exception("IpmiModule shut off node fail") except Exception as e: # shut off fail message += "IPMIOperator--shut off node fail.The node is %s.%s" % ( node_name, e) logging.error(message) # result = {"code": "1", "node_name": node_name, "message": message} result = Response(code="failed", message=message, data={"node_name": node_name}) else: message += " IPMIOperator--node is not in compute pool or is not a IPMI PC or is already be protected. The node is %s." % node_name logging.error(message) # result = {"code": "1", "node_name": node_name, "message": message} result = Response(code="failed", message=message, data={"node_name": node_name}) return result def rebootNode(self, node_name, default_wait_time=180): result = None message = "" if self._checkNodeIPMI(node_name) and self._checkNodeNotInCluster( node_name): try: ipmi_result = self.ipmi_module.rebootNode(node_name) if ipmi_result.code == "succeed": message += "reboot node success.The node is %s." % node_name logging.info(message) detection = self._checkDetectionAgent( node_name, default_wait_time) if not detection: message += "detectionagent in computing node is fail." # result = {"code": "0", "node_name": node_name, "message": message} result = Response(code="succeed", message=message, data={"node_name": node_name}) else: raise Exception("IpmiModule reboot node fail") except Exception as e: # shut off fail message += "IPMIOperator--reboot node fail.The node is %s.%s" % ( node_name, e) logging.error(message) # result = {"code": "1", "node_name": node_name, "message": message} result = Response(code="failed", message=message, data={"node_name": node_name}) else: message += " IPMIOperator--node is not in compute pool or is not a IPMI PC or is already be protected. The node is %s." % node_name logging.error(message) # result = {"code": "1", "node_name": node_name, "message": message} result = Response(code="failed", message=message, data={"node_name": node_name}) return result def getAllInfoByNode(self, node_name): data = self.ipmi_module.getAllInfoByNode(node_name) return data def getNodeInfoByType(self, node_name, sensor_type): data = self.ipmi_module.getNodeInfoByType(node_name, sensor_type) return data def _checkNodeIPMI(self, node_name): # is IPMI PC ipmistatus = self.ipmi_module._getIPMIStatus(node_name) if not ipmistatus: return False # is in computing pool if node_name in self.nova_client.getComputePool(): message = " node is in compute pool . The node is %s." % node_name logging.info(message) return True else: message = " node is not in compute pool please check again! The node is %s." % node_name logging.error(message) return False def _checkNodeNotInCluster(self, node_name): for cluster_id in self.cluster_list: cluster = ClusterManager.getCluster(cluster_id) node_list = cluster.getAllNodeStr() if node_name in node_list: return False return True def _checkNodeBootSuccess(self, nodeName, check_timeout): # check power statue in IPMIModule status = False while not status: if check_timeout > 0: result = self.ipmi_module.getPowerStatus(nodeName) print result, check_timeout if result == "OK": status = True else: time.sleep(1) check_timeout -= 1 else: return status return status def _checkDetectionAgent(self, nodeName, check_timeout): # not be protect(not connect socket) # check detection agent status = False data = "" try: sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock.setblocking(0) sock.settimeout(0.5) sock.connect((nodeName, self.port)) except Exception as e: print "create socket fail", str(e) while status == False: time.sleep(5) if check_timeout > 0: try: sock.sendall("polling request") data, addr = sock.recvfrom(2048) except Exception as e: print str(e) if "OK" in data: status = True sock.close() #print data else: # time.sleep(1) print "wating:", check_timeout check_timeout -= 5 else: # timeout return status # status is True return status
class Operator(object): def __init__(self): self.ipmi_module = IPMIManager() self.cluster_list = ClusterManager.getClusterList() self.config = ConfigParser.RawConfigParser() self.config.read('hass.conf') self.port = int(self.config.get("detection", "polling_port")) def startNode(self, node_name, default_wait_time=180): """ :param node_name: :param default_wait_time: :return: """ message = "" data = {"node_name": node_name} result = None try: if self._checkNodeIPMI(node_name) and self._checkNodeInComputePool( node_name): message += " IPMIOperator--node is in compute pool . The node is %s." % node_name ipmi_result = self.ipmi_module.startNode(node_name) if ipmi_result.code == "succeed": boot_up = self._checkNodeBootSuccess( node_name, default_wait_time) if boot_up: message += "start node success.The node is %s." % node_name detection = self._checkDetectionAgent( node_name, default_wait_time) if not detection: message += "DetectionAgent in computing node is fail." message += "DetectionAgent in computing is running!" result = self.successResult(message, data) logging.info(message) else: # boot_up is fail message += "check node boot fail" result = self.failResult(message, data) logging.error(message) else: # ipmi_result is fail message += "IpmiModule start node fail" result = self.failResult(message, data) logging.error(message) else: # node is not ipmi node message += " IPMIOperator--node is not in compute pool or is not a IPMI PC . The node is %s." % \ node_name result = self.failResult(message, data) logging.error(message) except Exception as e: message += "IPMIOperator--start node fail.The node is %s.%s" % ( node_name, str(e)) result = self.failResult(message, data) logging.error(message) finally: return result def shutOffNode(self, node_name): """ :param node_name: :return: """ message = "" data = {"node_name": node_name} result = None try: if self._checkNodeIPMI(node_name) and self._checkNodeInComputePool( node_name) and self._checkNodeNotInCluster(node_name): ipmi_result = self.ipmi_module.shutOffNode(node_name) # check power status in IPMIModule if ipmi_result.code == "succeed": message += "shut off node success.The node is %s." % node_name result = self.successResult(message, data) logging.info(message) else: message += "IpmiModule shut off node fail" result = self.failResult(message, data) logging.error(message) else: message += " IPMIOperator--node is not in compute pool or is not a IPMI PC or is already be " \ "protected. The node is %s." % node_name result = self.failResult(message, data) logging.error(message) except Exception as e: # shut off fail message += "IPMIOperator--shut off node fail.The node is %s.%s" % ( node_name, str(e)) result = self.failResult(message, data) logging.error(message) finally: return result def rebootNode(self, node_name, default_wait_time=180): """ :param node_name: :param default_wait_time: :return: """ result = None data = {"node_name": node_name} message = "" try: if self._checkNodeIPMI(node_name) and self._checkNodeInComputePool( node_name) and self._checkNodeNotInCluster(node_name): ipmi_result = self.ipmi_module.rebootNode(node_name) if ipmi_result.code == "succeed": message += "reboot node success.The node is %s." % node_name detection = self._checkDetectionAgent( node_name, default_wait_time) if not detection: message += "DetectionAgent in computing node is fail." message += "DetectionAgent in computing is running!" result = self.successResult(message, data) logging.info(message) else: message += "IpmiModule reboot node fail" result = self.failResult(message, data) logging.error(message) else: message += " IPMIOperator--node is not in compute pool or is not a IPMI PC or is already be " \ "protected. The node is %s." % node_name result = self.failResult(message, data) logging.error(message) except Exception as e: message += "IPMIOperator--reboot node fail.The node is %s.%s" % ( node_name, str(e)) result = self.failResult(message, data) logging.error(message) finally: return result def getAllInfoByNode(self, node_name): """ :param node_name: :return: """ global result try: result = self.ipmi_module.getAllInfoByNode(node_name) except Exception as e: message = " IPMIOperator--get node info bt type fail. The node is %s." % node_name result = self.failResult(message, []) logging.error("IPMIOperator get all sensor info of node fail.%s" % str(e)) finally: return result def getNodeInfoByType(self, node_name, sensor_type): """ :param node_name: :param sensor_type: :return: """ global result try: result = self.ipmi_module.getNodeInfoByType(node_name, sensor_type) except Exception as e: message = " IPMIOperator--get node info bt type fail. The node is %s,sensor type is %s ." % ( node_name, sensor_type) result = self.failResult(message, []) logging.error("IPMIOperator get %s sensor info of node fail.%s" % (sensor_type, str(e))) finally: return result def _checkNodeIPMI(self, node_name): ipmistatus = self.ipmi_module._getIPMIStatus(node_name) if not ipmistatus: message = " Node is not IPMI PC please check again! The node is %s." % node_name logging.error(message) else: message = " Node is IPMI PC. node is %s." % node_name logging.info(message) return ipmistatus def _checkNodeInComputePool(self, node_name): result = ClusterManager.nova.isInComputePool(node_name) if result: message = " Node is in compute pool . The node is %s." % node_name logging.info(message) else: message = " Node is not in compute pool please check again! The node is %s." % node_name logging.error(message) return result def _checkNodeNotInCluster(self, node_name): result = True if self.cluster_list is None: pass else: for cluster_id, cluster in self.cluster_list.iteritems(): node_list = cluster.getAllNodeStr() if node_name in node_list: logging.error( " Node is in HA cluster. The node is %s, cluster id is %s" % (node_name, cluster_id)) result = False return result def _checkNodeBootSuccess(self, nodeName, check_timeout): # check power statue in IPMIModule status = False while not status: if check_timeout > 0: result = self.ipmi_module.getPowerStatus(nodeName) print(result, check_timeout) if result == "OK": status = True else: time.sleep(1) check_timeout -= 1 else: return status return status def _checkDetectionAgent(self, nodeName, check_timeout): # not be protect(not connect socket) # check detection agent status = False data = "" try: sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock.setblocking(0) sock.settimeout(0.5) sock.connect((nodeName, self.port)) except Exception as e: print("create socket fail", str(e)) while not status: time.sleep(5) if check_timeout > 0: try: sock.sendall("polling request") data, addr = sock.recvfrom(2048) except Exception as e: print(str(e)) if "OK" in data: status = True sock.close() else: # time.sleep(1) print("waiting:", check_timeout) check_timeout -= 5 else: # timeout return status # status is True return status def successResult(self, message, data): """ :param message: :param data: :return: """ result = Response(code="succeed", message=message, data=data) return result def failResult(self, message, data): """ :param message: :param data: :return: """ result = Response(code="failed", message=message, data=data) return result