def run(): ipmi_manager = IPMIManager() result = ipmi_manager.getNodeInfoByType(HOST, TYPE) if result.code == "succeed": return True else: return False
def __init__(self): # self.clusterList = self.nova_client = NovaClient.getInstance() self.ipmi_module = IPMIManager() self.cluster_list = ClusterManager.getClusterList() config = ConfigParser.RawConfigParser() config.read('hass.conf') self.port = int(config.get("detection", "polling_port"))
def __init__(self, name, cluster_id): self.name = name # self.protected_instance_list = [] self.cluster_id = cluster_id self.ipmi = IPMIManager() self.ipmi_status = self.ipmi._getIPMIStatus(self.name) self.nova_client = NovaClient.getInstance() self.detection_thread = None self.initDetectionThread()
def __init__(self, node, port): self.node = node.name self.ipmi_status = node.ipmi_status self.ipmi_manager = IPMIManager() self.port = port self.sock = None self.config = ConfigParser.RawConfigParser() self.config.read('/home/localadmin/HASS/hass.conf') self.connect()
def run(): ipmi_manager = IPMIManager() try: result = ipmi_manager.getPowerStatus(HOST) if result == "OK": return True else: return False except: return False
def run(check_timeout=300): ipmi_manager = IPMIManager() result = ipmi_manager.rebootNode(HOST) print "wait to %s boot up" % HOST time.sleep(150) # wait node to reboot response = _check_boot_up(check_timeout) print response if response == "OK" and result.code == "succeed": return True return False
def __init__(self, name, cluster_id): self.name = name self.cluster_id = cluster_id self.ipmi = IPMIManager() self.ipmi_status = self.ipmi._getIPMIStatus(self.name) self.nova_client = NovaClient.getInstance() self.detection_thread = None self.config = ConfigParser.RawConfigParser() self.config.read('/home/localadmin/HASS/hass.conf') self.initDetectionThread()
def run(check_timeout=60): ipmi_manager = IPMIManager() result = ipmi_manager.shutOffNode(HOST) while check_timeout > 0: power_status = ipmi_manager.getPowerStatus(HOST) if power_status == "Error" and result.code == "succeed": time.sleep(60) return True check_timeout -= 1 time.sleep(1) return False
class NodeInterface(object): def __init__(self, name, cluster_id): self.name = name # self.protected_instance_list = [] self.cluster_id = cluster_id self.ipmi = IPMIManager() self.ipmi_status = self.ipmi._getIPMIStatus(self.name) self.nova_client = NovaClient.getInstance() self.detection_thread = None self.initDetectionThread() def setNodeName(self, name): self.name = name def getNodeName(self): return self.name def setClusterId(self, cluster_id): self.cluster_id = cluster_id def getClusterId(self, cluster_id): return self.cluster_id def initDetectionThread(self): config = ConfigParser.RawConfigParser() config.read('hass.conf') cluster_id = self.cluster_id node = self polling_port = int(config.get("detection", "polling_port")) # ipmi_status = self.ipmi_status polling_interval = float(config.get("detection", "polling_interval")) self.detection_thread = DetectionThread(cluster_id, node, polling_port, polling_interval) def startDetectionThread(self): self.detection_thread.daemon = True self.detection_thread.start() def deleteDetectionThread(self): self.detection_thread.stop() def getInfo(self): return [self.name, self.cluster_id, self.ipmi_status] def sendUpdateInstance(self): so = socket.socket(socket.AF_INET, socket.SOCK_STREAM) so.connect((self.name, 5001)) # ip = so.recv(1024) so.send("update instance") so.close()
import socket import sys import time import paramiko sys.path.insert(0, '/home/controller/Desktop/MOST/HASS') from IPMIModule import IPMIManager from Node import Node CLUSTER_ID = "clusterid" HOST = "compute4" PORT = 2468 ipmi_manager = IPMIManager() def run(): try: client = _create_ssh_client(HOST) cmd = "sudo sh /home/" + HOST + "/Desktop/MOST/HASS/compute_node/os_hang.sh" #cmd = "kill -SEGV 1 & ; kill -SEGV 1" #cmd = "sudo sh /home/compute4/Desktop/test.sh" stdin, stdout, stderr = _remote_exec(client, cmd) # print stdout.read() result = detection_OS_fail(20) if result: print "detect os successfuly" recover = recover_os_fail(180) if recover: return True else:
class Detector(object): def __init__(self, node, port): self.node = node.name self.ipmi_status = node.ipmi_status self.ipmi_manager = IPMIManager() self.port = port self.sock = None self.config = ConfigParser.RawConfigParser() self.config.read('/home/localadmin/HASS/hass.conf') self.connect() def connect(self): # connect to FA try: print "[" + self.node + "] create socket connection" self.sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) self.sock.setblocking(0) self.sock.settimeout(10) self.sock.connect((self.node, self.port)) except Exception as e: logging.error("detector connect error %s" % str(e)) print str(e) print "Init [" + self.node + "] connection failed" def checkNetworkStatus(self): heartbeat_time = int(self.config.get("default", "heartbeat_time")) fail = False while heartbeat_time > 0: try: response = subprocess.check_output( ['timeout', '0.2', 'ping', '-c', '1', self.node], stderr=subprocess.STDOUT, universal_newlines=True) fail = False except Exception as e: logging.error("transient network fail") fail = True pass finally: time.sleep(1) heartbeat_time -= 1 if not fail: return State.HEALTH return State.NETWORK_FAIL def checkServiceStatus(self): try: line = "polling request" self.sock.sendall(line) data, addr = self.sock.recvfrom(1024) if data == "OK": return State.HEALTH elif "error" in data: print data print "[" + self.node + "]service Failed" elif not data: print "[" + self.node + "]no ACK" else: print "[" + self.node + "]Receive:" + data return State.SERVICE_FAIL except Exception as e: logging.error(str(e)) fail_services = "agents" print "[" + self.node + "] connection failed" self.sock.connect((self.node, self.port)) return State.SERVICE_FAIL def checkPowerStatus(self): if not self.ipmi_status: return State.HEALTH status = self.ipmi_manager.getPowerStatus(self.node) if status == "OK": return State.HEALTH return State.POWER_FAIL def checkOSStatus(self): if not self.ipmi_status: return State.HEALTH status = self.ipmi_manager.getOSStatus(self.node) if status == "OK": return State.HEALTH return State.OS_FAIL def checkSensorStatus(self): if not self.ipmi_status: return State.HEALTH status = self.ipmi_manager.getSensorStatus(self.node) if status == "OK": return State.HEALTH return State.SENSOR_FAIL def getFailServices(self): try: line = "polling request" self.sock.sendall(line) data, addr = self.sock.recvfrom(1024) if data != "OK": return data except Exception as e: return "agents"
def __init__(self, id, name): super(Cluster, self).__init__(id, name) self.ipmi = IPMIManager() self.config = ConfigParser.RawConfigParser() self.config.read('/home/localadmin/HASS/hass.conf')
class Operator(object): def __init__(self): # self.clusterList = self.nova_client = NovaClient.getInstance() self.ipmi_module = IPMIManager() self.cluster_list = ClusterManager.getClusterList() config = ConfigParser.RawConfigParser() config.read('hass.conf') self.port = int(config.get("detection", "polling_port")) def startNode(self, node_name, default_wait_time=180): message = "" # code = "" result = None if self._checkNodeIPMI(node_name): # code = "0" message += " IPMIOperator--node is in compute pool . The node is %s." % node_name try: ipmi_result = self.ipmi_module.startNode(node_name) if ipmi_result.code == "succeed": boot_up = self._checkNodeBootSuccess( node_name, default_wait_time) if boot_up: message += "start node success.The node is %s." % node_name logging.info(message) detection = self._checkDetectionAgent( node_name, default_wait_time) if not detection: message += "detectionagent in computing node is fail." # result = {"code": "0", "node_name": node_name, "message": message} result = Response(code="succeed", message=message, data={"node_name": node_name}) else: raise Exception("check node boot fail") else: raise Exception("IpmiModule start node fail") except Exception as e: # start fail message += "IPMIOperator--start node fail.The node is %s.%s" % ( node_name, e) logging.error(message) # result = {"code": "1", "node_name": node_name, "message": message} result = Response(code="failed", message=message, data={"node_name": node_name}) else: # code = "1" message += " IPMIOperator--node is not in compute pool or is not a IPMI PC . The node is %s." % node_name logging.error(message) # result = {"code": "1", "node_name": node_name, "message": message} result = Response(code="failed", message=message, data={"node_name": node_name}) return result def shutOffNode(self, node_name): message = "" # result =None if self._checkNodeIPMI(node_name) and self._checkNodeNotInCluster( node_name): try: ipmi_result = self.ipmi_module.shutOffNode(node_name) # check power status in IPMIModule if ipmi_result.code == "succeed": message += "shut off node success.The node is %s." % node_name logging.info(message) # result = {"code": "0", "node_name": node_name, "message": message} result = Response(code="succeed", message=message, data={"node_name": node_name}) else: raise Exception("IpmiModule shut off node fail") except Exception as e: # shut off fail message += "IPMIOperator--shut off node fail.The node is %s.%s" % ( node_name, e) logging.error(message) # result = {"code": "1", "node_name": node_name, "message": message} result = Response(code="failed", message=message, data={"node_name": node_name}) else: message += " IPMIOperator--node is not in compute pool or is not a IPMI PC or is already be protected. The node is %s." % node_name logging.error(message) # result = {"code": "1", "node_name": node_name, "message": message} result = Response(code="failed", message=message, data={"node_name": node_name}) return result def rebootNode(self, node_name, default_wait_time=180): result = None message = "" if self._checkNodeIPMI(node_name) and self._checkNodeNotInCluster( node_name): try: ipmi_result = self.ipmi_module.rebootNode(node_name) if ipmi_result.code == "succeed": message += "reboot node success.The node is %s." % node_name logging.info(message) detection = self._checkDetectionAgent( node_name, default_wait_time) if not detection: message += "detectionagent in computing node is fail." # result = {"code": "0", "node_name": node_name, "message": message} result = Response(code="succeed", message=message, data={"node_name": node_name}) else: raise Exception("IpmiModule reboot node fail") except Exception as e: # shut off fail message += "IPMIOperator--reboot node fail.The node is %s.%s" % ( node_name, e) logging.error(message) # result = {"code": "1", "node_name": node_name, "message": message} result = Response(code="failed", message=message, data={"node_name": node_name}) else: message += " IPMIOperator--node is not in compute pool or is not a IPMI PC or is already be protected. The node is %s." % node_name logging.error(message) # result = {"code": "1", "node_name": node_name, "message": message} result = Response(code="failed", message=message, data={"node_name": node_name}) return result def getAllInfoByNode(self, node_name): data = self.ipmi_module.getAllInfoByNode(node_name) return data def getNodeInfoByType(self, node_name, sensor_type): data = self.ipmi_module.getNodeInfoByType(node_name, sensor_type) return data def _checkNodeIPMI(self, node_name): # is IPMI PC ipmistatus = self.ipmi_module._getIPMIStatus(node_name) if not ipmistatus: return False # is in computing pool if node_name in self.nova_client.getComputePool(): message = " node is in compute pool . The node is %s." % node_name logging.info(message) return True else: message = " node is not in compute pool please check again! The node is %s." % node_name logging.error(message) return False def _checkNodeNotInCluster(self, node_name): for cluster_id in self.cluster_list: cluster = ClusterManager.getCluster(cluster_id) node_list = cluster.getAllNodeStr() if node_name in node_list: return False return True def _checkNodeBootSuccess(self, nodeName, check_timeout): # check power statue in IPMIModule status = False while not status: if check_timeout > 0: result = self.ipmi_module.getPowerStatus(nodeName) print result, check_timeout if result == "OK": status = True else: time.sleep(1) check_timeout -= 1 else: return status return status def _checkDetectionAgent(self, nodeName, check_timeout): # not be protect(not connect socket) # check detection agent status = False data = "" try: sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock.setblocking(0) sock.settimeout(0.5) sock.connect((nodeName, self.port)) except Exception as e: print "create socket fail", str(e) while status == False: time.sleep(5) if check_timeout > 0: try: sock.sendall("polling request") data, addr = sock.recvfrom(2048) except Exception as e: print str(e) if "OK" in data: status = True sock.close() #print data else: # time.sleep(1) print "wating:", check_timeout check_timeout -= 5 else: # timeout return status # status is True return status
def __init__(self): self.ipmi_module = IPMIManager() self.cluster_list = ClusterManager.getClusterList() self.config = ConfigParser.RawConfigParser() self.config.read('hass.conf') self.port = int(self.config.get("detection", "polling_port"))
class Operator(object): def __init__(self): self.ipmi_module = IPMIManager() self.cluster_list = ClusterManager.getClusterList() self.config = ConfigParser.RawConfigParser() self.config.read('hass.conf') self.port = int(self.config.get("detection", "polling_port")) def startNode(self, node_name, default_wait_time=180): """ :param node_name: :param default_wait_time: :return: """ message = "" data = {"node_name": node_name} result = None try: if self._checkNodeIPMI(node_name) and self._checkNodeInComputePool( node_name): message += " IPMIOperator--node is in compute pool . The node is %s." % node_name ipmi_result = self.ipmi_module.startNode(node_name) if ipmi_result.code == "succeed": boot_up = self._checkNodeBootSuccess( node_name, default_wait_time) if boot_up: message += "start node success.The node is %s." % node_name detection = self._checkDetectionAgent( node_name, default_wait_time) if not detection: message += "DetectionAgent in computing node is fail." message += "DetectionAgent in computing is running!" result = self.successResult(message, data) logging.info(message) else: # boot_up is fail message += "check node boot fail" result = self.failResult(message, data) logging.error(message) else: # ipmi_result is fail message += "IpmiModule start node fail" result = self.failResult(message, data) logging.error(message) else: # node is not ipmi node message += " IPMIOperator--node is not in compute pool or is not a IPMI PC . The node is %s." % \ node_name result = self.failResult(message, data) logging.error(message) except Exception as e: message += "IPMIOperator--start node fail.The node is %s.%s" % ( node_name, str(e)) result = self.failResult(message, data) logging.error(message) finally: return result def shutOffNode(self, node_name): """ :param node_name: :return: """ message = "" data = {"node_name": node_name} result = None try: if self._checkNodeIPMI(node_name) and self._checkNodeInComputePool( node_name) and self._checkNodeNotInCluster(node_name): ipmi_result = self.ipmi_module.shutOffNode(node_name) # check power status in IPMIModule if ipmi_result.code == "succeed": message += "shut off node success.The node is %s." % node_name result = self.successResult(message, data) logging.info(message) else: message += "IpmiModule shut off node fail" result = self.failResult(message, data) logging.error(message) else: message += " IPMIOperator--node is not in compute pool or is not a IPMI PC or is already be " \ "protected. The node is %s." % node_name result = self.failResult(message, data) logging.error(message) except Exception as e: # shut off fail message += "IPMIOperator--shut off node fail.The node is %s.%s" % ( node_name, str(e)) result = self.failResult(message, data) logging.error(message) finally: return result def rebootNode(self, node_name, default_wait_time=180): """ :param node_name: :param default_wait_time: :return: """ result = None data = {"node_name": node_name} message = "" try: if self._checkNodeIPMI(node_name) and self._checkNodeInComputePool( node_name) and self._checkNodeNotInCluster(node_name): ipmi_result = self.ipmi_module.rebootNode(node_name) if ipmi_result.code == "succeed": message += "reboot node success.The node is %s." % node_name detection = self._checkDetectionAgent( node_name, default_wait_time) if not detection: message += "DetectionAgent in computing node is fail." message += "DetectionAgent in computing is running!" result = self.successResult(message, data) logging.info(message) else: message += "IpmiModule reboot node fail" result = self.failResult(message, data) logging.error(message) else: message += " IPMIOperator--node is not in compute pool or is not a IPMI PC or is already be " \ "protected. The node is %s." % node_name result = self.failResult(message, data) logging.error(message) except Exception as e: message += "IPMIOperator--reboot node fail.The node is %s.%s" % ( node_name, str(e)) result = self.failResult(message, data) logging.error(message) finally: return result def getAllInfoByNode(self, node_name): """ :param node_name: :return: """ global result try: result = self.ipmi_module.getAllInfoByNode(node_name) except Exception as e: message = " IPMIOperator--get node info bt type fail. The node is %s." % node_name result = self.failResult(message, []) logging.error("IPMIOperator get all sensor info of node fail.%s" % str(e)) finally: return result def getNodeInfoByType(self, node_name, sensor_type): """ :param node_name: :param sensor_type: :return: """ global result try: result = self.ipmi_module.getNodeInfoByType(node_name, sensor_type) except Exception as e: message = " IPMIOperator--get node info bt type fail. The node is %s,sensor type is %s ." % ( node_name, sensor_type) result = self.failResult(message, []) logging.error("IPMIOperator get %s sensor info of node fail.%s" % (sensor_type, str(e))) finally: return result def _checkNodeIPMI(self, node_name): ipmistatus = self.ipmi_module._getIPMIStatus(node_name) if not ipmistatus: message = " Node is not IPMI PC please check again! The node is %s." % node_name logging.error(message) else: message = " Node is IPMI PC. node is %s." % node_name logging.info(message) return ipmistatus def _checkNodeInComputePool(self, node_name): result = ClusterManager.nova.isInComputePool(node_name) if result: message = " Node is in compute pool . The node is %s." % node_name logging.info(message) else: message = " Node is not in compute pool please check again! The node is %s." % node_name logging.error(message) return result def _checkNodeNotInCluster(self, node_name): result = True if self.cluster_list is None: pass else: for cluster_id, cluster in self.cluster_list.iteritems(): node_list = cluster.getAllNodeStr() if node_name in node_list: logging.error( " Node is in HA cluster. The node is %s, cluster id is %s" % (node_name, cluster_id)) result = False return result def _checkNodeBootSuccess(self, nodeName, check_timeout): # check power statue in IPMIModule status = False while not status: if check_timeout > 0: result = self.ipmi_module.getPowerStatus(nodeName) print(result, check_timeout) if result == "OK": status = True else: time.sleep(1) check_timeout -= 1 else: return status return status def _checkDetectionAgent(self, nodeName, check_timeout): # not be protect(not connect socket) # check detection agent status = False data = "" try: sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock.setblocking(0) sock.settimeout(0.5) sock.connect((nodeName, self.port)) except Exception as e: print("create socket fail", str(e)) while not status: time.sleep(5) if check_timeout > 0: try: sock.sendall("polling request") data, addr = sock.recvfrom(2048) except Exception as e: print(str(e)) if "OK" in data: status = True sock.close() else: # time.sleep(1) print("waiting:", check_timeout) check_timeout -= 5 else: # timeout return status # status is True return status def successResult(self, message, data): """ :param message: :param data: :return: """ result = Response(code="succeed", message=message, data=data) return result def failResult(self, message, data): """ :param message: :param data: :return: """ result = Response(code="failed", message=message, data=data) return result
class NodeInterface(object): def __init__(self, name, cluster_id, detection_list): self.name = name self.cluster_id = cluster_id self.detection_list = detection_list self.ipmi = IPMIManager() self.ipmi_status = self.ipmi._getIPMIStatus(self.name) self.nova_client = NovaClient.getInstance() self.detection_thread = None self.config = ConfigParser.RawConfigParser() self.config.read('/home/localadmin/HASS/hass.conf') self.initDetectionThread() def setNodeName(self, name): self.name = name def getNodeName(self): return self.name def setClusterId(self, cluster_id): self.cluster_id = cluster_id def getClusterId(self, cluster_id): return self.cluster_id ''' def addInstance(self, instance): self.protected.instance_list.append(instance) def removeInstance(self, instance): self.instance_list.remove(instance) def initInstanceList(self): self.instance_list = [] ''' def initDetectionThread(self): cluster_id = self.cluster_id node = self detection_list = self.detection_list polling_port = int(self.config.get("detection", "polling_port")) # ipmi_status = self.ipmi_status polling_interval = float( self.config.get("detection", "polling_interval")) self.detection_thread = DetectionThread(cluster_id, node, detection_list, polling_port, polling_interval) def startDetectionThread(self): self.detection_thread.daemon = True self.detection_thread.start() def deleteDetectionThread(self): self.detection_thread.stop() def getDetectionList(self): detection_to_string = [] detection_list_map = { '1': 'Host Power', '2': 'Host OS', '3': 'Host Network', '4': 'Host Service', '5': 'VM Power', '6': 'VM OS', '7': 'VM Network' } if self.detection_list != []: for i in self.detection_list[:]: detection_to_string.append(detection_list_map[i]) return detection_to_string else: for i in self.detection_list_map[:]: detection_to_string.append(detection_list_map[i]) return detection_to_string def getAllDetectionStr(self): convert_to_string_detection = ",".join( str(x) for x in self.getDetectionList()) return convert_to_string_detection def getInfo(self): return { "node_name": self.name, "below_cluster_id": self.cluster_id, "ipmi_enable": self.ipmi_status, "detection_list": self.detection_list } def sendUpdateInstance(self): # try: # logging.info("Init update instance socket to %s" % self.name) # so = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # so.settimeout(10) # so.connect((self.name, 5001)) # so.send("update instance") # so.close() # except Exception as e: # logging.error("send updata instance fail %s" % str(e)) pass def undefine_instance_via_socket(self, instance): port = int(self.config.get("detection", "polling_port")) try: sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock.setblocking(0) sock.settimeout(3) sock.connect((self.name, port)) msg = "undefine %s" % instance.name sock.sendall(msg) data, addr = sock.recvfrom(1024) if data != "OK": logging.error("undefine instance fail msg %s" % data) except Exception as e: logging.error("socket send undefine instance fail %s" % str(e)) finally: if sock: sock.close() print "sock close"