def run():
    ipmi_manager = IPMIManager()
    try:
        result = ipmi_manager.getPowerStatus(HOST)
        if result == "OK":
            return True
        else:
            return False
    except:
        return False
def run(check_timeout=60):
    ipmi_manager = IPMIManager()
    result = ipmi_manager.shutOffNode(HOST)

    while check_timeout > 0:
        power_status = ipmi_manager.getPowerStatus(HOST)
        if power_status == "Error" and result.code == "succeed":
            time.sleep(60)
            return True
        check_timeout -= 1
        time.sleep(1)
    return False
Ejemplo n.º 3
0
class Detector(object):
    def __init__(self, node, port):
        self.node = node.name
        self.ipmi_status = node.ipmi_status
        self.ipmi_manager = IPMIManager()
        self.port = port
        self.sock = None
        self.config = ConfigParser.RawConfigParser()
        self.config.read('/home/localadmin/HASS/hass.conf')
        self.connect()

    def connect(self):
        # connect to FA
        try:
            print "[" + self.node + "] create socket connection"
            self.sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
            self.sock.setblocking(0)
            self.sock.settimeout(10)
            self.sock.connect((self.node, self.port))
        except Exception as e:
            logging.error("detector connect error %s" % str(e))
            print str(e)
            print "Init [" + self.node + "] connection failed"

    def checkNetworkStatus(self):
        heartbeat_time = int(self.config.get("default", "heartbeat_time"))
        fail = False
        while heartbeat_time > 0:
            try:
                response = subprocess.check_output(
                    ['timeout', '0.2', 'ping', '-c', '1', self.node],
                    stderr=subprocess.STDOUT,
                    universal_newlines=True)
                fail = False
            except Exception as e:
                logging.error("transient network fail")
                fail = True
                pass
            finally:
                time.sleep(1)
                heartbeat_time -= 1
        if not fail:
            return State.HEALTH
        return State.NETWORK_FAIL

    def checkServiceStatus(self):
        try:
            line = "polling request"
            self.sock.sendall(line)
            data, addr = self.sock.recvfrom(1024)
            if data == "OK":
                return State.HEALTH
            elif "error" in data:
                print data
                print "[" + self.node + "]service Failed"
            elif not data:
                print "[" + self.node + "]no ACK"
            else:
                print "[" + self.node + "]Receive:" + data
            return State.SERVICE_FAIL
        except Exception as e:
            logging.error(str(e))
            fail_services = "agents"
            print "[" + self.node + "] connection failed"
            self.sock.connect((self.node, self.port))
            return State.SERVICE_FAIL

    def checkPowerStatus(self):
        if not self.ipmi_status:
            return State.HEALTH
        status = self.ipmi_manager.getPowerStatus(self.node)
        if status == "OK":
            return State.HEALTH
        return State.POWER_FAIL

    def checkOSStatus(self):
        if not self.ipmi_status:
            return State.HEALTH
        status = self.ipmi_manager.getOSStatus(self.node)
        if status == "OK":
            return State.HEALTH
        return State.OS_FAIL

    def checkSensorStatus(self):
        if not self.ipmi_status:
            return State.HEALTH
        status = self.ipmi_manager.getSensorStatus(self.node)
        if status == "OK":
            return State.HEALTH
        return State.SENSOR_FAIL

    def getFailServices(self):
        try:
            line = "polling request"
            self.sock.sendall(line)
            data, addr = self.sock.recvfrom(1024)
            if data != "OK":
                return data
        except Exception as e:
            return "agents"
class Operator(object):
    def __init__(self):
        # self.clusterList =
        self.nova_client = NovaClient.getInstance()
        self.ipmi_module = IPMIManager()
        self.cluster_list = ClusterManager.getClusterList()
        config = ConfigParser.RawConfigParser()
        config.read('hass.conf')
        self.port = int(config.get("detection", "polling_port"))

    def startNode(self, node_name, default_wait_time=180):
        message = ""
        # code = ""
        result = None
        if self._checkNodeIPMI(node_name):
            # code = "0"
            message += " IPMIOperator--node is in compute pool . The node is %s." % node_name
            try:
                ipmi_result = self.ipmi_module.startNode(node_name)
                if ipmi_result.code == "succeed":
                    boot_up = self._checkNodeBootSuccess(
                        node_name, default_wait_time)
                    if boot_up:
                        message += "start node success.The node is %s." % node_name
                        logging.info(message)
                        detection = self._checkDetectionAgent(
                            node_name, default_wait_time)
                        if not detection:
                            message += "detectionagent in computing node is fail."
                        # result = {"code": "0", "node_name": node_name, "message": message}
                        result = Response(code="succeed",
                                          message=message,
                                          data={"node_name": node_name})
                    else:
                        raise Exception("check node boot fail")
                else:
                    raise Exception("IpmiModule start node fail")
            except Exception as e:
                # start fail
                message += "IPMIOperator--start node fail.The node is %s.%s" % (
                    node_name, e)
                logging.error(message)
                # result = {"code": "1", "node_name": node_name, "message": message}
                result = Response(code="failed",
                                  message=message,
                                  data={"node_name": node_name})

        else:
            # code = "1"
            message += " IPMIOperator--node is not in compute pool or is not a IPMI PC . The node is %s." % node_name
            logging.error(message)
            # result = {"code": "1", "node_name": node_name, "message": message}
            result = Response(code="failed",
                              message=message,
                              data={"node_name": node_name})
        return result

    def shutOffNode(self, node_name):
        message = ""
        # result =None
        if self._checkNodeIPMI(node_name) and self._checkNodeNotInCluster(
                node_name):
            try:
                ipmi_result = self.ipmi_module.shutOffNode(node_name)
                # check power status in IPMIModule
                if ipmi_result.code == "succeed":
                    message += "shut off node success.The node is %s." % node_name
                    logging.info(message)
                    # result = {"code": "0", "node_name": node_name, "message": message}
                    result = Response(code="succeed",
                                      message=message,
                                      data={"node_name": node_name})
                else:
                    raise Exception("IpmiModule shut off node fail")
            except Exception as e:
                # shut off fail
                message += "IPMIOperator--shut off node fail.The node is %s.%s" % (
                    node_name, e)
                logging.error(message)
                # result = {"code": "1", "node_name": node_name, "message": message}
                result = Response(code="failed",
                                  message=message,
                                  data={"node_name": node_name})
        else:
            message += " IPMIOperator--node is not in compute pool or is not a IPMI PC or is already be protected. The node is %s." % node_name
            logging.error(message)
            # result = {"code": "1", "node_name": node_name, "message": message}
            result = Response(code="failed",
                              message=message,
                              data={"node_name": node_name})
        return result

    def rebootNode(self, node_name, default_wait_time=180):
        result = None
        message = ""
        if self._checkNodeIPMI(node_name) and self._checkNodeNotInCluster(
                node_name):
            try:
                ipmi_result = self.ipmi_module.rebootNode(node_name)
                if ipmi_result.code == "succeed":
                    message += "reboot node success.The node is %s." % node_name
                    logging.info(message)
                    detection = self._checkDetectionAgent(
                        node_name, default_wait_time)
                    if not detection:
                        message += "detectionagent in computing node is fail."
                    # result = {"code": "0", "node_name": node_name, "message": message}
                    result = Response(code="succeed",
                                      message=message,
                                      data={"node_name": node_name})
                else:
                    raise Exception("IpmiModule reboot node fail")
            except Exception as e:
                # shut off fail
                message += "IPMIOperator--reboot node fail.The node is %s.%s" % (
                    node_name, e)
                logging.error(message)
                # result = {"code": "1", "node_name": node_name, "message": message}
                result = Response(code="failed",
                                  message=message,
                                  data={"node_name": node_name})
        else:
            message += " IPMIOperator--node is not in compute pool or is not a IPMI PC or is already be protected. The node is %s." % node_name
            logging.error(message)
            # result = {"code": "1", "node_name": node_name, "message": message}
            result = Response(code="failed",
                              message=message,
                              data={"node_name": node_name})
        return result

    def getAllInfoByNode(self, node_name):
        data = self.ipmi_module.getAllInfoByNode(node_name)
        return data

    def getNodeInfoByType(self, node_name, sensor_type):
        data = self.ipmi_module.getNodeInfoByType(node_name, sensor_type)
        return data

    def _checkNodeIPMI(self, node_name):
        # is IPMI PC
        ipmistatus = self.ipmi_module._getIPMIStatus(node_name)
        if not ipmistatus:
            return False
        # is in computing pool
        if node_name in self.nova_client.getComputePool():
            message = " node is in compute pool . The node is %s." % node_name
            logging.info(message)
            return True
        else:
            message = " node is not in compute pool please check again! The node is %s." % node_name
            logging.error(message)
            return False

    def _checkNodeNotInCluster(self, node_name):
        for cluster_id in self.cluster_list:
            cluster = ClusterManager.getCluster(cluster_id)
            node_list = cluster.getAllNodeStr()
            if node_name in node_list:
                return False
        return True

    def _checkNodeBootSuccess(self, nodeName, check_timeout):
        # check power statue in IPMIModule
        status = False
        while not status:
            if check_timeout > 0:
                result = self.ipmi_module.getPowerStatus(nodeName)
                print result, check_timeout
                if result == "OK":
                    status = True
                else:
                    time.sleep(1)
                    check_timeout -= 1
            else:
                return status
        return status

    def _checkDetectionAgent(self, nodeName, check_timeout):
        # not be protect(not connect socket)
        # check detection agent
        status = False
        data = ""
        try:
            sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
            sock.setblocking(0)
            sock.settimeout(0.5)
            sock.connect((nodeName, self.port))
        except Exception as e:
            print "create socket fail", str(e)

        while status == False:
            time.sleep(5)
            if check_timeout > 0:
                try:
                    sock.sendall("polling request")
                    data, addr = sock.recvfrom(2048)
                except Exception as e:
                    print str(e)

                if "OK" in data:
                    status = True
                    sock.close()
                    #print data
                else:
                    # time.sleep(1)
                    print "wating:", check_timeout
                    check_timeout -= 5
            else:
                # timeout
                return status
        # status is True
        return status
Ejemplo n.º 5
0
class Operator(object):
    def __init__(self):
        self.ipmi_module = IPMIManager()
        self.cluster_list = ClusterManager.getClusterList()
        self.config = ConfigParser.RawConfigParser()
        self.config.read('hass.conf')
        self.port = int(self.config.get("detection", "polling_port"))

    def startNode(self, node_name, default_wait_time=180):
        """

        :param node_name: 
        :param default_wait_time: 
        :return: 
        """
        message = ""
        data = {"node_name": node_name}
        result = None
        try:
            if self._checkNodeIPMI(node_name) and self._checkNodeInComputePool(
                    node_name):
                message += " IPMIOperator--node is in compute pool . The node is %s." % node_name
                ipmi_result = self.ipmi_module.startNode(node_name)
                if ipmi_result.code == "succeed":
                    boot_up = self._checkNodeBootSuccess(
                        node_name, default_wait_time)
                    if boot_up:
                        message += "start node success.The node is %s." % node_name
                        detection = self._checkDetectionAgent(
                            node_name, default_wait_time)
                        if not detection:
                            message += "DetectionAgent in computing node is fail."
                        message += "DetectionAgent in computing is running!"
                        result = self.successResult(message, data)
                        logging.info(message)
                    else:
                        # boot_up is fail
                        message += "check node boot fail"
                        result = self.failResult(message, data)
                        logging.error(message)
                else:
                    # ipmi_result is fail
                    message += "IpmiModule start node fail"
                    result = self.failResult(message, data)
                    logging.error(message)
            else:
                # node is not ipmi node
                message += " IPMIOperator--node is not in compute pool or is not a IPMI PC . The node is %s." % \
                           node_name
                result = self.failResult(message, data)
                logging.error(message)
        except Exception as e:
            message += "IPMIOperator--start node fail.The node is %s.%s" % (
                node_name, str(e))
            result = self.failResult(message, data)
            logging.error(message)
        finally:
            return result

    def shutOffNode(self, node_name):
        """

        :param node_name: 
        :return: 
        """
        message = ""
        data = {"node_name": node_name}
        result = None
        try:
            if self._checkNodeIPMI(node_name) and self._checkNodeInComputePool(
                    node_name) and self._checkNodeNotInCluster(node_name):
                ipmi_result = self.ipmi_module.shutOffNode(node_name)
                # check power status in IPMIModule
                if ipmi_result.code == "succeed":
                    message += "shut off node success.The node is %s." % node_name
                    result = self.successResult(message, data)
                    logging.info(message)
                else:
                    message += "IpmiModule shut off node fail"
                    result = self.failResult(message, data)
                    logging.error(message)
            else:
                message += " IPMIOperator--node is not in compute pool or is not a IPMI PC or is already be " \
                           "protected. The node is %s." % node_name
                result = self.failResult(message, data)
                logging.error(message)
        except Exception as e:
            # shut off fail
            message += "IPMIOperator--shut off node fail.The node is %s.%s" % (
                node_name, str(e))
            result = self.failResult(message, data)
            logging.error(message)
        finally:
            return result

    def rebootNode(self, node_name, default_wait_time=180):
        """

        :param node_name: 
        :param default_wait_time: 
        :return: 
        """
        result = None
        data = {"node_name": node_name}
        message = ""
        try:
            if self._checkNodeIPMI(node_name) and self._checkNodeInComputePool(
                    node_name) and self._checkNodeNotInCluster(node_name):
                ipmi_result = self.ipmi_module.rebootNode(node_name)
                if ipmi_result.code == "succeed":
                    message += "reboot node success.The node is %s." % node_name
                    detection = self._checkDetectionAgent(
                        node_name, default_wait_time)
                    if not detection:
                        message += "DetectionAgent in computing node is fail."
                    message += "DetectionAgent in computing is running!"
                    result = self.successResult(message, data)
                    logging.info(message)
                else:
                    message += "IpmiModule reboot node fail"
                    result = self.failResult(message, data)
                    logging.error(message)
            else:
                message += " IPMIOperator--node is not in compute pool or is not a IPMI PC or is already be " \
                           "protected. The node is %s." % node_name
                result = self.failResult(message, data)
                logging.error(message)
        except Exception as e:
            message += "IPMIOperator--reboot node fail.The node is %s.%s" % (
                node_name, str(e))
            result = self.failResult(message, data)
            logging.error(message)
        finally:
            return result

    def getAllInfoByNode(self, node_name):
        """

        :param node_name: 
        :return: 
        """
        global result
        try:
            result = self.ipmi_module.getAllInfoByNode(node_name)
        except Exception as e:
            message = " IPMIOperator--get node info bt type fail. The node is %s." % node_name
            result = self.failResult(message, [])
            logging.error("IPMIOperator get all sensor info of node fail.%s" %
                          str(e))
        finally:
            return result

    def getNodeInfoByType(self, node_name, sensor_type):
        """

        :param node_name: 
        :param sensor_type: 
        :return: 
        """
        global result
        try:
            result = self.ipmi_module.getNodeInfoByType(node_name, sensor_type)
        except Exception as e:
            message = " IPMIOperator--get node info bt type fail. The node is %s,sensor type is %s ." % (
                node_name, sensor_type)
            result = self.failResult(message, [])
            logging.error("IPMIOperator get %s sensor info of node fail.%s" %
                          (sensor_type, str(e)))
        finally:
            return result

    def _checkNodeIPMI(self, node_name):
        ipmistatus = self.ipmi_module._getIPMIStatus(node_name)
        if not ipmistatus:
            message = " Node is not IPMI PC please check again! The node is %s." % node_name
            logging.error(message)
        else:
            message = " Node is IPMI PC. node is %s." % node_name
            logging.info(message)
        return ipmistatus

    def _checkNodeInComputePool(self, node_name):
        result = ClusterManager.nova.isInComputePool(node_name)
        if result:
            message = " Node is in compute pool . The node is %s." % node_name
            logging.info(message)
        else:
            message = " Node is not in compute pool please check again! The node is %s." % node_name
            logging.error(message)
        return result

    def _checkNodeNotInCluster(self, node_name):
        result = True
        if self.cluster_list is None:
            pass
        else:
            for cluster_id, cluster in self.cluster_list.iteritems():
                node_list = cluster.getAllNodeStr()
                if node_name in node_list:
                    logging.error(
                        " Node is in HA cluster. The node is %s, cluster id is %s"
                        % (node_name, cluster_id))
                    result = False
        return result

    def _checkNodeBootSuccess(self, nodeName, check_timeout):
        # check power statue in IPMIModule
        status = False
        while not status:
            if check_timeout > 0:
                result = self.ipmi_module.getPowerStatus(nodeName)
                print(result, check_timeout)
                if result == "OK":
                    status = True
                else:
                    time.sleep(1)
                    check_timeout -= 1
            else:
                return status
        return status

    def _checkDetectionAgent(self, nodeName, check_timeout):
        # not be protect(not connect socket)
        # check detection agent
        status = False
        data = ""
        try:
            sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
            sock.setblocking(0)
            sock.settimeout(0.5)
            sock.connect((nodeName, self.port))
        except Exception as e:
            print("create socket fail", str(e))
        while not status:
            time.sleep(5)
            if check_timeout > 0:
                try:
                    sock.sendall("polling request")
                    data, addr = sock.recvfrom(2048)
                except Exception as e:
                    print(str(e))
                if "OK" in data:
                    status = True
                    sock.close()
                else:
                    # time.sleep(1)
                    print("waiting:", check_timeout)
                    check_timeout -= 5
            else:
                # timeout
                return status
        # status is True
        return status

    def successResult(self, message, data):
        """

        :param message: 
        :param data: 
        :return: 
        """
        result = Response(code="succeed", message=message, data=data)
        return result

    def failResult(self, message, data):
        """

        :param message: 
        :param data: 
        :return: 
        """
        result = Response(code="failed", message=message, data=data)
        return result