Beispiel #1
0
def main(args):
    caseName = os.path.basename(inspect.getfile(
        inspect.currentframe())).split('.')[0]
    logging.getLogger(caseName).info(caseDescription)
    clusterObj = base.getClusterObj(caseName, args)
    for client in clusterObj.getClients():
        base.startRBDIO(caseName, client, imageNum, poolName)
def main(args):
    caseName = os.path.basename(inspect.getfile(inspect.currentframe())).split('.')[0]
    logging.getLogger(caseName).info(caseDescription)
    clusterObj = base.getClusterObj(caseName, args)
    nodeList = clusterObj.getNodes()
    #client = clusterObj.getClients()[0]
    
    logging.getLogger(caseName).info("start to check cluster status before case running")
    status = clusterObj.getStatus(caseName, clusterObj.getFirstAvaNode(caseName), timeOut)
    if(status == 'HEALTH_OK'):
        logging.getLogger(caseName).info("health status is OK")
    else:
        logging.getLogger(caseName).error("health status is error")
        exit(-1)
        
    logging.getLogger(caseName).info("\nStep1: Check IO from clients")
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    for client in clusterObj.getClients():         
        if(client.checkIOProcess(caseName ) == "error"):
            base.startRBDIO(caseName, client, imageNum, poolName)
            
    sleep(60)
    logging.getLogger(caseName).info("\nStep 2: Out the osd and check IO")
    for nodeObj in nodeList:
        for osdObj in nodeObj.getOsds():
            #out the osd
            for i in range(5):
                logging.getLogger(caseName).info("\nNow operate "+nodeObj.gethostName())
                logging.getLogger(caseName).info(len(nodeObj.getOsds()))
                logging.getLogger(caseName).info("\nNow operate "+osdObj.getid())
                logging.getLogger(caseName).info("out "+osdObj.getid())
                osdObj.outCluster(caseName, nodeObj)
                logging.getLogger(caseName).info("check if IO error")
                sleep(15)
                #add osd in cluster
                #logging.getLogger(caseName).info("add in "+osdObj.getid()+" to cluster")
                osdObj.inCluster(caseName, nodeObj)
                #check ceph health
                sleep(15)
            logging.getLogger(caseName).info("Now check if there is any IO error")   
            for client in clusterObj.getClients():
                client.checkIOError(caseName)
            for client in clusterObj.getClients():         
                if(client.checkIOProcess(caseName ) == "error"):
                    base.startRBDIO(caseName, client, imageNum, poolName)


    for client in clusterObj.getClients():
        client.checkIOError(caseName) 
    logging.getLogger(caseName).info("%s runs complete"%caseName)      
    
    
                
def main(args):
    caseName = os.path.basename(inspect.getfile(
        inspect.currentframe())).split('.')[0]
    logging.getLogger(caseName).info(caseDescription)
    clusterObj = base.getClusterObj(caseName, args)
    #client = clusterObj.getClients()[0]
    nodeObj = clusterObj.getFirstAvaNode(caseName)

    logging.getLogger(caseName).info(
        "start to check cluster status before case running")
    status = clusterObj.getStatus(caseName, nodeObj, timeOut)
    if (status == 'HEALTH_OK'):
        logging.getLogger(caseName).info("health status is OK")

    else:
        logging.getLogger(caseName).error("health status is error")
        exit(-1)

    logging.getLogger(caseName).info("\nStep1: Check IO from clients")
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    for client in clusterObj.getClients():
        if (client.checkIOProcess(caseName) == "error"):
            base.startRBDIO(caseName, client, imageNum, poolName)
    sleep(60)

    logging.getLogger(caseName).info("\nStep2: pause all osds")
    clusterObj.pauseOsd(caseName)
    status = clusterObj.getStatus(caseName, nodeObj, timeOut)
    if (status == 'HEALTH_OK'):
        logging.getLogger(caseName).info("pause cluster successfully")
    else:
        logging.getLogger(caseName).error("status is %s" % status)
        logging.getLogger(caseName).error("print log for another 10 minutes")
        status = clusterObj.getStatus(caseName, nodeObj, timeOut)
        if (status == 'HEALTH_OK'):
            logging.getLogger(caseName).info("resume cluster successfully")

        else:
            logging.getLogger(caseName).error("%s  runs failed" % caseName)
            exit(-1)
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    logging.getLogger(caseName).info("\nStep3: resume all osds")
    clusterObj.resumeOsd(caseName)

    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    '''
    for client in clusterObj.getClients():             
        base.stopIO(caseName, client) 
    '''
    logging.getLogger(caseName).info("\nCase runs successfully")
def main(args):
    caseName = os.path.basename(inspect.getfile(
        inspect.currentframe())).split('.')[0]
    logging.getLogger(caseName).info(caseDescription)
    clusterObj = base.getClusterObj(caseName, args)
    clusterObj.initOsdProcess(caseName)
    #client = clusterObj.getClients()[0]
    #nodeObj = clusterObj.getFirstAvaNode(caseName)
    logging.getLogger(caseName).info("\nStep1: Check IO from clients")
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    for client in clusterObj.getClients():
        if (client.checkIOProcess(caseName) == "error"):
            base.startRBDIO(caseName, client, imageNum, poolName)
    sleep(60)
    logging.getLogger(caseName).info(
        "start to check cluster status before case running")
    status = clusterObj.getStatus(caseName,
                                  clusterObj.getFirstAvaNode(caseName),
                                  timeOut)
    if (status == 'HEALTH_OK'):
        logging.getLogger(caseName).info("health status is OK")
    else:
        logging.getLogger(caseName).error("health status is error")
        exit(-1)

    for nodeObj in clusterObj.getNodes():
        nodeObj.setOsdPid(caseName)
        for osdObj in nodeObj.getOsds():
            osdObj.forceKill(caseName, nodeObj)

    for monObj in clusterObj.getMonitors():
        monObj.shutdown(caseName)
        monObj.start(caseName)
    for monObj in clusterObj.getMonitors():
        monObj.setMonPid(caseName)
        monObj.forceKill(caseName)

    #TBD:check IO
    for nodeObj in clusterObj.getNodes():
        for osdObj in nodeObj.getOsds():
            osdObj.start(caseName, nodeObj)

    for monObj in clusterObj.getMonitors():
        monObj.start(caseName)

    logging.getLogger(caseName).info("sleep 10 mins to wait cluster recover")
    sleep(600)
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    if (status == 'HEALTH_OK'):
        logging.getLogger(caseName).info("health status is OK")
    else:
        logging.getLogger(caseName).error("health status is error")
        exit(-1)
    ''' 
    for client in clusterObj.getClients():
        base.stopIO(caseName, client) 
    '''
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    logging.getLogger(caseName).info("case runs complete")
def main(args):
    caseName = os.path.basename(inspect.getfile(
        inspect.currentframe())).split('.')[0]
    logging.getLogger(caseName).info(caseDescription)
    logging.getLogger(caseName).info("the timeout is %d" % timeOut)
    clusterObj = base.getClusterObj(caseName, args)
    clusterObj.initOsdProcess(caseName)
    nodeList = clusterObj.getNodes()
    #client = clusterObj.getClients()[0]

    logging.getLogger(caseName).info(
        "start to check cluster status before case running")
    status = clusterObj.getStatus(caseName,
                                  clusterObj.getFirstAvaNode(caseName),
                                  timeOut)
    if (status == 'HEALTH_OK'):
        logging.getLogger(caseName).info("health status is OK")
    else:
        logging.getLogger(caseName).error("health status is error")
        exit(-1)

    logging.getLogger(caseName).info("\nStep1: Check IO from clients")
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    for client in clusterObj.getClients():
        if (client.checkIOProcess(caseName) == "error"):
            base.startRBDIO(caseName, client, imageNum, poolName)

    sleep(60)

    logging.getLogger(caseName).info("\nStep 2: stop osd and check IO")
    #logging.getLogger(caseName).info("\n%d"%len(nodeList))
    for nodeObj in nodeList:
        logging.getLogger(caseName).info("\nNow operate osd on %s" %
                                         (nodeObj.gethostName()))
        for osdObj in nodeObj.getOsds():
            #out the osd
            logging.getLogger(caseName).info("\nNow operate " + osdObj.getid())
            #stop osd service

            logging.getLogger(caseName).info("Set the " + osdObj.getid() +
                                             " pid for kill")
            nodeObj.setOsdPid(caseName)
            logging.getLogger(caseName).info("shutdown " + osdObj.getid() +
                                             " by kill")
            osdObj.shutdown(caseName, nodeObj)
            for client in clusterObj.getClients():
                client.checkIOError(caseName)
            #start osd service
            sleep(5)
            logging.getLogger(caseName).info("start " + osdObj.getid())
            osdObj.start(caseName, nodeObj)
            returnCode = osdObj.checkIfOsdStart(caseName, nodeObj)
            tryCount = 0
            while (returnCode == 0 and tryCount < 10):
                returnCode = osdObj.checkIfOsdStart(caseName, nodeObj)
                tryCount = tryCount + 1
            if (tryCount == 10):
                logging.getLogger(caseName).error("%s cannot start" %
                                                  osdObj.getid())

            #check ceph health
            sleep(30)
            for client in clusterObj.getClients():
                client.checkIOError(caseName)
            status = clusterObj.getStatus(caseName, nodeObj, timeOut)
            if (status == 'HEALTH_OK'):
                logging.getLogger(caseName).info(
                    "stop %s in cluster successfully" % osdObj.getid())
            else:
                logging.getLogger(caseName).error("status is %s" % status)
                logging.getLogger(caseName).error(
                    "print log for another 10 minutes")
                status = clusterObj.getStatus(caseName, nodeObj, timeOut)
                if (status == 'HEALTH_OK'):
                    logging.getLogger(caseName).info(
                        "stop %s in cluster successfully" % osdObj.getid())
                    break
                else:
                    logging.getLogger(caseName).error("%s  runs failed" %
                                                      caseName)
                    exit(-1)
    '''        
    logging.getLogger(caseName).info("\nStep3:stop IO from clients") 
    sleep(60) 
    for client in clusterObj.getClients():           
        base.stopIO(caseName, client)  
    '''
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    logging.getLogger(caseName).info("%s runs complete" % caseName)
def main(args):
    caseName = os.path.basename(inspect.getfile(
        inspect.currentframe())).split('.')[0]
    logging.getLogger(caseName).info(caseDescription)
    clusterObj = base.getClusterObj(caseName, args)
    nodeList = clusterObj.getNodes()
    #client = clusterObj.getClients()[0]
    #stop osd process and start with ceph-osd -i
    clusterObj.initOsdProcess(caseName)

    logging.getLogger(caseName).info(
        "start to check cluster status before case running")
    status = clusterObj.getStatus(caseName,
                                  clusterObj.getFirstAvaNode(caseName),
                                  timeOut)
    if (status == 'HEALTH_OK'):
        logging.getLogger(caseName).info("health status is OK")
    else:
        logging.getLogger(caseName).error("health status is error")
        exit(-1)

    logging.getLogger(caseName).info("\nStep1: Check IO from clients")
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    for client in clusterObj.getClients():
        if (client.checkIOProcess(caseName) == "error"):
            base.startRBDIO(caseName, client, imageNum, poolName)

    sleep(60)
    logging.getLogger(caseName).info("\nStep2: kill three osds ")
    for nodeObj in nodeList:
        osdObjList = nodeObj.getOsds()
        #out the osd
        logging.getLogger(caseName).info("\nNow operate " +
                                         nodeObj.gethostName())
        #stop osd service
        #logging.getLogger(caseName).info("Set the "+osdObj.getid()+" pid for kill")
        nodeObj.setOsdPid(caseName)
        logging.getLogger(caseName).info("shutdown three osds on node " +
                                         nodeObj.gethostName())

        osdObjList[0].forceKill(caseName, nodeObj)
        osdObjList[1].forceKill(caseName, nodeObj)
        osdObjList[2].forceKill(caseName, nodeObj)
        #start osd service
        for client in clusterObj.getClients():
            client.checkIOError(caseName)
        logging.getLogger(caseName).info("start osd on node " +
                                         nodeObj.gethostName())
        osdObjList[0].start(caseName, nodeObj)
        osdObjList[1].start(caseName, nodeObj)
        osdObjList[2].start(caseName, nodeObj)
        returnCode = osdObjList[0].checkIfOsdStart(caseName, nodeObj)
        tryCount = 0
        while (returnCode == 0 and tryCount < 10):
            returnCode = osdObjList[0].checkIfOsdStart(caseName, nodeObj)
            tryCount = tryCount + 1
        if (tryCount == 10):
            logging.getLogger(caseName).error("%s cannot start" %
                                              osdObjList[0].getid())

        returnCode = osdObjList[1].checkIfOsdStart(caseName, nodeObj)
        tryCount = 0
        while (returnCode == 0 and tryCount < 10):
            returnCode = osdObjList[1].checkIfOsdStart(caseName, nodeObj)
            tryCount = tryCount + 1
        if (tryCount == 10):
            logging.getLogger(caseName).error("%s cannot starte" %
                                              osdObjList[1].getid())

        returnCode = osdObjList[2].checkIfOsdStart(caseName, nodeObj)
        tryCount = 0
        while (returnCode == 0 and tryCount < 10):
            returnCode = osdObjList[2].checkIfOsdStart(caseName, nodeObj)
            tryCount = tryCount + 1
        if (tryCount == 10):
            logging.getLogger(caseName).error("%s cannot starte" %
                                              osdObjList[2].getid())
        #check ceph health
        sleep(30)
        for client in clusterObj.getClients():
            client.checkIOError(caseName)
        status = clusterObj.getStatus(caseName, nodeObj, timeOut)
        if (status == 'HEALTH_OK'):
            logging.getLogger(caseName).info(
                "stop three osds in cluster successfully")
        else:
            logging.getLogger(caseName).error("status is %s" % status)
            logging.getLogger(caseName).error("%s  runs failed" % caseName)
            status = clusterObj.getStatus(caseName, nodeObj, timeOut)
            if (status == 'HEALTH_OK'):
                logging.getLogger(caseName).info(
                    "kill in cluster successfully")
            else:
                logging.getLogger(caseName).error("%s  runs failed" % caseName)
                exit(-1)
    '''
    logging.getLogger(caseName).info("\nStep3: stop IO from clients")
    logging.getLogger(caseName).info("\nstop IO from clients") 
    #sleep(60) 
    for client in clusterObj.getClients():           
        base.stopIO(caseName, client)  
    '''
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    logging.getLogger(caseName).info("%s runs complete" % caseName)
def main(args):
    caseName = os.path.basename(inspect.getfile(
        inspect.currentframe())).split('.')[0]
    logging.getLogger(caseName).info(caseDescription)
    clusterObj = base.getClusterObj(caseName, args)
    avaiNode = clusterObj.getFirstAvaNode(caseName)
    #client = clusterObj.getClients()[0]
    clusterObj.initOsdProcess(caseName)

    logging.getLogger(caseName).info(
        "start to check cluster status before case running")
    status = clusterObj.getStatus(caseName,
                                  clusterObj.getFirstAvaNode(caseName),
                                  timeOut)
    if (status == 'HEALTH_OK'):
        logging.getLogger(caseName).info("health status is OK")
    else:
        logging.getLogger(caseName).error("health status is error")
        exit(-1)

    logging.getLogger(caseName).info("\nStep1: Check IO from clients")
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    for client in clusterObj.getClients():
        if (client.checkIOProcess(caseName) == "error"):
            base.startRBDIO(caseName, client, imageNum, poolName)

    avaiNode.uploadScript(caseName)
    osdlist = avaiNode.getOsds()
    for osdObj in osdlist:
        osdObj.forceKill(caseName, avaiNode)
        osdObj.userStart(caseName, avaiNode)

    sleep(60)
    logging.getLogger(caseName).info(
        "\nStep2: remove osd and create them 10 times")
    for i in range(10):
        avaiNode.setOsdDisk(caseName)
        disks = []
        logging.getLogger(caseName).info("start to delete osd on node %s " %
                                         avaiNode.gethostName())
        for osdObj in osdlist:
            disks.append(osdObj.getDisk())
            osdObj.delete(caseName, avaiNode)
            status = clusterObj.getStatus(caseName, avaiNode, timeOut)

            if (status == 'HEALTH_OK'):
                logging.getLogger(caseName).info("%s delete succesfully" %
                                                 osdObj.getid())
            else:
                logging.getLogger(caseName).error("status is %s" % status)
                logging.getLogger(caseName).error("%s  runs failed" % caseName)
                status = clusterObj.getStatus(caseName, avaiNode, timeOut)
                if (status == 'HEALTH_OK'):
                    logging.getLogger(caseName).info(
                        "stop in cluster successfully")
                else:
                    logging.getLogger(caseName).error("%s  runs failed" %
                                                      caseName)
                    exit(-1)
        for client in clusterObj.getClients():
            client.checkIOError(caseName)
        clusterObj.updateCluster(avaiNode)
        logging.getLogger(caseName).info(
            "all osds on node %s delete succesfully" % avaiNode.gethostName())

        logging.getLogger(caseName).info("start to create osd on node %s " %
                                         avaiNode.gethostName())
        for disk in disks:
            avaiNode.createOsd(caseName, disk)
            status = clusterObj.getStatus(caseName, avaiNode, timeOut)
            if (status == 'HEALTH_OK'):
                logging.getLogger(caseName).info("%s create succesfully" %
                                                 osdObj.getid())
            else:
                logging.getLogger(caseName).error("status is %s" % status)
                logging.getLogger(caseName).error("%s  runs failed" % caseName)
                status = clusterObj.getStatus(caseName, avaiNode, timeOut)
                if (status == 'HEALTH_OK'):
                    logging.getLogger(caseName).info(
                        "stop in cluster successfully")
                else:
                    logging.getLogger(caseName).error("%s  runs failed" %
                                                      caseName)
                    exit(-1)
        for client in clusterObj.getClients():
            client.checkIOError(caseName)
        clusterObj.updateCluster(avaiNode)
        logging.getLogger(caseName).info(
            "all osd need to create on node %s create succesfully" %
            avaiNode.gethostName())
        for client in clusterObj.getClients():
            if (client.checkIOProcess(caseName) == "error"):
                base.startRBDIO(caseName, client, imageNum, poolName)
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    logging.getLogger(caseName).info("case runs complete")
    return 1
Beispiel #8
0
def main(args):
    caseName = os.path.basename(inspect.getfile(
        inspect.currentframe())).split('.')[0]
    logging.getLogger(caseName).info(caseDescription)
    clusterObj = base.getClusterObj(caseName, args)
    nodeList = clusterObj.getNodes()
    #client = clusterObj.getClients()[0]
    #stop osd process and start with ceph-osd -i
    clusterObj.initOsdProcess(caseName)

    logging.getLogger(caseName).info(
        "start to check cluster status before case running")
    status = clusterObj.getStatus(caseName,
                                  clusterObj.getFirstAvaNode(caseName),
                                  timeOut)
    if (status == 'HEALTH_OK'):
        logging.getLogger(caseName).info("health status is OK")
    else:
        logging.getLogger(caseName).error("health status is error")
        exit(-1)

    logging.getLogger(caseName).info("\nStep1: Check IO from clients")
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    for client in clusterObj.getClients():
        if (client.checkIOProcess(caseName) == "error"):
            base.startRBDIO(caseName, client, imageNum, poolName)

    sleep(60)
    logging.getLogger(caseName).info("\nStep2: kill osd on two nodes 10 times")
    for i in range(10):
        firOsdList = nodeList[0].getOsds()
        secOsdList = nodeList[1].getOsds()
        firOsdId = random.randint(0, len(firOsdList) - 1)
        secOsdId = random.randint(0, len(secOsdList) - 1)
        firOsdList[firOsdId].shutdown(caseName, nodeList[0])
        for client in clusterObj.getClients():
            client.checkIOError(caseName)
        status = clusterObj.getStatus(caseName, nodeList[0], timeOut)
        if (status == 'HEALTH_OK'):
            secOsdList[secOsdId].shutdown(caseName, nodeList[1])
            for client in clusterObj.getClients():
                client.checkIOError(caseName)
            status = clusterObj.getStatus(caseName, nodeList[0], timeOut)
        else:
            logging.getLogger(caseName).error("status is %s" % status)
            logging.getLogger(caseName).error("%s  runs failed" % caseName)
            exit(-1)

        if (status == 'HEALTH_OK'):
            logging.getLogger(caseName).info(
                "shutdown osd on two nodes successfully")
        else:
            logging.getLogger(caseName).error("status is %s" % status)
            logging.getLogger(caseName).error("%s  runs failed" % caseName)
            status = clusterObj.getStatus(caseName, nodeList[0], timeOut)
            if (status == 'HEALTH_OK'):
                logging.getLogger(caseName).info(
                    "kill in cluster successfully")
            else:
                logging.getLogger(caseName).error("%s  runs failed" % caseName)
                exit(-1)
        firOsdList[firOsdId].start(caseName, nodeList[0])
        for client in clusterObj.getClients():
            client.checkIOError(caseName)
        secOsdList[secOsdId].start(caseName, nodeList[1])
        for client in clusterObj.getClients():
            client.checkIOError(caseName)
        returnCode = firOsdList[firOsdId].checkIfOsdStart(
            caseName, nodeList[0])
        tryCount = 0
        while (returnCode == 0 and tryCount < 10):
            returnCode = firOsdList[firOsdId].checkIfOsdStart(
                caseName, nodeList[0])
            tryCount = tryCount + 1
        if (tryCount == 10):
            logging.getLogger(caseName).error("%s cannot start" %
                                              firOsdList[firOsdId].getid())

        returnCode = secOsdList[secOsdId].checkIfOsdStart(
            caseName, nodeList[1])
        tryCount = 0
        while (returnCode == 0 and tryCount < 10):
            returnCode = secOsdList[secOsdId].checkIfOsdStart(
                caseName, nodeList[1])
            tryCount = tryCount + 1
        if (tryCount == 10):
            logging.getLogger(caseName).error("%s cannot start" %
                                              secOsdList[secOsdId].getid())

        for client in clusterObj.getClients():
            client.checkIOError(caseName)
        for client in clusterObj.getClients():
            if (client.checkIOProcess(caseName) == "error"):
                base.startRBDIO(caseName, client, imageNum, poolName)

    logging.getLogger(caseName).info("%s runs complete" % caseName)
Beispiel #9
0
def main(args):
    caseName = os.path.basename(inspect.getfile(
        inspect.currentframe())).split('.')[0]
    logging.getLogger(caseName).info(caseDescription)
    clusterObj = base.getClusterObj(caseName, args)

    logging.getLogger(caseName).info(
        "start to check cluster status before case running")
    status = clusterObj.getStatus(caseName,
                                  clusterObj.getFirstAvaNode(caseName),
                                  timeOut)
    if (status == 'HEALTH_OK'):
        logging.getLogger(caseName).info("health status is OK")

    else:
        logging.getLogger(caseName).error("health status is error")
        exit(-1)

    #client = clusterObj.getClients()[0]
    nodeObj = clusterObj.getFirstAvaNode(caseName)
    logging.getLogger(caseName).info("\nStep1: Check IO from clients")
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    for client in clusterObj.getClients():
        if (client.checkIOProcess(caseName) == "error"):
            base.startRBDIO(caseName, client, imageNum, poolName)

    sleep(60)

    monitors = clusterObj.getMonitors()
    monitors[0].setMonPid(caseName)
    monitors[0].shutdown(caseName)
    sleep(30)
    #TBD:check if io process is still exist
    '''
    if(client.checkIOProcess(caseName, pidList) == 'Error') :
        logging.getLogger(caseName).error("some process is wrong")
    '''
    monitors[0].start(caseName)
    monitors[0].checkIfMonStart(caseName)
    sleep(30)
    status = clusterObj.getStatus(caseName, nodeObj, timeOut)
    if (status == 'HEALTH_OK'):
        logging.getLogger(caseName).info(
            "stop mon service on %s in cluster successfully" %
            nodeObj.gethostName())
    else:
        logging.getLogger(caseName).error("status is %s" % status)
        logging.getLogger(caseName).error("%s  runs failed" % caseName)
        status = clusterObj.getStatus(caseName, nodeObj, timeOut)
        if (status == 'HEALTH_OK'):
            logging.getLogger(caseName).info("stop in cluster successfully")
        else:
            logging.getLogger(caseName).error("%s  runs failed" % caseName)
            exit(-1)

    #logging.getLogger(caseName).info("\nstop IO from clients")
    #sleep(60)
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    '''
    for client in clusterObj.getClients():          
        base.stopIO(caseName, client) 
    '''
    logging.getLogger(caseName).info("\ncase runs complete")
def main(args):
    caseName = os.path.basename(inspect.getfile(inspect.currentframe())).split('.')[0]
    logging.getLogger(caseName).info(caseDescription)
    clusterObj = base.getClusterObj(caseName, args)
    
    #client = clusterObj.getClients()[0]
    nodeObj = clusterObj.getFirstAvaNode(caseName)
    logging.getLogger(caseName).info("\nStep1: Check IO from clients")
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    for client in clusterObj.getClients():         
        if(client.checkIOProcess(caseName ) == "error"):
            base.startRBDIO(caseName, client, imageNum, poolName)
    sleep(60)
    
    nonLeaderMon = clusterObj.getFirstNonLeaderMon()
    nonLeaderMon.shutdown(caseName)
    nonLeaderMon.start(caseName)
    logging.getLogger(caseName).info("\nStep2: kill non-leader mon 10 times")
    for i in range(10):
        nonLeaderMon.setMonPid(caseName)
        nonLeaderMon.forceKill(caseName)
        sleep(30)
        for client in clusterObj.getClients(): 
            client.checkIOError(caseName) 
        #check monitor quorum status 
        leaderIdBefore = nonLeaderMon.getQuorumLeader(caseName)
        logging.getLogger(caseName).info("now deal the non-leader mon  %s"%leaderIdBefore)
        #create another rbd and start IO
        #clusterObj.createImg(caseName,size = '10G', pool = poolName, imageName = newImageName)
        #pid = client.writeRbdFio(caseName, newImageName, poolName)
        '''
        pidList = []
        pidList.append(pid)
        if (client.checkIOProcess(caseName, pidList) == 'Error'):
            logging.getLogger(caseName).error("IO cannot start")
            exit
        '''
        #start leader mon again
        nonLeaderMon.start(caseName)
        nonLeaderMon.checkIfMonStart(caseName)
        sleep(30)
        status = clusterObj.getStatus(caseName, nodeObj, timeOut)
        if(status == 'HEALTH_OK'):
            logging.getLogger(caseName).info("stop mon service on %s in cluster successfully"%nonLeaderMon.gethostName())
        else:
            logging.getLogger(caseName).error("status is %s"%status)
            logging.getLogger(caseName).error("%s  runs failed"%caseName)
            status = clusterObj.getStatus(caseName, nodeObj, timeOut)
            if(status == 'HEALTH_OK'):
                logging.getLogger(caseName).info("stop in cluster successfully")
                
            else:
                logging.getLogger(caseName).error("%s  runs failed"%caseName)
                exit(-1)
        #check IO status
        for client in clusterObj.getClients(): 
            client.checkIOError(caseName)    
        #check monitor quorum status
        leaderIdAfter = nonLeaderMon.getQuorumLeader(caseName)
        logging.getLogger(caseName).info("now the leader mon is %s"%leaderIdAfter)
        if(leaderIdBefore == leaderIdAfter):
            logging.getLogger(caseName).info("the leader mon is not impacted")
        else:
            logging.getLogger(caseName).error("the leader mon is not the initial one")
            exit(-1)
        for client in clusterObj.getClients():         
            if(client.checkIOProcess(caseName ) == "error"):
                base.startRBDIO(caseName, client, imageNum, poolName)   
                 
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    logging.getLogger(caseName).info("case runs complete")
def main(args):
    caseName = os.path.basename(inspect.getfile(inspect.currentframe())).split('.')[0]
    logging.getLogger(caseName).info(caseDescription)
    clusterObj = base.getClusterObj(caseName, args)
    
    logging.getLogger(caseName).info("start to check cluster status before case running")
    status = clusterObj.getStatus(caseName, clusterObj.getFirstAvaNode(caseName), timeOut)
    if(status == 'HEALTH_OK'):
        logging.getLogger(caseName).info("health status is OK")
        
    else:
        logging.getLogger(caseName).error("health status is error")
        exit(-1)
        
    #client = clusterObj.getClients()[0]
    nodeObj = clusterObj.getFirstAvaNode(caseName)
    logging.getLogger(caseName).info("\nStep1: Check IO from clients")
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    for client in clusterObj.getClients():         
        if(client.checkIOProcess(caseName ) == "error"):
            base.startRBDIO(caseName, client, imageNum, poolName)
    sleep(60)
    
    logging.getLogger(caseName).info("\nStep2: kill leader mon 10 times")    
    leaderMonFir = clusterObj.getLeaderMon()
    leaderMonSec = clusterObj.getLeaderMon()
    leaderMonFir.shutdown(caseName)
    leaderMonFir.start(caseName)
    leaderMonSec.shutdown(caseName)
    leaderMonSec.start(caseName)
    for i in range(10):        
        leaderMonFir.setMonPid(caseName)
        leaderMonFir.forceKill(caseName)
        for client in clusterObj.getClients(): 
            client.checkIOError(caseName)
        #TBD: add try
        sleep(30)
        #check monitor quorum status 
        leaderId = leaderMonFir.getQuorumLeader(caseName)
        logging.getLogger(caseName).info("now the leader mon is %s"%leaderId)
        clusterObj.setLeaderMon(leaderId)
        
        leaderMonSec.setMonPid(caseName)
        leaderMonSec.forceKill(caseName)
        for client in clusterObj.getClients(): 
            client.checkIOError(caseName)
        #TBD: add try
        sleep(30)
        #check monitor quorum status 
        leaderId = leaderMonSec.getQuorumLeader(caseName)
        logging.getLogger(caseName).info("now the leader mon is %s"%leaderId)
        clusterObj.setLeaderMon(leaderId)
        
            #start leader second mon again
        leaderMonSec.start(caseName)
        leaderMonSec.checkIfMonStart(caseName)
        #TBD: add try
        sleep(60)
        status = clusterObj.getStatus(caseName, nodeObj, timeOut)
        if(status == 'HEALTH_OK'):
            logging.getLogger(caseName).info("stop mon service on %s in cluster successfully"%nodeObj.gethostName())
        else:
            logging.getLogger(caseName).error("status is %s"%status)
            logging.getLogger(caseName).error("%s  runs failed"%caseName)
            status = clusterObj.getStatus(caseName, nodeObj, timeOut)
            if(status == 'HEALTH_OK'):
                logging.getLogger(caseName).info("stop in cluster successfully")
                
            else:
                logging.getLogger(caseName).error("%s  runs failed"%caseName)
                exit(-1)
        #check IO status
        for client in clusterObj.getClients(): 
            client.checkIOError(caseName)    
        #check monitor quorum status
        leaderId = leaderMonSec.getQuorumLeader(caseName)
        logging.getLogger(caseName).info("now the leader mon is %s"%leaderId)
        if(leaderId == leaderMonSec.gethostName()):
            logging.getLogger(caseName).info("%s is back"%leaderId)
        else:
            logging.getLogger(caseName).error("leader monitor %s is not back"%leaderId)
            exit(-1)
        clusterObj.setLeaderMon(leaderId)
        
        #start leader first mon again
        leaderMonFir.start(caseName)
        leaderMonFir.checkIfMonStart(caseName)
        #TBD: add try
        sleep(60)
        for client in clusterObj.getClients(): 
            client.checkIOError(caseName)
        status = clusterObj.getStatus(caseName, nodeObj, timeOut)
        if(status == 'HEALTH_OK'):
            logging.getLogger(caseName).info("stop mon service on %s in cluster successfully"%nodeObj.gethostName())
        else:
            logging.getLogger(caseName).error("status is %s"%status)
            logging.getLogger(caseName).error("%s  runs failed"%caseName)
            status = clusterObj.getStatus(caseName, nodeObj, timeOut)
            if(status == 'HEALTH_OK'):
                logging.getLogger(caseName).info("stop in cluster successfully")
                
            else:
                logging.getLogger(caseName).error("%s  runs failed"%caseName)
                exit(-1)

        #check IO status
        for client in clusterObj.getClients(): 
            client.checkIOError(caseName)    
        #check monitor quorum status
        leaderId = leaderMonFir.getQuorumLeader(caseName)
        logging.getLogger(caseName).info("now the leader mon is %s"%leaderId)
        if(leaderId == leaderMonFir.gethostName()):
            logging.getLogger(caseName).info("%s is back"%leaderId)
        else:
            logging.getLogger(caseName).error("leader monitor %s is not back"%leaderId)
            exit(-1)
        clusterObj.setLeaderMon(leaderId) 
           
        for client in clusterObj.getClients():         
            if(client.checkIOProcess(caseName ) == "error"):
                base.startRBDIO(caseName, client, imageNum, poolName)   
                
    for client in clusterObj.getClients():
        client.checkIOError(caseName)
    logging.getLogger(caseName).info("case runs complete")