def main(args): caseName = os.path.basename(inspect.getfile( inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) clusterObj = base.getClusterObj(caseName, args) for client in clusterObj.getClients(): base.startRBDIO(caseName, client, imageNum, poolName)
def main(args): caseName = os.path.basename(inspect.getfile(inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) clusterObj = base.getClusterObj(caseName, args) nodeList = clusterObj.getNodes() #client = clusterObj.getClients()[0] logging.getLogger(caseName).info("start to check cluster status before case running") status = clusterObj.getStatus(caseName, clusterObj.getFirstAvaNode(caseName), timeOut) if(status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) logging.getLogger(caseName).info("\nStep1: Check IO from clients") for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if(client.checkIOProcess(caseName ) == "error"): base.startRBDIO(caseName, client, imageNum, poolName) sleep(60) logging.getLogger(caseName).info("\nStep 2: Out the osd and check IO") for nodeObj in nodeList: for osdObj in nodeObj.getOsds(): #out the osd for i in range(5): logging.getLogger(caseName).info("\nNow operate "+nodeObj.gethostName()) logging.getLogger(caseName).info(len(nodeObj.getOsds())) logging.getLogger(caseName).info("\nNow operate "+osdObj.getid()) logging.getLogger(caseName).info("out "+osdObj.getid()) osdObj.outCluster(caseName, nodeObj) logging.getLogger(caseName).info("check if IO error") sleep(15) #add osd in cluster #logging.getLogger(caseName).info("add in "+osdObj.getid()+" to cluster") osdObj.inCluster(caseName, nodeObj) #check ceph health sleep(15) logging.getLogger(caseName).info("Now check if there is any IO error") for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if(client.checkIOProcess(caseName ) == "error"): base.startRBDIO(caseName, client, imageNum, poolName) for client in clusterObj.getClients(): client.checkIOError(caseName) logging.getLogger(caseName).info("%s runs complete"%caseName)
def main(args): caseName = os.path.basename(inspect.getfile( inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) clusterObj = base.getClusterObj(caseName, args) #client = clusterObj.getClients()[0] nodeObj = clusterObj.getFirstAvaNode(caseName) logging.getLogger(caseName).info( "start to check cluster status before case running") status = clusterObj.getStatus(caseName, nodeObj, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) logging.getLogger(caseName).info("\nStep1: Check IO from clients") for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if (client.checkIOProcess(caseName) == "error"): base.startRBDIO(caseName, client, imageNum, poolName) sleep(60) logging.getLogger(caseName).info("\nStep2: pause all osds") clusterObj.pauseOsd(caseName) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("pause cluster successfully") else: logging.getLogger(caseName).error("status is %s" % status) logging.getLogger(caseName).error("print log for another 10 minutes") status = clusterObj.getStatus(caseName, nodeObj, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("resume cluster successfully") else: logging.getLogger(caseName).error("%s runs failed" % caseName) exit(-1) for client in clusterObj.getClients(): client.checkIOError(caseName) logging.getLogger(caseName).info("\nStep3: resume all osds") clusterObj.resumeOsd(caseName) for client in clusterObj.getClients(): client.checkIOError(caseName) ''' for client in clusterObj.getClients(): base.stopIO(caseName, client) ''' logging.getLogger(caseName).info("\nCase runs successfully")
def main(args): caseName = os.path.basename(inspect.getfile( inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) clusterObj = base.getClusterObj(caseName, args) clusterObj.initOsdProcess(caseName) #client = clusterObj.getClients()[0] #nodeObj = clusterObj.getFirstAvaNode(caseName) logging.getLogger(caseName).info("\nStep1: Check IO from clients") for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if (client.checkIOProcess(caseName) == "error"): base.startRBDIO(caseName, client, imageNum, poolName) sleep(60) logging.getLogger(caseName).info( "start to check cluster status before case running") status = clusterObj.getStatus(caseName, clusterObj.getFirstAvaNode(caseName), timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) for nodeObj in clusterObj.getNodes(): nodeObj.setOsdPid(caseName) for osdObj in nodeObj.getOsds(): osdObj.forceKill(caseName, nodeObj) for monObj in clusterObj.getMonitors(): monObj.shutdown(caseName) monObj.start(caseName) for monObj in clusterObj.getMonitors(): monObj.setMonPid(caseName) monObj.forceKill(caseName) #TBD:check IO for nodeObj in clusterObj.getNodes(): for osdObj in nodeObj.getOsds(): osdObj.start(caseName, nodeObj) for monObj in clusterObj.getMonitors(): monObj.start(caseName) logging.getLogger(caseName).info("sleep 10 mins to wait cluster recover") sleep(600) for client in clusterObj.getClients(): client.checkIOError(caseName) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) ''' for client in clusterObj.getClients(): base.stopIO(caseName, client) ''' for client in clusterObj.getClients(): client.checkIOError(caseName) logging.getLogger(caseName).info("case runs complete")
def main(args): caseName = os.path.basename(inspect.getfile( inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) logging.getLogger(caseName).info("the timeout is %d" % timeOut) clusterObj = base.getClusterObj(caseName, args) clusterObj.initOsdProcess(caseName) nodeList = clusterObj.getNodes() #client = clusterObj.getClients()[0] logging.getLogger(caseName).info( "start to check cluster status before case running") status = clusterObj.getStatus(caseName, clusterObj.getFirstAvaNode(caseName), timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) logging.getLogger(caseName).info("\nStep1: Check IO from clients") for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if (client.checkIOProcess(caseName) == "error"): base.startRBDIO(caseName, client, imageNum, poolName) sleep(60) logging.getLogger(caseName).info("\nStep 2: stop osd and check IO") #logging.getLogger(caseName).info("\n%d"%len(nodeList)) for nodeObj in nodeList: logging.getLogger(caseName).info("\nNow operate osd on %s" % (nodeObj.gethostName())) for osdObj in nodeObj.getOsds(): #out the osd logging.getLogger(caseName).info("\nNow operate " + osdObj.getid()) #stop osd service logging.getLogger(caseName).info("Set the " + osdObj.getid() + " pid for kill") nodeObj.setOsdPid(caseName) logging.getLogger(caseName).info("shutdown " + osdObj.getid() + " by kill") osdObj.shutdown(caseName, nodeObj) for client in clusterObj.getClients(): client.checkIOError(caseName) #start osd service sleep(5) logging.getLogger(caseName).info("start " + osdObj.getid()) osdObj.start(caseName, nodeObj) returnCode = osdObj.checkIfOsdStart(caseName, nodeObj) tryCount = 0 while (returnCode == 0 and tryCount < 10): returnCode = osdObj.checkIfOsdStart(caseName, nodeObj) tryCount = tryCount + 1 if (tryCount == 10): logging.getLogger(caseName).error("%s cannot start" % osdObj.getid()) #check ceph health sleep(30) for client in clusterObj.getClients(): client.checkIOError(caseName) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info( "stop %s in cluster successfully" % osdObj.getid()) else: logging.getLogger(caseName).error("status is %s" % status) logging.getLogger(caseName).error( "print log for another 10 minutes") status = clusterObj.getStatus(caseName, nodeObj, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info( "stop %s in cluster successfully" % osdObj.getid()) break else: logging.getLogger(caseName).error("%s runs failed" % caseName) exit(-1) ''' logging.getLogger(caseName).info("\nStep3:stop IO from clients") sleep(60) for client in clusterObj.getClients(): base.stopIO(caseName, client) ''' for client in clusterObj.getClients(): client.checkIOError(caseName) logging.getLogger(caseName).info("%s runs complete" % caseName)
def main(args): caseName = os.path.basename(inspect.getfile( inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) clusterObj = base.getClusterObj(caseName, args) nodeList = clusterObj.getNodes() #client = clusterObj.getClients()[0] #stop osd process and start with ceph-osd -i clusterObj.initOsdProcess(caseName) logging.getLogger(caseName).info( "start to check cluster status before case running") status = clusterObj.getStatus(caseName, clusterObj.getFirstAvaNode(caseName), timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) logging.getLogger(caseName).info("\nStep1: Check IO from clients") for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if (client.checkIOProcess(caseName) == "error"): base.startRBDIO(caseName, client, imageNum, poolName) sleep(60) logging.getLogger(caseName).info("\nStep2: kill three osds ") for nodeObj in nodeList: osdObjList = nodeObj.getOsds() #out the osd logging.getLogger(caseName).info("\nNow operate " + nodeObj.gethostName()) #stop osd service #logging.getLogger(caseName).info("Set the "+osdObj.getid()+" pid for kill") nodeObj.setOsdPid(caseName) logging.getLogger(caseName).info("shutdown three osds on node " + nodeObj.gethostName()) osdObjList[0].forceKill(caseName, nodeObj) osdObjList[1].forceKill(caseName, nodeObj) osdObjList[2].forceKill(caseName, nodeObj) #start osd service for client in clusterObj.getClients(): client.checkIOError(caseName) logging.getLogger(caseName).info("start osd on node " + nodeObj.gethostName()) osdObjList[0].start(caseName, nodeObj) osdObjList[1].start(caseName, nodeObj) osdObjList[2].start(caseName, nodeObj) returnCode = osdObjList[0].checkIfOsdStart(caseName, nodeObj) tryCount = 0 while (returnCode == 0 and tryCount < 10): returnCode = osdObjList[0].checkIfOsdStart(caseName, nodeObj) tryCount = tryCount + 1 if (tryCount == 10): logging.getLogger(caseName).error("%s cannot start" % osdObjList[0].getid()) returnCode = osdObjList[1].checkIfOsdStart(caseName, nodeObj) tryCount = 0 while (returnCode == 0 and tryCount < 10): returnCode = osdObjList[1].checkIfOsdStart(caseName, nodeObj) tryCount = tryCount + 1 if (tryCount == 10): logging.getLogger(caseName).error("%s cannot starte" % osdObjList[1].getid()) returnCode = osdObjList[2].checkIfOsdStart(caseName, nodeObj) tryCount = 0 while (returnCode == 0 and tryCount < 10): returnCode = osdObjList[2].checkIfOsdStart(caseName, nodeObj) tryCount = tryCount + 1 if (tryCount == 10): logging.getLogger(caseName).error("%s cannot starte" % osdObjList[2].getid()) #check ceph health sleep(30) for client in clusterObj.getClients(): client.checkIOError(caseName) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info( "stop three osds in cluster successfully") else: logging.getLogger(caseName).error("status is %s" % status) logging.getLogger(caseName).error("%s runs failed" % caseName) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info( "kill in cluster successfully") else: logging.getLogger(caseName).error("%s runs failed" % caseName) exit(-1) ''' logging.getLogger(caseName).info("\nStep3: stop IO from clients") logging.getLogger(caseName).info("\nstop IO from clients") #sleep(60) for client in clusterObj.getClients(): base.stopIO(caseName, client) ''' for client in clusterObj.getClients(): client.checkIOError(caseName) logging.getLogger(caseName).info("%s runs complete" % caseName)
def main(args): caseName = os.path.basename(inspect.getfile( inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) clusterObj = base.getClusterObj(caseName, args) avaiNode = clusterObj.getFirstAvaNode(caseName) #client = clusterObj.getClients()[0] clusterObj.initOsdProcess(caseName) logging.getLogger(caseName).info( "start to check cluster status before case running") status = clusterObj.getStatus(caseName, clusterObj.getFirstAvaNode(caseName), timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) logging.getLogger(caseName).info("\nStep1: Check IO from clients") for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if (client.checkIOProcess(caseName) == "error"): base.startRBDIO(caseName, client, imageNum, poolName) avaiNode.uploadScript(caseName) osdlist = avaiNode.getOsds() for osdObj in osdlist: osdObj.forceKill(caseName, avaiNode) osdObj.userStart(caseName, avaiNode) sleep(60) logging.getLogger(caseName).info( "\nStep2: remove osd and create them 10 times") for i in range(10): avaiNode.setOsdDisk(caseName) disks = [] logging.getLogger(caseName).info("start to delete osd on node %s " % avaiNode.gethostName()) for osdObj in osdlist: disks.append(osdObj.getDisk()) osdObj.delete(caseName, avaiNode) status = clusterObj.getStatus(caseName, avaiNode, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("%s delete succesfully" % osdObj.getid()) else: logging.getLogger(caseName).error("status is %s" % status) logging.getLogger(caseName).error("%s runs failed" % caseName) status = clusterObj.getStatus(caseName, avaiNode, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info( "stop in cluster successfully") else: logging.getLogger(caseName).error("%s runs failed" % caseName) exit(-1) for client in clusterObj.getClients(): client.checkIOError(caseName) clusterObj.updateCluster(avaiNode) logging.getLogger(caseName).info( "all osds on node %s delete succesfully" % avaiNode.gethostName()) logging.getLogger(caseName).info("start to create osd on node %s " % avaiNode.gethostName()) for disk in disks: avaiNode.createOsd(caseName, disk) status = clusterObj.getStatus(caseName, avaiNode, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("%s create succesfully" % osdObj.getid()) else: logging.getLogger(caseName).error("status is %s" % status) logging.getLogger(caseName).error("%s runs failed" % caseName) status = clusterObj.getStatus(caseName, avaiNode, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info( "stop in cluster successfully") else: logging.getLogger(caseName).error("%s runs failed" % caseName) exit(-1) for client in clusterObj.getClients(): client.checkIOError(caseName) clusterObj.updateCluster(avaiNode) logging.getLogger(caseName).info( "all osd need to create on node %s create succesfully" % avaiNode.gethostName()) for client in clusterObj.getClients(): if (client.checkIOProcess(caseName) == "error"): base.startRBDIO(caseName, client, imageNum, poolName) for client in clusterObj.getClients(): client.checkIOError(caseName) logging.getLogger(caseName).info("case runs complete") return 1
def main(args): caseName = os.path.basename(inspect.getfile( inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) clusterObj = base.getClusterObj(caseName, args) nodeList = clusterObj.getNodes() #client = clusterObj.getClients()[0] #stop osd process and start with ceph-osd -i clusterObj.initOsdProcess(caseName) logging.getLogger(caseName).info( "start to check cluster status before case running") status = clusterObj.getStatus(caseName, clusterObj.getFirstAvaNode(caseName), timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) logging.getLogger(caseName).info("\nStep1: Check IO from clients") for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if (client.checkIOProcess(caseName) == "error"): base.startRBDIO(caseName, client, imageNum, poolName) sleep(60) logging.getLogger(caseName).info("\nStep2: kill osd on two nodes 10 times") for i in range(10): firOsdList = nodeList[0].getOsds() secOsdList = nodeList[1].getOsds() firOsdId = random.randint(0, len(firOsdList) - 1) secOsdId = random.randint(0, len(secOsdList) - 1) firOsdList[firOsdId].shutdown(caseName, nodeList[0]) for client in clusterObj.getClients(): client.checkIOError(caseName) status = clusterObj.getStatus(caseName, nodeList[0], timeOut) if (status == 'HEALTH_OK'): secOsdList[secOsdId].shutdown(caseName, nodeList[1]) for client in clusterObj.getClients(): client.checkIOError(caseName) status = clusterObj.getStatus(caseName, nodeList[0], timeOut) else: logging.getLogger(caseName).error("status is %s" % status) logging.getLogger(caseName).error("%s runs failed" % caseName) exit(-1) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info( "shutdown osd on two nodes successfully") else: logging.getLogger(caseName).error("status is %s" % status) logging.getLogger(caseName).error("%s runs failed" % caseName) status = clusterObj.getStatus(caseName, nodeList[0], timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info( "kill in cluster successfully") else: logging.getLogger(caseName).error("%s runs failed" % caseName) exit(-1) firOsdList[firOsdId].start(caseName, nodeList[0]) for client in clusterObj.getClients(): client.checkIOError(caseName) secOsdList[secOsdId].start(caseName, nodeList[1]) for client in clusterObj.getClients(): client.checkIOError(caseName) returnCode = firOsdList[firOsdId].checkIfOsdStart( caseName, nodeList[0]) tryCount = 0 while (returnCode == 0 and tryCount < 10): returnCode = firOsdList[firOsdId].checkIfOsdStart( caseName, nodeList[0]) tryCount = tryCount + 1 if (tryCount == 10): logging.getLogger(caseName).error("%s cannot start" % firOsdList[firOsdId].getid()) returnCode = secOsdList[secOsdId].checkIfOsdStart( caseName, nodeList[1]) tryCount = 0 while (returnCode == 0 and tryCount < 10): returnCode = secOsdList[secOsdId].checkIfOsdStart( caseName, nodeList[1]) tryCount = tryCount + 1 if (tryCount == 10): logging.getLogger(caseName).error("%s cannot start" % secOsdList[secOsdId].getid()) for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if (client.checkIOProcess(caseName) == "error"): base.startRBDIO(caseName, client, imageNum, poolName) logging.getLogger(caseName).info("%s runs complete" % caseName)
def main(args): caseName = os.path.basename(inspect.getfile( inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) clusterObj = base.getClusterObj(caseName, args) logging.getLogger(caseName).info( "start to check cluster status before case running") status = clusterObj.getStatus(caseName, clusterObj.getFirstAvaNode(caseName), timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) #client = clusterObj.getClients()[0] nodeObj = clusterObj.getFirstAvaNode(caseName) logging.getLogger(caseName).info("\nStep1: Check IO from clients") for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if (client.checkIOProcess(caseName) == "error"): base.startRBDIO(caseName, client, imageNum, poolName) sleep(60) monitors = clusterObj.getMonitors() monitors[0].setMonPid(caseName) monitors[0].shutdown(caseName) sleep(30) #TBD:check if io process is still exist ''' if(client.checkIOProcess(caseName, pidList) == 'Error') : logging.getLogger(caseName).error("some process is wrong") ''' monitors[0].start(caseName) monitors[0].checkIfMonStart(caseName) sleep(30) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info( "stop mon service on %s in cluster successfully" % nodeObj.gethostName()) else: logging.getLogger(caseName).error("status is %s" % status) logging.getLogger(caseName).error("%s runs failed" % caseName) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("stop in cluster successfully") else: logging.getLogger(caseName).error("%s runs failed" % caseName) exit(-1) #logging.getLogger(caseName).info("\nstop IO from clients") #sleep(60) for client in clusterObj.getClients(): client.checkIOError(caseName) ''' for client in clusterObj.getClients(): base.stopIO(caseName, client) ''' logging.getLogger(caseName).info("\ncase runs complete")
def main(args): caseName = os.path.basename(inspect.getfile(inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) clusterObj = base.getClusterObj(caseName, args) #client = clusterObj.getClients()[0] nodeObj = clusterObj.getFirstAvaNode(caseName) logging.getLogger(caseName).info("\nStep1: Check IO from clients") for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if(client.checkIOProcess(caseName ) == "error"): base.startRBDIO(caseName, client, imageNum, poolName) sleep(60) nonLeaderMon = clusterObj.getFirstNonLeaderMon() nonLeaderMon.shutdown(caseName) nonLeaderMon.start(caseName) logging.getLogger(caseName).info("\nStep2: kill non-leader mon 10 times") for i in range(10): nonLeaderMon.setMonPid(caseName) nonLeaderMon.forceKill(caseName) sleep(30) for client in clusterObj.getClients(): client.checkIOError(caseName) #check monitor quorum status leaderIdBefore = nonLeaderMon.getQuorumLeader(caseName) logging.getLogger(caseName).info("now deal the non-leader mon %s"%leaderIdBefore) #create another rbd and start IO #clusterObj.createImg(caseName,size = '10G', pool = poolName, imageName = newImageName) #pid = client.writeRbdFio(caseName, newImageName, poolName) ''' pidList = [] pidList.append(pid) if (client.checkIOProcess(caseName, pidList) == 'Error'): logging.getLogger(caseName).error("IO cannot start") exit ''' #start leader mon again nonLeaderMon.start(caseName) nonLeaderMon.checkIfMonStart(caseName) sleep(30) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if(status == 'HEALTH_OK'): logging.getLogger(caseName).info("stop mon service on %s in cluster successfully"%nonLeaderMon.gethostName()) else: logging.getLogger(caseName).error("status is %s"%status) logging.getLogger(caseName).error("%s runs failed"%caseName) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if(status == 'HEALTH_OK'): logging.getLogger(caseName).info("stop in cluster successfully") else: logging.getLogger(caseName).error("%s runs failed"%caseName) exit(-1) #check IO status for client in clusterObj.getClients(): client.checkIOError(caseName) #check monitor quorum status leaderIdAfter = nonLeaderMon.getQuorumLeader(caseName) logging.getLogger(caseName).info("now the leader mon is %s"%leaderIdAfter) if(leaderIdBefore == leaderIdAfter): logging.getLogger(caseName).info("the leader mon is not impacted") else: logging.getLogger(caseName).error("the leader mon is not the initial one") exit(-1) for client in clusterObj.getClients(): if(client.checkIOProcess(caseName ) == "error"): base.startRBDIO(caseName, client, imageNum, poolName) for client in clusterObj.getClients(): client.checkIOError(caseName) logging.getLogger(caseName).info("case runs complete")
def main(args): caseName = os.path.basename(inspect.getfile(inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) clusterObj = base.getClusterObj(caseName, args) logging.getLogger(caseName).info("start to check cluster status before case running") status = clusterObj.getStatus(caseName, clusterObj.getFirstAvaNode(caseName), timeOut) if(status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) #client = clusterObj.getClients()[0] nodeObj = clusterObj.getFirstAvaNode(caseName) logging.getLogger(caseName).info("\nStep1: Check IO from clients") for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if(client.checkIOProcess(caseName ) == "error"): base.startRBDIO(caseName, client, imageNum, poolName) sleep(60) logging.getLogger(caseName).info("\nStep2: kill leader mon 10 times") leaderMonFir = clusterObj.getLeaderMon() leaderMonSec = clusterObj.getLeaderMon() leaderMonFir.shutdown(caseName) leaderMonFir.start(caseName) leaderMonSec.shutdown(caseName) leaderMonSec.start(caseName) for i in range(10): leaderMonFir.setMonPid(caseName) leaderMonFir.forceKill(caseName) for client in clusterObj.getClients(): client.checkIOError(caseName) #TBD: add try sleep(30) #check monitor quorum status leaderId = leaderMonFir.getQuorumLeader(caseName) logging.getLogger(caseName).info("now the leader mon is %s"%leaderId) clusterObj.setLeaderMon(leaderId) leaderMonSec.setMonPid(caseName) leaderMonSec.forceKill(caseName) for client in clusterObj.getClients(): client.checkIOError(caseName) #TBD: add try sleep(30) #check monitor quorum status leaderId = leaderMonSec.getQuorumLeader(caseName) logging.getLogger(caseName).info("now the leader mon is %s"%leaderId) clusterObj.setLeaderMon(leaderId) #start leader second mon again leaderMonSec.start(caseName) leaderMonSec.checkIfMonStart(caseName) #TBD: add try sleep(60) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if(status == 'HEALTH_OK'): logging.getLogger(caseName).info("stop mon service on %s in cluster successfully"%nodeObj.gethostName()) else: logging.getLogger(caseName).error("status is %s"%status) logging.getLogger(caseName).error("%s runs failed"%caseName) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if(status == 'HEALTH_OK'): logging.getLogger(caseName).info("stop in cluster successfully") else: logging.getLogger(caseName).error("%s runs failed"%caseName) exit(-1) #check IO status for client in clusterObj.getClients(): client.checkIOError(caseName) #check monitor quorum status leaderId = leaderMonSec.getQuorumLeader(caseName) logging.getLogger(caseName).info("now the leader mon is %s"%leaderId) if(leaderId == leaderMonSec.gethostName()): logging.getLogger(caseName).info("%s is back"%leaderId) else: logging.getLogger(caseName).error("leader monitor %s is not back"%leaderId) exit(-1) clusterObj.setLeaderMon(leaderId) #start leader first mon again leaderMonFir.start(caseName) leaderMonFir.checkIfMonStart(caseName) #TBD: add try sleep(60) for client in clusterObj.getClients(): client.checkIOError(caseName) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if(status == 'HEALTH_OK'): logging.getLogger(caseName).info("stop mon service on %s in cluster successfully"%nodeObj.gethostName()) else: logging.getLogger(caseName).error("status is %s"%status) logging.getLogger(caseName).error("%s runs failed"%caseName) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if(status == 'HEALTH_OK'): logging.getLogger(caseName).info("stop in cluster successfully") else: logging.getLogger(caseName).error("%s runs failed"%caseName) exit(-1) #check IO status for client in clusterObj.getClients(): client.checkIOError(caseName) #check monitor quorum status leaderId = leaderMonFir.getQuorumLeader(caseName) logging.getLogger(caseName).info("now the leader mon is %s"%leaderId) if(leaderId == leaderMonFir.gethostName()): logging.getLogger(caseName).info("%s is back"%leaderId) else: logging.getLogger(caseName).error("leader monitor %s is not back"%leaderId) exit(-1) clusterObj.setLeaderMon(leaderId) for client in clusterObj.getClients(): if(client.checkIOProcess(caseName ) == "error"): base.startRBDIO(caseName, client, imageNum, poolName) for client in clusterObj.getClients(): client.checkIOError(caseName) logging.getLogger(caseName).info("case runs complete")