def main(args): caseName = os.path.basename(inspect.getfile(inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) clusterObj = base.getClusterObj(caseName, args) #client = clusterObj.getClients()[0] nodeObj = clusterObj.getFirstAvaNode(caseName) logging.getLogger(caseName).info("start to check cluster status before case running") status = clusterObj.getStatus(caseName, nodeObj, timeOut) if(status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) logging.getLogger(caseName).info("\nStep1: start IO from clients") for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if(client.checkIOProcess(caseName ) == "error"): base.startIO(caseName, client, 'nbd') sleep(60) logging.getLogger(caseName).info("\nStep2: pause all osds") clusterObj.pauseOsd(caseName) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if(status == 'HEALTH_OK'): logging.getLogger(caseName).info("pause cluster successfully") else: logging.getLogger(caseName).error("status is %s"%status) logging.getLogger(caseName).error("print log for another 10 minutes") status = clusterObj.getStatus(caseName, nodeObj, timeOut) if(status == 'HEALTH_OK'): logging.getLogger(caseName).info("resume cluster successfully") else: logging.getLogger(caseName).error("%s runs failed"%caseName) exit(-1) for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if(client.checkIOProcess(caseName ) == "error"): base.startIO(caseName, client, 'nbd') logging.getLogger(caseName).info("\nStep3: resume all osds") clusterObj.resumeOsd(caseName) for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if(client.checkIOProcess(caseName ) == "error"): base.startIO(caseName, client, 'nbd') logging.getLogger(caseName).info("\nCase runs successfully")
def main(args): caseName = os.path.basename(inspect.getfile( inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) logging.getLogger(caseName).info("the timeout is %d" % timeOut) clusterObj = base.getClusterObj(caseName, args) clusterObj.initOsdProcess(caseName) nodeList = clusterObj.getNodes() #client = clusterObj.getClients()[0] logging.getLogger(caseName).info( "start to check cluster status before case running") status = clusterObj.getStatus(caseName, clusterObj.getFirstAvaNode(caseName), timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) logging.getLogger(caseName).info("\nStep 1: start IO from clients") for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if (client.checkIOProcess(caseName) == "error"): base.startIO(caseName, client, 'nbd') sleep(60) logging.getLogger(caseName).info("\nStep 2: stop osd and check IO") #logging.getLogger(caseName).info("\n%d"%len(nodeList)) for nodeObj in nodeList: logging.getLogger(caseName).info("\nNow operate osd on %s" % (nodeObj.gethostName())) for osdObj in nodeObj.getOsds(): #out the osd logging.getLogger(caseName).info("\nNow operate " + osdObj.getid()) #stop osd service logging.getLogger(caseName).info("Set the " + osdObj.getid() + " pid for kill") nodeObj.setOsdPid(caseName) logging.getLogger(caseName).info("shutdown " + osdObj.getid() + " by kill") osdObj.shutdown(caseName, nodeObj) for client in clusterObj.getClients(): client.checkIOError(caseName) #start osd service logging.getLogger(caseName).info("start " + osdObj.getid()) osdObj.start(caseName, nodeObj) returnCode = osdObj.checkIfOsdStart(caseName, nodeObj) tryCount = 0 while (returnCode == 0 and tryCount < 10): returnCode = osdObj.checkIfOsdStart(caseName, nodeObj) tryCount = tryCount + 1 if (tryCount == 10): logging.getLogger(caseName).error("%s cannot start" % osdObj.getid()) #check ceph health sleep(30) for client in clusterObj.getClients(): client.checkIOError(caseName) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info( "stop %s in cluster successfully" % osdObj.getid()) else: logging.getLogger(caseName).error("status is %s" % status) logging.getLogger(caseName).error( "print log for another 10 minutes") status = clusterObj.getStatus(caseName, nodeObj, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info( "stop %s in cluster successfully" % osdObj.getid()) break else: logging.getLogger(caseName).error("%s runs failed" % caseName) exit(-1) for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if (client.checkIOProcess(caseName) == "error"): base.startIO(caseName, client, 'nbd') logging.getLogger(caseName).info("%s runs complete" % caseName)
def main(args): caseName = os.path.basename(inspect.getfile(inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) clusterObj = base.getClusterObj(caseName, args) nodeList = clusterObj.getNodes() #client = clusterObj.getClients()[0] logging.getLogger(caseName).info("start to check cluster status before case running") status = clusterObj.getStatus(caseName, clusterObj.getFirstAvaNode(caseName), timeOut) if(status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) logging.getLogger(caseName).info("\nStep 1: start IO from clients") for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if(client.checkIOProcess(caseName ) == "error"): base.startIO(caseName, client, 'nbd') ''' for client in clusterObj.getClients(): base.startIO(caseName, client, 'nbd') ''' sleep(60) logging.getLogger(caseName).info("\nStep 2: Out the osd and check IO") for nodeObj in nodeList: for osdObj in nodeObj.getOsds(): #out the osd logging.getLogger(caseName).info("\nNow operate "+nodeObj.gethostName()) logging.getLogger(caseName).info(len(nodeObj.getOsds())) logging.getLogger(caseName).info("\nNow operate "+osdObj.getid()) logging.getLogger(caseName).info("out "+osdObj.getid()) osdObj.outCluster(caseName, nodeObj) logging.getLogger(caseName).info("check if IO error") for client in clusterObj.getClients(): client.checkIOError(caseName) #stop osd service logging.getLogger(caseName).info("Set the "+osdObj.getid()+" pid for kill") nodeObj.setOsdPid(caseName) osdObj.inCluster(caseName, nodeObj) #check ceph health sleep(30) for client in clusterObj.getClients(): client.checkIOError(caseName) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if(status == 'HEALTH_OK'): logging.getLogger(caseName).info("stop %s in cluster successfully"%osdObj.getid()) else: logging.getLogger(caseName).error("status is %s"%status) logging.getLogger(caseName).error("print log for another 10 minutes") status = clusterObj.getStatus(caseName, nodeObj, timeOut) if(status == 'HEALTH_OK'): logging.getLogger(caseName).info("stop %s in cluster successfully"%osdObj.getid()) break else: logging.getLogger(caseName).error("%s runs failed"%caseName) exit(-1) for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if(client.checkIOProcess(caseName ) == "error"): base.startIO(caseName, client, 'nbd') logging.getLogger(caseName).info("%s runs complete"%caseName)
def main(args): caseName = os.path.basename(inspect.getfile(inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) clusterObj = base.getClusterObj(caseName, args) clusterObj.initOsdProcess(caseName) #client = clusterObj.getClients()[0] #nodeObj = clusterObj.getFirstAvaNode(caseName) for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if(client.checkIOProcess(caseName ) == "error"): base.startIO(caseName, client, 'nbd') sleep(60) logging.getLogger(caseName).info("start to check cluster status before case running") status = clusterObj.getStatus(caseName, clusterObj.getFirstAvaNode(caseName), timeOut) if(status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) for nodeObj in clusterObj.getNodes(): nodeObj.setOsdPid(caseName) for osdObj in nodeObj.getOsds(): osdObj.forceKill(caseName, nodeObj) for monObj in clusterObj.getMonitors(): monObj.shutdown(caseName) monObj.start(caseName) for monObj in clusterObj.getMonitors(): monObj.setMonPid(caseName) monObj.forceKill(caseName) #TBD:check IO for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if(client.checkIOProcess(caseName ) == "error"): base.startIO(caseName, client, 'nbd') for nodeObj in clusterObj.getNodes(): for osdObj in nodeObj.getOsds(): osdObj.start(caseName, nodeObj) for monObj in clusterObj.getMonitors(): monObj.start(caseName) logging.getLogger(caseName).info("sleep 10 mins to wait cluster recover") sleep(600) for client in clusterObj.getClients(): client.checkIOError(caseName) if(status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) for client in clusterObj.getClients(): base.stopIO(caseName, client) logging.getLogger(caseName).info("case runs complete")
def main(args): caseName = os.path.basename(inspect.getfile( inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) clusterObj = base.getClusterObj(caseName, args) logging.getLogger(caseName).info( "start to check cluster status before case running") status = clusterObj.getStatus(caseName, clusterObj.getFirstAvaNode(caseName), timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) #client = clusterObj.getClients()[0] nodeObj = clusterObj.getFirstAvaNode(caseName) for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if (client.checkIOProcess(caseName) == "error"): base.startIO(caseName, client, 'nbd') sleep(60) monitors = clusterObj.getMonitors() monitors[0].setMonPid(caseName) monitors[0].shutdown(caseName) sleep(30) #TBD:check if io process is still exist ''' if(client.checkIOProcess(caseName, pidList) == 'Error') : logging.getLogger(caseName).error("some process is wrong") ''' monitors[0].start(caseName) monitors[0].checkIfMonStart(caseName) sleep(30) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info( "stop mon service on %s in cluster successfully" % nodeObj.gethostName()) else: logging.getLogger(caseName).error("status is %s" % status) logging.getLogger(caseName).error("%s runs failed" % caseName) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("stop in cluster successfully") else: logging.getLogger(caseName).error("%s runs failed" % caseName) exit(-1) #logging.getLogger(caseName).info("\nstop IO from clients") #sleep(60) for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if (client.checkIOProcess(caseName) == "error"): base.startIO(caseName, client, 'nbd') logging.getLogger(caseName).info("\ncase runs complete")
def main(args): caseName = os.path.basename(inspect.getfile( inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) clusterObj = base.getClusterObj(caseName, args) nodeList = clusterObj.getNodes() #client = clusterObj.getClients()[0] #stop osd process and start with ceph-osd -i clusterObj.initOsdProcess(caseName) logging.getLogger(caseName).info( "start to check cluster status before case running") status = clusterObj.getStatus(caseName, clusterObj.getFirstAvaNode(caseName), timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) logging.getLogger(caseName).info("\nStep1: start IO from clients") for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if (client.checkIOProcess(caseName) == "error"): base.startIO(caseName, client, 'nbd') sleep(60) logging.getLogger(caseName).info("\nStep2: kill three osds ") for nodeObj in nodeList: osdObjList = nodeObj.getOsds() #out the osd logging.getLogger(caseName).info("\nNow operate " + nodeObj.gethostName()) #stop osd service #logging.getLogger(caseName).info("Set the "+osdObj.getid()+" pid for kill") nodeObj.setOsdPid(caseName) logging.getLogger(caseName).info("shutdown three osds on node " + nodeObj.gethostName()) osdObjList[0].forceKill(caseName, nodeObj) osdObjList[1].forceKill(caseName, nodeObj) osdObjList[2].forceKill(caseName, nodeObj) #start osd service for client in clusterObj.getClients(): client.checkIOError(caseName) logging.getLogger(caseName).info("start osd on node " + nodeObj.gethostName()) osdObjList[0].start(caseName, nodeObj) osdObjList[1].start(caseName, nodeObj) osdObjList[2].start(caseName, nodeObj) returnCode = osdObjList[0].checkIfOsdStart(caseName, nodeObj) tryCount = 0 while (returnCode == 0 and tryCount < 10): returnCode = osdObjList[0].checkIfOsdStart(caseName, nodeObj) tryCount = tryCount + 1 if (tryCount == 10): logging.getLogger(caseName).error("%s cannot start" % osdObjList[0].getid()) returnCode = osdObjList[1].checkIfOsdStart(caseName, nodeObj) tryCount = 0 while (returnCode == 0 and tryCount < 10): returnCode = osdObjList[1].checkIfOsdStart(caseName, nodeObj) tryCount = tryCount + 1 if (tryCount == 10): logging.getLogger(caseName).error("%s cannot starte" % osdObjList[1].getid()) returnCode = osdObjList[2].checkIfOsdStart(caseName, nodeObj) tryCount = 0 while (returnCode == 0 and tryCount < 10): returnCode = osdObjList[2].checkIfOsdStart(caseName, nodeObj) tryCount = tryCount + 1 if (tryCount == 10): logging.getLogger(caseName).error("%s cannot starte" % osdObjList[2].getid()) #check ceph health sleep(30) for client in clusterObj.getClients(): client.checkIOError(caseName) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info( "stop three osds in cluster successfully") else: logging.getLogger(caseName).error("status is %s" % status) logging.getLogger(caseName).error("%s runs failed" % caseName) status = clusterObj.getStatus(caseName, nodeObj, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info( "kill in cluster successfully") else: logging.getLogger(caseName).error("%s runs failed" % caseName) exit(-1) for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if (client.checkIOProcess(caseName) == "error"): base.startIO(caseName, client, 'nbd') logging.getLogger(caseName).info("%s runs complete" % caseName)
def main(args): caseName = os.path.basename(inspect.getfile( inspect.currentframe())).split('.')[0] logging.getLogger(caseName).info(caseDescription) clusterObj = base.getClusterObj(caseName, args) avaiNode = clusterObj.getFirstAvaNode(caseName) #client = clusterObj.getClients()[0] logging.getLogger(caseName).info( "start to check cluster status before case running") status = clusterObj.getStatus(caseName, clusterObj.getFirstAvaNode(caseName), timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("health status is OK") else: logging.getLogger(caseName).error("health status is error") exit(-1) logging.getLogger(caseName).info("\nStep1: start IO from clients") for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if (client.checkIOProcess(caseName) == "error"): base.startIO(caseName, client, 'nbd') avaiNode.uploadScript(caseName) osdlist = avaiNode.getOsds() for osdObj in osdlist: osdObj.forceKill(caseName, avaiNode) osdObj.userStart(caseName, avaiNode) sleep(60) logging.getLogger(caseName).info( "\nStep2: remove osd and create them 10 times") for i in range(10): avaiNode.setOsdDisk(caseName) disks = [] logging.getLogger(caseName).info("start to delete osd on node %s " % avaiNode.gethostName()) for osdObj in avaiNode.getOsds(): disks.append(osdObj.getDisk()) osdObj.delete(caseName, avaiNode) status = clusterObj.getStatus(caseName, avaiNode, timeOut) logging.getLogger(caseName).info( "sleep 600s to wait the pg transfer successfully") sleep(600) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("%s delete succesfully" % osdObj.getid()) else: logging.getLogger(caseName).error("status is %s" % status) logging.getLogger(caseName).error("%s runs failed" % caseName) status = clusterObj.getStatus(caseName, avaiNode, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info( "stop in cluster successfully") else: logging.getLogger(caseName).error("%s runs failed" % caseName) exit(-1) for client in clusterObj.getClients(): client.checkIOError(caseName) clusterObj.updateCluster(avaiNode) logging.getLogger(caseName).info( "all osds on node %s delete succesfully" % avaiNode.gethostName()) logging.getLogger(caseName).info("start to create osd on node %s " % avaiNode.gethostName()) for disk in disks: avaiNode.createOsd(caseName, disk) status = clusterObj.getStatus(caseName, avaiNode, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info("%s create succesfully" % osdObj.getid()) else: logging.getLogger(caseName).error("status is %s" % status) logging.getLogger(caseName).error("%s runs failed" % caseName) status = clusterObj.getStatus(caseName, avaiNode, timeOut) if (status == 'HEALTH_OK'): logging.getLogger(caseName).info( "stop in cluster successfully") else: logging.getLogger(caseName).error("%s runs failed" % caseName) exit(-1) for client in clusterObj.getClients(): client.checkIOError(caseName) for client in clusterObj.getClients(): if (client.checkIOProcess(caseName) == "error"): base.startIO(caseName, client, 'nbd') clusterObj.updateCluster(avaiNode) logging.getLogger(caseName).info( "all osd need to create on node %s create succesfully" % avaiNode.gethostName()) logging.getLogger(caseName).info("case runs complete") return 1