def test_poolsvc(self): """ Test svc arg during pool create. :avocado: tags=pool,svc """ # parameters used in pool create createmode = self.params.get("mode", '/run/createtests/createmode/*/') createuid = os.geteuid() creategid = os.getegid() createsetid = self.params.get("setname", '/run/createtests/createset/') createsize = self.params.get("size", '/run/createtests/createsize/') createsvc = self.params.get("svc", '/run/createtests/createsvc/*/') expected_result = createsvc[1] try: # initialize a python pool object then create the underlying # daos storage self.pool = DaosPool(self.context) self.pool.create(createmode, createuid, creategid, createsize, createsetid, None, None, createsvc[0]) self.pool.connect(1 << 1) # checking returned rank list for server more than 1 iterator = 0 while (int(self.pool.svc.rl_ranks[iterator]) > 0 and int(self.pool.svc.rl_ranks[iterator]) <= createsvc[0] and int(self.pool.svc.rl_ranks[iterator]) != 999999): iterator += 1 if iterator != createsvc[0]: self.fail("Length of Returned Rank list is not equal to " "the number of Pool Service members.\n") rank_list = [] for iterator in range(createsvc[0]): rank_list.append(int(self.pool.svc.rl_ranks[iterator])) if len(rank_list) != len(set(rank_list)): self.fail("Duplicate values in returned rank list") if createsvc[0] == 3: self.pool.disconnect() cmd = ('{0} kill-leader --uuid={1}'.format( self.daosctl, self.pool.get_uuid_str())) process.system(cmd) self.pool.connect(1 << 1) self.pool.disconnect() server = DaosServer(self.context, self.server_group, 2) server.kill(1) self.pool.exclude([2]) self.pool.connect(1 << 1) if expected_result in ['FAIL']: self.fail("Test was expected to fail but it passed.\n") except DaosApiError as excep: print(excep) print(traceback.format_exc()) if expected_result == 'PASS': self.fail("Test was expected to pass but it failed.\n")
def test_poolsvc(self): """ Test svc arg during pool create. :avocado: tags=pool,svc """ # parameters used in pool create createmode = self.params.get("mode", '/run/createtests/createmode/*/') createuid = os.geteuid() creategid = os.getegid() createsetid = self.params.get("setname", '/run/createtests/createset/') createsize = self.params.get("size", '/run/createtests/createsize/') createsvc = self.params.get("svc", '/run/createtests/createsvc/*/') expected_result = createsvc[1] try: # initialize a python pool object then create the underlying # daos storage self.POOL = DaosPool(self.Context) self.POOL.create(createmode, createuid, creategid, createsize, createsetid, None, None, createsvc[0]) self.POOL.connect(1 << 1) if (createsvc[0] == 3): self.POOL.disconnect() cmd = ('{0} kill-leader --uuid={1}'.format( self.daosctl, self.POOL.get_uuid_str())) process.system(cmd) time.sleep(5) self.POOL.connect(1 << 1) self.POOL.disconnect() server = DaosServer(self.Context, self.server_group, 1) server.kill(1) time.sleep(5) self.POOL.connect(1 << 1) if expected_result in ['FAIL']: self.fail("Test was expected to fail but it passed.\n") # cleanup the pool self.POOL.disconnect() self.POOL.destroy(1) self.POOL = None except ValueError as e: print e print traceback.format_exc() if expected_result == 'PASS': self.fail("Test was expected to pass but it failed.\n")
def test_destroy_while_rebuilding(self): """ :avocado: tags=pool,pooldestroy,rebuild,desreb """ try: print("\nsetup complete, starting test\n") # create a server object that references on of our pool target hosts # and then kill it svr_to_kill = int(self.params.get("rank_to_kill", '/run/testparams/ranks/')) server = DaosServer(self.context, bytes(self.server_group), svr_to_kill) print("created server ") # BUG if you don't connect the rebuild doesn't start correctly self.pool.connect(1 << 1) status = self.pool.pool_query() if not status.pi_ntargets == len(self.hostlist): self.fail("target count wrong.\n") if not status.pi_ndisabled == 0: self.fail("disabled target count wrong.\n") print("connect ") time.sleep(1) server.kill(1) print("killed server ") # exclude the target from the dead server self.pool.exclude([svr_to_kill]) print("exclude target ") #self.pool.disconnect() #print "disconnect " # the rebuild won't take long since there is no data so do # the destroy quickly self.pool.destroy(1) print("destroy ") except DaosApiError as excep: print(excep) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n")
def test_destroy_while_rebuilding(self): """ :avocado: tags=pool,pooldestroy,rebuild,desreb """ try: print("\nsetup complete, starting test\n") # create a server object that references on of our pool target hosts # and then kill it svr_to_kill = int(self.params.get("rank_to_kill", '/run/testparams/ranks/')) server = DaosServer(self.context, bytes(self.server_group), svr_to_kill) print("created server ") # BUG if you don't connect the rebuild doesn't start correctly self.pool.connect(1 << 1) status = self.pool.pool_query() if not status.pi_ntargets == len(self.hostlist_servers): self.fail("target count wrong.\n") if not status.pi_ndisabled == 0: self.fail("disabled target count wrong.\n") print("connect ") time.sleep(1) server.kill(1) print("killed server ") # exclude the target from the dead server self.pool.exclude([svr_to_kill]) print("exclude target ") #self.pool.disconnect() #print "disconnect " # the rebuild won't take long since there is no data so do # the destroy quickly self.pool.destroy(1) print("destroy ") except DaosApiError as excep: print(excep) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n")
def start_rebuild(self, server_group, rank, daos_log): """Kill a specific server rank using this pool. Args: server_group (str): daos server group name rank (int): daos server rank to kill daos_log (DaosLog): object for logging messages """ msg = "Killing DAOS server {} (rank {})".format(server_group, rank) self.log.info(msg) daos_log.info(msg) server = DaosServer(self.context, server_group, rank) server.kill(1) msg = "Excluding server rank {} from pool {}".format(rank, self.uuid) self.log.info(msg) daos_log.info(msg) self.pool.exclude([rank])
def start_rebuild(self, server_group, rank, daos_log): """Kill a specific server rank using this pool. Args: server_group (str): daos server group name rank (int): daos server rank to kill daos_log (DaosLog): object for logging messages Returns: bool: True if the server has been killed and the rank has been excluded from the pool; False if the pool is undefined """ msg = "Killing DAOS server {} (rank {})".format(server_group, rank) self.log.info(msg) daos_log.info(msg) server = DaosServer(self.context, server_group, rank) server.kill(1) return self.exclude(rank, daos_log)
def start_rebuild(self, ranks, daos_log): """Kill the specific server ranks using this pool. Args: ranks (list): a list of daos server ranks (int) to kill daos_log (DaosLog): object for logging messages Returns: bool: True if the server ranks have been killed and the ranks have been excluded from the pool; False if the pool is undefined """ msg = "Killing DAOS ranks {} from server group {}".format( ranks, self.name.value) self.log.info(msg) daos_log.info(msg) for rank in ranks: server = DaosServer(self.context, self.name.value, rank) server.kill(1) return self.exclude(ranks, daos_log)
def kill_server(server_group, context, rank, pool, log=None): """Kill a specific server rank. Args: server_group (str): daos server group name context (DaosContext): the context to use to create the DaosServer rank (int): daos server rank to kill pool (DaosPool): the DaosPool from which to exclude the rank log (DaosLog|None): object for logging messages Returns: None """ if log: log.info("Killing DAOS server {} (rank {})".format(server_group, rank)) server = DaosServer(context, server_group, rank) server.kill(1) if log: log.info("Excluding server rank {}".format(rank)) pool.exclude([rank])
def test_rebuild_no_capacity(self): """ :avocado: tags=pool,rebuild,nocap """ try: print "\nsetup complete, starting test\n" # create a server object that references on of our pool target hosts # and then kill it svr_to_kill = int( self.params.get("rank_to_kill", '/run/testparams/ranks/')) sh = DaosServer(self.CONTEXT, bytes(self.server_group), svr_to_kill) time.sleep(1) sh.kill(1) # exclude the target from the dead server self.POOL.exclude([svr_to_kill]) # exclude should trigger rebuild, check self.POOL.connect(1 << 1) status = self.POOL.pool_query() if not status.pi_ntargets == len(self.host_list): self.fail("target count wrong.\n") if not status.pi_ndisabled == 1: self.fail("disabled target count wrong.\n") # the pool should be too full to start a rebuild so # expecting an error # not sure yet specifically what error if status.pi_rebuild_st[2] == 0: self.fail("expecting rebuild to fail but it didn't.\n") except ValueError as e: print(e) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n")
def test_rebuild_no_capacity(self): """ :avocado: tags=pool,rebuild,nocap """ try: print("\nsetup complete, starting test\n") # create a server object that references on of our pool target hosts # and then kill it svr_to_kill = int(self.params.get("rank_to_kill", '/run/testparams/ranks/')) d_server = DaosServer(self.context, bytes(self.server_group), svr_to_kill) time.sleep(1) d_server.kill(1) # exclude the target from the dead server self.pool.exclude([svr_to_kill]) # exclude should trigger rebuild, check self.pool.connect(1 << 1) status = self.pool.pool_query() if not status.pi_ntargets == len(self.hostlist): self.fail("target count wrong.\n") if not status.pi_ndisabled == 1: self.fail("disabled target count wrong.\n") # the pool should be too full to start a rebuild so # expecting an error # not sure yet specifically what error if status.pi_rebuild_st.rs_errno == 0: self.fail("expecting rebuild to fail but it didn't.\n") except DaosApiError as excep: print(excep) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n")
def test_rebuild_with_io(self): """ Test ID: Rebuild-003 Test Description: Trigger a rebuild while I/O is ongoing. Use Cases: -- single pool, single client performing continous read/write/verify sequence while failure/rebuild is triggered in another process :avocado: tags=pool,rebuild,rebuildwithio """ # the rebuild tests need to redo this stuff each time so not in setup # as it usually would be server_group = self.params.get("name", '/server_config/', 'daos_server') self.hostlist_servers = self.params.get("test_machines", '/run/hosts/') hostfile_servers = write_host_file.write_host_file( self.hostlist_servers, self.workdir) try: self.agent_sessions = agent_utils.run_agent(self.basepath, self.hostlist_servers) server_utils.run_server(hostfile_servers, server_group, self.basepath) # use the uid/gid of the user running the test, these should # be perfectly valid createuid = os.geteuid() creategid = os.getegid() # parameters used in pool create that are in yaml createmode = self.params.get("mode", '/run/testparams/createmode/') createsetid = self.params.get("setname", '/run/testparams/createset/') createsize = self.params.get("size", '/run/testparams/createsize/') # initialize a python pool object then create the underlying # daos storage pool = DaosPool(self.context) pool.create(createmode, createuid, creategid, createsize, createsetid, None) pool.connect(1 << 1) container = DaosContainer(self.context) container.create(pool.handle) container.open() # get pool status and make sure it all looks good before we start pool.pool_query() if pool.pool_info.pi_ndisabled != 0: self.fail("Number of disabled targets reporting incorrectly.\n") if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error but rebuild hasn't run.\n") if pool.pool_info.pi_rebuild_st.rs_done != 1: self.fail("Rebuild is running but device hasn't failed yet.\n") if pool.pool_info.pi_rebuild_st.rs_obj_nr != 0: self.fail("Rebuilt objs not zero.\n") if pool.pool_info.pi_rebuild_st.rs_rec_nr != 0: self.fail("Rebuilt recs not zero.\n") dummy_pool_version = pool.pool_info.pi_rebuild_st.rs_version # do I/O for 30 seconds dummy_bw = io_utilities.continuous_io(container, 30) # trigger the rebuild rank = self.params.get("rank", '/run/testparams/ranks/*') server = DaosServer(self.context, server_group, rank) server.kill(1) pool.exclude([rank]) # do another 30 seconds of I/O, # waiting for some improvements in server bootstrap # at which point we can move the I/O to a separate client and # really pound it with I/O dummy_bw = io_utilities.continuous_io(container, 30) # wait for the rebuild to finish while True: pool.pool_query() if pool.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check rebuild statistics if pool.pool_info.pi_ndisabled != 1: self.fail("Number of disabled targets reporting incorrectly: {}" .format(pool.pool_info.pi_ndisabled)) if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}".format( pool.pool_info.pi_rebuild_st.rs_errno)) if pool.pool_info.pi_rebuild_st.rs_obj_nr <= 0: self.fail("No objects have been rebuilt.") if pool.pool_info.pi_rebuild_st.rs_rec_nr <= 0: self.fail("No records have been rebuilt.") except (ValueError, DaosApiError) as excep: print(excep) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n") finally: # wait for the I/O process to finish try: server_utils.stop_server(hosts=self.hostlist_servers) os.remove(hostfile_servers) # really make sure everything is gone check_for_pool.cleanup_pools(self.hostlist_servers) finally: if self.agent_sessions: agent_utils.stop_agent(self.agent_sessions) server_utils.kill_server(self.hostlist_servers)
def test_multipool_rebuild(self): """ Test ID: Rebuild-002 Test Description: Expand on the basic test by rebuilding 2 pools at once. Use Cases: -- multipool rebuild, single client, various object and record counds :avocado: tags=pool,rebuild,rebuildmulti """ try: # initialize python pool object then create the underlying # daos storage, the way the code is now the pools should be # on the same storage and have the same service leader pool1 = DaosPool(self.context) pool2 = DaosPool(self.context) pool1.create(self.createmode, self.createuid, self.creategid, self.createsize, self.createsetid) pool2.create(self.createmode, self.createuid, self.creategid, self.createsize, self.createsetid) # want an open connection during rebuild pool1.connect(1 << 1) pool2.connect(1 << 1) # create containers container1 = DaosContainer(self.context) container1.create(pool1.handle) container2 = DaosContainer(self.context) container2.create(pool2.handle) # now open them container1.open() container2.open() # Putting the same data in both pools, at least for now to simplify # checking its correct saved_data = [] for _objc in range(self.objcount): obj = None for _recc in range(self.reccount): # make some stuff up and write dkey = (''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) akey = (''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) data = (''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(self.size))) # Used DAOS_OC_R1S_SPEC_RANK # 1 replica with specified rank obj, txn = container1.write_an_obj(data, len(data), dkey, akey, obj, self.rank, obj_cls=15) obj, txn = container2.write_an_obj(data, len(data), dkey, akey, obj, self.rank, obj_cls=15) saved_data.append((obj, dkey, akey, data, txn)) # read the data back and make sure its correct containers data2 = container1.read_an_obj(self.size, dkey, akey, obj, txn) if data != data2.value: self.fail( "Wrote data P1, read it back, didn't match\n") data2 = container2.read_an_obj(self.size, dkey, akey, obj, txn) if data != data2.value: self.fail( "Wrote data P2, read it back, didn't match\n") # kill a server server = DaosServer(self.context, self.server_group, self.rank) server.kill(1) # temporarily, the exclude of a failed target must be done # manually pool1.exclude([self.rank]) pool2.exclude([self.rank]) # check that rebuild finishes, no errors, progress data as # know it to be. Check pool 1 first then we'll check 2 below. while True: pool1.pool_query() if pool1.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check there are no errors and other data matches what we # apriori know to be true, if pool1.pool_info.pi_ndisabled != 1: self.fail( "P1 number disabled targets reporting incorrectly: {}". format(pool1.pool_info.pi_ndisabled)) if pool1.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("P1 rebuild error reported: {}".format( pool1.pool_info.pi_rebuild_st.rs_errno)) if pool1.pool_info.pi_rebuild_st.rs_obj_nr != self.objcount: self.fail("P1 rebuilt objs not as expected: {0} {1}".format( pool1.pool_info.pi_rebuild_st.rs_obj_nr, self.objcount)) if (pool1.pool_info.pi_rebuild_st.rs_rec_nr != (self.reccount * self.objcount)): self.fail("P1 rebuilt recs not as expected: {0} {1}".format( pool1.pool_info.pi_rebuild_st.rs_rec_nr, self.reccount * self.objcount)) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container1.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") # now check the other pool while True: pool2.pool_query() if pool2.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check there are no errors and other data matches what we # apriori know to be true if pool2.pool_info.pi_ndisabled != 1: self.fail( "Number disabled targets reporting incorrectly: {}".format( pool2.pool_info.pi_ndisabled)) if pool2.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}".format( pool2.pool_info.pi_rebuild_st.rs_errno)) if pool2.pool_info.pi_rebuild_st.rs_obj_nr != self.objcount: self.fail("Rebuilt objs not as expected: {0} {1}".format( pool2.pool_info.pi_rebuild_st.rs_obj_nr, self.objcount)) if (pool2.pool_info.pi_rebuild_st.rs_rec_nr != (self.reccount * self.objcount)): self.fail("Rebuilt recs not as expected: {0} {1}".format( pool2.pool_info.pi_rebuild_st.rs_rec_nr, (self.reccount * self.objcount))) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container2.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") except DaosApiError as excp: print(excp) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n") finally: server_utils.stop_server(hosts=self.hostlist_servers) check_for_pool.cleanup_pools(self.hostlist_servers) server_utils.kill_server(self.hostlist_servers)
def test_rebuild_with_io(self): """ Test ID: Rebuild-003 Test Description: Trigger a rebuild while I/O is ongoing. Use Cases: -- single pool, single client performing continous read/write/verify sequence while failure/rebuild is triggered in another process :avocado: tags=pool,rebuild,rebuildwithio """ # the rebuild tests need to redo this stuff each time so not in setup # as it usually would be server_group = self.params.get("name", '/server_config/', 'daos_server') basepath = os.path.normpath(self.build_paths['PREFIX'] + "/../") self.hostlist = self.params.get("test_machines", '/run/hosts/') hostfile = write_host_file.write_host_file(self.hostlist, self.workdir) try: self.agent_sessions = AgentUtils.run_agent(basepath, self.hostlist) server_utils.run_server(hostfile, server_group, basepath) # use the uid/gid of the user running the test, these should # be perfectly valid createuid = os.geteuid() creategid = os.getegid() # parameters used in pool create that are in yaml createmode = self.params.get("mode", '/run/testparams/createmode/') createsetid = self.params.get("setname", '/run/testparams/createset/') createsize = self.params.get("size", '/run/testparams/createsize/') # initialize a python pool object then create the underlying # daos storage pool = DaosPool(self.context) pool.create(createmode, createuid, creategid, createsize, createsetid, None) pool.connect(1 << 1) container = DaosContainer(self.context) container.create(pool.handle) container.open() # get pool status and make sure it all looks good before we start pool.pool_query() if pool.pool_info.pi_ndisabled != 0: self.fail("Number of disabled targets reporting incorrectly.\n") if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error but rebuild hasn't run.\n") if pool.pool_info.pi_rebuild_st.rs_done != 1: self.fail("Rebuild is running but device hasn't failed yet.\n") if pool.pool_info.pi_rebuild_st.rs_obj_nr != 0: self.fail("Rebuilt objs not zero.\n") if pool.pool_info.pi_rebuild_st.rs_rec_nr != 0: self.fail("Rebuilt recs not zero.\n") dummy_pool_version = pool.pool_info.pi_rebuild_st.rs_version # do I/O for 30 seconds dummy_bw = io_utilities.continuous_io(container, 30) # trigger the rebuild rank = self.params.get("rank", '/run/testparams/ranks/*') server = DaosServer(self.context, server_group, rank) server.kill(1) pool.exclude([rank]) # do another 30 seconds of I/O, # waiting for some improvements in server bootstrap # at which point we can move the I/O to a separate client and # really pound it with I/O dummy_bw = io_utilities.continuous_io(container, 30) # wait for the rebuild to finish while True: pool.pool_query() if pool.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check rebuild statistics if pool.pool_info.pi_ndisabled != 1: self.fail("Number of disabled targets reporting incorrectly: {}" .format(pool.pool_info.pi_ndisabled)) if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}".format( pool.pool_info.pi_rebuild_st.rs_errno)) if pool.pool_info.pi_rebuild_st.rs_obj_nr <= 0: self.fail("No objects have been rebuilt.") if pool.pool_info.pi_rebuild_st.rs_rec_nr <= 0: self.fail("No records have been rebuilt.") except (ValueError, DaosApiError) as excep: print(excep) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n") finally: # wait for the I/O process to finish try: server_utils.stop_server(hosts=self.hostlist) os.remove(hostfile) # really make sure everything is gone check_for_pool.cleanup_pools(self.hostlist) finally: if self.agent_sessions: AgentUtils.stop_agent(self.hostlist, self.agent_sessions) server_utils.kill_server(self.hostlist)
def test_simple_rebuild(self): """ Test ID: Rebuild-001 Test Description: The most basic rebuild test. Use Cases: -- single pool rebuild, single client, various reord/object counts :avocado: tags=pool,rebuild,rebuildsimple """ try: # initialize a python pool object then create the underlying # daos storage pool = DaosPool(self.context) pool.create(self.createmode, self.createuid, self.creategid, self.createsize, self.createsetid) # want an open connection during rebuild pool.connect(1 << 1) # get pool status we want to test later pool.pool_query() if pool.pool_info.pi_ndisabled != 0: self.fail( "Number of disabled targets reporting incorrectly.\n") if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error but rebuild hasn't run.\n") if pool.pool_info.pi_rebuild_st.rs_done != 1: self.fail("Rebuild is running but device hasn't failed yet.\n") if pool.pool_info.pi_rebuild_st.rs_obj_nr != 0: self.fail("Rebuilt objs not zero.\n") if pool.pool_info.pi_rebuild_st.rs_rec_nr != 0: self.fail("Rebuilt recs not zero.\n") # create a container container = DaosContainer(self.context) container.create(pool.handle) # now open it container.open() saved_data = [] for _objc in range(self.objcount): obj = None for _recc in range(self.reccount): # make some stuff up and write dkey = (''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) akey = (''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) data = (''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(self.size))) obj, txn = container.write_an_obj(data, len(data), dkey, akey, obj, self.rank, obj_cls=16) saved_data.append((obj, dkey, akey, data, txn)) # read the data back and make sure its correct data2 = container.read_an_obj(self.size, dkey, akey, obj, txn) if data != data2.value: self.fail("Write data 1, read it back, didn't match\n") # kill a server that has server = DaosServer(self.context, self.server_group, self.rank) server.kill(1) # temporarily, the exclude of a failed target must be done manually pool.exclude([self.rank]) while True: # get the pool/rebuild status again pool.pool_query() if pool.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) if pool.pool_info.pi_ndisabled != 1: self.fail( "Number of disabled targets reporting incorrectly: {}". format(pool.pool_info.pi_ndisabled)) if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}".format( pool.pool_info.pi_rebuild_st.rs_errno)) if pool.pool_info.pi_rebuild_st.rs_obj_nr != self.objcount: self.fail("Rebuilt objs not as expected: {0} {1}".format( pool.pool_info.pi_rebuild_st.rs_obj_nr, self.objcount)) if (pool.pool_info.pi_rebuild_st.rs_rec_nr != (self.reccount * self.objcount)): self.fail("Rebuilt recs not as expected: {0} {1}".format( pool.pool_info.pi_rebuild_st.rs_rec_nr, self.reccount * self.objcount)) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") except DaosApiError as excp: print(excp) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n")
def test_multipool_rebuild(self): """ Test ID: Rebuild-002 Test Description: Expand on the basic test by rebuilding 2 pools at once. Use Cases: -- multipool rebuild, single client, various object and record counds :avocado: tags=pool,rebuild,rebuildmulti """ try: # initialize python pool object then create the underlying # daos storage, the way the code is now the pools should be # on the same storage and have the same service leader pool1 = DaosPool(self.context) pool2 = DaosPool(self.context) pool1.create(self.createmode, self.createuid, self.creategid, self.createsize, self.createsetid) pool2.create(self.createmode, self.createuid, self.creategid, self.createsize, self.createsetid) # want an open connection during rebuild pool1.connect(1 << 1) pool2.connect(1 << 1) # create containers container1 = DaosContainer(self.context) container1.create(pool1.handle) container2 = DaosContainer(self.context) container2.create(pool2.handle) # now open them container1.open() container2.open() # Putting the same data in both pools, at least for now to simplify # checking its correct saved_data = [] for _objc in range(self.objcount): obj = None for _recc in range(self.reccount): # make some stuff up and write dkey = ( ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) akey = ( ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) data = ( ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(self.size))) # Used DAOS_OC_R1S_SPEC_RANK # 1 replica with specified rank obj, txn = container1.write_an_obj(data, len(data), dkey, akey, obj, self.rank, obj_cls=15) obj, txn = container2.write_an_obj(data, len(data), dkey, akey, obj, self.rank, obj_cls=15) saved_data.append((obj, dkey, akey, data, txn)) # read the data back and make sure its correct containers data2 = container1.read_an_obj(self.size, dkey, akey, obj, txn) if data != data2.value: self.fail("Wrote data P1, read it back, didn't match\n") data2 = container2.read_an_obj(self.size, dkey, akey, obj, txn) if data != data2.value: self.fail("Wrote data P2, read it back, didn't match\n") # kill a server server = DaosServer(self.context, self.server_group, self.rank) server.kill(1) # temporarily, the exclude of a failed target must be done # manually pool1.exclude([self.rank]) pool2.exclude([self.rank]) # check that rebuild finishes, no errors, progress data as # know it to be. Check pool 1 first then we'll check 2 below. while True: pool1.pool_query() if pool1.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check there are no errors and other data matches what we # apriori know to be true, if pool1.pool_info.pi_ndisabled != 1: self.fail("P1 number disabled targets reporting incorrectly: {}" .format(pool1.pool_info.pi_ndisabled)) if pool1.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("P1 rebuild error reported: {}" .format(pool1.pool_info.pi_rebuild_st.rs_errno)) if pool1.pool_info.pi_rebuild_st.rs_obj_nr != self.objcount: self.fail("P1 rebuilt objs not as expected: {0} {1}" .format(pool1.pool_info.pi_rebuild_st.rs_obj_nr, self.objcount)) if (pool1.pool_info.pi_rebuild_st.rs_rec_nr != (self.reccount*self.objcount)): self.fail("P1 rebuilt recs not as expected: {0} {1}" .format(pool1.pool_info.pi_rebuild_st.rs_rec_nr, self.reccount*self.objcount)) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container1.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") # now check the other pool while True: pool2.pool_query() if pool2.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check there are no errors and other data matches what we # apriori know to be true if pool2.pool_info.pi_ndisabled != 1: self.fail("Number disabled targets reporting incorrectly: {}" .format(pool2.pool_info.pi_ndisabled)) if pool2.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}" .format(pool2.pool_info.pi_rebuild_st.rs_errno)) if pool2.pool_info.pi_rebuild_st.rs_obj_nr != self.objcount: self.fail("Rebuilt objs not as expected: {0} {1}" .format(pool2.pool_info.pi_rebuild_st.rs_obj_nr, self.objcount)) if (pool2.pool_info.pi_rebuild_st.rs_rec_nr != (self.reccount*self.objcount)): self.fail("Rebuilt recs not as expected: {0} {1}". format(pool2.pool_info.pi_rebuild_st.rs_rec_nr, (self.reccount*self.objcount))) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container2.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") except DaosApiError as excp: print (excp) print (traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n")
def test_simple_rebuild(self): """ Test ID: Rebuild-001 Test Description: The most basic rebuild test. Use Cases: -- single pool rebuild, single client, various reord/object counts :avocado: tags=pool,rebuild,rebuildsimple """ try: # initialize a python pool object then create the underlying # daos storage pool = DaosPool(self.context) pool.create(self.createmode, self.createuid, self.creategid, self.createsize, self.createsetid) # want an open connection during rebuild pool.connect(1 << 1) # get pool status we want to test later pool.pool_query() if pool.pool_info.pi_ndisabled != 0: self.fail("Number of disabled targets reporting incorrectly.\n") if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error but rebuild hasn't run.\n") if pool.pool_info.pi_rebuild_st.rs_done != 1: self.fail("Rebuild is running but device hasn't failed yet.\n") if pool.pool_info.pi_rebuild_st.rs_obj_nr != 0: self.fail("Rebuilt objs not zero.\n") if pool.pool_info.pi_rebuild_st.rs_rec_nr != 0: self.fail("Rebuilt recs not zero.\n") # create a container container = DaosContainer(self.context) container.create(pool.handle) # now open it container.open() saved_data = [] for _objc in range(self.objcount): obj = None for _recc in range(self.reccount): # make some stuff up and write dkey = ( ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) akey = ( ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(5))) data = (''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(self.size))) obj, txn = container.write_an_obj(data, len(data), dkey, akey, obj, self.rank, obj_cls=16) saved_data.append((obj, dkey, akey, data, txn)) # read the data back and make sure its correct data2 = container.read_an_obj(self.size, dkey, akey, obj, txn) if data != data2.value: self.fail("Write data 1, read it back, didn't match\n") # kill a server that has server = DaosServer(self.context, self.server_group, self.rank) server.kill(1) # temporarily, the exclude of a failed target must be done manually pool.exclude([self.rank]) while True: # get the pool/rebuild status again pool.pool_query() if pool.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) if pool.pool_info.pi_ndisabled != 1: self.fail("Number of disabled targets reporting incorrectly: {}" .format(pool.pool_info.pi_ndisabled)) if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}" .format(pool.pool_info.pi_rebuild_st.rs_errno)) if pool.pool_info.pi_rebuild_st.rs_obj_nr != self.objcount: self.fail("Rebuilt objs not as expected: {0} {1}" .format(pool.pool_info.pi_rebuild_st.rs_obj_nr, self.objcount)) if (pool.pool_info.pi_rebuild_st.rs_rec_nr != (self.reccount*self.objcount)): self.fail("Rebuilt recs not as expected: {0} {1}" .format(pool.pool_info.pi_rebuild_st.rs_rec_nr, self.reccount*self.objcount)) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") except DaosApiError as excp: print (excp) print (traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n")
def test_poolsvc(self): """ Test svc arg during pool create. :avocado: tags=pool,svc """ # parameters used in pool create createmode = self.params.get("mode", '/run/createtests/createmode/*/') createuid = os.geteuid() creategid = os.getegid() createsetid = self.params.get("setname", '/run/createtests/createset/') createsize = self.params.get("size", '/run/createtests/createsize/') createsvc = self.params.get("svc", '/run/createtests/createsvc/*/') expected_result = createsvc[1] try: # initialize a python pool object then create the underlying # daos storage self.pool = DaosPool(self.context) self.pool.create(createmode, createuid, creategid, createsize, createsetid, None, None, createsvc[0]) self.pool.connect(1 << 1) # checking returned rank list for server more than 1 i = 0 while ( int(self.pool.svc.rl_ranks[i]) > 0 and int(self.pool.svc.rl_ranks[i]) <= createsvc[0] and int(self.pool.svc.rl_ranks[i]) != 999999 ): i += 1 if i != createsvc[0]: self.fail("Length of Returned Rank list is not equal to " "the number of Pool Service members.\n") rank_list = [] for j in range(createsvc[0]): rank_list.append(int(self.pool.svc.rl_ranks[j])) if len(rank_list) != len(set(rank_list)): self.fail("Duplicate values in returned rank list") if createsvc[0] == 3: self.pool.disconnect() cmd = ('{0} kill-leader --uuid={1}' .format(self.daosctl, self.pool.get_uuid_str())) process.system(cmd) self.pool.connect(1 << 1) self.pool.disconnect() server = DaosServer(self.context, self.server_group, 2) server.kill(1) self.pool.exclude([2]) self.pool.connect(1 << 1) if expected_result in ['FAIL']: self.fail("Test was expected to fail but it passed.\n") except DaosApiError as excep: print(excep) print(traceback.format_exc()) if expected_result == 'PASS': self.fail("Test was expected to pass but it failed.\n")
def test_multipool_rebuild(self): """ Test ID: Rebuild-002 Test Description: Expand on the basic test by rebuilding 2 pools at once. Use Cases: -- multipool rebuild, single client, various object and record counds :avocado: tags=pool,rebuild,rebuildmulti """ # the rebuild tests need to redo this stuff each time so not in setup # as it usually would be setid = self.params.get("setname", '/run/testparams/setnames/') server_group = self.params.get("server_group", '/server/', 'daos_server') basepath = os.path.normpath(self.build_paths['PREFIX'] + "/../") tmp = self.build_paths['PREFIX'] + '/tmp' self.hostlist = self.params.get("test_machines", '/run/hosts/') hostfile = WriteHostFile.WriteHostFile(self.hostlist, tmp) try: ServerUtils.runServer(hostfile, server_group, basepath) # use the uid/gid of the user running the test, these should # be perfectly valid createuid = os.geteuid() creategid = os.getegid() # parameters used in pool create that are in yaml createmode = self.params.get("mode", '/run/testparams/createmode/') createsetid = self.params.get("setname", '/run/testparams/createset/') createsize = self.params.get("size", '/run/testparams/createsize/') # initialize python pool object then create the underlying # daos storage, the way the code is now the pools should be # on the same storage and have the same service leader pool1 = DaosPool(self.Context) pool2 = DaosPool(self.Context) pool1.create(createmode, createuid, creategid, createsize, createsetid, None) pool2.create(createmode, createuid, creategid, createsize, createsetid, None) # want an open connection during rebuild pool1.connect(1 << 1) pool2.connect(1 << 1) # create containers container1 = DaosContainer(self.Context) container1.create(pool1.handle) container2 = DaosContainer(self.Context) container2.create(pool2.handle) # now open them container1.open() container2.open() # how many objects and records are we creating objcount = self.params.get("objcount", '/run/testparams/numobjects/*') reccount = self.params.get("reccount", '/run/testparams/numrecords/*') if objcount == 0: reccount = 0 # which rank to write to and kill rank = self.params.get("rank", '/run/testparams/ranks/*') # how much data to write with each key size = self.params.get("size", '/run/testparams/datasize/') # Putting the same data in both pools, at least for now to simplify # checking its correct saved_data = [] for i in range(0, objcount): obj = None for j in range(0, reccount): # make some stuff up and write dkey = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5)) akey = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5)) data = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(size)) obj, tx = container1.write_an_obj(data, len(data), dkey, akey, obj, rank) obj, tx = container2.write_an_obj(data, len(data), dkey, akey, obj, rank) saved_data.append((obj, dkey, akey, data, tx)) # read the data back and make sure its correct # containers data2 = container1.read_an_obj(size, dkey, akey, obj, tx) if data != data2.value: self.fail( "Wrote data P1, read it back, didn't match\n") # containers data2 = container2.read_an_obj(size, dkey, akey, obj, tx) if data != data2.value: self.fail( "Wrote data P2, read it back, didn't match\n") # kill a server server = DaosServer(self.Context, server_group, rank) server.kill(1) # temporarily, the exclude of a failed target must be done # manually pool1.exclude([rank]) pool2.exclude([rank]) # check that rebuild finishes, no errors, progress data as # know it to be. Check pool 1 first then we'll check 2 below. while True: pool1.pool_query() if pool1.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check there are no errors and other data matches what we # apriori know to be true, if pool1.pool_info.pi_ndisabled != 1: self.fail( "P1 number disabled targets reporting incorrectly: {}". format(pool1.pool_info.pi_ndisabled)) if pool1.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("P1 rebuild error reported: {}".format( pool1.pool_info.pi_rebuild_st.rs_errno)) if pool1.pool_info.pi_rebuild_st.rs_obj_nr != objcount: self.fail("P1 rebuilt objs not as expected: {0} {1}".format( pool1.pool_info.pi_rebuild_st.rs_obj_nr, objcount)) if pool1.pool_info.pi_rebuild_st.rs_rec_nr != (reccount * objcount): self.fail("P1 rebuilt recs not as expected: {0} {1}".format( pool1.pool_info.pi_rebuild_st.rs_rec_nr, reccount * objcount)) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container1.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") # now check the other pool while True: pool2.pool_query() if pool2.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) # check there are no errors and other data matches what we # apriori know to be true if pool2.pool_info.pi_ndisabled != 1: self.fail( "Number disabled targets reporting incorrectly: {}".format( pool2.pool_info.pi_ndisabled)) if pool2.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}".format( pool2.pool_info.pi_rebuild_st.rs_errno)) if pool2.pool_info.pi_rebuild_st.rs_obj_nr != objcount: self.fail("Rebuilt objs not as expected: {0} {1}".format( pool2.pool_info.pi_rebuild_st.rs_obj_nr, objcount)) if pool2.pool_info.pi_rebuild_st.rs_rec_nr != (reccount * objcount): self.fail("Rebuilt recs not as expected: {0} {1}".format( pool2.pool_info.pi_rebuild_st.rs_rec_nr, (reccount * objcount))) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container2.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") except DaosApiError as e: print(e) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n") finally: ServerUtils.stopServer(hosts=self.hostlist) os.remove(hostfile) CheckForPool.CleanupPools(self.hostlist) ServerUtils.killServer(self.hostlist)
def test_simple_rebuild(self): """ Test ID: Rebuild-001 Test Description: The most basic rebuild test. Use Cases: -- single pool rebuild, single client, various reord/object counts :avocado: tags=pool,rebuild,rebuildsimple """ # the rebuild tests need to redo this stuff each time so not in setup # as it usually would be setid = self.params.get("setname", '/run/testparams/setnames/') server_group = self.params.get("server_group", '/server/', 'daos_server') basepath = os.path.normpath(self.build_paths['PREFIX'] + "/../") tmp = self.build_paths['PREFIX'] + '/tmp' self.hostlist = self.params.get("test_machines", '/run/hosts/') hostfile = WriteHostFile.WriteHostFile(self.hostlist, tmp) try: ServerUtils.runServer(hostfile, server_group, basepath) # use the uid/gid of the user running the test, these should # be perfectly valid createuid = os.geteuid() creategid = os.getegid() # parameters used in pool create that are in yaml createmode = self.params.get("mode", '/run/testparams/createmode/') createsetid = self.params.get("setname", '/run/testparams/createset/') createsize = self.params.get("size", '/run/testparams/createsize/') # initialize a python pool object then create the underlying # daos storage pool = DaosPool(self.Context) pool.create(createmode, createuid, creategid, createsize, createsetid, None) # want an open connection during rebuild pool.connect(1 << 1) # get pool status we want to test later pool.pool_query() if pool.pool_info.pi_ndisabled != 0: self.fail( "Number of disabled targets reporting incorrectly.\n") if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error but rebuild hasn't run.\n") if pool.pool_info.pi_rebuild_st.rs_done != 1: self.fail("Rebuild is running but device hasn't failed yet.\n") if pool.pool_info.pi_rebuild_st.rs_obj_nr != 0: self.fail("Rebuilt objs not zero.\n") if pool.pool_info.pi_rebuild_st.rs_rec_nr != 0: self.fail("Rebuilt recs not zero.\n") pool_version = pool.pool_info.pi_rebuild_st.rs_version # create a container container = DaosContainer(self.Context) container.create(pool.handle) # now open it container.open() # how many objects and records are we creating objcount = self.params.get("objcount", '/run/testparams/numobjects/*') reccount = self.params.get("reccount", '/run/testparams/numrecords/*') if objcount == 0: reccount = 0 # which rank to write to and kill rank = self.params.get("rank", '/run/testparams/ranks/*') # how much data to write with each key size = self.params.get("size", '/run/testparams/datasize/') saved_data = [] for i in range(0, objcount): obj = None for j in range(0, reccount): # make some stuff up and write dkey = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5)) akey = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5)) data = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(size)) obj, tx = container.write_an_obj(data, len(data), dkey, akey, obj, rank) saved_data.append((obj, dkey, akey, data, tx)) # read the data back and make sure its correct data2 = container.read_an_obj(size, dkey, akey, obj, tx) if data != data2.value: self.fail("Write data 1, read it back, didn't match\n") # kill a server that has server = DaosServer(self.Context, server_group, rank) server.kill(1) # temporarily, the exclude of a failed target must be done # manually pool.exclude([rank]) while True: # get the pool/rebuild status again pool.pool_query() if pool.pool_info.pi_rebuild_st.rs_done == 1: break else: time.sleep(2) if pool.pool_info.pi_ndisabled != 1: self.fail( "Number of disabled targets reporting incorrectly: {}". format(pool.pool_info.pi_ndisabled)) if pool.pool_info.pi_rebuild_st.rs_errno != 0: self.fail("Rebuild error reported: {}".format( pool.pool_info.pi_rebuild_st.rs_errno)) if pool.pool_info.pi_rebuild_st.rs_obj_nr != objcount: self.fail("Rebuilt objs not as expected: {0} {1}".format( pool.pool_info.pi_rebuild_st.rs_obj_nr, objcount)) if pool.pool_info.pi_rebuild_st.rs_rec_nr != (reccount * objcount): self.fail("Rebuilt recs not as expected: {0} {1}".format( pool.pool_info.pi_rebuild_st.rs_rec_nr, reccount * objcount)) # now that the rebuild finished verify the records are correct for tup in saved_data: data2 = container.read_an_obj(len(tup[3]), tup[1], tup[2], tup[0], tup[4]) if tup[3] != data2.value: self.fail("after rebuild data didn't check out") except DaosApiError as e: print(e) print(traceback.format_exc()) self.fail("Expecting to pass but test has failed.\n") finally: try: ServerUtils.stopServer(hosts=self.hostlist) os.remove(hostfile) # really make sure everything is gone CheckForPool.CleanupPools(self.hostlist) finally: ServerUtils.killServer(self.hostlist)
def test_poolsvc(self): """ Test svc arg during pool create. :avocado: tags=all,pool,pr,medium,svc """ # parameters used in pool create createmode = self.params.get("mode", '/run/createtests/createmode/*/') createuid = os.geteuid() creategid = os.getegid() createsetid = self.params.get("setname", '/run/createtests/createset/') createsize = self.params.get("size", '/run/createtests/createsize/') createsvc = self.params.get("svc", '/run/createtests/createsvc/*/') expected_result = createsvc[1] try: # initialize a python pool object then create the underlying # daos storage self.pool = DaosPool(self.context) self.pool.create(createmode, createuid, creategid, createsize, createsetid, None, None, createsvc[0]) self.pool.connect(1 << 1) # checking returned rank list for server more than 1 iterator = 0 while (int(self.pool.svc.rl_ranks[iterator]) > 0 and int(self.pool.svc.rl_ranks[iterator]) <= createsvc[0] and int(self.pool.svc.rl_ranks[iterator]) != 999999): iterator += 1 if iterator != createsvc[0]: self.fail("Length of Returned Rank list is not equal to " "the number of Pool Service members.\n") rank_list = [] for iterator in range(createsvc[0]): rank_list.append(int(self.pool.svc.rl_ranks[iterator])) if len(rank_list) != len(set(rank_list)): self.fail("Duplicate values in returned rank list") self.pool.pool_query() leader = self.pool.pool_info.pi_leader if createsvc[0] == 3: # kill pool leader and exclude it self.pool.pool_svc_stop() self.pool.exclude([leader]) # perform pool disconnect, try connect again and disconnect self.pool.disconnect() self.pool.connect(1 << 1) self.pool.disconnect() # kill another server which is not a leader and exclude it server = DaosServer(self.context, self.server_group, 3) server.kill(1) self.pool.exclude([3]) # perform pool connect self.pool.connect(1 << 1) if expected_result in ['FAIL']: self.fail("Test was expected to fail but it passed.\n") except DaosApiError as excep: print(excep) print(traceback.format_exc()) if expected_result == 'PASS': self.fail("Test was expected to pass but it failed.\n")