def get_job_manager_command(self): """Get the MPI job manager command for IOR. Returns: str: the path for the mpi job manager command """ # Initialize MpioUtils if IOR is running in MPIIO or DAOS mode if self.ior_cmd.api.value in ["MPIIO", "DAOS", "POSIX"]: mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") else: self.fail("Unsupported IOR API") mpirun_path = os.path.join(mpio_util.mpichinstall, "bin") return Mpirun(self.ior_cmd, mpirun_path)
def get_job_manager_command(self, manager): """Get the MPI job manager command for Mdtest. Returns: JobManager: the object for the mpi job manager command """ # Initialize MpioUtils if mdtest needs to be run using mpich if manager == "MPICH": mpio_util = MpioUtils() if mpio_util.mpich_installed(self.hostlist_clients) is False: self.fail("Exiting Test: Mpich not installed") path = os.path.join(mpio_util.mpichinstall, "bin") return Mpirun(self.mdtest_cmd, path) path = os.path.join(self.ompi_prefix, "bin") return Orterun(self.mdtest_cmd, path)
def test_rebuild_container_create(self): """Jira ID: DAOS-1168. Test Description: Configure 4 servers and 1 client with 1 or 2 pools and a pool service leader quantity of 2. Add 1 container to the first pool configured with 3 replicas. Populate the container with 1GB of objects. Exclude a server that has shards of this object and verify that rebuild is initiated. While rebuild is active, create 1000 additional containers in the same pool or the second pool (when available). Finally verify that rebuild completes and the pool info indicates the correct number of rebuilt objects and records. Also confirm that all 1000 additional containers created during rebuild are accessible. Use Cases: Basic rebuild of container objects of array values with sufficient numbers of rebuild targets and no available rebuild targets. :avocado: tags=all,medium,full_regression,rebuild,rebuildcontcreate """ # Get test params targets = self.params.get("targets", "/run/server_config/*") pool_qty = self.params.get("pools", "/run/test/*") loop_qty = self.params.get("loops", "/run/test/*") cont_qty = self.params.get("containers", "/run/test/*") cont_obj_cls = self.params.get("container_obj_class", "/run/test/*") rank = self.params.get("rank", "/run/test/*") use_ior = self.params.get("use_ior", "/run/test/*", False) node_qty = len(self.hostlist_servers) # Get pool params self.pool = [] for index in range(pool_qty): self.pool.append(TestPool(self.context, self.log)) self.pool[-1].get_params(self) if use_ior: # Get ior params mpirun_path = os.path.join(self.ompi_prefix, "bin") mpirun = Mpirun(IorCommand(), mpirun_path) mpirun.job.get_params(self) mpirun.setup_command( mpirun.job.get_default_env("mpirun", self.tmp), self.hostfile_clients, len(self.hostlist_clients)) # Cancel any tests with tickets already assigned if rank == 1 or rank == 2: self.cancelForTicket("DAOS-2434") errors = [0 for _ in range(loop_qty)] for loop in range(loop_qty): # Log the start of the loop loop_id = "LOOP {}/{}".format(loop + 1, loop_qty) self.log.info("%s", "-" * 80) self.log.info("%s: Starting loop", loop_id) # Start this loop with a fresh list of containers self.container = [] # Create the requested number of pools info_checks = [] rebuild_checks = [] for pool in self.pool: pool.create() info_checks.append( { "pi_uuid": pool.uuid, "pi_ntargets": node_qty * targets, "pi_nnodes": node_qty, "pi_ndisabled": 0, } ) rebuild_checks.append( { "rs_errno": 0, "rs_done": 1, "rs_obj_nr": 0, "rs_rec_nr": 0, } ) # Check the pool info status = True for index, pool in enumerate(self.pool): status &= pool.check_pool_info(**info_checks[index]) status &= pool.check_rebuild_status(**rebuild_checks[index]) pool.display_pool_daos_space("after creation") self.assertTrue( status, "Error verifying pool info prior to excluding rank {}".format( rank)) # Create a container with 1GB of data in the first pool if use_ior: mpirun.job.flags.update("-v -w -W -G 1 -k", "ior.flags") mpirun.job.daos_destroy.update(False, "ior.daos_destroy") mpirun.job.set_daos_params(self.server_group, self.pool[0]) self.log.info( "%s: Running IOR on pool %s to fill container %s with data", loop_id, self.pool[0].uuid, mpirun.job.daos_cont.value) self.run_ior(loop_id, mpirun) else: self.container.append(TestContainer(self.pool[0])) self.container[-1].get_params(self) self.container[-1].create() self.log.info( "%s: Writing to pool %s to fill container %s with data", loop_id, self.pool[0].uuid, self.container[-1].uuid) self.container[-1].object_qty.value = 8 self.container[-1].record_qty.value = 64 self.container[-1].data_size.value = 1024 * 1024 self.container[-1].write_objects(rank, cont_obj_cls, False) rank_list = self.container[-1].get_target_rank_lists( " after writing data") self.container[-1].get_target_rank_count(rank, rank_list) # Display the updated pool space usage for pool in self.pool: pool.display_pool_daos_space("after container creation") # Exclude the first rank from the first pool to initiate rebuild self.pool[0].start_rebuild([rank], self.d_log) # Wait for rebuild to start self.pool[0].wait_for_rebuild(True, 1) # Create additional containers in the last pool start_index = len(self.container) self.add_containers_during_rebuild( loop_id, cont_qty, self.pool[0], self.pool[-1]) # Confirm rebuild completes self.pool[0].wait_for_rebuild(False, 1) # Check the pool info info_checks[0]["pi_ndisabled"] += targets rebuild_checks[0]["rs_done"] = 1 rebuild_checks[0]["rs_obj_nr"] = ">=0" rebuild_checks[0]["rs_rec_nr"] = ">=0" for index, pool in enumerate(self.pool): status &= pool.check_pool_info(**info_checks[index]) status &= pool.check_rebuild_status(**rebuild_checks[index]) self.assertTrue(status, "Error verifying pool info after rebuild") # Verify that each of created containers exist by openning them for index in range(start_index, len(self.container)): count = "{}/{}".format( index - start_index + 1, len(self.container) - start_index) if not self.access_container(loop_id, index, count): errors[loop] += 1 # Destroy the containers created during rebuild for index in range(start_index, len(self.container)): self.container[index].destroy() # Read the data from the container created before rebuild if use_ior: self.log.info( "%s: Running IOR on pool %s to verify container %s", loop_id, self.pool[0].uuid, mpirun.job.daos_cont.value) mpirun.job.flags.update("-v -r -R -G 1 -E", "ior.flags") mpirun.job.daos_destroy.update(True, "ior.daos_destroy") self.run_ior(loop_id, mpirun) else: self.log.info( "%s: Reading pool %s to verify container %s", loop_id, self.pool[0].uuid, self.container[0].uuid) self.assertTrue( self.container[0].read_objects(), "Error verifying data written before rebuild") self.container[0].destroy() # Destroy the pools for pool in self.pool: pool.destroy(1) self.log.info( "%s: Loop %s", loop_id, "passed" if errors[loop] == 0 else "failed") self.log.info("Test %s", "passed" if sum(errors) == 0 else "failed")