Example #1
0
 def daos_racer_thread(self):
     """Start the daos_racer thread."""
     self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0],
                                        self.dmg_command)
     self.daos_racer.get_params(self)
     self.daos_racer.set_environment(
         self.daos_racer.get_environment(self.server_managers[0]))
     self.daos_racer.run()
Example #2
0
File: multi.py Project: liw/daos
    def test_daos_racer(self):
        """JIRA-3855: daos_racer/consistency checker test.

        Test Description:
            The daos_racer test tool generates a bunch of simultaneous,
            conflicting I/O requests. After it is run it will verify that all
            the replicas of a given object are consistent.

            Run daos_racer for 5-10 minutes or so on 3-way replicated object
            data (at least 6 servers) and verify the object replicas.

        Use Cases:
            Running simultaneous, conflicting I/O requests.

        :avocado: tags=all,full_regression
        :avocado: tags=hw,large
        :avocado: tags=io,daosracer
        :avocado: tags=daosracer_multi
        """
        dmg = self.get_dmg_command()
        self.assertGreater(
            len(self.hostlist_clients), 0,
            "This test requires one client: {}".format(self.hostlist_clients))
        daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0], dmg)
        daos_racer.get_params(self)
        daos_racer.set_environment(
            daos_racer.get_environment(self.server_managers[0]))
        daos_racer.run()
Example #3
0
def create_racer_cmdline(self, job_spec):
    """Create the srun cmdline to run daos_racer.

    Args:
        self (obj): soak obj
        job_spec (str): fio job in yaml to run
    Returns:
        cmd(list): list of cmdlines

    """
    commands = []
    racer_namespace = os.path.join(os.sep, "run", job_spec, "*")
    daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0],
                                  self.dmg_command)
    daos_racer.namespace = racer_namespace
    daos_racer.get_params(self)
    racer_log = os.path.join(
        self.test_log_dir,
        self.test_name + "_" + job_spec + "_${SLURM_JOB_NODELIST}_"
        "${SLURM_JOB_ID}_" + "racer_log")
    env = daos_racer.get_environment(self.server_managers[0], racer_log)
    daos_racer.set_environment(env)
    log_name = job_spec
    srun_cmds = []
    srun_cmds.append(str(daos_racer.__str__()))
    srun_cmds.append("status=$?")
    # add exit code
    commands.append([srun_cmds, log_name])
    self.log.info("<<DAOS racer cmdlines>>:")
    for cmd in srun_cmds:
        self.log.info("%s", cmd)
    return commands
Example #4
0
    def test_parallel(self):
        """JIRA-8445: multi-client daos_racer/consistency checker test.

        Test Description:
            The daos_racer test tool generates a bunch of simultaneous, conflicting I/O requests. It
            will test both replicated objects and EC objects and verify the data consistency. The
            duration will depend on parameters in test yaml configuration file.

        Use Cases:
            Running simultaneous, conflicting I/O requests.

        :avocado: tags=all,full_regression
        :avocado: tags=hw,large
        :avocado: tags=io,daosracer,daos_racer_parallel
        """
        # Create the dmg command
        daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0],
                                      self.get_dmg_command())
        daos_racer.get_params(self)

        # Create the orterun command
        self.job_manager.assign_hosts(self.hostlist_clients, self.workdir,
                                      None)
        self.job_manager.assign_processes(len(self.hostlist_clients))
        self.job_manager.assign_environment(
            daos_racer.get_environment(self.server_managers[0]))
        self.job_manager.job = daos_racer
        self.job_manager.check_results_list = ["<stderr>"]
        self.job_manager.timeout = daos_racer.clush_timeout.value
        self.log.info("Multi-process command: %s", str(self.job_manager))

        # Run the daos_perf command and check for errors
        try:
            self.job_manager.run()

        except CommandFailure as error:
            self.log.error("DAOS Racer Failed: %s", str(error))
            self.fail("Test was expected to pass but it failed.\n")

        self.log.info("Test passed!")
Example #5
0
def create_racer_cmdline(self, job_spec, pool):
    """Create the srun cmdline to run daos_racer.

    Args:
        self (obj): soak obj
        job_spec (str): fio job in yaml to run
        pool (obj):   TestPool obj
    Returns:
        cmd(list): list of cmdlines

    """
    commands = []
    racer_namespace = "/run/{}/*".format(job_spec)
    daos_racer = DaosRacerCommand(
        self.bin, self.hostlist_clients[0], self.dmg_command)
    daos_racer.namespace = racer_namespace
    daos_racer.get_params(self)
    racer_log = os.path.join(
        self.test_log_dir,
        self.test_name + "_" + job_spec + "_${SLURM_JOB_NODELIST}_"
        "${SLURM_JOB_ID}_" + "racer_log")
    env = daos_racer.get_environment(self.server_managers[0], racer_log)
    daos_racer.set_environment(env)
    daos_racer.pool_uuid.update(pool.uuid)
    add_containers(self, pool, path=racer_namespace)
    daos_racer.cont_uuid.update(self.container[-1].uuid)
    log_name = job_spec
    srun_cmds = []
    # add fio cmline
    srun_cmds.append(str(daos_racer.__str__()))
    srun_cmds.append("status=$?")
    # add exit code
    commands.append([srun_cmds, log_name])
    self.log.info("<<DAOS racer cmdlines>>:")
    for cmd in srun_cmds:
        self.log.info("%s", cmd)
    return commands
Example #6
0
class OSAOnlineReintegration(OSAUtils):
    # pylint: disable=too-many-ancestors
    """Online Server Addition online re-integration test class.

    Test Class Description:
        This test runs the daos_server Online reintegration test cases.

    :avocado: recursive
    """

    def setUp(self):
        """Set up for test case."""
        super().setUp()
        self.dmg_command = self.get_dmg_command()
        self.daos_command = DaosCommand(self.bin)
        self.ior_test_sequence = self.params.get(
            "ior_test_sequence", '/run/ior/iorflags/*')
        self.test_oclass = self.params.get("oclass", '/run/test_obj_class/*')
        # Recreate the client hostfile without slots defined
        self.hostfile_clients = write_host_file(
            self.hostlist_clients, self.workdir, None)
        self.pool = None
        self.ds_racer_queue = queue.Queue()
        self.daos_racer = None
        self.dmg_command.exit_status_exception = True

    def daos_racer_thread(self):
        """Start the daos_racer thread."""
        self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0],
                                           self.dmg_command)
        self.daos_racer.get_params(self)
        self.daos_racer.set_environment(
            self.daos_racer.get_environment(self.server_managers[0]))
        self.daos_racer.run()

    def run_online_reintegration_test(self, num_pool, racer=False,
                                      server_boot=False,
                                      oclass=None):
        """Run the Online reintegration without data.

        Args:
            num_pool (int) : total pools to create for testing purposes.
            data (bool) : whether pool has no data or to create
                          some data in pool. Defaults to False.
            server_boot (bool) : Perform system stop/start on a rank.
                                 Defults to False.
            oclass (str) : daos object class string (eg: "RP_2G8").
                           Defaults to None.
        """
        if oclass is None:
            oclass = self.ior_cmd.dfs_oclass.value
        test_seq = self.ior_test_sequence[0]
        # Create a pool
        pool = {}
        exclude_servers = (len(self.hostlist_servers) * 2) - 1

        # Exclude one rank : other than rank 0.
        rank = random.randint(1, exclude_servers) #nosec

        # Start the daos_racer thread
        if racer is True:
            daos_racer_thread = threading.Thread(target=self.daos_racer_thread)
            daos_racer_thread.start()
            time.sleep(30)

        for val in range(0, num_pool):
            pool[val] = add_pool(self, connect=False)
            pool[val].set_property("reclaim", "disabled")

        # Exclude and reintegrate the pool_uuid, rank and targets
        for val in range(0, num_pool):
            threads = []
            self.pool = pool[val]
            # Instantiate aggregation
            if self.test_during_aggregation is True:
                for _ in range(0, 2):
                    self.run_ior_thread("Write", oclass, test_seq)
                self.delete_extra_container(self.pool)
            # The following thread runs while performing osa operations.
            threads.append(threading.Thread(target=self.run_ior_thread,
                                            kwargs={"action": "Write",
                                                    "oclass": oclass,
                                                    "test": test_seq}))

            # Launch the IOR threads
            for thrd in threads:
                self.log.info("Thread : %s", thrd)
                thrd.start()
                time.sleep(1)
            self.pool.display_pool_daos_space("Pool space: Beginning")
            pver_begin = self.get_pool_version()
            self.log.info("Pool Version at the beginning %s", pver_begin)
            if server_boot is False:
                output = self.dmg_command.pool_exclude(
                    self.pool.uuid, rank)
            else:
                output = self.dmg_command.system_stop(ranks=rank, force=True)
                self.pool.wait_for_rebuild(False)
                self.log.info(output)
                output = self.dmg_command.system_start(ranks=rank)

            self.print_and_assert_on_rebuild_failure(output)
            pver_exclude = self.get_pool_version()

            self.log.info("Pool Version after exclude %s", pver_exclude)
            # Check pool version incremented after pool exclude
            # pver_exclude should be greater than
            # pver_begin + 8 targets.
            self.assertTrue(pver_exclude > (pver_begin + 8),
                            "Pool Version Error:  After exclude")
            output = self.dmg_command.pool_reintegrate(self.pool.uuid,
                                                       rank)
            self.print_and_assert_on_rebuild_failure(output)

            pver_reint = self.get_pool_version()
            self.log.info("Pool Version after reintegrate %d", pver_reint)
            # Check pool version incremented after pool reintegrate
            self.assertTrue(pver_reint > (pver_exclude + 1),
                            "Pool Version Error:  After reintegrate")
            # Wait to finish the threads
            for thrd in threads:
                thrd.join()
                if not self.out_queue.empty():
                    self.assert_on_exception()

        # Check data consistency for IOR in future
        # Presently, we are running daos_racer in parallel
        # to IOR and checking the data consistency only
        # for the daos_racer objects after exclude
        # and reintegration.
        if racer is True:
            daos_racer_thread.join()

        for val in range(0, num_pool):
            display_string = "Pool{} space at the End".format(val)
            self.pool = pool[val]
            self.pool.display_pool_daos_space(display_string)
            self.run_ior_thread("Read", oclass, test_seq)
            self.container = self.pool_cont_dict[self.pool][0]
            kwargs = {"pool": self.pool.uuid,
                      "cont": self.container.uuid}
            output = self.daos_command.container_check(**kwargs)
            self.log.info(output)

    @skipForTicket("DAOS-7420")
    def test_osa_online_reintegration(self):
        """Test ID: DAOS-5075.

        Test Description: Validate Online Reintegration

        :avocado: tags=all,daily_regression
        :avocado: tags=hw,medium,ib2
        :avocado: tags=osa,checksum
        :avocado: tags=online_reintegration,online_reintegration_basic
        """
        self.log.info("Online Reintegration : Basic test")
        self.run_online_reintegration_test(1)

    @skipForTicket("DAOS-7195")
    def test_osa_online_reintegration_server_stop(self):
        """Test ID: DAOS-5920.
        Test Description: Validate Online Reintegration with server stop
        :avocado: tags=all,daily_regression
        :avocado: tags=hw,medium,ib2
        :avocado: tags=osa,checksum
        :avocado: tags=online_reintegration,online_reintegration_srv_stop
        """
        self.log.info("Online Reintegration : System stop/start")
        self.run_online_reintegration_test(1, server_boot=True)

    @skipForTicket("DAOS-7420")
    def test_osa_online_reintegration_without_csum(self):
        """Test ID: DAOS-5075.

        Test Description: Validate Online Reintegration
        without checksum

        :avocado: tags=all,daily_regression
        :avocado: tags=hw,medium,ib2
        :avocado: tags=osa,checksum
        :avocado: tags=online_reintegration,online_reintegration_without_csum
        """
        self.log.info("Online Reintegration : No Checksum")
        self.test_with_checksum = self.params.get("test_with_checksum",
                                                  '/run/checksum/*')
        self.run_online_reintegration_test(1)

    @skipForTicket("DAOS-7996")
    def test_osa_online_reintegration_with_aggregation(self):
        """Test ID: DAOS-6715
        Test Description: Reintegrate rank while aggregation
        is happening in parallel

        :avocado: tags=all,full_regression
        :avocado: tags=hw,medium,ib2
        :avocado: tags=osa,checksum
        :avocado: tags=online_reintegration,online_reintegration_aggregation
        """
        self.test_during_aggregation = self.params.get("test_with_aggregation",
                                                       '/run/aggregation/*')
        self.log.info("Online Reintegration : Aggregation")
        self.run_online_reintegration_test(1)

    @skipForTicket("DAOS-7996")
    def test_osa_online_reintegration_oclass(self):
        """Test ID: DAOS-6715
        Test Description: Reintegrate rank with different
        object class

        :avocado: tags=all,full_regression
        :avocado: tags=hw,medium,ib2
        :avocado: tags=osa,checksum
        :avocado: tags=online_reintegration,online_reintegration_oclass
        """
        self.log.info("Online Reintegration : Object Class")
        for oclass in self.test_oclass:
            self.run_online_reintegration_test(1, oclass=oclass)
class OSAOnlineReintegration(OSAUtils):
    # pylint: disable=too-many-ancestors
    """Online Server Addition online re-integration test class.

    Test Class Description:
        This test runs the daos_server Online reintegration test cases.

    :avocado: recursive
    """
    def setUp(self):
        """Set up for test case."""
        super().setUp()
        self.dmg_command = self.get_dmg_command()
        self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*')
        self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*')
        self.ior_test_sequence = self.params.get("ior_test_sequence",
                                                 '/run/ior/iorflags/*')
        self.ior_dfs_oclass = self.params.get("obj_class",
                                              '/run/ior/iorflags/*')
        # Recreate the client hostfile without slots defined
        self.hostfile_clients = write_host_file(self.hostlist_clients,
                                                self.workdir, None)
        self.pool = None
        self.out_queue = queue.Queue()
        self.ds_racer_queue = queue.Queue()
        self.daos_racer = None

    def daos_racer_thread(self):
        """Start the daos_racer thread."""
        self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0],
                                           self.dmg_command)
        self.daos_racer.get_params(self)
        self.daos_racer.set_environment(
            self.daos_racer.get_environment(self.server_managers[0]))
        self.daos_racer.run()

    def run_online_reintegration_test(self,
                                      num_pool,
                                      racer=False,
                                      server_boot=False):
        """Run the Online reintegration without data.

        Args:
            num_pool (int) : total pools to create for testing purposes.
            data (bool) : whether pool has no data or to create
                          some data in pool. Defaults to False.
            server_boot (bool) : Perform system stop/start on a rank.
                                 Defults to False.
        """
        num_jobs = self.params.get("no_parallel_job", '/run/ior/*')
        # Create a pool
        pool = {}
        pool_uuid = []
        exclude_servers = (len(self.hostlist_servers) * 2) - 1

        # Exclude one rank : other than rank 0.
        rank = random.randint(1, exclude_servers)

        # Start the daos_racer thread
        if racer is True:
            daos_racer_thread = threading.Thread(target=self.daos_racer_thread)
            daos_racer_thread.start()
            time.sleep(30)

        for val in range(0, num_pool):
            pool[val] = TestPool(self.context, self.get_dmg_command())
            pool[val].get_params(self)
            # Split total SCM and NVME size for creating multiple pools.
            pool[val].scm_size.value = int(pool[val].scm_size.value / num_pool)
            pool[val].nvme_size.value = int(pool[val].nvme_size.value /
                                            num_pool)
            pool[val].create()
            pool_uuid.append(pool[val].uuid)

        # Exclude and reintegrate the pool_uuid, rank and targets
        for val in range(0, num_pool):
            threads = []
            for oclass, api, test, flags in product(self.ior_dfs_oclass,
                                                    self.ior_apis,
                                                    self.ior_test_sequence,
                                                    self.ior_flags):
                for _ in range(0, num_jobs):
                    # Add a thread for these IOR arguments
                    threads.append(
                        threading.Thread(target=self.ior_thread,
                                         kwargs={
                                             "pool": pool[val],
                                             "oclass": oclass,
                                             "api": api,
                                             "test": test,
                                             "flags": flags,
                                             "results": self.out_queue
                                         }))
                # Launch the IOR threads
                for thrd in threads:
                    self.log.info("Thread : %s", thrd)
                    thrd.start()
                    time.sleep(1)
            self.pool = pool[val]
            time.sleep(5)
            self.pool.display_pool_daos_space("Pool space: Beginning")
            pver_begin = self.get_pool_version()
            self.log.info("Pool Version at the beginning %s", pver_begin)
            if server_boot is False:
                output = self.dmg_command.pool_exclude(self.pool.uuid, rank)
            else:
                output = self.dmg_command.system_stop(ranks=rank)
                self.pool.wait_for_rebuild(True)
                self.log.info(output)
                output = self.dmg_command.system_start(ranks=rank)

            self.log.info(output)
            self.is_rebuild_done(3)
            self.assert_on_rebuild_failure()
            pver_exclude = self.get_pool_version()
            time.sleep(5)

            self.log.info("Pool Version after exclude %s", pver_exclude)
            # Check pool version incremented after pool exclude
            # pver_exclude should be greater than
            # pver_begin + 8 targets.
            self.assertTrue(pver_exclude > (pver_begin + 8),
                            "Pool Version Error:  After exclude")
            output = self.dmg_command.pool_reintegrate(self.pool.uuid, rank)
            self.log.info(output)
            self.is_rebuild_done(3)
            self.assert_on_rebuild_failure()

            pver_reint = self.get_pool_version()
            self.log.info("Pool Version after reintegrate %d", pver_reint)
            # Check pool version incremented after pool reintegrate
            self.assertTrue(pver_reint > (pver_exclude + 1),
                            "Pool Version Error:  After reintegrate")
            # Wait to finish the threads
            for thrd in threads:
                thrd.join(timeout=20)

        # Check data consistency for IOR in future
        # Presently, we are running daos_racer in parallel
        # to IOR and checking the data consistency only
        # for the daos_racer objects after exclude
        # and reintegration.
        if racer is True:
            daos_racer_thread.join()

        for val in range(0, num_pool):
            display_string = "Pool{} space at the End".format(val)
            self.pool = pool[val]
            self.pool.display_pool_daos_space(display_string)

    @skipForTicket("DAOS-6573")
    def test_osa_online_reintegration(self):
        """Test ID: DAOS-5075.

        Test Description: Validate Online Reintegration

        :avocado: tags=all,pr,daily_regression,hw,medium,ib2,osa
        :avocado: tags=online_reintegration
        """
        # Perform reintegration testing with 1 pool.
        for pool_num in range(1, 2):
            self.run_online_reintegration_test(pool_num)

    @skipForTicket("DAOS-6766, DAOS-6783")
    def test_osa_online_reintegration_server_stop(self):
        """Test ID: DAOS-5920.
        Test Description: Validate Online Reintegration with server stop
        :avocado: tags=all,pr,daily_regression,hw,medium,ib2,osa
        :avocado: tags=online_reintegration_srv_stop
        """
        self.run_online_reintegration_test(1, server_boot=True)
Example #8
0
    def verify_client_run(self, exp_iface, env):
        """Verify the interface assigned by running a libdaos client.

        Args:
            exp_iface (str): expected interface to check.
            env (bool): add OFI_INTERFACE variable to exported variables of
                client command.

        Returns:
            bool: returns status

        """
        clients = self.agent_managers[0].hosts

        # Get counter values for hfi devices before and after
        port_info_before = self.get_port_cnt(clients, "port_rcv_data")

        # get the dmg config file for daos_racer
        dmg = self.get_dmg_command()

        # Let's run daos_racer as a client
        daos_racer = DaosRacerCommand(self.bin, clients[0], dmg)
        daos_racer.get_params(self)

        # Update env_name list to add OFI_INTERFACE if needed.
        if env:
            daos_racer.update_env_names(["OFI_INTERFACE"])

        # Setup the environment and logfile
        log_file = "daos_racer_{}_{}.log".format(exp_iface, env)

        # Add FI_LOG_LEVEL to get more info on device issues
        racer_env = daos_racer.get_environment(self.server_managers[0], log_file)
        racer_env["FI_LOG_LEVEL"] = "info"
        racer_env["D_LOG_MASK"] = "INFO,object=ERR,placement=ERR"
        daos_racer.set_environment(racer_env)

        # Run client
        daos_racer.run()

        # Verify output and port count to check what iface CaRT init with.
        port_info_after = self.get_port_cnt(clients, "port_rcv_data")

        self.log.info("Client interface port_rcv_data counters")
        msg_format = "%16s  %9s  %9s  %9s  %s"
        self.log.info(msg_format, "Host(s)", "Interface", "Before", "After", "Difference")
        self.log.info(msg_format, "-" * 16, "-" * 9, "-" * 9, "-" * 9, "-" * 9)
        no_traffic = set()
        for interface in sorted(port_info_before):
            for host in sorted(port_info_before[interface]):
                before = port_info_before[interface][host][1]["port_rcv_data"]
                try:
                    after = port_info_after[interface][host][1]["port_rcv_data"]
                    diff = int(after) - int(before)
                    if diff <= 0:
                        no_traffic.add(interface)
                except (KeyError, ValueError) as error:
                    after = "Error"
                    diff = "Unknown - {}".format(error)
                    no_traffic.add(interface)
                self.log.info(msg_format, host, interface, before, after, diff)

        # Read daos.log to verify device used and prevent false positives
        self.assertTrue(self.get_log_info(clients, exp_iface, env, get_log_file(log_file)))

        # If we don't see data going through the device, fail
        for interface in no_traffic:
            self.log.info("No client traffic seen through device: %s", interface)
        return len(no_traffic) != len(self.interfaces)
Example #9
0
class OSAOnlineParallelTest(OSAUtils):
    # pylint: disable=too-many-ancestors
    """
    Test Class Description: This test runs
    daos_server online drain,reintegration,
    extend test cases in parallel.

    :avocado: recursive
    """
    def setUp(self):
        """Set up for test case."""
        super(OSAOnlineParallelTest, self).setUp()
        self.dmg_command = self.get_dmg_command()
        self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*')
        self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*')
        self.ior_test_sequence = self.params.get("ior_test_sequence",
                                                 '/run/ior/iorflags/*')
        self.ior_dfs_oclass = self.params.get("obj_class",
                                              '/run/ior/iorflags/*')
        # Recreate the client hostfile without slots defined
        self.hostfile_clients = write_host_file(self.hostlist_clients,
                                                self.workdir, None)
        self.pool = None
        self.out_queue = queue.Queue()
        self.ds_racer_queue = queue.Queue()
        self.daos_racer = None

    def daos_racer_thread(self, results):
        """Start the daos_racer thread.
        """
        self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0],
                                           self.dmg_command)
        self.daos_racer.get_params(self)
        self.daos_racer.set_environment(
            self.daos_racer.get_environment(self.server_managers[0]))
        self.daos_racer.run()
        results.put("Daos Racer Started")

    def dmg_thread(self, action, action_args, results):
        """Generate different dmg command related to OSA.
            Args:
            action_args(dict) : {action: {"puuid":
                                          pool[val].uuid,
                                          "rank": rank,
                                          "target": t_string,
                                          "action": action,}
            results (queue) : dmg command output queue.
        """
        # Give sometime for IOR threads to start
        dmg = copy.copy(self.dmg_command)
        try:
            if action == "reintegrate":
                time.sleep(60)
            # For each action, read the values from the
            # dictionary.
            # example {"exclude" : {"puuid": self.pool, "rank": rank
            #                       "target": t_string, "action": exclude}}
            # getattr is used to obtain the method in dmg object.
            # eg: dmg -> pool_exclude method, then pass arguments like
            # puuid, rank, target to the pool_exclude method.
            getattr(dmg, "pool_{}".format(action))(**action_args[action])
        except CommandFailure as _error:
            results.put("{} failed".format(action_args[action]))
        # Future enhancement for extend
        # elif action == "extend":
        #    dmg.pool_extend(puuid, (rank + 2))

    def run_online_parallel_test(self, num_pool, racer=False):
        """Run multiple OSA commands / IO in parallel.
            Args:
            num_pool (int) : total pools to create for testing purposes.
            data (bool) : whether pool has no data or to create
                          some data in pool. Defaults to False.
        """
        num_jobs = self.params.get("no_parallel_job", '/run/ior/*')
        # Create a pool
        pool = {}
        pool_uuid = []
        target_list = []

        # Exclude target : random two targets  (target idx : 0-7)
        n = random.randint(0, 6)
        target_list.append(n)
        target_list.append(n + 1)
        t_string = "{},{}".format(target_list[0], target_list[1])

        # Exclude rank 2.
        rank = 2

        # Start the daos_racer thread
        if racer is True:
            kwargs = {"results": self.ds_racer_queue}
            daos_racer_thread = threading.Thread(target=self.daos_racer_thread,
                                                 kwargs=kwargs)
            daos_racer_thread.start()
            time.sleep(30)

        for val in range(0, num_pool):
            pool[val] = TestPool(self.context,
                                 dmg_command=self.get_dmg_command())
            pool[val].get_params(self)
            # Split total SCM and NVME size for creating multiple pools.
            pool[val].scm_size.value = int(pool[val].scm_size.value / num_pool)
            pool[val].nvme_size.value = int(pool[val].nvme_size.value /
                                            num_pool)
            pool[val].create()
            pool_uuid.append(pool[val].uuid)

        # Exclude and reintegrate the pool_uuid, rank and targets
        for val in range(0, num_pool):
            self.pool = pool[val]
            self.pool.display_pool_daos_space("Pool space: Beginning")
            pver_begin = self.get_pool_version()
            self.log.info("Pool Version at the beginning %s", pver_begin)
            threads = []
            for oclass, api, test, flags in product(self.ior_dfs_oclass,
                                                    self.ior_apis,
                                                    self.ior_test_sequence,
                                                    self.ior_flags):
                # Action dictionary with OSA dmg command parameters
                action_args = {
                    "drain": {
                        "pool": self.pool.uuid,
                        "rank": rank,
                        "tgt_idx": None
                    },
                    "exclude": {
                        "pool": self.pool.uuid,
                        "rank": (rank + 1),
                        "tgt_idx": t_string
                    },
                    "reintegrate": {
                        "pool": self.pool.uuid,
                        "rank": (rank + 1),
                        "tgt_idx": t_string
                    }
                }
                for _ in range(0, num_jobs):
                    # Add a thread for these IOR arguments
                    threads.append(
                        threading.Thread(target=self.ior_thread,
                                         kwargs={
                                             "pool": pool[val],
                                             "oclass": oclass,
                                             "api": api,
                                             "test": test,
                                             "flags": flags,
                                             "results": self.out_queue
                                         }))
                for action in sorted(action_args):
                    # Add dmg threads
                    threads.append(
                        threading.Thread(target=self.dmg_thread,
                                         kwargs={
                                             "action": action,
                                             "action_args": action_args,
                                             "results": self.out_queue
                                         }))

                # Launch the IOR threads
                for thrd in threads:
                    self.log.info("Thread : %s", thrd)
                    thrd.start()
                    time.sleep(2)

                # Wait to finish the threads
                for thrd in threads:
                    thrd.join(timeout=20)

            # Check data consistency for IOR in future
            # Presently, we are running daos_racer in parallel
            # to IOR and checking the data consistency only
            # for the daos_racer objects after exclude
            # and reintegration.
            if racer is True:
                daos_racer_thread.join()

            for val in range(0, num_pool):
                display_string = "Pool{} space at the End".format(val)
                pool[val].display_pool_daos_space(display_string)
                self.is_rebuild_done(3)
                self.assert_on_rebuild_failure()

                pver_end = self.get_pool_version()
                self.log.info("Pool Version at the End %s", pver_end)
                self.assertTrue(pver_end == 25,
                                "Pool Version Error:  at the end")
                pool[val].destroy()

    @skipForTicket("DAOS-6107")
    def test_osa_online_parallel_test(self):
        """
        JIRA ID: DAOS-4752

        Test Description: Runs multiple OSA commands/IO in parallel

        :avocado: tags=all,pr,daily_regression,hw,medium,ib2,osa
        :avocado: tags=osa_parallel,online_parallel
        """
        self.run_online_parallel_test(1)
Example #10
0
class OSAOnlineParallelTest(TestWithServers):
    # pylint: disable=too-many-ancestors
    """
    Test Class Description: This test runs
    daos_server online drain,reintegration,
    extend test cases in parallel.

    :avocado: recursive
    """
    def setUp(self):
        """Set up for test case."""
        super(OSAOnlineParallelTest, self).setUp()
        self.dmg_command = self.get_dmg_command()
        self.no_of_dkeys = self.params.get("no_of_dkeys", '/run/dkeys/*')
        self.no_of_akeys = self.params.get("no_of_akeys", '/run/akeys/*')
        self.record_length = self.params.get("length", '/run/record/*')
        self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*')
        self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*')
        self.ior_test_sequence = self.params.get("ior_test_sequence",
                                                 '/run/ior/iorflags/*')
        self.ior_dfs_oclass = self.params.get("obj_class",
                                              '/run/ior/iorflags/*')
        # Recreate the client hostfile without slots defined
        self.hostfile_clients = write_host_file(
            self.hostlist_clients, self.workdir, None)
        self.pool = None
        self.out_queue = queue.Queue()
        self.ds_racer_queue = queue.Queue()
        self.daos_racer = None

    @fail_on(CommandFailure)
    def get_pool_version(self):
        """Get the pool version.

        Returns:
            int: pool_version_value

        """
        data = self.dmg_command.pool_query(self.pool.uuid)
        return int(data["version"])

    def daos_racer_thread(self, results):
        """Start the daos_racer thread.
        """
        self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0],
                                           self.dmg_command)
        self.daos_racer.get_params(self)
        self.daos_racer.set_environment(
            self.daos_racer.get_environment(self.server_managers[0]))
        self.daos_racer.run()
        results.put("Daos Racer Started")

    def ior_thread(self, pool, oclass, api, test, flags, results):
        """Start threads and wait until all threads are finished.
        Args:
            pool (object): pool handle
            oclass (str): IOR object class
            api (str): IOR api
            test (list): IOR test sequence
            flags (str): IOR flags
            results (queue): queue for returning thread results

        Returns:
            None
        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        container_info = {}
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test : Mpich not installed on :"
                      " {}".format(self.hostfile_clients[0]))
        self.pool = pool
        # Define the arguments for the ior_runner_thread method
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)
        ior_cmd.set_daos_params(self.server_group, self.pool)
        ior_cmd.dfs_oclass.update(oclass)
        ior_cmd.api.update(api)
        ior_cmd.transfer_size.update(test[2])
        ior_cmd.block_size.update(test[3])
        ior_cmd.flags.update(flags)

        container_info["{}{}{}"
                       .format(oclass,
                               api,
                               test[2])] = str(uuid.uuid4())

        # Define the job manager for the IOR command
        manager = Mpirun(ior_cmd, mpitype="mpich")
        key = "".join([oclass, api, str(test[2])])
        manager.job.dfs_cont.update(container_info[key])
        env = ior_cmd.get_default_env(str(manager))
        manager.assign_hosts(self.hostlist_clients, self.workdir, None)
        manager.assign_processes(processes)
        manager.assign_environment(env, True)

        # run IOR Command
        try:
            manager.run()
        except CommandFailure as _error:
            results.put("FAIL")

    def dmg_thread(self, action, action_args, results):
        """Generate different dmg command related to OSA.
            Args:
            action_args(dict) : {action: {"puuid":
                                          pool[val].uuid,
                                          "rank": rank,
                                          "target": t_string,
                                          "action": action,}
            results (queue) : dmg command output queue.
        """
        # Give sometime for IOR threads to start
        dmg = copy.copy(self.dmg_command)
        try:
            if action == "reintegrate":
                time.sleep(60)
            # For each action, read the values from the
            # dictionary.
            # example {"exclude" : {"puuid": self.pool, "rank": rank
            #                       "target": t_string, "action": exclude}}
            # getattr is used to obtain the method in dmg object.
            # eg: dmg -> pool_exclude method, then pass arguments like
            # puuid, rank, target to the pool_exclude method.
            getattr(dmg, "pool_{}".format(action))(**action_args[action])
        except CommandFailure as _error:
            results.put("{} failed".format(action_args[action]))
        # Future enhancement for extend
        # elif action == "extend":
        #    dmg.pool_extend(puuid, (rank + 2))

    def run_online_parallel_test(self, num_pool):
        """Run multiple OSA commands / IO in parallel.
            Args:
            num_pool (int) : total pools to create for testing purposes.
            data (bool) : whether pool has no data or to create
                          some data in pool. Defaults to False.
        """
        num_jobs = self.params.get("no_parallel_job", '/run/ior/*')
        # Create a pool
        pool = {}
        pool_uuid = []
        target_list = []

        # Exclude target : random two targets  (target idx : 0-7)
        n = random.randint(0, 6)
        target_list.append(n)
        target_list.append(n+1)
        t_string = "{},{}".format(target_list[0], target_list[1])

        # Exclude rank 2.
        rank = 2

        # Start the daos_racer thread
        kwargs = {"results": self.ds_racer_queue}
        daos_racer_thread = threading.Thread(target=self.daos_racer_thread,
                                             kwargs=kwargs)
        daos_racer_thread.start()
        time.sleep(30)

        for val in range(0, num_pool):
            pool[val] = TestPool(self.context,
                                 dmg_command=self.get_dmg_command())
            pool[val].get_params(self)
            # Split total SCM and NVME size for creating multiple pools.
            pool[val].scm_size.value = int(pool[val].scm_size.value /
                                           num_pool)
            pool[val].nvme_size.value = int(pool[val].nvme_size.value /
                                            num_pool)
            pool[val].create()
            pool_uuid.append(pool[val].uuid)

        # Exclude and reintegrate the pool_uuid, rank and targets
        for val in range(0, num_pool):
            self.pool = pool[val]
            self.pool.display_pool_daos_space("Pool space: Beginning")
            pver_begin = self.get_pool_version()
            self.log.info("Pool Version at the beginning %s", pver_begin)

            for oclass, api, test, flags in product(self.ior_dfs_oclass,
                                                    self.ior_apis,
                                                    self.ior_test_sequence,
                                                    self.ior_flags):
                threads = []
                # Action dictionary with OSA dmg command parameters
                action_args = {
                    "drain": {"pool": self.pool.uuid, "rank": rank,
                              "tgt_idx": None},
                    "exclude": {"pool": self.pool.uuid, "rank": (rank + 1),
                                "tgt_idx": t_string},
                    "reintegrate": {"pool": self.pool.uuid, "rank": (rank + 1),
                                    "tgt_idx": t_string}
                }
                for _ in range(0, num_jobs):
                    # Add a thread for these IOR arguments
                    threads.append(threading.Thread(target=self.ior_thread,
                                                    kwargs={"pool": pool[val],
                                                            "oclass": oclass,
                                                            "api": api,
                                                            "test": test,
                                                            "flags": flags,
                                                            "results":
                                                            self.out_queue}))
                for action in sorted(action_args):
                    # Add dmg threads
                    threads.append(threading.Thread(target=self.dmg_thread,
                                                    kwargs={"action": action,
                                                            "action_args":
                                                            action_args,
                                                            "results":
                                                            self.out_queue}))

                # Launch the IOR threads
                for thrd in threads:
                    self.log.info("Thread : %s", thrd)
                    thrd.start()
                    time.sleep(3)

                # Wait to finish the threads
                for thrd in threads:
                    thrd.join()

            # Check data consistency for IOR in future
            # Presently, we are running daos_racer in parallel
            # to IOR and checking the data consistency only
            # for the daos_racer objects after exclude
            # and reintegration.
            daos_racer_thread.join()

            for val in range(0, num_pool):
                display_string = "Pool{} space at the End".format(val)
                pool[val].display_pool_daos_space(display_string)
                fail_count = 0
                while fail_count <= 20:
                    pver_end = self.get_pool_version()
                    time.sleep(10)
                    fail_count += 1
                    if pver_end > 23:
                        break
                self.log.info("Pool Version at the End %s", pver_end)
                self.assertTrue(pver_end == 25,
                                "Pool Version Error:  at the end")
                pool[val].destroy()

    @skipForTicket("DAOS-5877")
    def test_osa_online_parallel_test(self):
        """
        JIRA ID: DAOS-4752

        Test Description: Runs multiple OSA commands/IO in parallel

        :avocado: tags=all,pr,hw,large,osa,osa_parallel,online_parallel
        """
        self.run_online_parallel_test(1)
Example #11
0
    def verify_client_run(self, exp_iface, env):
        """Verify the interface assigned by running a libdaos client.

        Args:
            exp_iface (str): expected interface to check.
            env (bool): add OFI_INTERFACE variable to exported variables of
                client command.

        Returns:
            bool: returns status

        """
        hfi_map = {"ib0": "hfi1_0", "ib1": "hfi1_1"}

        # Get counter values for hfi devices before and after
        cnt_before = self.get_port_cnt(self.hostlist_clients,
                                       hfi_map[exp_iface], "port_rcv_data")

        # get the dmg config file for daos_racer
        dmg = self.get_dmg_command()

        # Let's run daos_racer as a client
        daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0], dmg)
        daos_racer.get_params(self)

        # Update env_name list to add OFI_INTERFACE if needed.
        if env:
            daos_racer.update_env_names(["OFI_INTERFACE"])

        # Setup the environment and logfile
        logf = "daos_racer_{}_{}.log".format(exp_iface, env)

        # Add FI_LOG_LEVEL to get more info on device issues
        racer_env = daos_racer.get_environment(self.server_managers[0], logf)
        racer_env["FI_LOG_LEVEL"] = "info"
        daos_racer.set_environment(racer_env)

        # Run client
        daos_racer.run()

        # Verify output and port count to check what iface CaRT init with.
        cnt_after = self.get_port_cnt(self.hostlist_clients,
                                      hfi_map[exp_iface], "port_rcv_data")

        diff = 0
        for cnt_b, cnt_a in zip(cnt_before.values(), cnt_after.values()):
            diff = int(cnt_a) - int(cnt_b)
            self.log.info("Port [%s] count difference: %s", exp_iface, diff)

        # Read daos.log to verify device used and prevent false positives
        self.assertTrue(
            self.get_log_info(self.hostlist_clients, exp_iface, env,
                              get_log_file(logf)))

        # If we don't see data going through the device, fail
        status = True
        if diff <= 0:
            self.log.info("No traffic seen through device: %s", exp_iface)
            status = False
        else:
            status = True
        return status
Example #12
0
class OSAOnlineExtend(TestWithServers):
    # pylint: disable=too-many-ancestors
    """
    Test Class Description: This test runs
    daos_server Online Extend test cases.

    :avocado: recursive
    """
    def setUp(self):
        """Set up for test case."""
        super(OSAOnlineExtend, self).setUp()
        self.dmg_command = self.get_dmg_command()
        self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*')
        self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*')
        self.ior_test_sequence = self.params.get("ior_test_sequence",
                                                 '/run/ior/iorflags/*')
        self.ior_daos_oclass = self.params.get("obj_class",
                                               '/run/ior/iorflags/*')
        self.ior_dfs_oclass = self.params.get(
            "obj_class", '/run/ior/iorflags/*')
        # Start an additional server.
        self.extra_servers = self.params.get("test_servers",
                                             "/run/extra_servers/*")
        # Recreate the client hostfile without slots defined
        self.hostfile_clients = write_host_file(
            self.hostlist_clients, self.workdir, None)
        self.pool = None
        self.out_queue = queue.Queue()
        self.ds_racer_queue = queue.Queue()
        self.daos_racer = None

    @fail_on(CommandFailure)
    def get_pool_version(self):
        """Get the pool version.

        Returns:
            int: pool_version_value

        """
        data = self.dmg_command.pool_query(self.pool.uuid)
        return int(data["version"])

    def daos_racer_thread(self):
        """Start the daos_racer thread."""
        self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0],
                                           self.dmg_command)
        self.daos_racer.get_params(self)
        self.daos_racer.set_environment(
            self.daos_racer.get_environment(self.server_managers[0]))
        self.daos_racer.run()

    def ior_thread(self, pool, oclass, api, test, flags, results):
        """Start threads and wait until all threads are finished.
        Args:
            pool (object): pool handle
            oclass (str): IOR object class
            API (str): IOR API
            test (list): IOR test sequence
            flags (str): IOR flags
            results (queue): queue for returning thread results
        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        container_info = {}
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test: Mpich not installed")
        self.pool = pool
        # Define the arguments for the ior_runner_thread method
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)
        ior_cmd.set_daos_params(self.server_group, self.pool)
        ior_cmd.dfs_oclass.update(oclass)
        ior_cmd.api.update(api)
        ior_cmd.transfer_size.update(test[2])
        ior_cmd.block_size.update(test[3])
        ior_cmd.flags.update(flags)

        container_info["{}{}{}"
                       .format(oclass,
                               api,
                               test[2])] = str(uuid.uuid4())

        # Define the job manager for the IOR command
        manager = Mpirun(ior_cmd, mpitype="mpich")
        key = "".join([oclass, api, str(test[2])])
        manager.job.dfs_cont.update(container_info[key])
        env = ior_cmd.get_default_env(str(manager))
        manager.assign_hosts(self.hostlist_clients, self.workdir, None)
        manager.assign_processes(processes)
        manager.assign_environment(env, True)

        # run IOR Command
        try:
            manager.run()
        except CommandFailure as _error:
            results.put("FAIL")

    def run_online_extend_test(self, num_pool):
        """Run the Online extend without data.
            Args:
             int : total pools to create for testing purposes.
        """
        num_jobs = self.params.get("no_parallel_job", '/run/ior/*')
        # Create a pool
        pool = {}
        pool_uuid = []
        total_servers = len(self.hostlist_servers)

        # Extend one of the ranks (or server)
        # rank index starts from zero
        rank = total_servers

        # Start the daos_racer thread
        daos_racer_thread = threading.Thread(target=self.daos_racer_thread)
        daos_racer_thread.start()
        time.sleep(30)

        for val in range(0, num_pool):
            pool[val] = TestPool(self.context, self.get_dmg_command())
            pool[val].get_params(self)
            # Split total SCM and NVME size for creating multiple pools.
            pool[val].scm_size.value = int(pool[val].scm_size.value /
                                           num_pool)
            pool[val].nvme_size.value = int(pool[val].nvme_size.value /
                                            num_pool)
            pool[val].create()
            pool_uuid.append(pool[val].uuid)

        # Extend the pool_uuid, rank and targets
        for val in range(0, num_pool):
            for oclass, api, test, flags in product(self.ior_dfs_oclass,
                                                    self.ior_apis,
                                                    self.ior_test_sequence,
                                                    self.ior_flags):
                threads = []
                for _ in range(0, num_jobs):
                    # Add a thread for these IOR arguments
                    threads.append(threading.Thread(target=self.ior_thread,
                                                    kwargs={"pool": pool[val],
                                                            "oclass": oclass,
                                                            "api": api,
                                                            "test": test,
                                                            "flags": flags,
                                                            "results":
                                                            self.out_queue}))
                # Launch the IOR threads
                for thrd in threads:
                    self.log.info("Thread : %s", thrd)
                    thrd.start()
                    time.sleep(5)
            self.pool = pool[val]
            scm_size = self.pool.scm_size
            nvme_size = self.pool.nvme_size
            self.pool.display_pool_daos_space("Pool space: Beginning")
            pver_begin = self.get_pool_version()

            # Start the additional servers and extend the pool
            self.log.info("Extra Servers = %s", self.extra_servers)
            self.start_additional_servers(self.extra_servers)
            # Give sometime for the additional server to come up.
            time.sleep(5)
            self.log.info("Pool Version at the beginning %s", pver_begin)
            output = self.dmg_command.pool_extend(self.pool.uuid,
                                                  rank, scm_size,
                                                  nvme_size)
            self.log.info(output)

            fail_count = 0
            while fail_count <= 20:
                pver_extend = self.get_pool_version()
                time.sleep(15)
                fail_count += 1
                if pver_extend > pver_begin:
                    break

            self.log.info("Pool Version after extend %s", pver_extend)
            # Check pool version incremented after pool exclude
            self.assertTrue(pver_extend > pver_begin,
                            "Pool Version Error:  After extend")
            # Wait to finish the threads
            for thrd in threads:
                thrd.join()

        # Check data consistency for IOR in future
        # Presently, we are running daos_racer in parallel
        # to IOR and checking the data consistency only
        # for the daos_racer objects after exclude
        # and reintegration.
        daos_racer_thread.join()

        for val in range(0, num_pool):
            display_string = "Pool{} space at the End".format(val)
            self.pool = pool[val]
            self.pool.display_pool_daos_space(display_string)
            pool[val].destroy()

    @skipForTicket("DAOS-5869")
    def test_osa_online_extend(self):
        """Test ID: DAOS-4751
        Test Description: Validate Online extend

        :avocado: tags=all,pr,hw,large,osa,osa_extend,online_extend
        """
        # Perform extend testing with 1 to 2 pools
        self.run_online_extend_test(1)
Example #13
0
class OSAOnlineReintegration(TestWithServers):
    # pylint: disable=too-many-ancestors
    """Online Server Addition online re-integration test class.

    Test Class Description:
        This test runs the daos_server Online reintegration test cases.

    :avocado: recursive
    """
    def setUp(self):
        """Set up for test case."""
        super(OSAOnlineReintegration, self).setUp()
        self.dmg_command = self.get_dmg_command()
        self.no_of_dkeys = self.params.get("no_of_dkeys", '/run/dkeys/*')
        self.no_of_akeys = self.params.get("no_of_akeys", '/run/akeys/*')
        self.record_length = self.params.get("length", '/run/record/*')
        self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*')
        self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*')
        self.ior_test_sequence = self.params.get("ior_test_sequence",
                                                 '/run/ior/iorflags/*')
        self.ior_dfs_oclass = self.params.get("obj_class",
                                              '/run/ior/iorflags/*')
        # Recreate the client hostfile without slots defined
        self.hostfile_clients = write_host_file(self.hostlist_clients,
                                                self.workdir, None)
        self.pool = None
        self.out_queue = queue.Queue()
        self.ds_racer_queue = queue.Queue()
        self.daos_racer = None

    @fail_on(CommandFailure)
    def get_pool_leader(self):
        """Get the pool leader.

        Returns:
            int: pool leader number

        """
        data = self.dmg_command.pool_query(self.pool.uuid)
        return int(data["leader"])

    @fail_on(CommandFailure)
    def get_pool_version(self):
        """Get the pool version.

        Returns:
            int: pool version number

        """
        data = self.dmg_command.pool_query(self.pool.uuid)
        return int(data["version"])

    def daos_racer_thread(self):
        """Start the daos_racer thread."""
        self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0],
                                           self.dmg_command)
        self.daos_racer.get_params(self)
        self.daos_racer.set_environment(
            self.daos_racer.get_environment(self.server_managers[0]))
        self.daos_racer.run()

    def ior_thread(self, pool, oclass, api, test, flags, results):
        """Start threads and wait until all threads are finished.

        Args:
            pool (object): pool handle
            oclass (str): IOR object class
            api (str): IOR api
            test (list): IOR test sequence
            flags (str): IOR flags
            results (queue): queue for returning thread results

        """
        processes = self.params.get("slots", "/run/ior/clientslots/*")
        container_info = {}
        mpio_util = MpioUtils()
        if mpio_util.mpich_installed(self.hostlist_clients) is False:
            self.fail("Exiting Test : Mpich not installed on :"
                      " {}".format(self.hostfile_clients[0]))
        self.pool = pool
        # Define the arguments for the ior_runner_thread method
        ior_cmd = IorCommand()
        ior_cmd.get_params(self)
        ior_cmd.set_daos_params(self.server_group, self.pool)
        ior_cmd.dfs_oclass.update(oclass)
        ior_cmd.api.update(api)
        ior_cmd.transfer_size.update(test[2])
        ior_cmd.block_size.update(test[3])
        ior_cmd.flags.update(flags)

        container_info["{}{}{}".format(oclass, api,
                                       test[2])] = str(uuid.uuid4())

        # Define the job manager for the IOR command
        self.job_manager = Mpirun(ior_cmd, mpitype="mpich")
        key = "".join([oclass, api, str(test[2])])
        self.job_manager.job.dfs_cont.update(container_info[key])
        env = ior_cmd.get_default_env(str(self.job_manager))
        self.job_manager.assign_hosts(self.hostlist_clients, self.workdir,
                                      None)
        self.job_manager.assign_processes(processes)
        self.job_manager.assign_environment(env, True)

        # run IOR Command
        try:
            self.job_manager.run()
        except CommandFailure as _error:
            results.put("FAIL")

    def run_online_reintegration_test(self, num_pool):
        """Run the Online reintegration without data.

        Args:
            num_pool (int) : total pools to create for testing purposes.
            data (bool) : whether pool has no data or to create
                some data in pool. Defaults to False.
        """
        num_jobs = self.params.get("no_parallel_job", '/run/ior/*')
        # Create a pool
        pool = {}
        pool_uuid = []
        target_list = []
        exclude_servers = len(self.hostlist_servers) - 1

        # Exclude target : random two targets  (target idx : 0-7)
        n = random.randint(0, 6)
        target_list.append(n)
        target_list.append(n + 1)
        t_string = "{},{}".format(target_list[0], target_list[1])

        # Exclude one rank : other than rank 0.
        rank = random.randint(1, exclude_servers)

        # Start the daos_racer thread
        daos_racer_thread = threading.Thread(target=self.daos_racer_thread)
        daos_racer_thread.start()
        time.sleep(30)

        for val in range(0, num_pool):
            pool[val] = TestPool(self.context, self.get_dmg_command())
            pool[val].get_params(self)
            # Split total SCM and NVME size for creating multiple pools.
            pool[val].scm_size.value = int(pool[val].scm_size.value / num_pool)
            pool[val].nvme_size.value = int(pool[val].nvme_size.value /
                                            num_pool)
            pool[val].create()
            pool_uuid.append(pool[val].uuid)

        # Exclude and reintegrate the pool_uuid, rank and targets
        for val in range(0, num_pool):
            for oclass, api, test, flags in product(self.ior_dfs_oclass,
                                                    self.ior_apis,
                                                    self.ior_test_sequence,
                                                    self.ior_flags):
                threads = []
                for _ in range(0, num_jobs):
                    # Add a thread for these IOR arguments
                    threads.append(
                        threading.Thread(target=self.ior_thread,
                                         kwargs={
                                             "pool": pool[val],
                                             "oclass": oclass,
                                             "api": api,
                                             "test": test,
                                             "flags": flags,
                                             "results": self.out_queue
                                         }))
                # Launch the IOR threads
                for thrd in threads:
                    self.log.info("Thread : %s", thrd)
                    thrd.start()
                    time.sleep(5)
            self.pool = pool[val]
            self.pool.display_pool_daos_space("Pool space: Beginning")
            pver_begin = self.get_pool_version()
            self.log.info("Pool Version at the beginning %s", pver_begin)
            output = self.dmg_command.pool_exclude(self.pool.uuid, rank,
                                                   t_string)
            self.log.info(output)

            fail_count = 0
            while fail_count <= 20:
                pver_exclude = self.get_pool_version()
                time.sleep(10)
                fail_count += 1
                if pver_exclude > (pver_begin + len(target_list)):
                    break

            self.log.info("Pool Version after exclude %s", pver_exclude)
            # Check pool version incremented after pool exclude
            self.assertTrue(pver_exclude > (pver_begin + len(target_list)),
                            "Pool Version Error:  After exclude")
            output = self.dmg_command.pool_reintegrate(self.pool.uuid, rank,
                                                       t_string)
            self.log.info(output)

            fail_count = 0
            while fail_count <= 20:
                pver_reint = self.get_pool_version()
                time.sleep(10)
                fail_count += 1
                if pver_reint > (pver_exclude + 1):
                    break

            self.log.info("Pool Version after reintegrate %d", pver_reint)
            # Check pool version incremented after pool reintegrate
            self.assertTrue(pver_reint > (pver_exclude + 1),
                            "Pool Version Error:  After reintegrate")
            # Wait to finish the threads
            for thrd in threads:
                thrd.join()

        # Check data consistency for IOR in future
        # Presently, we are running daos_racer in parallel
        # to IOR and checking the data consistency only
        # for the daos_racer objects after exclude
        # and reintegration.
        daos_racer_thread.join()

        for val in range(0, num_pool):
            display_string = "Pool{} space at the End".format(val)
            self.pool = pool[val]
            self.pool.display_pool_daos_space(display_string)
            pool[val].destroy()

    def test_osa_online_reintegration(self):
        """Test ID: DAOS-5075.

        Test Description: Validate Online Reintegration

        :avocado: tags=all,pr,hw,large,osa,online_reintegration,DAOS_5610
        """
        # Perform reintegration testing with 1 pool.
        for pool_num in range(1, 2):
            self.run_online_reintegration_test(pool_num)
Example #14
0
class OSAOnlineExtend(OSAUtils):
    # pylint: disable=too-many-ancestors
    """
    Test Class Description: This test runs
    daos_server Online Extend test cases.

    :avocado: recursive
    """
    def setUp(self):
        """Set up for test case."""
        super(OSAOnlineExtend, self).setUp()
        self.dmg_command = self.get_dmg_command()
        self.ior_flags = self.params.get("ior_flags", '/run/ior/iorflags/*')
        self.ior_apis = self.params.get("ior_api", '/run/ior/iorflags/*')
        self.ior_test_sequence = self.params.get("ior_test_sequence",
                                                 '/run/ior/iorflags/*')
        self.ior_daos_oclass = self.params.get("obj_class",
                                               '/run/ior/iorflags/*')
        self.ior_dfs_oclass = self.params.get("obj_class",
                                              '/run/ior/iorflags/*')
        # Start an additional server.
        self.extra_servers = self.params.get("test_servers",
                                             "/run/extra_servers/*")
        # Recreate the client hostfile without slots defined
        self.hostfile_clients = write_host_file(self.hostlist_clients,
                                                self.workdir, None)
        self.pool = None
        self.out_queue = queue.Queue()
        self.ds_racer_queue = queue.Queue()
        self.daos_racer = None

    def daos_racer_thread(self):
        """Start the daos_racer thread."""
        self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0],
                                           self.dmg_command)
        self.daos_racer.get_params(self)
        self.daos_racer.set_environment(
            self.daos_racer.get_environment(self.server_managers[0]))
        self.daos_racer.run()

    def run_online_extend_test(self, num_pool, racer=False):
        """Run the Online extend without data.
            Args:
             int : total pools to create for testing purposes.
        """
        num_jobs = self.params.get("no_parallel_job", '/run/ior/*')
        # Create a pool
        pool = {}
        pool_uuid = []

        # Extend one of the ranks 4 and 5
        rank = [4, 5]

        # Start the daos_racer thread
        if racer is True:
            daos_racer_thread = threading.Thread(target=self.daos_racer_thread)
            daos_racer_thread.start()
            time.sleep(30)

        for val in range(0, num_pool):
            pool[val] = TestPool(self.context, self.get_dmg_command())
            pool[val].get_params(self)
            # Split total SCM and NVME size for creating multiple pools.
            pool[val].scm_size.value = int(pool[val].scm_size.value / num_pool)
            pool[val].nvme_size.value = int(pool[val].nvme_size.value /
                                            num_pool)
            pool[val].create()
            pool_uuid.append(pool[val].uuid)

        # Extend the pool_uuid, rank and targets
        for val in range(0, num_pool):
            threads = []
            for oclass, api, test, flags in product(self.ior_dfs_oclass,
                                                    self.ior_apis,
                                                    self.ior_test_sequence,
                                                    self.ior_flags):
                for _ in range(0, num_jobs):
                    # Add a thread for these IOR arguments
                    threads.append(
                        threading.Thread(target=self.ior_thread,
                                         kwargs={
                                             "pool": pool[val],
                                             "oclass": oclass,
                                             "api": api,
                                             "test": test,
                                             "flags": flags,
                                             "results": self.out_queue
                                         }))
                # Launch the IOR threads
                for thrd in threads:
                    self.log.info("Thread : %s", thrd)
                    thrd.start()
                    time.sleep(1)
            self.pool = pool[val]
            scm_size = self.pool.scm_size
            nvme_size = self.pool.nvme_size
            self.pool.display_pool_daos_space("Pool space: Beginning")
            pver_begin = self.get_pool_version()

            # Start the additional servers and extend the pool
            self.log.info("Extra Servers = %s", self.extra_servers)
            self.start_additional_servers(self.extra_servers)
            # Give sometime for the additional server to come up.
            time.sleep(25)
            self.log.info("Pool Version at the beginning %s", pver_begin)
            output = self.dmg_command.pool_extend(self.pool.uuid, rank,
                                                  scm_size, nvme_size)
            self.log.info(output)
            self.is_rebuild_done(3)
            self.assert_on_rebuild_failure()

            pver_extend = self.get_pool_version()
            self.log.info("Pool Version after extend %s", pver_extend)
            # Check pool version incremented after pool exclude
            self.assertTrue(pver_extend > pver_begin,
                            "Pool Version Error:  After extend")
            # Wait to finish the threads
            for thrd in threads:
                thrd.join(timeout=20)

        # Check data consistency for IOR in future
        # Presently, we are running daos_racer in parallel
        # to IOR and checking the data consistency only
        # for the daos_racer objects after exclude
        # and reintegration.
        if racer is True:
            daos_racer_thread.join()

        for val in range(0, num_pool):
            display_string = "Pool{} space at the End".format(val)
            self.pool = pool[val]
            self.pool.display_pool_daos_space(display_string)
            pool[val].destroy()

    @skipForTicket("DAOS-5869")
    def test_osa_online_extend(self):
        """Test ID: DAOS-4751
        Test Description: Validate Online extend

        :avocado: tags=all,pr,daily_regression,hw,medium,ib2
        :avocado: tags=osa,osa_extend,online_extend
        """
        # Perform extend testing with 1 to 2 pools
        self.run_online_extend_test(1)
Example #15
0
class OSAOnlineExtend(OSAUtils):
    # pylint: disable=too-many-ancestors
    """
    Test Class Description: This test runs
    daos_server Online Extend test cases.

    :avocado: recursive
    """
    def setUp(self):
        """Set up for test case."""
        super().setUp()
        self.dmg_command = self.get_dmg_command()
        self.daos_command = DaosCommand(self.bin)
        self.ior_test_sequence = self.params.get("ior_test_sequence",
                                                 '/run/ior/iorflags/*')
        self.test_oclass = self.params.get("oclass", '/run/test_obj_class/*')
        self.ranks = self.params.get("rank_list", '/run/test_ranks/*')
        # Start an additional server.
        self.extra_servers = self.params.get("test_servers",
                                             "/run/extra_servers/*")
        # Recreate the client hostfile without slots defined
        self.hostfile_clients = write_host_file(self.hostlist_clients,
                                                self.workdir, None)
        self.pool = None
        self.dmg_command.exit_status_exception = True
        self.daos_racer = None

    def daos_racer_thread(self):
        """Start the daos_racer thread."""
        self.daos_racer = DaosRacerCommand(self.bin, self.hostlist_clients[0],
                                           self.dmg_command)
        self.daos_racer.get_params(self)
        self.daos_racer.set_environment(
            self.daos_racer.get_environment(self.server_managers[0]))
        self.daos_racer.run()

    def run_online_extend_test(self,
                               num_pool,
                               racer=False,
                               oclass=None,
                               app_name="ior"):
        """Run the Online extend without data.
            Args:
             num_pool(int) : total pools to create for testing purposes.
             racer(bool) : Run the testing along with daos_racer.
                           Defaults to False.
             oclass(str) : Object Class (eg: RP_2G1, etc). Default to None.
             app_name(str) : App (ior or mdtest) to run during the testing.
                             Defaults to ior.
        """
        # Pool dictionary
        pool = {}

        if oclass is None:
            oclass = self.ior_cmd.dfs_oclass.value
        test_seq = self.ior_test_sequence[0]

        # Start the daos_racer thread
        if racer is True:
            daos_racer_thread = threading.Thread(target=self.daos_racer_thread)
            daos_racer_thread.start()
            time.sleep(30)

        for val in range(0, num_pool):
            pool[val] = TestPool(context=self.context,
                                 dmg_command=self.get_dmg_command(),
                                 label_generator=self.label_generator)
            pool[val].get_params(self)
            pool[val].create()
            pool[val].set_property("reclaim", "disabled")

        # Extend the pool_uuid, rank and targets
        for val in range(0, num_pool):
            threads = []
            self.pool = pool[val]

            # Start the additional servers and extend the pool
            self.log.info("Extra Servers = %s", self.extra_servers)
            self.start_additional_servers(self.extra_servers)
            if self.test_during_aggregation is True:
                for _ in range(0, 2):
                    self.run_ior_thread("Write", oclass, test_seq)
                self.delete_extra_container(self.pool)
            # The following thread runs while performing osa operations.
            if app_name == "ior":
                threads.append(
                    threading.Thread(target=self.run_ior_thread,
                                     kwargs={
                                         "action": "Write",
                                         "oclass": oclass,
                                         "test": test_seq
                                     }))
            else:
                threads.append(threading.Thread(target=self.run_mdtest_thread))
            # Make sure system map has all ranks in joined state.
            for retry in range(0, 10):
                scan_info = self.get_dmg_command().system_query()
                if not check_system_query_status(scan_info):
                    if retry == 9:
                        self.fail("One or more servers not in expected status")
                else:
                    break

            # Launch the IOR or mdtest thread
            for thrd in threads:
                self.log.info("Thread : %s", thrd)
                thrd.start()
                time.sleep(1)

            self.pool.display_pool_daos_space("Pool space: Beginning")
            pver_begin = self.get_pool_version()
            self.log.info("Pool Version at the beginning %s", pver_begin)
            output = self.dmg_command.pool_extend(self.pool.uuid, self.ranks)
            self.print_and_assert_on_rebuild_failure(output)

            pver_extend = self.get_pool_version()
            self.log.info("Pool Version after extend %s", pver_extend)
            # Check pool version incremented after pool exclude
            self.assertTrue(pver_extend > pver_begin,
                            "Pool Version Error:  After extend")
            # Wait to finish the threads
            for thrd in threads:
                thrd.join()
                if not self.out_queue.empty():
                    self.assert_on_exception()

        # Check data consistency for IOR in future
        # Presently, we are running daos_racer in parallel
        # to IOR and checking the data consistency only
        # for the daos_racer objects after exclude
        # and reintegration.
        if racer is True:
            daos_racer_thread.join()

        for val in range(0, num_pool):
            display_string = "Pool{} space at the End".format(val)
            self.pool = pool[val]
            self.pool.display_pool_daos_space(display_string)
            self.run_ior_thread("Read", oclass, test_seq)
            self.container = self.pool_cont_dict[self.pool][0]
            kwargs = {"pool": self.pool.uuid, "cont": self.container.uuid}
            output = self.daos_command.container_check(**kwargs)
            self.log.info(output)

    @skipForTicket("DAOS-7195,DAOS-7955")
    def test_osa_online_extend(self):
        """Test ID: DAOS-4751
        Test Description: Validate Online extend with checksum
        enabled.

        :avocado: tags=all,pr,daily_regression
        :avocado: tags=hw,medium,ib2
        :avocado: tags=osa,checksum
        :avocado: tags=osa_extend,online_extend,online_extend_with_csum
        """
        self.log.info("Online Extend : With Checksum")
        self.run_online_extend_test(1)

    @skipForTicket("DAOS-7195,DAOS-7955")
    def test_osa_online_extend_without_checksum(self):
        """Test ID: DAOS-6645
        Test Description: Validate Online extend without checksum enabled.

        :avocado: tags=all,pr,daily_regression
        :avocado: tags=hw,medium,ib2
        :avocado: tags=osa,checksum
        :avocado: tags=osa_extend,online_extend,online_extend_without_csum
        """
        self.log.info("Online Extend : Without Checksum")
        self.test_with_checksum = self.params.get("test_with_checksum",
                                                  '/run/checksum/*')
        self.run_online_extend_test(1)

    @skipForTicket("DAOS-7195,DAOS-7955")
    def test_osa_online_extend_oclass(self):
        """Test ID: DAOS-6645
        Test Description: Validate Online extend with different
        object class.

        :avocado: tags=all,pr,daily_regression
        :avocado: tags=hw,medium,ib2
        :avocado: tags=osa,checksum
        :avocado: tags=osa_extend,online_extend,online_extend_oclass
        """
        self.log.info("Online Extend : Oclass")
        self.run_online_extend_test(1, oclass=self.test_oclass[0])

    @skipForTicket("DAOS-7195,DAOS-7955")
    def test_osa_online_extend_mdtest(self):
        """Test ID: DAOS-6645
        Test Description: Validate Online extend with mdtest application.

        :avocado: tags=all,pr,daily_regression
        :avocado: tags=hw,medium,ib2
        :avocado: tags=osa,checksum
        :avocado: tags=osa_extend,online_extend,online_extend_mdtest
        """
        self.log.info("Online Extend : Mdtest")
        self.run_online_extend_test(1, app_name="mdtest")

    @skipForTicket("DAOS-7195,DAOS-7955")
    def test_osa_online_extend_with_aggregation(self):
        """Test ID: DAOS-6645
        Test Description: Validate Online extend with aggregation on.

        :avocado: tags=all,pr,daily_regression
        :avocado: tags=hw,medium,ib2
        :avocado: tags=osa,checksum
        :avocado: tags=osa_extend,online_extend,online_extend_with_aggregation
        """
        self.log.info("Online Extend : Aggregation")
        self.test_during_aggregation = self.params.get("test_with_aggregation",
                                                       '/run/aggregation/*')
        self.run_online_extend_test(1)