Ejemplo n.º 1
0
    def test_ior_crash(self):
        """Jira ID: DAOS-4332.
           Jira ID: DAOS-9946.

        Test Description:
            Verify DAOS server does not need to be restarted when an application crashes.

        Use Cases:
            Run IOR Write.
            Kill IOR process in the middle of Write.
            Verify DAOS engines did not crash.
            Run IOR Write, Read.
            Kill IOR process in the middle of Read.
            Verify DAOS engines did not crash.
            Run IOR Write, Read, CheckRead.
            Verify IOR completes successfully.

        :avocado: tags=all,full_regression
        :avocado: tags=hw,medium,ib2
        :avocado: tags=daosio,ior,dfs
        :avocado: tags=ior_crash
        """
        # Run IOR and crash it in the middle of Write
        self.run_ior_with_pool()
        self.check_subprocess_status()
        time.sleep(self.ior_cmd.sw_deadline.value / 2)
        self.stop_ior()

        # Verify engines did not crash
        scan_info = self.dmg.system_query(verbose=True)
        if not check_system_query_status(scan_info):
            self.fail("One or more engines crashed")

        # Run IOR and crash it in the middle of Read.
        # Must wait for Write to complete first.
        self.run_ior_with_pool()
        time.sleep(self.ior_cmd.sw_deadline.value * 1.5)
        self.check_subprocess_status("read")
        self.stop_ior()

        # Verify engines did not crash
        scan_info = self.dmg.system_query(verbose=True)
        if not check_system_query_status(scan_info):
            self.fail("One or more engines crashed")

        # Run IOR and verify it completes successfully
        self.run_ior_with_pool()
        self.job_manager.wait()

        # Verify engines did not crash
        scan_info = self.dmg.system_query(verbose=True)
        if not check_system_query_status(scan_info):
            self.fail("One or more engines crashed")
Ejemplo n.º 2
0
    def test_crashior(self):
        """Jira ID: DAOS-4332.

        Test Description:
            DAOS server does not need to be restarted when the application
            crashes.

        Use Cases:
            Run IOR over dfuse.
            Cancel IOR in the middle of io.
            Check daos server does not need to be restarted when the
            application crashes.

        :avocado: tags=all,daosio,hw,medium,ib2,full_regression,crashior
        """
        # run ior and crash it during write process
        self.run_ior_with_pool()
        # check if ior write has started
        self.check_subprocess_status()
        # allow 50 secs of write to happen
        time.sleep(50)
        # kill ior process in the middle of IO
        self.stop_ior()

        # obtain server rank info using 'dmg system query -v'
        scan_info = self.dmg.system_query(verbose=True)
        # check for any crashed servers after killing ior in the middle
        if not check_system_query_status(scan_info):
            self.fail("One or more server crashed")

        # run ior again and crash it during read process
        self.run_ior_with_pool()
        # allow write to finish which is set at stonewalling limit of 100 sec
        # hence allowing extra 5 secs for read to begin
        time.sleep(105)
        # check if ior read process started
        self.check_subprocess_status("read")
        # kill ior process in middle of read process
        self.stop_ior()

        # obtain server rank info using 'dmg system query -v'
        scan_info = self.dmg.system_query(verbose=True)
        # check for any crashed servers after killing ior in the middle
        if not check_system_query_status(scan_info):
            self.fail("One or more server crashed")

        # run ior again if everything goes well till now and allow it to
        # complete without killing in the middle this time to check
        # if io goes as expected after crashing it previously
        self.run_ior_with_pool()
        self.job_manager.wait()
Ejemplo n.º 3
0
    def run_offline_extend_test(self, num_pool, data=False, oclass=None):
        """Run the offline extend without data.

        Args:
            num_pool (int) : total pools to create for testing purposes.
            data (bool) : whether pool has no data or to create
                          some data in pool. Defaults to False.
            oclass (list) : list of daos object class (eg: "RP_2G8")
        """
        # Create a pool
        pool = {}
        if oclass is None:
            oclass = []
            oclass.append(self.ior_cmd.dfs_oclass.value)

        self.log.info(oclass[0])

        for val in range(0, num_pool):
            # Perform IOR write using the oclass list
            if val < len(oclass):
                index = val
            else:
                index = 0
            pool[val] = TestPool(self.context, dmg_command=self.dmg_command)
            pool[val].get_params(self)
            pool[val].create()
            self.pool = pool[val]
            test_seq = self.ior_test_sequence[0]
            self.pool.set_property("reclaim", "disabled")
            if data:
                self.run_ior_thread("Write", oclass[index], test_seq)
                self.run_mdtest_thread()
                if self.test_during_aggregation is True:
                    self.run_ior_thread("Write", oclass[index], test_seq)
        # Start the additional servers and extend the pool
        self.log.info("Extra Servers = %s", self.extra_servers)
        self.start_additional_servers(self.extra_servers)
        # Give sometime for the additional server to come up.
        for retry in range(0, 10):
            scan_info = self.get_dmg_command().system_query()
            if not check_system_query_status(scan_info):
                if retry == 9:
                    self.fail("One or more servers not in expected status")
            else:
                break

        for rank_index, rank_val in enumerate(self.rank):
            # If total pools less than 3, extend only a single pool.
            # If total pools >= 3  : Extend only 3 pools.
            if num_pool >= len(self.rank):
                val = rank_index
            else:
                val = 0
            self.pool = pool[val]
            scm_size = self.pool.scm_size
            nvme_size = self.pool.nvme_size
            self.pool.display_pool_daos_space("Pool space: Beginning")
            pver_begin = self.get_pool_version()
            self.log.info("Pool Version at the beginning %s", pver_begin)
            # Enable aggregation for multiple pool testing only.
            if self.test_during_aggregation is True and (num_pool > 1):
                self.delete_extra_container(self.pool)
            output = self.dmg_command.pool_extend(self.pool.uuid, rank_val,
                                                  scm_size, nvme_size)
            self.print_and_assert_on_rebuild_failure(output)

            pver_extend = self.get_pool_version()
            self.log.info("Pool Version after extend %d", pver_extend)
            # Check pool version incremented after pool extend
            self.assertTrue(pver_extend > pver_begin,
                            "Pool Version Error:  After extend")

            display_string = "Pool{} space at the End".format(val)
            pool[val].display_pool_daos_space(display_string)

            if data:
                # Perform the IOR read using the same
                # daos object class used for write.
                if val < len(oclass):
                    index = val
                else:
                    index = 0
                self.run_ior_thread("Read", oclass[index], test_seq)
                self.run_mdtest_thread()
                self.container = self.pool_cont_dict[self.pool][0]
                kwargs = {"pool": self.pool.uuid, "cont": self.container.uuid}
                output = self.daos_command.container_check(**kwargs)
                self.log.info(output)
Ejemplo n.º 4
0
    def run_nvme_pool_extend(self, num_pool, oclass=None):
        """Run Pool Extend
        Args:
            num_pool (int) : total pools to create for testing purposes.
            oclass (str) : object class (eg: RP_2G8,etc)
                           Defaults to None.
        """
        self.pool = []
        total_servers = len(self.hostlist_servers) * 2
        self.log.info("Total Daos Servers (Initial): %d", total_servers)
        if oclass is None:
            oclass = self.ior_cmd.dfs_oclass.value

        for val in range(0, num_pool):
            # Create a pool
            self.pool.append(self.get_pool())
            self.pool[-1].set_property("reclaim", "disabled")

        # On each pool (max 3), extend the ranks
        # eg: ranks : 4,5 ; 6,7; 8,9.
        for val in range(0, num_pool):
            test = self.ior_test_sequence[val]
            threads = []
            threads.append(threading.Thread(target=self.run_ior_thread,
                                            kwargs={"action": "Write",
                                                    "oclass": oclass,
                                                    "test": test,
                                                    "pool": self.pool[val]}))
            # Launch the IOR threads
            for thrd in threads:
                self.log.info("Thread : %s", thrd)
                thrd.start()
                time.sleep(1)

            self.pool[val].display_pool_daos_space("Pool space: Beginning")
            pver_begin = self.get_pool_version()

            # Start the additional servers and extend the pool
            if val == 0:
                self.log.info("Extra Servers = %s", self.extra_servers)
                self.start_additional_servers(self.extra_servers)
                # Check the system map extra servers are in joined state.
                for retry in range(0, 10):
                    scan_info = self.get_dmg_command().system_query()
                    if not check_system_query_status(scan_info):
                        if retry == 9:
                            self.fail("One/More servers status not correct")
                    else:
                        break
            self.log.info("Pool Version at the beginning %s", pver_begin)
            # Extend ranks (4,5), (6,7), (8,9)
            ranks_extended = "{},{}".format((val * 2) + 4, (val * 2) + 5)
            output = self.dmg_command.pool_extend(self.pool[val].uuid,
                                                  ranks_extended)
            self.print_and_assert_on_rebuild_failure(output)
            pver_extend = self.get_pool_version()
            self.log.info("Pool Version after extend %s", pver_extend)
            # Check pool version incremented after pool extend
            self.assertTrue(pver_extend > pver_begin,
                            "Pool Version Error:  After extend")
            # Wait to finish the threads
            for thrd in threads:
                thrd.join()
                if not self.out_queue.empty():
                    self.assert_on_exception()
            # Verify the data after pool extend
            self.run_ior_thread("Read", oclass, test)
            # Get the pool space at the end of the test
            display_string = "Pool{} space at the End".format(val)
            self.pool[val].display_pool_daos_space(display_string)
            self.container = self.pool_cont_dict[self.pool[val]][0]
            kwargs = {"pool": self.pool[val].uuid,
                      "cont": self.container.uuid}
            output = self.daos_command.container_check(**kwargs)
            self.log.info(output)
Ejemplo n.º 5
0
    def run_offline_parallel_test(self, num_pool, data=False, oclass=None):
        """Run multiple OSA commands in parallel with or without data.
            Args:
            num_pool (int) : total pools to create for testing purposes.
            data (bool) : whether pool has no data or to create
                          some data in pool. Defaults to False.
            oclass (str) : Daos object class (RP_2G1,etc)
        """
        # Create a pool
        pool = {}
        pool_uuid = []
        target_list = []
        if oclass is None:
            oclass = self.ior_cmd.dfs_oclass.value

        # Exclude target : random two targets (target idx : 0-7)
        n = random.randint(0, 6)
        target_list.append(n)
        target_list.append(n + 1)
        t_string = "{},{}".format(target_list[0], target_list[1])

        # Exclude rank 2.
        rank = 2

        test_seq = self.ior_test_sequence[0]
        for val in range(0, num_pool):
            pool[val] = TestPool(self.context,
                                 dmg_command=self.get_dmg_command())
            pool[val].get_params(self)
            pool[val].create()
            pool_uuid.append(pool[val].uuid)
            self.pool = pool[val]
            self.pool.set_property("reclaim", "disabled")
            if data:
                self.run_ior_thread("Write", oclass, test_seq)
                if oclass != "S1":
                    self.run_mdtest_thread()
                # if self.test_during_aggregation is set,
                # Create another container and run the IOR
                # command using the second container.
                if self.test_during_aggregation is True:
                    self.run_ior_thread("Write", oclass, test_seq)

        # Start the additional servers and extend the pool
        self.log.info("Extra Servers = %s", self.extra_servers)
        self.start_additional_servers(self.extra_servers)
        # Give sometime for the additional server to come up.
        for retry in range(0, 10):
            scan_info = self.get_dmg_command().system_query()
            if not check_system_query_status(scan_info):
                if retry == 9:
                    self.fail("One or more servers not in expected status")
            else:
                break

        # Exclude and reintegrate the pool_uuid, rank and targets
        for val in range(0, num_pool):
            self.pool = pool[val]
            self.pool.display_pool_daos_space("Pool space: Beginning")
            pver_begin = self.get_pool_version()
            self.log.info("Pool Version at the beginning %s", pver_begin)
            # If we need to trigger aggregation on pool 1, delete
            # the second container which has IOR data.
            if self.test_during_aggregation is True and val == 0:
                self.delete_extra_container(self.pool)
            # Create the threads here
            threads = []
            # Action dictionary with OSA dmg command parameters
            action_args = {
                "drain": {
                    "pool": self.pool.uuid,
                    "rank": rank,
                    "tgt_idx": None
                },
                "exclude": {
                    "pool": self.pool.uuid,
                    "rank": (rank + 1),
                    "tgt_idx": t_string
                },
                "reintegrate": {
                    "pool": self.pool.uuid,
                    "rank": (rank + 1),
                    "tgt_idx": t_string
                },
                "extend": {
                    "pool": self.pool.uuid,
                    "ranks": (rank + 2),
                    "scm_size": self.pool.scm_size,
                    "nvme_size": self.pool.nvme_size
                }
            }
            for action in sorted(action_args):
                # Add a dmg thread
                process = threading.Thread(target=self.dmg_thread,
                                           kwargs={
                                               "action": action,
                                               "action_args": action_args,
                                               "results": self.out_queue
                                           })
                process.start()
                threads.append(process)

        # Wait to finish the threads
        for thrd in threads:
            thrd.join()
            time.sleep(5)

        # Check the queue for any failure.
        tmp_list = list(self.out_queue.queue)
        for failure in tmp_list:
            if "FAIL" in failure:
                self.fail("Test failed : {0}".format(failure))

        for val in range(0, num_pool):
            display_string = "Pool{} space at the End".format(val)
            pool[val].display_pool_daos_space(display_string)
            self.is_rebuild_done(3)
            self.assert_on_rebuild_failure()
            pver_end = self.get_pool_version()
            self.log.info("Pool Version at the End %s", pver_end)
            self.assertTrue(pver_end >= 26, "Pool Version Error:  at the end")
        if data:
            self.run_ior_thread("Read", oclass, test_seq)
            if oclass != "S1":
                self.run_mdtest_thread()
            self.container = self.pool_cont_dict[self.pool][0]
            kwargs = {"pool": self.pool.uuid, "cont": self.container.uuid}
            output = self.daos_command.container_check(**kwargs)
            self.log.info(output)
Ejemplo n.º 6
0
    def test_nvme_server_restart(self):
        """Jira ID: DAOS-2650.

        Test Description:
            Test will run IOR with non standard transfer sizes for different
            set of pool sizes. Purpose is to verify io transaction to scm and
            nvme for different pool sizes when servers are restarted after
            write.

        Use Cases:
            (1) Running IOR with different set of transfer size where first
            transfer size is < 4096 and then > 4096. Verify the data after
            servers are restarted.
            (2) Repeat the case(1) with maximum nvme pool size that can be
            created.
            (3) Running IOR with different set of transfer size where the
            transfer size is > 4096 throughout. Verify the data after
            servers are restarted.
            (4) Repeat the case(3) with maximum nvme pool size that can be
            created.
        :avocado: tags=all,full_regression,hw,large,daosio,nvme_server_restart
        """
        # Test params
        tests = self.params.get("ior_sequence", '/run/ior/*')
        processes = self.params.get("np", '/run/ior/*')
        transfer_size = self.params.get("tsize", '/run/ior/transfersize/*/')
        flag_write = self.params.get("write", '/run/ior/*/')
        flag_read = self.params.get("read", '/run/ior/*/')
        block_size = self.ior_cmd.block_size.value

        # Loop for every IOR object type
        for ior_param in tests:
            # Create and connect to a pool
            self.pool = TestPool(
                self.context, dmg_command=self.get_dmg_command())
            self.pool.get_params(self)

            # update pool sizes
            self.pool.scm_size.update(ior_param[0])
            self.pool.nvme_size.update(ior_param[1])

            # Create a pool
            self.pool.create()

            # get pool info
            self.pool.get_info()

            for tsize in transfer_size:
                # Run ior with the parameters specified for this pass
                self.ior_cmd.transfer_size.update(tsize)
                self.ior_cmd.flags.update(flag_write)
                # if transfer size is less thank 1K
                # update block size to 32K to keep it small
                if tsize <= 1000:
                    self.ior_cmd.block_size.update(32000)
                else:
                    self.ior_cmd.block_size.update(block_size)
                self.ior_cmd.set_daos_params(self.server_group, self.pool)
                self.run_ior(self.get_ior_job_manager_command(), processes)

                # Stop all servers
                self.get_dmg_command().system_stop(True)

                # Start all servers
                self.get_dmg_command().system_start()

                # check if all servers started as expected
                scan_info = self.get_dmg_command().get_output("system_query")
                if not check_system_query_status(scan_info):
                    self.fail("One or more servers crashed")

                # read all the data written before server restart
                self.ior_cmd.flags.update(flag_read)
                self.run_ior(self.get_ior_job_manager_command(), processes)

            # destroy pool
            self.destroy_pools(self.pool)
Ejemplo n.º 7
0
    def test_ioaggregation(self):
        """Jira ID: DAOS-4332.

        Test Description:
            Verify Aggregation across system shutdown.

        Use Cases:
            Create Pool.
            Create Container.
            Run IOR and keep the written.
            Capture Free space available after first ior write.
            Create snapshot and obtain the epoch id.
            Write to the same ior file and same amount of data,
            without overwriting the previous data.
            Capture free space again, after second ior write.
            Capture Highest epoch ID before snapshot destroy.
            Destroy the snapshot which was created.
            Shut down the servers and restart them again.
            After servers have successfully restarted, Look for
            aggregation to finish by checking the free space available
            and value of highest epoch which should be higher than
            the value of highest epoch before snapshot destroy.
            If current free space is equal to free space after first
            ior write, then pass otherwise fail the test after waiting
            for 4 attempts.

        :avocado: tags=all,daosio,hw,small,full_regression,ioaggregation
        """
        # update ior signature option
        self.ior_cmd.signature.update("123")
        # run ior write process
        self.run_ior_with_pool()

        # capture free space before taking the snapshot
        self.get_nvme_free_space()

        # create snapshot
        self.container.create_snap()

        # write to same ior file again
        self.ior_cmd.signature.update("456")
        self.run_ior_with_pool(create_cont=False)

        # capture free space after second ior write
        free_space_before_snap_destroy = self.get_nvme_free_space()

        # obtain highest epoch before snapshot destroy via container query
        kwargs = {
            "pool": self.pool.uuid,
            "cont": self.container.uuid
        }
        highest_epc_before_snap_destroy = self.highest_epoch(kwargs)

        # delete snapshot
        self.container.destroy_snap(epc=self.container.epoch)

        # Shutdown the servers and restart
        self.get_dmg_command().system_stop(True)
        time.sleep(5)
        self.get_dmg_command().system_start()

        # check if all servers started as expected
        scan_info = self.get_dmg_command().system_query()
        if not check_system_query_status(scan_info):
            self.fail("One or more servers crashed")

        # Now check if the space is returned back and Highest epoch value
        # is higher than the the value just before snapshot destroy.
        counter = 1
        returned_space = (self.get_nvme_free_space() -
                          free_space_before_snap_destroy)
        while returned_space < int(self.ior_cmd.block_size.value) or \
                highest_epc_before_snap_destroy >= self.highest_epoch(kwargs):
            # try to wait for 4 x 60 secs for aggregation to be completed or
            # else exit the test with a failure.
            if counter > 4:
                self.log.info("Free space before snapshot destroy: %s",
                              free_space_before_snap_destroy)
                self.log.info("Free space when test terminated: %s",
                              self.get_nvme_free_space())
                self.log.info("Highest Epoch before IO Aggregation: %s",
                              highest_epc_before_snap_destroy)
                self.log.info("Highest Epoch when test terminated: %s",
                              self.highest_epoch(kwargs))
                self.fail("Aggregation did not complete as expected")
            time.sleep(60)
            returned_space = (self.get_nvme_free_space() -
                              free_space_before_snap_destroy)
            counter += 1
Ejemplo n.º 8
0
    def run_online_extend_test(self,
                               num_pool,
                               racer=False,
                               oclass=None,
                               app_name="ior"):
        """Run the Online extend without data.
            Args:
             num_pool(int) : total pools to create for testing purposes.
             racer(bool) : Run the testing along with daos_racer.
                           Defaults to False.
             oclass(str) : Object Class (eg: RP_2G1, etc). Default to None.
             app_name(str) : App (ior or mdtest) to run during the testing.
                             Defaults to ior.
        """
        # Pool dictionary
        pool = {}

        if oclass is None:
            oclass = self.ior_cmd.dfs_oclass.value
        test_seq = self.ior_test_sequence[0]

        # Start the daos_racer thread
        if racer is True:
            daos_racer_thread = threading.Thread(target=self.daos_racer_thread)
            daos_racer_thread.start()
            time.sleep(30)

        for val in range(0, num_pool):
            pool[val] = TestPool(context=self.context,
                                 dmg_command=self.get_dmg_command(),
                                 label_generator=self.label_generator)
            pool[val].get_params(self)
            pool[val].create()
            pool[val].set_property("reclaim", "disabled")

        # Extend the pool_uuid, rank and targets
        for val in range(0, num_pool):
            threads = []
            self.pool = pool[val]

            # Start the additional servers and extend the pool
            self.log.info("Extra Servers = %s", self.extra_servers)
            self.start_additional_servers(self.extra_servers)
            if self.test_during_aggregation is True:
                for _ in range(0, 2):
                    self.run_ior_thread("Write", oclass, test_seq)
                self.delete_extra_container(self.pool)
            # The following thread runs while performing osa operations.
            if app_name == "ior":
                threads.append(
                    threading.Thread(target=self.run_ior_thread,
                                     kwargs={
                                         "action": "Write",
                                         "oclass": oclass,
                                         "test": test_seq
                                     }))
            else:
                threads.append(threading.Thread(target=self.run_mdtest_thread))
            # Make sure system map has all ranks in joined state.
            for retry in range(0, 10):
                scan_info = self.get_dmg_command().system_query()
                if not check_system_query_status(scan_info):
                    if retry == 9:
                        self.fail("One or more servers not in expected status")
                else:
                    break

            # Launch the IOR or mdtest thread
            for thrd in threads:
                self.log.info("Thread : %s", thrd)
                thrd.start()
                time.sleep(1)

            self.pool.display_pool_daos_space("Pool space: Beginning")
            pver_begin = self.get_pool_version()
            self.log.info("Pool Version at the beginning %s", pver_begin)
            output = self.dmg_command.pool_extend(self.pool.uuid, self.ranks)
            self.print_and_assert_on_rebuild_failure(output)

            pver_extend = self.get_pool_version()
            self.log.info("Pool Version after extend %s", pver_extend)
            # Check pool version incremented after pool exclude
            self.assertTrue(pver_extend > pver_begin,
                            "Pool Version Error:  After extend")
            # Wait to finish the threads
            for thrd in threads:
                thrd.join()
                if not self.out_queue.empty():
                    self.assert_on_exception()

        # Check data consistency for IOR in future
        # Presently, we are running daos_racer in parallel
        # to IOR and checking the data consistency only
        # for the daos_racer objects after exclude
        # and reintegration.
        if racer is True:
            daos_racer_thread.join()

        for val in range(0, num_pool):
            display_string = "Pool{} space at the End".format(val)
            self.pool = pool[val]
            self.pool.display_pool_daos_space(display_string)
            self.run_ior_thread("Read", oclass, test_seq)
            self.container = self.pool_cont_dict[self.pool][0]
            kwargs = {"pool": self.pool.uuid, "cont": self.container.uuid}
            output = self.daos_command.container_check(**kwargs)
            self.log.info(output)