Ejemplo n.º 1
0
    def run_soak(self, test_param):
        """Run the soak test specified by the test params.

        Args:
            test_param (str): test_params from yaml file

        """
        self.soak_results = {}
        self.pool = []
        self.container = []
        self.harasser_results = {}
        self.harasser_args = {}
        run_harasser = False
        self.all_failed_jobs = []
        self.all_failed_harassers = []
        self.soak_errors = []
        self.check_errors = []
        self.used = []
        test_to = self.params.get("test_timeout", test_param + "*")
        self.test_name = self.params.get("name", test_param + "*")
        single_test_pool = self.params.get("single_test_pool",
                                           test_param + "*", True)
        self.dmg_command.copy_certificates(get_log_file("daosCA/certs"),
                                           self.hostlist_clients)
        self.dmg_command.copy_configuration(self.hostlist_clients)
        harassers = self.params.get("harasserlist", test_param + "*")
        job_list = self.params.get("joblist", test_param + "*")
        if harassers:
            run_harasser = True
            self.log.info("<< Initial harasser list = %s>>", harassers)
            harasserlist = harassers[:]
        # Create the reserved pool with data
        # self.pool is a list of all the pools used in soak
        # self.pool[0] will always be the reserved pool
        add_pools(self, ["pool_reserved"])
        # Create the reserved container
        resv_cont = self.get_container(self.pool[0],
                                       "/run/container_reserved/*", True)
        # populate reserved container with a 500MB file
        initial_resv_file = os.path.join(os.environ["DAOS_TEST_LOG_DIR"],
                                         "initial", "resv_file")
        try:
            reserved_file_copy(self,
                               initial_resv_file,
                               self.pool[0],
                               resv_cont,
                               num_bytes=500000000,
                               cmd="write")
        except CommandFailure as error:
            raise SoakTestError(
                "<<FAILED: Soak reserved container write failed>>") from error

        # Create pool for jobs
        if single_test_pool:
            add_pools(self, ["pool_jobs"])
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))

        # cleanup soak log directories before test on all nodes
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients),
                                  "rm -rf {}".format(self.log_dir),
                                  self.srun_params)
        if result.exit_status > 0:
            raise SoakTestError("<<FAILED: Soak directories not removed"
                                "from clients>>: {}".format(
                                    self.hostlist_clients))
        # cleanup test_node
        for log_dir in [self.log_dir, self.sharedlog_dir]:
            cmd = "rm -rf {}".format(log_dir)
            try:
                result = run_command(cmd, timeout=30)
            except DaosTestError as error:
                raise SoakTestError(
                    "<<FAILED: Soak directory {} was not removed>>".format(
                        log_dir)) from error

        # Initialize time
        start_time = time.time()
        self.test_timeout = int(3600 * test_to)
        self.end_time = start_time + self.test_timeout
        self.log.info("<<START %s >> at %s", self.test_name, time.ctime())
        while time.time() < self.end_time:
            # Start new pass
            start_loop_time = time.time()
            self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop,
                          DDHHMMSS_format(self.end_time - time.time()))
            if not single_test_pool:
                # Create pool for jobs
                add_pools(self, ["pool_jobs"])
                self.log.info("Current pools: %s",
                              " ".join([pool.uuid for pool in self.pool]))
            # Initialize harassers
            if run_harasser:
                if not harasserlist:
                    harasserlist = harassers[:]
                harasser = harasserlist.pop(0)
                self.harasser_args = {}
                self.harasser_results = {}
                self.harassers, self.offline_harassers = get_harassers(
                    harasser)
            try:
                self.execute_jobs(job_list, self.pool[1])
            except SoakTestError as error:
                self.fail(error)
            # Check space after jobs done
            for pool in self.pool:
                self.dmg_command.pool_query(pool.uuid)
            self.soak_errors.extend(self.destroy_containers(self.container))
            self.container = []
            # Remove the test pools from self.pool; preserving reserved pool
            if not single_test_pool:
                self.soak_errors.extend(self.destroy_pools(self.pool[1]))
                self.pool = [self.pool[0]]
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))
            # Fail if the pool/containers did not clean up correctly
            self.assertEqual(len(self.soak_errors), 0,
                             "\n".join(self.soak_errors))
            # Break out of loop if smoke
            if "smoke" in self.test_name:
                break
            loop_time = time.time() - start_loop_time
            self.log.info("<<LOOP %s completed in %s at %s>>", self.loop,
                          DDHHMMSS_format(loop_time), time.ctime())
            # Initialize harasser loop time from first pass loop time
            if self.loop == 1 and run_harasser:
                self.harasser_loop_time = loop_time
            self.loop += 1
        # verify reserved container data
        final_resv_file = os.path.join(os.environ["DAOS_TEST_LOG_DIR"],
                                       "final", "resv_file")
        try:
            reserved_file_copy(self, final_resv_file, self.pool[0], resv_cont)
        except CommandFailure as error:
            raise SoakTestError(
                "<<FAILED: Soak reserved container read failed>>") from error

        if not cmp(initial_resv_file, final_resv_file):
            self.soak_errors.append("Data verification error on reserved pool"
                                    " after SOAK completed")
        for file in [initial_resv_file, final_resv_file]:
            if os.path.isfile(file):
                file_name = os.path.split(os.path.dirname(file))[-1]
                # save a copy of the POSIX file in self.outputsoakdir
                copy_cmd = "cp -p {} {}/{}_resv_file".format(
                    file, self.outputsoakdir, file_name)
                try:
                    run_command(copy_cmd, timeout=30)
                except DaosTestError as error:
                    self.soak_errors.append(
                        "Reserved data file {} failed to archive".format(file))
                os.remove(file)
        self.container.append(resv_cont)
        # Gather the daos logs from the client nodes
        self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>",
                      DDHHMMSS_format(time.time() - start_time))
Ejemplo n.º 2
0
    def run_soak(self, test_param):
        """Run the soak test specified by the test params.

        Args:
            test_param (str): test_params from yaml file

        """
        self.soak_results = {}
        self.pool = []
        self.container = []
        self.harasser_results = {}
        self.harasser_args = {}
        run_harasser = False
        self.all_failed_jobs = []
        self.all_failed_harassers = []
        self.soak_errors = []
        test_to = self.params.get("test_timeout", test_param + "*")
        self.job_timeout = self.params.get("job_timeout", test_param + "*")
        self.test_name = self.params.get("name", test_param + "*")
        self.nodesperjob = self.params.get("nodesperjob", test_param + "*")
        self.taskspernode = self.params.get("taskspernode", test_param + "*")
        harassers = self.params.get("harasserlist", test_param + "*")
        job_list = self.params.get("joblist", test_param + "*")
        rank = self.params.get("rank", "/run/container_reserved/*")
        obj_class = self.params.get("oclass", "/run/container_reserved/*")
        if harassers:
            harasserlist = get_harassers(harassers)
            self.harassers = harasserlist[:]
            run_harasser = True
            self.log.info("<< Initial harrasser list = %s>>",
                          " ".join([harasser for harasser in self.harassers]))
        # Create the reserved pool with data
        # self.pool is a list of all the pools used in soak
        # self.pool[0] will always be the reserved pool
        add_pools(self, ["pool_reserved"])
        self.pool[0].connect()

        # Create the container and populate with a known data
        # TO-DO: use IOR to write and later read verify the data
        resv_cont = self.get_container(self.pool[0],
                                       "/run/container_reserved/*", True)
        resv_cont.write_objects(rank, obj_class)

        # cleanup soak log directories before test on all nodes
        result = slurm_utils.srun(NodeSet.fromlist(self.hostlist_clients),
                                  "rm -rf {}".format(self.log_dir),
                                  self.srun_params)
        if result.exit_status > 0:
            raise SoakTestError("<<FAILED: Soak directories not removed"
                                "from clients>>: {}".format(
                                    self.hostlist_clients))
        # cleanup test_node
        for log_dir in [self.log_dir, self.sharedlog_dir]:
            cmd = "rm -rf {}".format(log_dir)
            try:
                result = run_command(cmd, timeout=30)
            except DaosTestError as error:
                raise SoakTestError(
                    "<<FAILED: Soak directory {} was not removed {}>>".format(
                        log_dir, error))

        # Initialize time
        start_time = time.time()
        self.test_timeout = int(3600 * test_to)
        self.end_time = start_time + self.test_timeout
        self.log.info("<<START %s >> at %s", self.test_name, time.ctime())
        while time.time() < self.end_time:
            # Start new pass
            start_loop_time = time.time()
            self.log.info("<<SOAK LOOP %s: time until done %s>>", self.loop,
                          DDHHMMSS_format(self.end_time - time.time()))
            # Create pool for jobs
            add_pools(self, ["pool_jobs"])
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))
            # Initialize if harassers
            if run_harasser and not self.harassers:
                self.harasser_results = {}
                self.harasser_args = {}
                self.harassers = harasserlist[:]
            try:
                self.execute_jobs(job_list, self.pool[1])
            except SoakTestError as error:
                self.fail(error)
            # Check space after jobs done
            for pool in self.pool:
                self.dmg_command.pool_query(pool.uuid)
            self.soak_errors.extend(self.destroy_containers(self.container))
            self.soak_errors.extend(self.destroy_pools(self.pool[1]))
            # remove the test pools from self.pool; preserving reserved pool
            self.container = []
            self.pool = [self.pool[0]]
            self.log.info("Current pools: %s",
                          " ".join([pool.uuid for pool in self.pool]))
            # fail if the pool/containers did not clean up correctly
            self.assertEqual(len(self.soak_errors), 0,
                             "\n".join(self.soak_errors))
            # Break out of loop if smoke
            if "smoke" in self.test_name:
                break
            loop_time = time.time() - start_loop_time
            self.log.info("<<LOOP %s completed in %s at %s>>", self.loop,
                          DDHHMMSS_format(loop_time), time.ctime())
            # Initialize harasser loop time from first pass loop time
            if self.loop == 1 and self.harassers:
                self.harasser_loop_time = loop_time
            self.loop += 1
        # TO-DO: use IOR
        if not resv_cont.read_objects():
            self.soak_errors.append("Data verification error on reserved pool"
                                    "after SOAK completed")
        self.container.append(resv_cont)
        # gather the daos logs from the client nodes
        self.log.info("<<<<SOAK TOTAL TEST TIME = %s>>>>",
                      DDHHMMSS_format(time.time() - start_time))