def job_completion(self, job_id_list): """Wait for job completion and cleanup. Args: job_id_list: IDs of each job submitted to slurm Returns: failed_job_id_list: IDs of each job that failed in slurm """ self.log.info("<<Job Completion - %s >> at %s", self.test_name, time.ctime()) harasser_interval = 0 failed_harasser_msg = None harasser_timer = time.time() # loop time exists after the first pass; no harassers in the first pass if self.harasser_loop_time and self.harassers: harasser_interval = self.harasser_loop_time / ( len(self.harassers) + 3) # If there is nothing to do; exit if job_id_list: # wait for all the jobs to finish while len(self.soak_results) < len(job_id_list): # wait for the jobs to complete. # enter tearDown before hitting the avocado timeout if time.time() > self.end_time: self.log.info( "<< SOAK test timeout in Job Completion at %s >>", time.ctime()) for job in job_id_list: _ = slurm_utils.cancel_jobs(int(job)) # launch harassers if enabled; # one harasser at a time starting on pass2 if self.harassers: if self.loop >= 2 and (time.time() > harasser_timer + harasser_interval): harasser = self.harassers.pop(0) harasser_timer += harasser_interval failed_harasser_msg = self.launch_harasser( harasser, self.pool) time.sleep(5) if failed_harasser_msg is not None: self.all_failed_harassers.append(failed_harasser_msg) # check for JobStatus = COMPLETED or CANCELLED (i.e. TEST TO) for job, result in self.soak_results.items(): if result in ["COMPLETED", "CANCELLED"]: job_id_list.remove(int(job)) else: self.log.info("<< Job %s failed with status %s>>", job, result) # gather all the logfiles for this pass and cleanup test nodes try: get_remote_logs(self) except SoakTestError as error: self.log.info("Remote copy failed with %s", error) self.soak_results = {} return job_id_list
def job_completion(self, job_id_list): """Wait for job completion and cleanup. Args: job_id_list: IDs of each job submitted to slurm Returns: failed_job_id_list: IDs of each job that failed in slurm """ self.log.info("<<Job Completion - %s >> at %s", self.test_name, time.ctime()) # If there is nothing to do; exit if job_id_list: # wait for all the jobs to finish while len(self.soak_results) < len(job_id_list): # wait for the jobs to complete. # enter tearDown before hitting the avocado timeout if time.time() > self.end_time: self.log.info( "<< SOAK test timeout in Job Completion at %s >>", time.ctime()) for job in job_id_list: _ = slurm_utils.cancel_jobs(int(job)) time.sleep(5) # check for JobStatus = COMPLETED or CANCELLED (i.e. TEST TO) for job, result in self.soak_results.items(): if result in ["COMPLETED", "CANCELLED"]: job_id_list.remove(int(job)) else: self.log.info("<< Job %s failed with status %s>>", job, result) # gather all the logfiles for this pass and cleanup test nodes try: get_remote_logs(self) except SoakTestError as error: self.log.info("Remote copy failed with %s", error) self.soak_results = {} return job_id_list
def job_completion(self, job_id_list): """Wait for job completion and cleanup. Args: job_id_list: IDs of each job submitted to slurm Returns: failed_job_id_list: IDs of each job that failed in slurm """ self.log.info("<<Job Completion - %s >> at %s", self.test_name, time.ctime()) harasser_interval = 0 failed_harasser_msg = None harasser_timer = time.time() check_time = datetime.now() event_check_messages = [] since = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # loop time exists after the first pass; no harassers in the first pass if self.harasser_loop_time and self.harassers: harasser_interval = self.harasser_loop_time / ( len(self.harassers) + 1) # If there is nothing to do; exit if job_id_list: # wait for all the jobs to finish while len(self.soak_results) < len(job_id_list): # wait for the jobs to complete. # enter tearDown before hitting the avocado timeout if time.time() > self.end_time: self.log.info( "<< SOAK test timeout in Job Completion at %s >>", time.ctime()) for job in job_id_list: _ = slurm_utils.cancel_jobs(int(job)) # monitor events every 15 min if datetime.now() > check_time: run_monitor_check(self) check_time = datetime.now() + timedelta(minutes=15) # launch harassers if enabled; # one harasser at a time starting on pass2 if self.harassers: if self.loop >= 2 and ( time.time() > (harasser_timer + harasser_interval)): harasser = self.harassers.pop(0) harasser_timer += harasser_interval failed_harasser_msg = self.launch_harasser( harasser, self.pool) time.sleep(5) if time.time() < self.end_time: # Run any offline harassers after first loop if self.offline_harassers and self.loop >= 1: for offline_harasser in self.offline_harassers: if time.time() + int(180) < self.end_time: failed_harasser_msg = self.launch_harasser( offline_harasser, self.pool) # wait 2 minutes to issue next harasser time.sleep(120) # check journalctl for events; until = datetime.now().strftime('%Y-%m-%d %H:%M:%S') event_check_messages = run_event_check(self, since, until) self.check_errors.extend(event_check_messages) run_monitor_check(self) # init harasser list when all jobs are done self.harassers = [] self.offline_harassers = [] if failed_harasser_msg is not None: self.all_failed_harassers.append(failed_harasser_msg) # check for JobStatus = COMPLETED or CANCELLED (i.e. TEST TO) for job, result in list(self.soak_results.items()): if result in ["COMPLETED", "CANCELLED"]: job_id_list.remove(int(job)) else: self.log.info("<< Job %s failed with status %s>>", job, result) # gather all the logfiles for this pass and cleanup test nodes try: get_remote_logs(self) except SoakTestError as error: self.log.info("Remote copy failed with %s", error) self.soak_results = {} return job_id_list