def cleanup(self) -> None: """Cleanup the log and batch files. If the `RunManager` is not running, the ``run_script.py`` file will also be removed. """ from adaptive_scheduler.utils import ( _delete_old_ipython_profiles, _remove_or_move_files, ) scheduler = self.scheduler with suppress(FileNotFoundError): if self.status() != "running": os.remove(scheduler.run_script) running_job_ids = set(scheduler.queue().keys()) if scheduler.executor_type == "ipyparallel": _delete_old_ipython_profiles(running_job_ids) log_fnames = [scheduler.log_fname(name) for name in self.job_names] output_fnames = [ scheduler.output_fnames(name) for name in self.job_names ] output_fnames = sum(output_fnames, []) batch_fnames = [scheduler.batch_fname(name) for name in self.job_names] fnames = log_fnames + output_fnames + batch_fnames to_rm = [ glob.glob(f.replace(scheduler._JOB_ID_VARIABLE, "*")) for f in fnames ] to_rm = sum(to_rm, []) _remove_or_move_files(to_rm, True, self.move_old_logs_to, "Removing logs and batch files")
def cleanup( job_names: list[str], scheduler: BaseScheduler, with_progress_bar: bool = True, move_to: str | None = None, ) -> None: """Cleanup the scheduler log-files files. Parameters ---------- job_names : list List of job names. scheduler : `~adaptive_scheduler.scheduler.BaseScheduler` A scheduler instance from `adaptive_scheduler.scheduler`. with_progress_bar : bool, default: True Display a progress bar using `tqdm`. move_to : str, default: None Move the file to a different directory. If None the file is removed. log_file_folder : str, default: '' The folder in which to delete the log-files. """ to_rm = _get_all_files(job_names, scheduler) _remove_or_move_files(to_rm, with_progress_bar, move_to, "Removing logs and batch files")
async def _manage(self) -> None: while True: try: self.database_manager.update() failed_jobs = logs_with_string_or_condition( self.error, self.database_manager) to_cancel: list[str] = [] to_delete: list[str] = [] for job_name, fnames in failed_jobs: to_cancel.append(job_name) to_delete.extend(fnames) self.scheduler.cancel(to_cancel, with_progress_bar=False, max_tries=self.max_cancel_tries) _remove_or_move_files(to_delete, with_progress_bar=False, move_to=self.move_to) self.cancelled.extend(to_cancel) self.deleted.extend(to_delete) await asyncio.sleep(self.interval) except concurrent.futures.CancelledError: log.info("task was cancelled because of a CancelledError") raise except Exception as e: log.exception("got exception in kill manager", exception=str(e))
async def _manage(self) -> Coroutine: while True: try: queue = self.scheduler.queue() self.database_manager.update(queue) failed_jobs = logs_with_string_or_condition( self.error, self.database_manager, self.scheduler) to_cancel = [] to_delete = [] # get cancel/delete only the processes/logs that are running now for job_id in queue.keys(): if job_id in failed_jobs: job_name, fnames = failed_jobs[job_id] to_cancel.append(job_name) to_delete += fnames self.scheduler.cancel(to_cancel, with_progress_bar=False, max_tries=self.max_cancel_tries) _remove_or_move_files(to_delete, with_progress_bar=False, move_to=self.move_to) self.cancelled += to_cancel self.deleted += to_delete await asyncio.sleep(self.interval) except concurrent.futures.CancelledError: log.info("task was cancelled because of a CancelledError") raise except Exception as e: log.exception("got exception in kill manager", exception=str(e))
async def manage_killer( job_names: List[str], error: Union[str, callable, None] = "srun: error:", interval: int = 600, max_cancel_tries: int = 5, move_to: Optional[str] = None, ) -> Coroutine: # It seems like tasks that print the error message do not always stop working # I think it only stops working when the error happens on a node where the logger runs. from adaptive_scheduler.utils import ( _remove_or_move_files, logs_with_string_or_condition, ) while True: try: failed_jobs = logs_with_string_or_condition(job_names, error) to_cancel = [] to_delete = [] # get cancel/delete only the processes/logs that are running nowg for job_id, info in queue().items(): job_name = info["name"] if job_id in failed_jobs.get(job_name, []): to_cancel.append(job_name) to_delete.append(f"{job_name}-{job_id}.out") cancel(to_cancel, with_progress_bar=False, max_tries=max_cancel_tries) _remove_or_move_files(to_delete, with_progress_bar=False, move_to=move_to) await asyncio.sleep(interval) except concurrent.futures.CancelledError: log.info("task was cancelled because of a CancelledError") raise except Exception as e: log.exception("got exception in kill manager", exception=str(e))
async def manage_killer( job_names: List[str], scheduler: BaseScheduler, error: Union[str, Callable[[List[str]], bool]] = "srun: error:", interval: int = 600, max_cancel_tries: int = 5, move_to: Optional[str] = None, db_fname: str = "running.json", ) -> Coroutine: # It seems like tasks that print the error message do not always stop working # I think it only stops working when the error happens on a node where the logger runs. from adaptive_scheduler.utils import _remove_or_move_files while True: try: failed_jobs = logs_with_string_or_condition( error, db_fname, scheduler) to_cancel = [] to_delete = [] # get cancel/delete only the processes/logs that are running now for job_id in scheduler.queue().keys(): if job_id in failed_jobs: job_name, fnames = failed_jobs[job_id] to_cancel.append(job_name) to_delete += fnames scheduler.cancel(to_cancel, with_progress_bar=False, max_tries=max_cancel_tries) _remove_or_move_files(to_delete, with_progress_bar=False, move_to=move_to) await asyncio.sleep(interval) except concurrent.futures.CancelledError: log.info("task was cancelled because of a CancelledError") raise except Exception as e: log.exception("got exception in kill manager", exception=str(e))