Beispiel #1
0
    def cleanup(self) -> None:
        """Cleanup the log and batch files.

        If the `RunManager` is not running, the ``run_script.py`` file
        will also be removed.
        """
        from adaptive_scheduler.utils import (
            _delete_old_ipython_profiles,
            _remove_or_move_files,
        )

        scheduler = self.scheduler
        with suppress(FileNotFoundError):
            if self.status() != "running":
                os.remove(scheduler.run_script)

        running_job_ids = set(scheduler.queue().keys())
        if scheduler.executor_type == "ipyparallel":
            _delete_old_ipython_profiles(running_job_ids)

        log_fnames = [scheduler.log_fname(name) for name in self.job_names]
        output_fnames = [
            scheduler.output_fnames(name) for name in self.job_names
        ]
        output_fnames = sum(output_fnames, [])
        batch_fnames = [scheduler.batch_fname(name) for name in self.job_names]
        fnames = log_fnames + output_fnames + batch_fnames
        to_rm = [
            glob.glob(f.replace(scheduler._JOB_ID_VARIABLE, "*"))
            for f in fnames
        ]
        to_rm = sum(to_rm, [])
        _remove_or_move_files(to_rm, True, self.move_old_logs_to,
                              "Removing logs and batch files")
Beispiel #2
0
def cleanup(
    job_names: list[str],
    scheduler: BaseScheduler,
    with_progress_bar: bool = True,
    move_to: str | None = None,
) -> None:
    """Cleanup the scheduler log-files files.

    Parameters
    ----------
    job_names : list
        List of job names.
    scheduler : `~adaptive_scheduler.scheduler.BaseScheduler`
        A scheduler instance from `adaptive_scheduler.scheduler`.
    with_progress_bar : bool, default: True
        Display a progress bar using `tqdm`.
    move_to : str, default: None
        Move the file to a different directory.
        If None the file is removed.
    log_file_folder : str, default: ''
        The folder in which to delete the log-files.
    """

    to_rm = _get_all_files(job_names, scheduler)

    _remove_or_move_files(to_rm, with_progress_bar, move_to,
                          "Removing logs and batch files")
Beispiel #3
0
    async def _manage(self) -> None:
        while True:
            try:
                self.database_manager.update()

                failed_jobs = logs_with_string_or_condition(
                    self.error, self.database_manager)

                to_cancel: list[str] = []
                to_delete: list[str] = []
                for job_name, fnames in failed_jobs:
                    to_cancel.append(job_name)
                    to_delete.extend(fnames)

                self.scheduler.cancel(to_cancel,
                                      with_progress_bar=False,
                                      max_tries=self.max_cancel_tries)
                _remove_or_move_files(to_delete,
                                      with_progress_bar=False,
                                      move_to=self.move_to)
                self.cancelled.extend(to_cancel)
                self.deleted.extend(to_delete)
                await asyncio.sleep(self.interval)
            except concurrent.futures.CancelledError:
                log.info("task was cancelled because of a CancelledError")
                raise
            except Exception as e:
                log.exception("got exception in kill manager",
                              exception=str(e))
Beispiel #4
0
    async def _manage(self) -> Coroutine:
        while True:
            try:
                queue = self.scheduler.queue()
                self.database_manager.update(queue)

                failed_jobs = logs_with_string_or_condition(
                    self.error, self.database_manager, self.scheduler)
                to_cancel = []
                to_delete = []

                # get cancel/delete only the processes/logs that are running now
                for job_id in queue.keys():
                    if job_id in failed_jobs:
                        job_name, fnames = failed_jobs[job_id]
                        to_cancel.append(job_name)
                        to_delete += fnames

                self.scheduler.cancel(to_cancel,
                                      with_progress_bar=False,
                                      max_tries=self.max_cancel_tries)
                _remove_or_move_files(to_delete,
                                      with_progress_bar=False,
                                      move_to=self.move_to)
                self.cancelled += to_cancel
                self.deleted += to_delete
                await asyncio.sleep(self.interval)
            except concurrent.futures.CancelledError:
                log.info("task was cancelled because of a CancelledError")
                raise
            except Exception as e:
                log.exception("got exception in kill manager",
                              exception=str(e))
Beispiel #5
0
async def manage_killer(
    job_names: List[str],
    error: Union[str, callable, None] = "srun: error:",
    interval: int = 600,
    max_cancel_tries: int = 5,
    move_to: Optional[str] = None,
) -> Coroutine:
    # It seems like tasks that print the error message do not always stop working
    # I think it only stops working when the error happens on a node where the logger runs.
    from adaptive_scheduler.utils import (
        _remove_or_move_files,
        logs_with_string_or_condition,
    )

    while True:
        try:
            failed_jobs = logs_with_string_or_condition(job_names, error)
            to_cancel = []
            to_delete = []

            # get cancel/delete only the processes/logs that are running nowg
            for job_id, info in queue().items():
                job_name = info["name"]
                if job_id in failed_jobs.get(job_name, []):
                    to_cancel.append(job_name)
                    to_delete.append(f"{job_name}-{job_id}.out")

            cancel(to_cancel, with_progress_bar=False, max_tries=max_cancel_tries)
            _remove_or_move_files(to_delete, with_progress_bar=False, move_to=move_to)
            await asyncio.sleep(interval)
        except concurrent.futures.CancelledError:
            log.info("task was cancelled because of a CancelledError")
            raise
        except Exception as e:
            log.exception("got exception in kill manager", exception=str(e))
Beispiel #6
0
async def manage_killer(
    job_names: List[str],
    scheduler: BaseScheduler,
    error: Union[str, Callable[[List[str]], bool]] = "srun: error:",
    interval: int = 600,
    max_cancel_tries: int = 5,
    move_to: Optional[str] = None,
    db_fname: str = "running.json",
) -> Coroutine:
    # It seems like tasks that print the error message do not always stop working
    # I think it only stops working when the error happens on a node where the logger runs.
    from adaptive_scheduler.utils import _remove_or_move_files

    while True:
        try:
            failed_jobs = logs_with_string_or_condition(
                error, db_fname, scheduler)
            to_cancel = []
            to_delete = []

            # get cancel/delete only the processes/logs that are running now
            for job_id in scheduler.queue().keys():
                if job_id in failed_jobs:
                    job_name, fnames = failed_jobs[job_id]
                    to_cancel.append(job_name)
                    to_delete += fnames

            scheduler.cancel(to_cancel,
                             with_progress_bar=False,
                             max_tries=max_cancel_tries)
            _remove_or_move_files(to_delete,
                                  with_progress_bar=False,
                                  move_to=move_to)
            await asyncio.sleep(interval)
        except concurrent.futures.CancelledError:
            log.info("task was cancelled because of a CancelledError")
            raise
        except Exception as e:
            log.exception("got exception in kill manager", exception=str(e))