Beispiel #1
0
    def check_mgmt_db(self):
        """Create errors for all entries in management db that did not complete"""
        base_proc_types = [
            const.ProcessType.EMOD3D,
            const.ProcessType.HF,
            const.ProcessType.BB,
            const.ProcessType.IM_calculation,
        ]
        db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir))

        entries = db.command_builder(
            allowed_tasks=base_proc_types,
            allowed_states=[const.Status.unknown, const.Status.failed],
            blocked_ids=self.canceled_running,
        )

        for entry in entries:
            self.errors.append(
                Error(
                    "Slurm task",
                    "Run {} did not complete task {} "
                    "(Status {}, JobId {}".format(
                        entry.run_name,
                        const.ProcessType(entry.proc_type),
                        const.Status(entry.status),
                        entry.job_id,
                    ),
                ))
Beispiel #2
0
    def cancel_running(self, proc_types: List[const.ProcessType]):
        """Looks for any running task of the specified process types
        and attempts to cancel one of each.
        """
        # Get all running jobs in the mgmt db
        db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir))
        entries = db.command_builder(allowed_tasks=proc_types,
                                     allowed_states=[const.Status.running])

        # Cancel one for each process type
        for entry in entries:
            if entry.proc_type in proc_types:
                print(
                    f"Checkpoint testing: Cancelling job-id {entry.job_id} "
                    "for {entry.run_name} and process type {entry.proc_type}")

                out, err = Scheduler.get_scheduler().cancel_job(entry.job_id)

                print("Scancel out: ", out, err)
                if "error" not in out.lower() and "error" not in err.lower():
                    self.canceled_running.append(str(entry.job_id))
                    proc_types.remove(entry.proc_type)
                    print("Cancelled job-id {}".format(entry.job_id))

        return proc_types
def update_db(root_folder, process, status, run_name, job_id, error):
    """Update the database with the given values"""

    entry = SchedulerTask(run_name, process, status, job_id, error)
    database = MgmtDB(root_folder)

    # If we are running this manually then we should set the retry limit to be above a reasonable value for manual
    # submissions
    database.update_entries_live([entry], 256)
def create_mgmt_db(realisations, db_file, srf_files=[]):
    mgmt_db = MgmtDB.init_db(
        db_file,
        os.path.join(os.path.dirname(os.path.realpath(__file__)),
                     "slurm_mgmt.db.sql"),
    )
    mgmt_db.populate(realisations, srf_files)

    return mgmt_db
Beispiel #5
0
    def check_completed(self):
        """Checks all simulations that have completed"""
        base_proc_types = [const.ProcessType.IM_calculation]
        db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir))
        entries = db.command_builder(allowed_tasks=base_proc_types,
                                     allowed_states=[const.Status.completed])

        completed_sims = [sim_t.run_name for sim_t in entries]

        # Only check the ones that haven't been checked already
        completed_new = set(completed_sims) - (self._sim_passed
                                               | self._sim_failed)

        for sim in completed_new:
            result = self.check_sim_result(
                os.path.join(self.runs_dir,
                             sim_struct.get_fault_from_realisation(sim), sim))

            if not result:
                self._sim_failed.add(sim)

                if self._stop_on_error:
                    print("Quitting as the following errors occured: ")
                    self.print_errors()
                    return False
                else:
                    print("The following error occured for simulation {}:".
                          format(sim))
                    print("ERROR: {}, {}\n".format(self.errors[-1].location,
                                                   self.errors[-1].error))

            else:
                self._sim_passed.add(sim)

        print("Passed/Failed/Total simulations: {}/{}/{}, ".format(
            len(self._sim_passed), len(self._sim_failed), len(self.sim_dirs)))

        return True
Beispiel #6
0
    def check_mgmt_db_progress(self):
        """Checks auto submit progress in the management db"""
        base_proc_types = [
            const.ProcessType.EMOD3D,
            const.ProcessType.HF,
            const.ProcessType.BB,
            const.ProcessType.IM_calculation,
        ]
        db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir))

        total_count = len(db.command_builder(allowed_tasks=base_proc_types))

        comp_count = len(
            db.command_builder(allowed_tasks=base_proc_types,
                               allowed_states=[const.Status.completed]))

        failed_count = len(
            db.command_builder(
                allowed_tasks=base_proc_types,
                allowed_states=[const.Status.failed, const.Status.unknown],
            ))

        return total_count, comp_count, failed_count
Beispiel #7
0
def queue_monitor_loop(
    root_folder: str,
    sleep_time: int,
    max_retries: int,
    queue_logger: Logger = qclogging.get_basic_logger(),
    alert_url=None,
):
    mgmt_db = MgmtDB(sim_struct.get_mgmt_db(root_folder))
    queue_folder = sim_struct.get_mgmt_db_queue(root_folder)

    queue_logger.info("Running queue-monitor, exit with Ctrl-C.")

    mgmt_db.add_retries(max_retries)

    sqlite_tmpdir = "/tmp/cer"
    while keepAlive:
        complete_data = True
        if not os.path.exists(sqlite_tmpdir):
            os.makedirs(sqlite_tmpdir)
            queue_logger.debug("Set up the sqlite_tmpdir")

        # For each hpc get a list of job id and status', and for each pair save them in a dictionary
        queued_tasks = {}
        for hpc in HPC:
            try:
                squeued_tasks = Scheduler.get_scheduler().check_queues(
                    user=False, target_machine=hpc
                )
            except EnvironmentError as e:
                queue_logger.critical(e)
                queue_logger.critical(
                    f"An error was encountered when attempting to check {Scheduler.get_scheduler().QUEUE_NAME} for HPC {hpc}. "
                    "Tasks will not be submitted to this HPC until the issue is resolved"
                )
                complete_data = False
            else:
                for task in squeued_tasks:
                    queued_tasks[task.split()[0]] = task.split()[1]

        if len(queued_tasks) > 0:
            if len(queued_tasks) > 200:
                queue_logger.log(
                    VERYVERBOSE,
                    f"{Scheduler.get_scheduler().QUEUE_NAME} tasks: {', '.join([' '.join(task) for task in queued_tasks.items()])}",
                )
                queue_logger.info(
                    f"Over 200 tasks were found in the queue. Check the log for an exact listing of them"
                )
            else:
                queue_logger.info(
                    f"{Scheduler.get_scheduler().QUEUE_NAME} tasks: {', '.join([' '.join(task) for task in queued_tasks.items()])}"
                )
        else:
            queue_logger.debug(f"No {Scheduler.get_scheduler().QUEUE_NAME} tasks")

        db_in_progress_tasks = mgmt_db.get_submitted_tasks()
        if len(db_in_progress_tasks) > 0:

            queue_logger.info(
                "In progress tasks in mgmt db:"
                + ", ".join(
                    [
                        "{}-{}-{}-{}".format(
                            entry.run_name,
                            const.ProcessType(entry.proc_type).str_value,
                            entry.job_id,
                            const.Status(entry.status).str_value,
                        )
                        for entry in db_in_progress_tasks
                    ]
                )
            )

        entry_files = os.listdir(queue_folder)
        entry_files.sort()

        entries = []

        for file_name in entry_files[::-1]:
            queue_logger.debug(
                "Checking {} to see if it is a valid update file".format(file_name)
            )
            entry = get_queue_entry(os.path.join(queue_folder, file_name), queue_logger)
            if entry is None:
                queue_logger.debug(
                    "Removing {} from the list of update files".format(file_name)
                )
                entry_files.remove(file_name)
            else:
                if str(entry.job_id) in queued_tasks.keys() and entry.status > 3:
                    # This will prevent race conditions if the failure/completion state file is made and picked up before the job actually finishes
                    # Most notabley happens on Kisti
                    # The queued and running states are allowed
                    queue_logger.debug(
                        "Job {} is still running on the HPC, skipping this iteration".format(
                            entry
                        )
                    )
                    entry_files.remove(file_name)
                else:
                    queue_logger.debug("Adding {} to the list of updates".format(entry))
                    entries.insert(0, entry)

        entries.extend(
            update_tasks(
                entry_files,
                queued_tasks,
                db_in_progress_tasks,
                complete_data,
                queue_logger,
                root_folder,
            )
        )

        if len(entries) > 0:
            queue_logger.info("Updating {} mgmt db tasks.".format(len(entries)))
            if mgmt_db.update_entries_live(entries, max_retries, queue_logger):
                for file_name in entry_files:
                    os.remove(os.path.join(queue_folder, file_name))
                # check for jobs that matches alert criteria
                if alert_url != None:
                    for entry in entries:
                        if entry.status == const.Status.failed.value:
                            entry_retries = mgmt_db.get_retries(
                                entry.proc_type, entry.run_name
                            )
                            if entry_retries < max_retries:
                                msg = f"fault:{entry.run_name} step:{entry.proc_type} has failed with error:{entry.error}"
                            elif entry_retries >= max_retries:
                                msg = f"@here fault:{entry.run_name} step:{entry.proc_type} has failed with error:{entry.error} and met the retry cap"
                            send_alert(msg, alert_url)
            else:
                queue_logger.error(
                    "Failed to update the current entries in the mgmt db queue. "
                    "Please investigate and fix. If this is a repeating error, then this "
                    "will block all other entries from updating."
                )
        else:
            queue_logger.info("No entries in the mgmt db queue.")

        # Nap time
        queue_logger.debug("Sleeping for {}".format(sleep_time))
        time.sleep(sleep_time)
Beispiel #8
0
def run_main_submit_loop(
        root_folder: str,
        n_runs: Dict[str, int],
        rels_to_run: str,
        given_tasks_to_run: List[const.ProcessType],
        sleep_time: int,
        models_tuple: Tuple[est.EstModel],
        main_logger: Logger = qclogging.get_basic_logger(),
        cycle_timeout=1,
):
    mgmt_queue_folder = sim_struct.get_mgmt_db_queue(root_folder)
    mgmt_db = MgmtDB(sim_struct.get_mgmt_db(root_folder))
    root_params_file = os.path.join(sim_struct.get_runs_dir(root_folder),
                                    "root_params.yaml")
    config = utils.load_yaml(root_params_file)
    main_logger.info("Loaded root params file: {}".format(root_params_file))
    # Default values

    hf_seed = config["hf"].get(const.RootParams.seed.value,
                               const.HF_DEFAULT_SEED)
    main_logger.debug("hf_seed set to {}".format(hf_seed))

    main_logger.debug(
        f"extended_period set to {config['ims']['extended_period']}")

    time_since_something_happened = cycle_timeout

    while time_since_something_happened > 0:
        main_logger.debug("time_since_something_happened is now {}".format(
            time_since_something_happened))
        time_since_something_happened -= 1
        # Get items in the mgmt queue, have to get a snapshot instead of
        # checking the directory real-time to prevent timing issues,
        # which can result in dual-submission
        mgmt_queue_entries = os.listdir(mgmt_queue_folder)

        # Get in progress tasks in the db and the HPC queue
        n_tasks_to_run = {}
        for hpc in HPC:
            try:
                squeued_tasks = Scheduler.get_scheduler().check_queues(
                    user=True, target_machine=hpc)
            except EnvironmentError as e:
                main_logger.critical(e)
                n_tasks_to_run[hpc] = 0
            else:
                n_tasks_to_run[hpc] = n_runs[hpc] - len(squeued_tasks)
                if len(squeued_tasks) > 0:
                    main_logger.debug(
                        "There was at least one job in squeue, resetting timeout"
                    )
                    time_since_something_happened = cycle_timeout

        # Gets all runnable tasks based on mgmt db state
        runnable_tasks = mgmt_db.get_runnable_tasks(
            rels_to_run,
            sum(n_runs.values()),
            os.listdir(sim_struct.get_mgmt_db_queue(root_folder)),
            given_tasks_to_run,
            main_logger,
        )
        if len(runnable_tasks) > 0:
            time_since_something_happened = cycle_timeout
            main_logger.info("Number of runnable tasks: {}".format(
                len(runnable_tasks)))
            main_logger.debug(
                "There was at least one runnable task, resetting timeout")
        else:
            main_logger.debug("No runnable_tasks")

        # Select the first ntask_to_run that are not waiting
        # for mgmt db updates (i.e. items in the queue)
        tasks_to_run, task_counter = [], {key: 0 for key in HPC}
        for cur_proc_type, cur_run_name, retries in runnable_tasks:

            cur_hpc = get_target_machine(cur_proc_type)
            # Add task if limit has not been reached and there are no
            # outstanding mgmt db updates
            if (not shared_automated_workflow.check_mgmt_queue(
                    mgmt_queue_entries, cur_run_name, cur_proc_type) and
                    task_counter.get(cur_hpc, 0) < n_tasks_to_run[cur_hpc]):
                tasks_to_run.append((cur_proc_type, cur_run_name, retries))
                task_counter[cur_hpc] += 1

            # Open to better suggestions
            # Break if enough tasks for each HPC have been added
            if np.all([
                    True if task_counter.get(hpc, 0) >= n_tasks_to_run[hpc]
                    else False for hpc in n_tasks_to_run.keys()
            ]):
                break

        if len(tasks_to_run) > 0:
            main_logger.info("Tasks to run this iteration: " + ", ".join([
                "{}-{}".format(entry[1],
                               const.ProcessType(entry[0]).str_value)
                for entry in tasks_to_run
            ]))
        else:
            main_logger.debug("No tasks to run this iteration")

        # Submit the runnable tasks
        for proc_type, run_name, retries in tasks_to_run:

            # Special handling for merge-ts
            if proc_type == const.ProcessType.merge_ts.value:
                # Check if clean up has already run
                if mgmt_db.is_task_complete([
                        const.ProcessType.clean_up.value,
                        run_name,
                        const.Status.completed.str_value,
                ]):
                    # If clean_up has already run, then we should set it to
                    # be run again after merge_ts has run
                    shared_automated_workflow.add_to_queue(
                        mgmt_queue_folder,
                        run_name,
                        const.ProcessType.clean_up.value,
                        const.Status.created.value,
                        logger=main_logger,
                    )

            # submit the job
            submit_task(
                sim_struct.get_sim_dir(root_folder, run_name),
                proc_type,
                run_name,
                root_folder,
                main_logger,
                retries=retries,
                hf_seed=hf_seed,
                models=models_tuple,
            )
        main_logger.debug("Sleeping for {} second(s)".format(sleep_time))
        time.sleep(sleep_time)
    main_logger.info(
        "Nothing was running or ready to run last cycle, exiting now")